ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/go_db_update.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 3232 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q{Usage:
7 go_db_update.pl [-f <path/to/go_daily-termdb-data.gz>]
8
9 Prepares data from go_daily-termdb-data.gz in "bcp" tab delimited
10 format for the following tables:
11
12 go_term
13 go_term2term
14 go_synonym
15 go_xdb
16 go_dbxref
17 go_xref
18 go_rel_prop
19 go_rel_comp
20 };
21 umask 0002;
22 getopts('f:o:') || die($usage."\n");
23 my $infile=$Getopt::Std::opt_f || 'go_daily-termdb-data.gz';
24 die "$usage Error: cannot locate input file $infile\n" unless -f $infile;
25 # --
26 # - plan of action: parse term and term_definition data in memory so we can link them
27 # - parse the other tables as they are (no changes)
28 # --
29 my %go_tables=('term'=>'go_term', # special case - schema changed
30 'term_definition'=>'definition', # special case: schema included in go_term
31 'term2term'=>'go_term2term',
32 'term_synonym'=>'go_synonym',
33 'relation_properties'=>'go_rel_prop',
34 'relation_composition'=>'go_rel_comp',
35 'db'=>'go_xdb',
36 'dbxref'=>'go_dbxref',
37 'term_synonym'=>'go_synonym',
38 'term_dbxref'=>'go_xref'
39 );
40 my %go_written; # table_name => written_flag
41 my @go_terms; # list of [id, name, term_type, acc, is_obsolete, is_root, is_relation]
42 my %go_defs; # id=>definition
43 open(GZ, "gzip -cd $infile |") || die ("Error starting gzip -cd $infile pipe !\n");
44
45 while (<GZ>) {
46 if (m/^\s*INSERT\s+INTO\s+\`?([\-\.\w]+)/i) {
47 my $table=$1;
48 my $mytable=$go_tables{$table};
49 next unless $mytable;
50 chomp;
51 s/^\s*INSERT\s+INTO\s+\`?[\-\.\w]+\`?\s+VALUES\s*\(//i;
52 my @r=split(/\),\(/);
53 $r[-1]=~tr/\n\r//d;
54 $r[-1]=~s/\)$//;
55 my $bcpwrite=($table ne 'term' && $table ne 'term_definition');
56 if ($bcpwrite) {
57 if (exists($go_written{$mytable})) {
58 open(BCP, ">>$mytable.bcp") || die "Error writing to $mytable.bcp\n";
59 }
60 else {
61 open(BCP, ">$mytable.bcp") || die "Error creating $mytable.bcp\n";
62 $go_written{$mytable}=1;
63 }
64 }
65 foreach my $rd (@r) {
66 # for each row
67 $rd=~s/\\'/\x81/g;
68 $rd=~s/\\"/\x82/g;
69 $rd=~tr/\\//d; #there is no reason to protect anything else
70 $rd=~tr/\n\t\r / /s; #WARNING: tabs are clearly not allowed within field data!
71 $rd=~s/\,\s+'/,'/g;
72 $rd=~s/'\s+\,/',/g;
73 my @f=();
74 push(@f, defined($1) ? $1:$3)
75 while ($rd=~m/'([^'\\]*(\\.[^'\\]*)*)'|([^,]+)/g);
76 foreach my $s (@f) {
77 $s=~s/^['"]//;
78 $s=~s/['"]$//;
79 $s=~tr/\x81\x82/'"/; #'
80 }
81 if ($table eq 'term') {
82 push(@go_terms,[@f]);
83 next;
84 }
85 if ($table eq 'term_definition') {
86 $go_defs{$f[0]}=$f[1];
87 next;
88 }
89 if ($bcpwrite) {
90 print BCP join("\t",@f)."\n";
91 }
92 } #for each row data
93 close(BCP) if ($bcpwrite);
94 } #if INSERT line
95 } #while GZ lines
96 close(GZ);
97 # write go_term table
98 open(BCP, '>go_term.bcp') || die("Error creating file go_term.bcp\n");
99 foreach my $td (@go_terms) {
100 my $tdef=$go_defs{$$td[0]} || '';
101 print BCP join("\t",@$td,$tdef)."\n";
102 }
103 close(BCP);
104 # --
105 #************ Subroutines **************

Properties

Name Value
svn:executable *