ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/tbl2gff
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 13595 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 tbl2gff [-F] [-g <chr_table>] [-t <track>] -p <prot2nuc.tab> <ncbi_tbl_file>
8
9 Convert the given NCBI tbl file to GFF3
10
11 Options:
12 -p protein_id to transcript_id mapping file (required)
13 this file has 2 columns:
14 GenPept accession and (nucleotide) accession
15 it is required in order to assign CDS to their
16 corresponding mRNA exons
17 -t use given track name instead of 'gb_tbl'
18 -g replace the genomic sequence names according
19 to the translation table <chr_table>
20 (only the first 2 columns in that file are used)
21 -M only outputs mRNA entries (without gene lines)
22 -F preserve all notes and comments
23 /;
24 umask 0002;
25 my %kattrs;
26 #key attributes to always keep
27 @kattrs{'gene', 'product', 'transcript_id'}=();
28 getopts('FMt:g:p:') || die($usage."\n");
29 my $gxfile=$Getopt::Std::opt_g;
30 my $p2nfile=$Getopt::Std::opt_p;
31 my $mrnaonly=$Getopt::Std::opt_M;
32 die("Error: -p is required -- to map protein_id to transcript_id\n") unless $p2nfile && -f $p2nfile;
33 my %pmap; # protein_id => transcript_id
34 open(PROTMAP, $p2nfile) || die("Error opening $p2nfile!\n");
35 while (<PROTMAP>) {
36 chomp;
37 my ($p, $n)=split;
38 next unless $n;
39 $p=~s/\.\d+$//;
40 $n=~s/\.\d+$//; # discard versioning
41 $pmap{$p}=$n;
42 }
43 close(PROTMAP);
44
45 my $track=$Getopt::Std::opt_t || 'gb_tbl';
46 my $allattrs=$Getopt::Std::opt_F;
47 my $last_line;
48 my %gx; # translation table for genomic sequence names ('>Feature' line)
49 if ($gxfile) {
50 open(GX, $gxfile) || die("Error opening $gxfile\n");
51 while(<GX>) {
52 chomp;
53 my @t=split;
54 if ($t[0] && $t[1]) {
55 $gx{$t[0]}=$t[1];
56 $gx{$t[1]}=$t[0];
57 }
58 }
59 close(GX);
60 }
61
62 my $tblfile=$ARGV[0];
63 die($usage."\n") if $tblfile eq '-h' || ! -f $tblfile;
64
65 my %gids; #ensure unique gene IDs
66 #allow for duplication $gids{$gene}=[ [gstart1, gend1], [gstart2, gend2] ]
67 my %tids; # and transcript IDs
68
69 # -- per gbase:
70
71 my %genes; # genes{$gene}=[$gbase, $strand, $gstart, $gend, [@gxrefs], \%gattrs, \%rnas, isPseudo]
72 # 0 1 2 3 4 5 6 7
73 #my %rnas;
74 # %rnas hash holds all mRNA data for a single gene (i.e. all isoforms)
75 # 0 1 2 3 4 5 6
76 # $rnas{transcript_id}=[ $featname, [@exon], [@cds], \%tattrs, [@txrefs], $partial3, $partial5 ]
77 #current values:
78 my ($gbase, $curfeat, @exon, %cattrs, $gpseudo, $gstart, $gend, $partial5, $partial3);
79 my $strand='+';
80
81 # product and transcript_id are found in %tattrs
82 while (<>) {
83 $last_line=$_;
84 if (m/^>Feature\s+(\S+)/) { # misnomer - in fact this is the genomic sequence name
85 my $newbase=$1;
86 flushGSeq(); #write previous gene feature
87 $gbase='';
88 if ($gxfile) {
89 $gbase=$gx{$newbase};
90 unless ($gbase) {
91 if ($newbase=~m/^[a-z]+\|([\w\.]+)/) {
92 $newbase=$1;
93 $gbase=$gx{$newbase};
94 unless ($gbase) {
95 $newbase=~s/\.\d+$//;
96 $gbase=$gx{$newbase};
97 }
98 }
99 }
100 } #convert contig name
101 $gbase=$newbase unless $gbase;
102 next;
103 } # base genomic sequence (contig/chromosome) established
104 chomp;
105 tr/\n\r//d;
106 my @t=split(/\t/);
107 my ($start, $end)=@t[0,1];
108 if ($start && $end) { # segment coordinates
109 $partial5=1 if ($start=~s/^<//);
110 $partial3=1 if ($end=~s/^>//);
111 my $newstrand='+';
112 if ($end<$start) {
113 $newstrand='-';
114 ($start, $end)=($end, $start);
115 }
116 if ($t[2]) { # feature start (gene/mRNA/CDS)
117 flushFeature() unless $t[2] eq 'exon'; #cleanup current gene/transcript data
118 $strand=$newstrand;
119 if ($t[2] eq 'gene') {
120 #new gene feature starts
121 $curfeat='gene';
122 $gstart=$start;
123 $gend=$end;
124 } # gene start
125 else { #non-gene features starting
126 if ($t[2] eq 'exon') { # special case, exons given one by one
127 if ($curfeat ne 'RNA_exon') { #first exon
128 flushFeature(); #just in case there was another mRNA (exons) given before
129 }
130 $curfeat='RNA_exon';
131 push(@exon, [$start, $end]);
132 }
133 elsif ($t[2] eq 'CDS' || $t[2]=~m/RNA/) {
134 $curfeat=$t[2];
135 push(@exon, [$start, $end]);
136 }
137 else { $curfeat=''; next; }
138 # -- skipping any other features here
139 } #non-gene features starting
140 } # feature start
141 else { # feature continuation
142 $strand=$newstrand;
143 if ($curfeat eq 'CDS' || $curfeat=~/RNA/) {
144 push(@exon, [$start, $end]);
145 }
146 else {
147 die("Error: unexpected feature '$curfeat' continuation\n$last_line");
148 }
149 }
150 } # had segment coordinates
151 else { # no segment coords => attributes for current feature
152 my ($attr, $value)=@t[3,4];
153 $value=~tr/;/,/;
154 if ($attr eq 'db_xref' || $attr eq 'gene_syn') {
155 push(@{$cattrs{$attr}}, $value);
156 }
157 else {
158 $cattrs{$attr}=$value if $attr ne 'number';
159 }
160 } # attributes block
161 } #while input lines
162
163 flushGSeq();
164
165 #================ SUBROUTINES ============
166
167 sub writeGene {
168 my ($gene, $gdata)=@_;
169 my $geneid=$gene;
170 my ($gene_name)=($geneid=~m/^([^~]+)/);
171 $geneid=~tr/~/_/;
172 ($gbase, $strand, $gstart, $gend, $gpseudo)=
173 ($gdata->[0], $gdata->[1], $gdata->[2], $gdata->[3], $gdata->[7]);
174 my $gfeature=($gpseudo)?'pseudogene' : 'gene';
175 #makeUniqID(\$gene, \%gids); # $gene will be updated if necessary
176 #my $geneid=$gene;
177 #$geneid=~tr/~/_/;
178 my $attrs="ID=$geneid;Name=$gene_name";
179 my @gxrefs=@{$gdata->[4]};
180 $attrs.=';xrefs='.join(',', @gxrefs) if (@gxrefs>0);
181 my $gattrs=$gdata->[5];
182 unless ($mrnaonly) {
183 foreach my $k (keys(%$gattrs)) {
184 $attrs.=";$k=".$gattrs->{$k};
185 }
186 print join("\t", $gbase, $track, $gfeature, $gstart, $gend, '.', $strand, '.', $attrs)."\n";
187 }
188
189 my $rnas=$gdata->[6];
190 foreach my $tid (keys(%$rnas)) {
191 #die("Error: no transcript_id found at write_RNA for gene $gene ($geneid)\n$last_line\n") unless $tid;
192 my $tdata=$rnas->{$tid};
193 my $rnafeat=$tdata->[0];
194 if ($rnafeat eq 'RNA_exon') {
195 $rnafeat = $gpseudo ? 'pseudo_RNA' : 'misc_RNA';
196 }
197 if ($mrnaonly) {
198 $rnafeat=~s/misc_RNA/mRNA/;
199 }
200
201 next if $mrnaonly && $rnafeat ne 'mRNA';
202 my $texons=$tdata->[1];
203 my $tcds=$tdata->[2];
204 my $tahref=$tdata->[3]; #attributes
205 my $txar=$tdata->[4]; # db_xrefs
206 my ($part3, $part5)=($tdata->[5], $tdata->[6]);
207 makeUniqID(\$tid, \%tids);
208 @exon=sort { $main::a->[0] <=> $main::b->[0] } @$texons;
209 my ($tstart, $tend)=($exon[0]->[0], $exon[-1]->[1]);
210 my $attrs="ID=$tid";
211 $attrs.=";Parent=$geneid" unless $mrnaonly;
212 $attrs.="Name=$gene_name;gene_name=$gene_name";
213 $attrs.=';partial=1' if ($part5 || $part3);
214 my $product=$tahref->{'product'};
215 if ($product) {
216 $product=~tr/"/'/; #"
217 $attrs.=';product="'.$product.'"';
218 }
219 if ($allattrs) {
220 $attrs.=';xrefs='.join(',', @$txar) if (@$txar > 0);
221 foreach my $k (keys(%$tahref)) {
222 next if exists($kattrs{$k});
223 $attrs.=";$k=".$tahref->{$k};
224 }
225 }
226 print join("\t", $gbase, $track, $rnafeat, $tstart, $tend, '.', $strand, '.', $attrs)."\n";
227 foreach my $ex (@exon) {
228 print join("\t", $gbase, $track, 'exon', $$ex[0], $$ex[1], '.', $strand, '.', "Parent=$tid")."\n";
229 }
230 @exon=();
231 #write_CDS();
232 my @cds=@$tcds;
233 if (@cds>0) { #have CDS, write it
234 @cds=sort { $main::a->[0] <=> $main::b->[0] } @cds;
235 my $codonstart=$tahref->{'codon_start'};
236 if ($codonstart>1) {
237 $codonstart--;
238 if ($strand eq '-') {
239 $cds[-1]->[1]-=$codonstart;
240 }
241 else {
242 $cds[0]->[0]+=$codonstart;
243 }
244 }
245 else { $codonstart='0'; }
246
247 foreach my $cd (@cds) { #TODO: calculate phase ?
248 print join("\t", $gbase, $track, 'CDS', $$cd[0], $$cd[1], '.', $strand, '.', "Parent=$tid")."\n";
249 }
250 }
251 } #for each transcript
252 }
253
254 sub makeUniqID {
255 my ($idref, $href)=@_;
256 my $id=$$idref;
257 $id=~s/~\d+$//;
258 my $c = ++$$href{$id};
259 if ($c>1) {
260 $c--;
261 $$idref=$id.'.m'.$c;
262 }
263 return $$idref;
264 }
265
266 sub flushFeature { #called after an attribute block has been parsed into %cattrs
267 return unless $curfeat && keys(%cattrs)>0;
268 my $gene=$cattrs{'gene'};
269 die("Error: no gene attribute found for current feature $curfeat\n")
270 unless $gene;
271 if ($curfeat eq 'gene') { # gene attributes processing
272 #make sure it's unique
273 push(@{$gids{$gene}},[$gbase, $gstart, $gend]);
274 my $gno=scalar(@{$gids{$gene}})-1;
275 $gene.='~'.$gno if $gno>0;
276 if (exists($genes{$gene})) { #should never happen, we just made sure it's unique!
277 my $prev_gstart=$genes{$gene}->[2];
278 my $prev_gend=$genes{$gene}->[3];
279 # if (abs($prev_gstart-$gstart)>30000) {
280 die("Error -- gene $gene is duplicated ($prev_gstart vs $gstart) !?\n");
281 # }
282 # else { #update min..max coordinates
283 # $genes{$gene}->[2]=$gstart if $gstart<$prev_gstart;
284 # $genes{$gene}->[3]=$gend if $gend>$prev_gend;
285 # ${$genes{$gene}->[5]}{'fragmented'}=1;
286 # }
287 }
288 else
289 { # create a new gene entry here
290 my $pseudo= delete($cattrs{'pseudo'}) ? 1 : 0;
291 # 0 1 2 3 4 5 6 7
292 $genes{$gene}=[$gbase, $strand, $gstart, $gend, [], {}, {}, $pseudo];
293 }
294 delete $cattrs{'gene'};
295 if ($allattrs) {
296 if (exists($cattrs{'db_xref'})) {
297 $genes{$gene}->[4]=[@{$cattrs{'db_xref'}}];
298 delete $cattrs{'db_xref'};
299 }
300 #another special case: gene_syn
301 if (exists($cattrs{'gene_syn'})) {
302 #convert [list] to a string with comma delimited names
303 $cattrs{'gene_syn'}=join(',',@{$cattrs{'gene_syn'}});
304 }
305 my $garef=$genes{$gene}->[5];
306 my @k=keys(%cattrs);
307 @{$garef}{@k}=(values(%cattrs));
308 } #all attributes are kept
309 @exon=();
310 %cattrs=();
311 ($partial3, $partial5)=();
312 return; #gene data added
313 } # stored gene attributes
314 # else
315 # - storing RNA attributes
316 # now make sure we place this RNA within the right gene - in case of gene duplications
317 # locate the matching $gene
318 @exon= sort { $main::a->[0] <=> $main::b->[0] } @exon;
319 my ($xstart, $xend)=($exon[0]->[0], $exon[-1]->[1]);
320 my $gxdata=$gids{$gene};
321 die("Error: gene $gene not found in \%gids !\n") unless $gxdata; #should never happen
322 if (@$gxdata>1) {
323 my $i=0;
324 my $found;
325 foreach my $gd (@$gxdata) {
326 if ($gd->[0] eq $gbase && $gd->[1]<=$xstart && $gd->[2]>=$xend) {
327 $found=1;
328 last;
329 }
330 $i++;
331 }
332 die("Error: cannot find gene enclosing RNA data ".
333 $cattrs{'transcript_id'}." ".$cattrs{'protein_id'}." (on $gbase, $xstart..$xend)\n")
334 unless $found;
335 $gene.='~'.$i if $i>0;
336 }
337
338 my $geneid=$gene;
339 my $gdata=$genes{$gene};
340 die("Error: \$genes data for gene '$gene' cannot be found!\n") unless $gdata;
341 my $rnas=$gdata->[6];
342 $geneid=~tr/~/_/;
343 my ($tid, $pid);
344 if ($curfeat eq 'CDS') { # just finished a CDS parsing
345 my $pid=$cattrs{'protein_id'};
346 die("Error: no protein_id found for CDS feature (gene $geneid)!\n") unless $pid;
347 #$pid=~s/^[a-z]+\|//i; # ref|NM_002314.1|
348 #$pid=~s/\|(\w*)$//; # some entries have a suffix - e.g. ref|NM_006824.1|P40
349 my @t=split(/\|/,$pid);
350 $pid=$t[1] if (@t>1);
351 $pid=~s/\.\d+$//; #remove version
352 $tid=$pmap{$pid};
353 die("Error: cannot get transcript for protein id $pid (gene $geneid)!\n") unless $tid;
354 unless (exists($rnas->{$tid})) { #this should never happen - CDS without mRNA definition?
355 # 0 1 2 3 4 5 6
356 #$rnas->{$tid}=['mRNA', [@exons], [@cds], {}, [], 0, 0];
357 $tid=~s/\.\d+$//;
358 die("Error: cannot find mRNA entry for protein $pid (reported as coming from transcript $tid)\n")
359 unless exists($rnas->{$tid});
360 }
361 # #just add the CDS
362 push(@{$rnas->{$tid}->[2]}, @exon);
363 # }
364 } #CDS
365 else { # exons (RNA feature)
366 $tid=$cattrs{'transcript_id'};
367 if ($tid) {
368 my @t=split(/\|/,$tid);
369 $tid=$t[1] if (@t>1);
370 #$tid=~s/^[a-z]+\|//i;
371 #$tid=~s/\|\w*$//; # some entries have a suffix - e.g. ref|NM_006824.1|P40
372 $tid=~s/\.\d+$//; #remove version!
373 }
374 else {
375 $tid=$geneid.'_t'; #some special cases with 'exon' features for pseudo genes
376 }
377 unless (exists($rnas->{$tid})) {
378 # 0 1 2 3 4 5 6
379 $rnas->{$tid}=[$curfeat, [@exon], [], {}, [], 0, 0];
380 }
381 else { #just add the exons
382 push(@{$rnas->{$tid}->[1]}, @exon);
383 }
384 } #exons
385 my $tdata=$rnas->{$tid};
386 $tdata->[5]=1 if $partial5;
387 $tdata->[6]=1 if $partial3;
388 # add attributes
389 delete @cattrs{'gene', 'transcript_id', 'protein_id'};
390 my $product = delete $cattrs{'product'};
391 my $ahref=$tdata->[3];
392 $ahref->{'product'}=$product if $product;
393 if ($allattrs) {
394 my %xrefs;
395 my @cx;
396 @cx=@{$cattrs{'db_xref'}} if exists $cattrs{'db_xref'};
397 push(@cx, @{$tdata->[4]});
398 @xrefs{@cx}=();
399 @cx=keys(%xrefs);
400 $tdata->[4]=[@cx];
401 delete $cattrs{'db_xref'};
402 #add new attributes
403 foreach my $k (keys %cattrs) {
404 $ahref->{$k}=$cattrs{$k};
405 }
406 }
407 #--> clean up
408 @exon=();
409 %cattrs=();
410 ($partial3, $partial5)=();
411 }
412
413 sub flushGSeq {
414 return unless keys(%genes);
415 flushFeature(); # write last RNA feature read
416 my $gcount=keys(%genes);
417 while (my ($g,$gd)=each(%genes)) {
418 writeGene($g,$gd);
419 }
420 ($gbase, $curfeat, @exon, %cattrs, $gpseudo, $gstart, $gend,
421 $partial5, $partial3)=();
422 $strand='+';
423 @exon=();
424 %cattrs=();
425 %genes=();
426 #-- debug only:
427 #exit;
428 }

Properties

Name Value
svn:executable *