ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/jgff2tbl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 22625 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5 use dbSession;
6
7 my $usage = q/Usage:
8 jgff2tbl -f <fasta_file> [-a <acc2version_table>] [-A] [-N] [-t <track>]
9 [-o <out_prefix>] [-c 'organism_code'] [-n '<organism_name>'] <gff_file>
10
11 Default <track> used for features is 'jigsaw'.
12
13 Creates a .fsa and .tbl files to be used by tbl2asn
14
15 /;
16 umask 0002;
17 getopts('NDn:c:Aa:f:t:o:') || die($usage."\n");
18 my %stop=('TAA'=>1,'TAG'=>1,'TGA'=>1, 'TAR'=>1,'TRA'=>1);
19
20 my $fastafile=$Getopt::Std::opt_f || die("$usage A fasta file is required!\n");
21 my $defpre=$fastafile;
22 if ($defpre=~m/[\/\\]([\.\,\; \(\)\$\~\-\w]+)$/) {
23 $defpre=$1; #remove path components
24 }
25 ($defpre)=($defpre=~m/^(\w+)/);#keep only the base part of the file name
26 my $nodb=$Getopt::Std::opt_N;
27 my @tbl;
28 my $orgcode=$Getopt::Std::opt_c;
29 my $debug=$Getopt::Std::opt_D;
30 unless ($orgcode) {
31 ($orgcode)=($defpre=~m/([a-z,_]+)\d+$/i);
32 $orgcode=~tr/-_//d;
33 $orgcode=uc($orgcode);
34 $orgcode='BB' if $orgcode eq 'BS';
35 }
36 my $accfile=$Getopt::Std::opt_a;
37 my $accparse=$Getopt::Std::opt_A;
38
39 my $outprefix=$Getopt::Std::opt_o || $defpre;
40 my $orgname=$Getopt::Std::opt_n;
41 my $track=$Getopt::Std::opt_t || 'jigsaw';
42 # die("$usage A feature track must be specified!\n");
43 my $gff=shift(@ARGV) || die("$usage An input gff3 file must be given!\n");
44 die("$gff: nothing to do.") if (-e $gff && (-s $gff<10));
45 my %acc;
46 if ($accfile) {
47 open(ACCFILE, $accfile) || die ("Error opening $accfile!\n");
48 chomp;
49 while (<ACCFILE>) {
50 my @a=split;
51 $acc{$a[0]}=$a[1];
52 }
53 close(ACCFILE);
54 }
55 open(FSA, '>'.$outprefix.'.fsa')
56 || die ("Error creating file $outprefix.fsa !\n");
57
58 open(FA, $fastafile)||die("Error opening $fastafile !\n");
59 my ($seqid, $seqlen, $ntseq, $encname);
60 while (<FA>) {
61 if (m/^>(\S+)\s*(.*)/) {
62 last if $seqid;#only one sequence !
63 ($seqid, my $rest)=($1, $2);
64 if ($accfile) {
65 my $accver=$acc{$seqid};
66 $seqid=$accver if $accver;
67 }
68 my ($enc)=($rest=~m/EN([mr]\d+)/);
69 if ($enc) {
70 $encname='EN'.$enc;
71 $orgcode.='_'.$enc
72 }
73 if ($accparse) {
74 my ($short, $acc)=split(/\|/,$seqid);
75 $acc=$short unless $acc;
76 if ($acc=~m/(\w+)v(\d+)$/) {
77 $seqid=$1.'.'.$2;
78 }
79 }
80 print FSA '>'.$seqid;
81 print FSA " [organism=$orgname]" if $orgname;
82 print FSA " [gcode=1] [primary=$seqid] $rest\n";
83 next;
84 }
85 print FSA $_;
86 tr/\n\r\t //d;
87 $ntseq.=uc($_); # for large sequences, better write this into a memory mappable file
88 $seqlen+=length($_);
89 }
90 close(FSA);
91
92 open(GFF, $gff) || die("Error opening the input gff file: $gff !\n");
93 #print FTBL join("\t", '', '', '','note', $credits)."\n";
94
95 my %gn; #unique gene names
96 {
97 my $cparent; #current parent - ASSUMES lines are ordered properly (children following parent)
98 my $cf; #current parent feature name (e.g. 'mRNA' or 'gene')
99 my $cxf; # child feature type: 'exon' or 'CDS'
100
101 my $cstrand;
102 my ($cstart, $cend);
103 #TODO: also use @ex for mRNA fkey (when UTRs are available)
104 my @cx; #children intervals
105 my ($cdescr, $gene_name, $mcov, $pxgap, $pnostart, $pnostop, $qxgap);
106 while (<GFF>) {
107 next if m/^\s*#/;
108 chomp;
109 my ($chr, $ftrack, $f, $fstart, $fend, $fscore, $strand, $frame, $attrs)=split(/\t/);
110 next unless lc($track) eq lc($ftrack);
111 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
112 $f='exon' if ($f =~m/\-exon$/);
113 my ($fid)=($attrs=~m/ID\s*=\s*"?([^;"]+)/);
114 my ($fp)=($attrs=~m/Parent\s*=\s*"?([^;"]+)/);
115 unless ($fid || $fp) {
116 print STDERR "Warning ($defpre, $encname): feature line $_ has neither ID nor Parent! skipping..\n";
117 next;
118 }
119 if ($fp) { #child feature ('exon' or 'CDS')
120 if ($cparent && $cparent ne $fp) {
121 die("Error ($defpre): invalid order of input lines (unknown parent $fp while still processing $cparent\n$_\n");
122 }
123 if ($cxf && $cxf ne $f) {
124 # TODO: will permit both 'exon' and 'CDS' subfeatures later
125 die("Error ($defpre): multiple subfeatures are not accepted (parent $fp, $cf has subfeature $cxf already)\n");
126 }
127 $cxf=$f;
128 my ($mevcount)=($attrs=~m/mappingEvCount=(\d+)/i);
129 # 0 1 2 3
130 push(@cx, [$fstart, $fend, $frame, $mevcount]);
131 }
132 else { #parent (mRNA) line
133 if ($cparent ne $fid) {
134 storeFeature($cparent, $cstrand, $cstart, $cend, $cdescr, $mcov,
135 $gene_name, [@cx], $pxgap, $pnostart, $pnostop, $qxgap) if @cx>0;
136 $cparent=$fid;
137 $cstrand=$strand;
138 $cf=$f;
139 $cxf='';
140 ($cstart, $cend)=($fstart, $fend);
141 @cx=();
142 $cdescr='';
143 ($pxgap, $pnostart, $pnostop, $qxgap)=();
144 }
145 else {
146 die("Error ($defpre): duplicate parent entry for $fid\n");
147 }
148 # try to parse the description/annotation, if any
149 ($mcov)=($attrs=~m/MCov=([\d\.]+)/i);
150 ($gene_name)=($attrs=~m/GeneId=([^\;]+)/i);
151 $gene_name=uc($gene_name);
152 ($pxgap)=($attrs=~m/ExonGap=(\d+:\d+)/i);
153 ($qxgap)=($attrs=~m/QXGap=([\d+\-\,]+)/i);
154 $pnostart=1 if $attrs=~m/noStart/i;
155 $pnostop=1 if $attrs=~m/noStop/i;
156 if ($attrs=~m/TopHit\s*=\s*"?([^;"]+)/i) {
157 $cdescr=$1;
158 }
159 elsif ($attrs=~m/descr\s*=\s*"?([^;"]+)/i) {
160 $cdescr=$1;
161 }
162 elsif ($attrs=~m/Info\s*=\s*"?([^;"]+)/i) {
163 $cdescr=$1;
164 }
165 elsif ($attrs=~m/BestHit\s*=\s*"?([^;"]+)/i) {
166 $cdescr=$1;
167 }
168 elsif ($attrs=~m/Name\s*=\s*"?([^;"]+)/) {
169 $cdescr=$1;
170 }
171 if ($cdescr) {
172 $cdescr=~s/^\s+//;$cdescr=~s/\s+$//;
173 $cdescr='' if index(lc($fid),lc($cdescr))>=0;
174 $cdescr=~s/\x01.+//;
175 }
176 } #parent
177
178 }
179
180 storeFeature($cparent, $cstrand, $cstart, $cend, $cdescr, $mcov,
181 $gene_name, [@cx], $pxgap, $pnostart, $pnostop, $qxgap) if @cx>0;
182 } #input parsing block
183 close(GFF);
184
185 my ($ds, $sth);
186 my $sql='select xref_data from uniprot_xref where xrefdb in ("GenBank", "EMBL") '.
187 'and xref_data not like "NOT_ANNOTATED%" and up_id=?';
188
189 unless ($nodb) {
190 # $ds = dbSession->new('geanno@NEOSYBASE');
191 #$ds = dbSession->new('common@EVERGREEN');
192 $ds = dbSession->new('common@CBCBMYSQL');
193 $sth=$ds->prep($sql);
194 }
195 my %gdup;
196
197
198 open(FTBL, '>'.$outprefix.'.tbl')
199 || die("Error creating $outprefix.tbl file!\n");
200 open(GFFOUT, ">$outprefix.tbl.gff3")
201 || die("Error creating $outprefix.tbl.gff3 file!\n");
202 print FTBL ">Feature\t$seqid\n";
203 #print FTBL join("\t", 1, $seqlen, 'misc_feature')."\n";
204 my $credits='Predicted annotation generated by JIGSAW and other methods, and provided'.
205 ' by Geo Pertea, Mihaela Pertea, Jonathan Allen and Steven Salzberg, Center for'.
206 ' Bioinformatics and Computational Biology, University of Maryland';
207
208
209 # -- write sequencing gaps track to gff file;
210 #---
211 my @sgaps=getSequencingGaps();
212 my $sgnum=0;
213 foreach my $d (@sgaps) {
214 $sgnum++;
215 print GFFOUT join("\t", $seqid, 'seqgap', 'gap', $d->[0], $d->[1], '.', '+',
216 '.', "ID=Seqgap_$sgnum;Name=Seqgap_$sgnum\n");
217 }
218
219 my $re;
220 $re = qr/\s*\(([^\)\(]+|(??{$re}))+\)/x;
221
222 foreach my $t (@tbl) {
223 writeFeature($t);
224 }
225
226 close(FTBL);
227 close(GFFOUT);
228
229 if ($sth) {
230 $sth->finish();
231 $ds->logout();
232 }
233
234
235 sub storeFeature {
236 my ($cparent, $cstrand, $cstart, $cend, $cdescr, $mcov,
237 $gene_name, $cxref, $xgap, $nostart, $nostop, $qgap)=@_;
238 $cparent=~tr/.,:/___/s;
239
240 unless($mcov>50 && $cdescr) {
241 $gene_name='';
242 }
243 $gn{$gene_name}++ if $gene_name;
244 push(@tbl, [$cparent, $cstrand, $cstart, $cend, $cdescr, $mcov,
245
246 $gene_name, $cxref, $xgap, $nostart, $nostop, $qgap]),
247 }
248
249 sub writeFeature {
250 my $d=$_[0];
251 my ($parent, $gstrand, $gstart, $gend, $gdescr, $acov, $gname, $ex,
252 $xgap, $nostart, $nostop, $qgap)=@$d;
253 print STDERR "writeFeature($parent, $gdescr, $gname)..\n" if $debug;
254 my @cx= sort { $main::a->[0]<=>$main::b->[0] } @$ex;
255 my @gffx = @cx;
256 #if ($gstrand eq '-') { #reverse complement features
257 # ($gstart, $gend)=($gend, $gstart);
258 # @cx = map { $_=[$_->[1], $_->[0], $_->[2]] } @cx;
259 # @cx= reverse(@cx);
260 # }
261 my $s="\t";
262 my $cid='';
263 my $infrsim;
264 my $unipr=0;
265 if ($gdescr) {
266 my $hdescr;
267 ($cid, $hdescr)=split(/ /,$gdescr,2);
268 $gdescr=$hdescr if $hdescr;
269 $gdescr=~s/^gid:\S+\s+//;
270 $gdescr=~s/^CDS:\d+\-\d+\s+//;
271 $gdescr=~s/^CDS:join\([\d\,\-]+\)\s+//;
272 $gdescr=~s/UniRef100_\S+\s+//;
273 if ($cid=~m/^UPr?\|(\w+)\|acc\|(\w+)/) {
274 $unipr=1;
275 my ($upid, $upacc)=($1,$2);
276 $cid=up2gb($upid);
277 if ($cid) {
278 $infrsim='AA sequence:INSD:'.$cid;
279 $cid.=' ';
280 }
281 else { #no native GB entry, use SwissProt accession
282 $infrsim='AA sequence:SP:'.$upacc;
283 $cid=$upacc;
284 $cid.=' ';
285 }
286 }
287 elsif ($cid=~m/[modinfpxvr]+\|([\w\.]+)/) { #refseq entry
288 $cid=$1;
289 $cid =~ s/\.[24a-z]+\d+$//;
290 my $refacc=$cid;
291 $refacc.='.1' unless $refacc=~m/\.\d+$/;
292 $infrsim='RNA sequence:RefSeq:'.$refacc;
293 $cid.=' ';
294 }
295 else {
296 $cid='';
297 }
298 }
299 ##print FTBL join($s, '', '', '','locus_tag', $parent)."\n";
300 my $note;
301 my $product='hypothetical mRNA';
302 #my $product='hypothetical protein';
303 my $prot_desc;
304 my $cdsproduct='hypothetical protein';
305 my $gproduct='';
306 my $gffnote='';
307 if ($gdescr) {
308 #print STDERR " re processing start ($gdescr)..\n" if $debug;
309 # my ($fp)=($gdescr=~m/($re)/); #first note within parentheses
310 #print STDERR " re m/ processing end.\n" if $debug;
311 # $fp=~s/^\s*\(\s*//;
312 # $fp=~s/\s*\)\s*$//;
313 $gdescr=~tr/[]/()/;
314 $gdescr=~s/$re//g; #remove all parenthetical constructs
315 $gdescr=~tr/ / /s; #squash duplicated spaces
316 if ($acov>50) {
317 my $d=$gdescr;
318 $d=~s/UniRef100_\S+\s+//;
319 my $species;
320 #remove organism name
321 $species=$1 if ($d=~s/\s+\{([ \.\,\-\"\'\w]+)\}$//);
322 # is the product provided?
323 if ($d=~/\bproduct:\s*(\S[^\:\{\;]+)/i) {
324 my $v=$1;
325 if ($v=~m/\bhypothetical\b/ || $v=~m/\bLOC\d+/ || $v=~m/\bRIKEN\b/) {
326 $d=~s/\s*product:[^\:\{;]+//;
327 }
328 else { $d=$v; }
329 }
330 # make words start with lowercase: (proper names will mess this up)
331 $d =~ s/([A-Z][a-z]{5,})\b/\L$1/g;
332 #-- remove/convert isoform info from product descriptions
333 $d=~s/(?:transcript|splice)\s+variant\s+([\w\.\-\;]{1,5})$/isoform $1/i;
334 $d=~s/\,?\s*full insert(?: sequence)//i;
335 $d=~s/\,?\s*full (?:insert|sequence)//i;
336 $d=~s/\,?\s*\bform\d+//i;
337 $d=~s/phosphorylase\, glycogen[\;\.]\s*muscle/muscle glycogen phosphorylase/i;
338 $d=~s/^\s+//;$d=~s/[\;\:\,\. ]*$//;
339 if ($d=~s/^\s*similar\s+(to|with)\s+//i) {
340 $d=~s/homolog\w*$//i;
341 $d.=' homolog';
342 }
343 $note='similar to '.$cid.$d;
344 $note.=' ('.$species.')' if $species;
345 $gffnote=$note.", $acov\% coverage";
346 $d=~s/\s*\bLOC\d+//;
347 # $d=~s/\s*\bproduct\b//i;
348 # $d=~s/\s*\bC[\dxy]+orf\d+//i;
349 if ($d =~m/\bhypothetical\b/ || $d=~m/_predicted\b/i || $d=~m/\bRIKEN\b/ || $d=~m/\d+RIK\b/i
350 || $d=~m/predicted gene/i || $d=~m/open reading frame\s+\d+/i || $d=~m/uncharacterized/i
351 || $d=~m/with sequence similarity/i || $d=~m/conserved gene/i
352 || $d=~m/on chr/i || $d=~m/\bchromosome\b/i || $d=~m/\s*\bC[\dxy]+orf\d+/i
353 || $d=~m/\b[xy\d]+[pq]\d+\b/i || $d=~m/telomeric to/i || $d=~m/neighbor (?:of|to)/i
354 || $d=~m/^\s*(?:protein|product)\s*$/i
355 || ($unipr && $d=~m/^[A-Z]+\d{9,}$/)
356 ) {
357 $d='';
358 }
359 else {
360 $d=~s/\,?\s*nuclear gene encoding mitochondrial protein//i;
361 $d=~s/^\s+//;$d=~s/[\;\:\,\. ]*$//;
362 $d=~s/\,?\s*\d+\s*kDa\b//i;
363 $d=~s/gene$/protein/;
364 $d=~tr/ / /s;
365 if ($d) {
366 $product=$d.' (predicted)'; #unless $d =~ m/hypothetical/;
367 $cdsproduct=$product;
368 }
369 }
370 $gname='' unless $d;
371 $gname='' if ($gname=~m/LOC\d+/ || $gname=~m/\s*\bC[\dxy]+orf\d+/i || $gname=~m/\d+RIK$/i);
372 }
373 else {
374 $gname='';
375 }
376 }
377 my $lpid; #local protein id
378 my $cdsnote;
379 if ($gname) {
380 if ($gn{$gname}>1) {
381 my $gnum = ++$gdup{$gname};
382 $gname=~s/[ \-_]?predicted$//i;
383 $cdsnote='distinct genes were predicted for gene locus '.$gname;
384 $gname.='_'.$gnum;
385 }
386 $gname=~s/[ \-_]?predicted$//i;
387 $lpid=$orgcode.'_'.$gname;
388 }
389 else {
390 my $jname=$parent;
391 $jname=~tr/_//d;
392 $jname=~s/tjsm/jsm/;
393 $lpid=$orgcode.'_jsm'.uc(sprintf('%x',$gstart)).($gstrand eq '+' ? 'f':'r');
394 $gname=$lpid;
395 }
396
397 my @xsplit=( 0 );
398 #--
399 $gffnote = $product unless $gffnote;
400 print GFFOUT join("\t", $seqid, 'jigsaw', 'mRNA', $gffx[0]->[0], $gffx[-1]->[1], '.',
401 $gstrand, '.', "ID=$parent;Name=$gname;descr=\"$gffnote\"\n");
402 foreach my $gex (@gffx) {
403 print GFFOUT join("\t", $seqid, 'jigsaw', 'CDS', $gex->[0], $gex->[1], '.',
404 $gstrand, $gex->[2], "Parent=$parent\n");
405 if ($gex->[3]==0) {
406 print STDERR "Warning ($defpre, $encname): no mapping evidence for exon $$gex[0]-$$gex[1] of $parent|$gname;\n";
407 }
408 }
409 #--
410 my $stopErr="ERROR ($defpre): in-frame stop codon found for $parent|$gname ($product)\n";
411 my $revstrand=($gstrand eq '-');
412 my $lastCodon='';
413 if ($xgap) { # exon gap found
414 my ($splstart, $splend)=split(/:/,$xgap);
415 my @sa = grep { $_->[0]<=$splstart } @cx; #lower part, affected by gap
416 @sa= map { [ @$_ ] } @sa; #convert @sa into a copy, not references to exon data
417 #print STDERR "lower half: ".join(',', ( map { $_->[0]."-".$_->[1] } @sa ))."\n";
418 #print STDERR "$sa[-1]->[1] vs $splend \n";
419 if ($sa[-1]->[1]>=$splstart) {
420 $sa[-1]->[1]=$splstart-1; # frame will be fixed later?
421 }
422 my @sb = grep { $_->[1]>=$splend } @cx; #upper half
423 @sb= map { [ @$_ ] } @sb;
424 #print STDERR "upper half: ".join(',', ( map { $_->[0]."-".$_->[1] } @sb ))."\n";
425
426 my @sbr= map { $_->[0].'-'.$_->[1] } @sb;
427 if ($sb[0]->[0]<$splend) {
428 $sb[0]->[0]=$splend+1;
429 }
430 my @cxr= map { $_->[0].'-'.$_->[1] } @cx;
431
432 if ($sa[-1]->[1]-$sa[0]->[0]<8) { # too short a fragment
433 @cx=@sb;
434 $xgap='';
435 $gstart=$sb[0]->[0];
436 print STDERR "WARNING: dropping short split end $sa[-1]->[1]-$sa[0]->[0] of $parent|$gname\n";
437 goto SHORT_SPLIT;
438 }
439 if ($sb[-1]->[1]-$sb[0]->[0]<8) {
440 @cx=@sa;
441 $xgap='';
442 $gend=$sa[-1]->[1];
443 print STDERR "WARNING: dropping too short split end $sb[-1]->[1]-$sb[0]->[0] of $parent|$gname\n" if $sb[-1]->[1];
444 goto SHORT_SPLIT;
445 }
446 @xsplit=();
447 my $frameCheck=0;
448 if ($revstrand) {
449 @cx = map { $_=[$_->[1], $_->[0], $_->[2]] } @cx;
450 @cx= reverse(@cx);
451 @sa = map { $_=[$_->[1], $_->[0], $_->[2]] } @sa;
452 @sa= reverse(@sa);
453 @sb = map { $_=[$_->[1], $_->[0], $_->[2]] } @sb;
454 @sb= reverse(@sb);
455 #print STDERR "checking Frame for \@sa, stops for \@b\n";
456 $frameCheck=&checkFrame(\@sa, 1, \$lastCodon); # this will also "fix" the frame if needed
457 print STDERR $stopErr if &checkStop(\@sb, 1);
458 push(@xsplit, [[@sb], '_5p',', 5 prime'],[[@sa], '_3p',', 3 prime']);
459 }
460 else { #forward strand
461 $frameCheck=&checkFrame(\@sb,0,\$lastCodon);
462 print STDERR $stopErr if &checkStop(\@sa);
463 push(@xsplit, [[@sa], '_5p',', 5 prime'],[[@sb], '_3p',', 3 prime']);
464 }
465 if ($frameCheck==0) {
466 print STDERR "ERROR ($defpre): in-frame stop found for $parent|$gname ($product) and frame shift fix failed.\n";
467 }
468 }
469 else { #no intra-exon gaps detected (no split)
470 SHORT_SPLIT:
471 if ($revstrand) {
472 @cx = map { $_=[$_->[1], $_->[0], $_->[2]] } @cx;
473 @cx= reverse(@cx);
474 }
475 print STDERR $stopErr if &checkStop(\@cx, $revstrand, \$lastCodon);
476 }
477 #-------------- gene ---------------------
478 my @hasNs = checkNs($gstart, $gend);
479 if (@hasNs>0 || $qgap) {
480 my $wmsg="Warning ($defpre, $encname): possible gap in model $parent|$gname : ";
481 my @ev;
482 push(@ev, "sequencing gap at ".join(',',@hasNs)) if @hasNs>0;
483 push(@ev, "mapping-suggested gaps at $qgap") if $qgap;
484 $wmsg.=join("; ",@ev);
485 print STDERR $wmsg."\n";
486 }
487 if ($revstrand) { #reverse complement features
488 ($gstart, $gend)=($gend, $gstart);
489 }
490 print FTBL join($s, '<'.$gstart, '>'.$gend, 'gene')."\n";
491 print FTBL join($s, '', '', '','gene', $gname)."\n";
492 my $jspredline=join($s,'','','','inference','ab initio prediction:JIGSAW:3.2')."\n";
493 print FTBL $jspredline;
494 if ($infrsim) {
495 print FTBL join($s,'','','','inference','similar to '.$infrsim)."\n";
496 }
497 #---
498
499 $nostop=1 unless $stop{uc($lastCodon)};
500 foreach my $xspl (@xsplit) {
501 #
502 # ------------- mRNA --------------------
503 my ($cdspl_id, $cdspl_prod, $cdspl_note)=('','','');
504 my ($part5, $part3)=($nostart, $nostop); #partial at 5' and/or at 3'
505
506 if ($xspl) { # if we have split data
507 @cx=@{$xspl->[0]};
508 $cdspl_id=$xspl->[1]; # '_a' or '_b'
509 $cdspl_prod=$xspl->[2]; # ", 5 prime" or ", 3 prime"
510 $cdspl_note='; sequencing gap found within coding sequence';
511 $part5=1 if index($cdspl_prod, '3')>0;
512 $part3=1 if index($cdspl_prod, '5')>0;
513 }
514 my $endrna= $cx[-1]->[1];
515 $cx[-1]->[1]='>'.$endrna;
516 print FTBL join($s, '<'.$cx[0]->[0], $cx[0]->[1], 'mRNA')."\n";
517 my @crx=@cx;shift(@crx);
518 foreach my $xc (@crx) {
519 print FTBL join($s, $$xc[0], $$xc[1])."\n";
520 }
521 print FTBL $jspredline;
522 if ($infrsim) {
523 print FTBL join($s,'','','','inference','similar to '.$infrsim)."\n";
524 }
525 print FTBL join($s, '', '', '','product', $product.$cdspl_prod)."\n";
526
527 #-------------- CDS ---------------------
528 $cx[-1]->[1]=$endrna;
529 my $cdstart=$cx[0]->[2]+1;
530 #@cx is sorted such that $cx[0] is always the 5' exon
531 $cx[0]->[0]='<'.$cx[0]->[0] if $part5;
532 $cx[-1]->[1]='>'.$cx[-1]->[1] if $part3;
533 print FTBL join($s, $cx[0]->[0], $cx[0]->[1], 'CDS')."\n";
534 @crx=@cx;shift(@crx);
535 foreach my $xc (@crx) {
536 print FTBL join($s, $$xc[0], $$xc[1])."\n";
537 }
538 print FTBL $jspredline;
539 if ($infrsim) {
540 print FTBL join($s,'','','','inference','similar to '.$infrsim)."\n";
541 }
542
543 print FTBL join($s, '', '', '','codon_start', $cdstart)."\n" if $cdstart>1 || $part5 || $part3;
544 print FTBL join($s, '', '', '','protein_id', 'gnl|NISC-CON|'.$lpid.$cdspl_id)."\n";
545 print FTBL join($s, '', '', '','product', $cdsproduct.$cdspl_prod)."\n";
546 print FTBL join($s, '', '', '','prot_desc', $prot_desc)."\n" if ($prot_desc);
547 my $cnote=$note;
548 if ($cdsnote) {
549 $cnote=$note ? $note.'; '.$cdsnote : $cdsnote;
550 }
551 print FTBL join($s, '', '', '','note', $cnote.$cdspl_note)."\n" if $cnote;
552 } # - for each gap split -------------
553
554 print STDERR " => done $gname\n" if $debug;
555 }
556
557 sub checkFrame {
558 #return 1;
559 my ($rx, $rev, $rlastc)=@_; # $rx = [ [$exonstart, $exonend, $frame], ... ]
560 my ($s, $f);
561 my @xrange= map { $_->[0].'-'.$_->[1] } @$rx;
562 #print STDERR "Checking FRAME for exons: ".join(',',@xrange)."\n";
563
564 # print STDERR "checking frame for ".scalar(@$rx). " exons..\n";
565 if ($rev) {
566 map { $s.=substr($ntseq, $_->[1]-1, $_->[0]-$_->[1]+1) } (reverse(@$rx));
567 $s=reverseComplement($s);
568 # $f=$$rx[-1]->[2];
569 }
570 else {
571 map { $s.=substr($ntseq, $_->[0]-1, $_->[1]-$_->[0]+1) } @$rx;
572 #first check if the given frame is OK
573 }
574 # print STDERR join("\n", unpack("(A80)*",$s))."\n";
575 my %fr=( 0=>1, 1=>1, 2=>1 );
576 my @frames; # frames to try, in the order of priority:
577 # 1st the frame that would cover the end precisely
578
579 push(@frames,length($s)%3);
580 delete $fr{$frames[0]};
581
582 # 2nd - is the originally declared frame
583 $f=$$rx[0]->[2];
584 $f=0 unless $f>0;
585 if ($f!=$frames[0]) {
586 push(@frames, $f);
587 delete $fr{$f};
588 }
589 #add the rest of the frames to check
590 my @frest=keys(%fr);
591 push(@frames, @frest);
592
593 my ($stopfound, $newf);
594 my %lastcodon; # last codons in each frame
595 foreach my $frame (@frames) {
596 # print STDERR ".. trying frame $frame\n";
597 my @c=unpack('A'.$frame.'(A3)*',$s);
598 shift(@c);$lastcodon{$frame}=pop(@c);
599 $stopfound=0;
600 foreach my $codon (@c) {
601 if ($stop{$codon}) {
602 $stopfound=1;
603 # print STDERR " STOP found.\n";
604 last;
605 }
606 }
607 $newf=$frame;
608 last unless $stopfound;
609 }
610 return 0 if ($stopfound); # Error: cannot find a "clean" frame!
611 #-- new frame suggested
612 $$rx[0]->[2]=$newf;
613 $$rlastc=$lastcodon{$newf} if $rlastc;
614 return 1;
615 }
616
617 sub checkNs {
618 my ($gs, $ge)=@_;
619 if ($ge<$gs) {
620 print STDERR "Warning ($defpre, $encname): genestart>geneend coordinates!\n";
621 ($ge,$gs)=($gs,$ge);
622 }
623 my $gseq=substr($ntseq, $gs-1, $ge-$gs+1);
624 my @sgaps;
625 my $from=0;
626 my $r;
627 my @nregs=($gseq=~m/(N{2,})/g);
628 my $ni=0;
629 while (($r=index($gseq, 'NN',$from))>=0) {
630 $from=$r+length($nregs[$ni]);
631 push(@sgaps, ($gs+$r).'-'.($gs+$from-1));
632 $ni++;
633 }
634 return @sgaps;
635 }
636
637 sub getSequencingGaps {
638 my @res;
639 my @nregs=($ntseq=~m/(N{2,})/g);
640 my $from=0;
641 my $r;
642 my $ni=0;
643 while (($r=index($ntseq, 'NN',$from))>=0) {
644 $from=$r+length($nregs[$ni]);
645 push(@res, [$r+1, $from]);
646 $ni++;
647 }
648 return @res;
649 }
650
651 sub checkStop {
652 my ($rx, $rev, $rlastc)=@_; # ref to list of [$exonstart, $exonend, $frame]
653 #my @xrange= map { $_->[0].'-'.$_->[1] } @$rx;
654 #print STDERR "..checking ".join(',',@xrange)." for stop codons..\n";
655 my ($s, $f);
656 if ($rev) {
657 map { $s.=substr($ntseq, $_->[1]-1, $_->[0]-$_->[1]+1) } (reverse(@$rx));
658 $s=reverseComplement($s);
659 $f=$$rx[-1]->[2];
660 }
661 else {
662 map { $s.=substr($ntseq, $_->[0]-1, $_->[1]-$_->[0]+1) } @$rx;
663 #first check if the given frame is OK
664 }
665 print STDERR "ERROR: invalid sequence length at $_[0]->[0]-$_[0]->[1]\n" if length($s)<=3;
666 $f=$$rx[0]->[2];
667 $f=0 unless $f>0;
668 my @c=unpack('A'.$f.'(A3)*',$s);
669 shift(@c);
670 my $lastcodon=pop(@c);
671 $$rlastc=$lastcodon if $rlastc;
672 my $stopfound;
673 #print STDERR ".. trying original frame: $f\n";
674 foreach my $codon (@c) { if ($stop{$codon}) {
675 $stopfound=1;
676 #print STDERR " STOP found.\n";
677 last;}
678 }
679 return $stopfound;
680 }
681
682 sub reverseComplement {
683 my $s=reverse($_[0]);
684 $s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
685 return $s;
686 }
687
688
689 sub up2gb {
690 my $uid=shift(@_);
691 my $first;
692 if ($nodb) {
693 $first='UP2GB_'.$uid;
694 }
695 else {
696 $ds->exec($sth, $uid);
697 print STDERR "querying for $uid..\n" if $debug;
698 while (my $r=$ds->fetch($sth)) {
699 $first=$$r[0] unless $first;
700 }
701 }
702 ($first)=($first=~m/^([\w\.]+)/) if $first;
703 print STDERR " up2gb: $uid -> '$first'\n" if $debug;
704 #unless ($first) {
705 # print STDERR "WARNING: no GB accession returned for UniProt $uid!".
706 # "($sql)\n";
707 # }
708 return $first;
709 }

Properties

Name Value
svn:executable *