ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/recon_jigsaw.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 13014 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Reconcile jigsaw results on separate strands.
7 Usage:
8 recon_jigsaw.pl [-P <gseq.fa>] [-o <output.gff3>] <both_strands_jigsaw.jgff> \
9 <forward_strand.jgff> <reverse_strand.jgff>
10
11 Requires the raw jigsaw output from the 3 jigsaw runs on the same
12 genomic sequence.
13 Outputs a gff3 file with the resulting "reconciliation".
14
15 Use the -P option to discard any partial gene predictions made by jigsaw.
16 <gseq.fa> is a fasta providing the genomic sequence required for checking
17 start&stop codons.
18 /;
19 umask 0002;
20 getopts('P:Do:') || die($usage."\n");
21 my $seqfile=$Getopt::Std::opt_P; #remove partial predictions
22 my $seq='';
23 my $outfile=$Getopt::Std::opt_o;
24 my $debug=$Getopt::Std::opt_D;
25 die($usage."\n") unless @ARGV==3;
26
27 my ($bfile, $ffile, $rfile)=@ARGV;
28 my %stopCodons=('TAA'=>1,'TAG'=>1,'TAR'=>1, 'TGA'=>1, 'TRA'=>1);
29 my $startCodon='ATG';
30
31 #foreach ($bfile, $ffile, $rfile) {
32 die ($usage."File $_ not found or empty! \n") unless -s $bfile;
33 # }
34
35 if ($seqfile) { #load sequence in memory
36 open(SEQFILE, $seqfile) || die ("Error opening fasta file '$seqfile'!\n");
37 while (<SEQFILE>) {
38 if (m/^>/) {
39 last if length($seq)>10;
40 next;
41 }
42 tr/\n\r \t//d;
43 $seq.=$_;
44 }
45 close(SEQFILE);
46 }
47
48 my (@bgenes, @fgenes, @rgenes);
49 ##
50 ## @genes= (generef1, generef2, ... )
51 ## where a generef := [ predset, strand, start, end, \@exons, cluster, total_exonlen, score]
52 ## 0 1 2 3 4 5 6 7
53 ## where @exons := list of [exonstart, exend, frameshift]
54 ## and predset := jigsaw prediction set code ( represented as one of
55 ## the letters 'b', 'f', 'r', meaning one of:
56 ## both strands at once, forward-only, reverse-only
57 #
58 ## a "cluster" is a reference to a list of coordinates followed by overlapping loci:
59 ## $cluster := [ $cstart, $cend, $generef1, $generef2 ... ]
60
61 my %clusters; #just a set of references to formed clusters
62
63 my $genomicseq;
64
65 $genomicseq=&readJigsaw($bfile, \@bgenes, 'b');
66 &readJigsaw($ffile, \@fgenes, 'f');
67 &readJigsaw($rfile, \@rgenes, 'r');
68
69
70
71 ## -- now cluster the loci and check them for conflicts:
72 ## for now only consider predictions which overlap
73 ## the "both-strands" prediction set
74 for (my $bl=0;$bl<@bgenes;$bl++) {
75 for (my $fl=0;$fl<@fgenes;$fl++) {
76 last if $fgenes[$fl]->[2]>$bgenes[$bl]->[3];
77 # check Gene overlap and update cluster if so
78 &processGovl($bgenes[$bl], $fgenes[$fl]);
79 }
80 for (my $rl=0;$rl<@rgenes;$rl++) {
81 last if $rgenes[$rl]->[2]>$bgenes[$bl]->[3];
82 # check Gene overlap and update cluster if so
83 &processGovl($bgenes[$bl], $rgenes[$rl]);
84 }
85 }
86
87
88 # now %clusters contains the loci clusters (except for 'f' or 'r' singletons we don't care about)
89 my @cls=values(%clusters);
90 @cls=sort { $main::a->[0] <=> $main::b->[0] } @cls;
91 my $clno=1;
92 my $gno=1;
93 if ($outfile) {
94 open(OUTF, '>'.$outfile) || die ("Error creating file $outfile !\n");
95 select(OUTF);
96 }
97 foreach my $c (@cls) {
98 $c=&processCluster($c);
99
100 my @cl=@$c;
101 my $clstart=shift(@cl);
102 my $clend=shift(@cl);
103 #only generefs in @cl now:
104
105 print STDERR "Locus Cluster #$clno ($clstart-$clend)\n" if $debug;
106 foreach my $gr (@cl) {
107 print STDERR "gene #$gno:\t".join("\t",@{$gr}[0..3], $$gr[7])."\n" if $debug;
108 #if ($$gr[7]>0) {
109 printGff($genomicseq, $gno, $gr);
110 $gno++;
111 # }
112 }
113 $clno++;
114 }
115
116 if ($outfile) {
117 close(OUTF);
118 select(STDOUT);
119 }
120
121 ##### ============== SUBROUTINES ============ #####
122
123
124 sub printGff {
125 my ($srcseq, $gno, $g)=@_;
126 my $curmodel="tjsm.$gno";
127 my @xflags;
128 push(@xflags, 'NoStartCodon') if $$g[8];
129 push(@xflags, 'NoStopCodon') if $$g[9];
130 my @xsplit;
131 if ($$g[10]) {
132 push(@xflags, 'ExonGap='.$$g[10]);
133 @xsplit=split(/:/,$$g[10]);
134 }
135 my $xfl='';
136 if (@xflags) {
137 $xfl=';'.join(';',@xflags);
138 }
139 print join("\t", $srcseq, 'jigsaw', 'mRNA', $$g[2], $$g[3], '.', $$g[1], '.',
140 "ID=$curmodel;Name=$curmodel".$xfl)."\n";
141
142 my $exons=$g->[4];
143 my $i=1;
144 foreach my $exon (@$exons) {
145 my ($estart, $eend, $eframe, $splitframe)=@$exon;
146 my $xattr="Parent=$curmodel";
147 $xattr.=';ExonSplit' if ($estart==$xsplit[1]);
148 $xattr.=';SplitFrame' if $splitframe;
149 print join("\t",$srcseq, 'jigsaw', 'CDS', $estart, $eend, '.',
150 $$g[1], $eframe, $xattr)."\n";
151 $i++;
152 }
153
154 }
155
156
157 sub processCluster {
158 my $c=shift(@_);
159 my @cl=@$c;
160 my $clstart=shift(@cl);
161 my $clend=shift(@cl);
162 # check for exon collisions
163 #my %removal; # list of genes to remove due to exon conflicts
164 #my %score; # $generef => score based on overlap with other model on the same strand
165 foreach my $g (@cl) {
166 $$g[7]+=$$g[6] if length($$g[0])>1;
167 }
168 my %removal; # list of genes to remove due to exon conflicts
169 for (my $i=0;$i<@cl;$i++) {
170 my $gx=$cl[$i];
171 for (my $j=$i+1;$j<@cl;$j++) {
172 my $gy=$cl[$j];
173 # CDS lengths and exon overlap bps
174 my $exovl=&exonOverlap($gx, $gy);
175 if ($exovl>=0 && ($gx->[1] eq $gy->[1])) { #overlapping, on the same strand: must choose one!
176 if ($$gx[6]+$$gx[7] > $$gy[6] +$$gy[7]) {
177 # gx "better" than gy
178 $$gx[7]+=$exovl;
179 $removal{$gy}=$gy;
180 }
181 else {
182 $$gy[7]+=$exovl;
183 $removal{$gx}=$gx;
184 }
185 } #overlapping on the same strand
186 } #inner loop
187 } #outer loop
188 # second pass, to remove conflicting models:
189 for (my $i=0;$i<@cl;$i++) {
190 my $gx=$cl[$i];
191 next if $removal{$gx};
192 for (my $j=$i+1;$j<@cl;$j++) {
193 my $gy=$cl[$j];
194 next if $removal{$gy};
195 # CDS lengths and exon overlap bps
196 if ($gx->[1] ne $gy->[1] && &exonOverlap($gx, $gy)) { #opposite strands
197 #opposite strands -- one of them should be thrown away
198 my $grm = ($$gx[6]+$$gx[7]>$$gy[6]+$$gy[7]) ? $gy : $gx;
199 $removal{$grm}=$grm;
200 } #opposite strands
201 } #inner for loop
202 } #outer for loop
203 my @newcl= grep { ($_->[7]>0 || ($_->[7]==0 && $_->[0] eq 'b')) && not exists($removal{$_}) } @cl;
204 #my @newcl= grep { ($_->[7]>0 && not exists($removal{$_}) } @cl;
205 #DBG#:
206 foreach my $g (values(%removal)) {
207 if ($$g[0] =~ m/b/) {
208 print STDERR "W: eliminated base model at $$g[2]-$$g[3]($$g[1]) from $bfile\n";
209 }
210 }
211 return [$clstart, $clend, @newcl]
212 }
213
214
215 sub exonOverlap { #returns the total length of overlapping exonic regions
216 my ($ga, $gb)=@_;
217 return -1 unless ( $$ga[2]<=$$gb[2] ? $$gb[2]<=$$ga[3] : $$ga[2] <= $$gb[3] );
218 my ($ea, $eb)=($ga->[4], $gb->[4]);
219 my $ovl=0;
220 foreach my $a (@$ea) {
221 foreach my $b (@$eb) {
222 if ($$a[0]<=$$b[0]) {
223 if ($$b[0]<=$$a[1]) {
224 $ovl += ( $$b[1]<$$a[1] ? $$b[1]-$$b[0]+1 : $$a[1]-$$b[0]+1 );
225 }
226 }
227 else {
228 if ($$a[0]<=$$b[1]) {
229 $ovl += ( $$b[1]<$$a[1] ? $$b[1]-$$a[0]+1 : $$a[1]-$$a[0]+1 );
230 }
231 }
232 } #for b
233 } # for a
234 return $ovl;
235 }
236
237 sub processGovl {
238 my ($ga, $gb)=@_;
239 my ($ovl, $min, $max);
240 #in fact, check if their cluster ends overlap
241 my ($ca, $cb)=($ga->[5], $gb->[5]);
242 return if ($ca eq $cb); # already in the same cluster
243 if ($$ca[0]<=$$cb[0]) {
244 $min = $$ca[0] if ($ovl=($$cb[0]<=$$ca[1]));
245 }
246 else {
247 $min = $$cb[0] if ($ovl = ($$ca[0]<=$$cb[1]));
248 }
249 return unless $ovl;
250 #overlap: update cluster data
251 $max = ($$ca[1]>$$cb[1]) ? $$ca[1] : $$cb[1];
252 $ca->[0]=$min;
253 $ca->[1]=$max;
254
255 #we know that one of them is SURELY 'b'
256 #collapse them if that is the case
257
258 # add the $cb's loci here to this cluster and update their cluster pointer
259 my @bg=@$cb;shift(@bg);shift(@bg);
260 my $sameLocus=&sameLocus($ga, $gb);
261 $ga->[0].=$gb->[0] if $sameLocus;
262 delete($clusters{$cb}); #-- only 'b'-based clusters were stored, but they could get merged
263 foreach my $g (@bg) {
264 unless ($sameLocus && $g eq $gb) {
265 push(@$ca, $g);
266 $g->[5] = $ca;
267 }
268 }
269 }
270
271
272 sub sameLocus {
273 my ($ga, $gb)=@_;
274 return 0 if $$ga[1]!=$$gb[1] || $$ga[2]!=$$gb[2] || $$ga[3]!=$$gb[3];
275 my $warn="WARNING ($bfile): $$ga[0] and $$gb[0] predictions at $$ga[2]-$$ga[3] do not have the same exon structure!\n";
276 my ($ea, $eb)=($$ga[4], $$gb[4]);
277 if (@$ea!=@$eb) {
278 print STDERR $warn;
279 return 0;
280 }
281 for (my $i=1;$i<@$ea;$i++) {
282 if ($$ea[$i]->[0]!=$$eb[$i]->[0] ||
283 $$ea[$i]->[1]!=$$eb[$i]->[1]) {
284 print STDERR $warn;
285 return 0
286 }
287 }
288 return 1;
289 }
290
291 sub readJigsaw {
292 my ($file, $aref, $predset)=@_;
293 return unless -s $file;
294 local *FIN;
295 open(FIN, $file) || die ("Error opening file $file\n");
296
297 my @exd;
298 my ($curgnum, $cstrand);
299 my $curchr;
300 while (<FIN>) {
301 next if m/^\s*#/;
302 chomp;
303 next unless length($_)>4;
304 my ($chr, $jsver, $ftype, $exonstart, $exonend, $jscore,
305 $strand, $frame, $lnum)=split(/\t/);
306
307 my $gnum;
308 if ($lnum=~m/^(\d+)$/) { #old version
309 $gnum=$1;
310 }
311 else {
312 my @a=split(/\s*;\s*/, $lnum);
313 ($gnum)=($a[0]=~m/\.(\d+)$/);
314 }
315
316 die("Invalid strand ($strand) found for expected '$predset' type locus")
317 if ($predset eq 'f' && $strand ne '+') || ($predset eq 'r' && $strand ne '-');
318 ($exonstart, $exonend) = ($exonend, $exonstart) if $exonstart>$exonend;
319 die("Error at parsing jigsaw line: $_\n") unless ($jsver=~/^jigsaw/ && $gnum);
320 if ($curgnum ne $gnum) {
321 &storeGene($aref, \@exd, $predset, $cstrand, $curgnum, $file) if $curgnum;
322 $curgnum=$gnum;
323 $cstrand=$strand;
324 $curchr=$chr;
325 @exd=();
326 }
327 push(@exd, [$exonstart, $exonend, $frame]);
328 }#while reading jgff lines
329 close(FIN);
330 &storeGene($aref, \@exd, $predset, $cstrand, $curgnum, $file) if $curgnum;
331 # sort genes by coordinates
332 @$aref = sort { $main::a->[2] <=> $main::b->[2] } @$aref;
333 return $curchr;
334 }
335
336
337 sub storeGene {
338 my ($gr, $er, $pset, $strand, $mgnum, $mfile)=@_;
339 my @ex= sort { $main::a->[0] <=> $main::b->[0] } @$er;
340 my ($nostart, $nostop);
341 if ($seq) { #validate that it has start and stop codons
342 my ($cstart, $cend) = ($strand eq '-') ?
343 ( uc(revCompl(substr($seq, $ex[-1]->[1]-3,3))), uc(revCompl(substr($seq, $ex[0]->[0]-1,3))) ) :
344 ( uc(substr($seq, $ex[0]->[0]-1,3)), uc(substr($seq, $ex[-1]->[1]-3,3)) );
345 print STDERR "$mgnum ($mfile) : start codon: $cstart | stop codon: $cend\n" if $debug;
346 if ($cstart ne 'ATG') {
347 my $discard = (@ex<3) ? 'discarded ':'';
348 print STDERR "..${discard}partial gene $mgnum ($mfile): lacking START codon ($cstart)\n";
349 return if $discard;
350 $nostart=1;
351 }
352 unless ($stopCodons{$cend}) {
353 my $discard = (@ex<3) ? 'discarded ':'';
354 print STDERR "..${discard}partial gene $mgnum ($mfile): lacking STOP codon ($cend)\n";
355 return if $discard;
356 $nostop=1;
357 }
358 }
359 my $exlen=0;
360
361 my ($xgap, $nex) = exonGapCheck(\@ex, $strand, $mgnum, $mfile) if $seq;
362 if ($xgap) {
363 @ex=@$nex;
364 }
365 map { $exlen += $_->[1]-$_->[0]+1 } @ex;
366
367 # 0 1 2 3 4 5 6 7 8 9 10
368 my $gref=[$pset, $strand, $ex[0]->[0], $ex[-1]->[1], [@ex], undef, $exlen, 0, $nostart, $nostop, $xgap];
369 my $selfcl=[$ex[0]->[0], $ex[-1]->[1], $gref];
370 $gref->[5]=$selfcl;
371 $clusters{$selfcl}=$selfcl if $pset eq 'b'; #only store "both-strands" clusters
372 push(@$gr, $gref);
373
374 }
375
376
377 sub exonGapCheck {
378 my ($er, $strand, $mgnum, $mfile)=@_;
379 #check each exon for N-gaps;
380 my @xce; #exons, possibly split, will be stored here
381 my @gapdata; # gapstart_gapend
382 for (my $i=0;$i<@$er;$i++) {
383 my ($x0, $x1, $xfr) = @{$$er[$i]};
384 my $xseq=uc(substr($seq, $x0-1, $x1-$x0+1));
385 my $gapstart=index($xseq,'NN');
386 my $gapend=rindex($xseq, 'NN');
387 if ($gapstart>=0) { # Houston, we have a problem: exon gap
388 my ($x0e, $x1b)=($x0+$gapstart-1,$x0+$gapend+2);
389 push(@gapdata, ($x0e+1).':'.($x1b-1));
390 #fix the frame:
391 my $xf;
392 if ($strand eq '-') {
393 #$xf=(($x1-$x0e)+$xfr)%3;
394 $xf=(($x1-$x1b+1)+$xfr)%3;
395 # print STDERR "exon gap for ($x0,$x1,$xfr) at $x0e .. $x1b\n";
396 if ($x0e>$x0) {
397 push(@xce, [$x0, $x0e, $xf, 1]);
398 #print STDERR " added split exon: ($x0, $x0e, $xf)\n";
399 }
400 if ($x1>$x1b) {
401 push(@xce, [$x1b, $x1, $xfr]);
402 #print STDERR " added split exon: ($x1b, $x1, $xfr)\n";
403 }
404 }
405 else {
406 $xf=(($x1b-$x0+1)+$xfr)%3;
407 #print STDERR "exon gap for ($x0,$x1,$xfr) at $x0e .. $x1b (new frame: $xf)\n";
408 push(@xce, [$x0, $x0e, $xfr]) if $x0e>$x0;
409 push(@xce, [$x1b, $x1, $xf, 1]) if $x1>$x1b;
410 }
411 next;
412 }
413 push(@xce, [$x0, $x1, $xfr]);
414 }
415 my $gapds=$gapdata[0]; #should probably be: join(',', @gapdata)
416 if (@gapdata>1) {
417 print STDERR "WARNING: multiple exons with sequencing gaps found for $mgnum ($mfile)!\n";
418 }
419 #print STDERR " >> gapdata returned: $gapds\n" if $gapds;
420 return ($gapds, \@xce);
421 }
422
423 sub revCompl {
424 my $s=reverse($_[0]);
425 $s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
426 return $s;
427 }

Properties

Name Value
svn:executable *