ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2gview
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 13585 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     # -- we should also write the minimap image:
5     use GD;
6    
7     my $usage = q/Usage:
8     gff2gview [-d <output_dir>] [-o <output_basename>] [-i 'proj_description']
9     [-w <img_width>] [-s <sep_dist>] [-l <reservedTracks>] [-T] [-I] \
10     <tracks.def> <genomic_seq.fa> <input_gff>
11    
12     Processes the input files and produces a set of "project" files
13     suitable for web display within the annotation viewer (gview)
14    
15     Use the -I option if the first column in the gff input contains a special
16     "node ID" that should be used for URL redirection.
17    
18     Use the -T option to avoid generating the tracks file (useful when
19     the track definition file is shared).
20    
21     Requirements:
22    
23     <tracks.def> contains ordered track info, each line with the space
24     delimited fields: <gff_trackID> <hex_color> <display_name> <redirect URL>
25    
26     The <input_gff> file should specify in the second column the track ID
27     for each feature (and it must match a <gff_trackID> from the
28     <tracks.def> file).
29    
30     Overlapping features of the same type (i.e. from the same track) are
31     limited to <max_ovl> entries per track (default <max_ovl>=5, i.e.
32     at most 5 overlapping features of the same type will be kept for a locus)
33     /;
34    
35     umask 0002;
36     getopts('TId:l:s:i:w:m:o:') || die($usage."\n");
37     my $outdir=$Getopt::Std::opt_d || '.';
38     my $maxOvl=$Getopt::Std::opt_m || 5;
39     my $useNodeID=$Getopt::Std::opt_I;
40     my $notrackout=$Getopt::Std::opt_T;
41     my $resTracks=$Getopt::Std::opt_l;
42     my $imgW=$Getopt::Std::opt_w || 648;
43     my $sepDist=$Getopt::Std::opt_s || 80; # min nt distance to consider "non-overlapping" mappings
44    
45     my $pjinfo=$Getopt::Std::opt_i;
46     $outdir=~s/\/+$//;
47     #my $gene_type=$Getopt::Std::opt_g || 'mRNA';
48     if ($outdir ne '.' && !-d $outdir) {
49     mkdir($outdir, 0775) || die("Error creating directory $outdir\n");
50     }
51    
52     my $ftrackdef=shift(@ARGV);
53    
54     my $fasta=shift(@ARGV);
55     die("$usage\nError gettting the input files. Please check parameters.\n")
56     unless $ARGV[0] && -f $fasta && -f $ftrackdef;
57     my $defoutname;;
58     if ($ARGV[0] eq '-') {
59     shift(@ARGV);
60     $defoutname='gff2gview_stdin';
61     }
62     else {
63     $defoutname=$ARGV[0];
64     $defoutname=~s/\.\w+$//; #remove extension if any
65     }
66     my $outbase=$Getopt::Std::opt_o || $defoutname;
67    
68     #-- read tracks:
69    
70     open(TRDEF, $ftrackdef) || die ("Error opening track file $ftrackdef ($!)\n");
71     my %trackOrd;
72     my %trackCol; #track colors
73     my $tord=0;
74     my @tlines;
75     while (<TRDEF>) {
76     next if m/^\s*#/;
77     push(@tlines, $_);
78     chomp;
79     $tord++;
80     my @t=split;
81     $trackOrd{$t[0]}=$tord;
82     $trackCol{$t[0]}=$t[1];
83     }
84     close(TRDEF);
85     unless ($notrackout) {
86     #unless (-s "$outdir/$outbase.tracks") {
87     open(TRW, ">$outdir/$outbase.tracks") || die("Error creating file $outdir/$outbase.tracks");
88     print TRW join('',@tlines);
89     close(TRW);
90     # }
91     }
92     #--- write the sequence file:
93     open(FSEQ, $fasta) || die("Error opening $fasta\n");
94    
95     open(GSEQ, ">$outdir/$outbase.seq") || die("Error creating file $outdir/$outbase.seq");
96     my $gseqlen=0;
97     my $gseqid;
98     while (<FSEQ>) {
99     next if m/^\s*#/;
100     if (m/^>(\S+)/) {
101     $gseqid=$1;
102     if (!$pjinfo && m/^>\S+\s+(.+)/) {
103     $pjinfo=$1;
104     }
105     next;
106     }
107     chomp;
108     s/[\r\s]+$//s;
109     $gseqlen+=length($_);
110     print GSEQ $_;
111     }
112     close(GSEQ);
113     close(FSEQ);
114    
115     #-------------
116    
117    
118     # 0 1 2 3 4 5 6 7 8
119     my %map; # mapID => [ $track, $strand, $start, $end, $fname, $descr, $subf_name, $subf_listref, $cluster ]
120    
121     my $mcount=0;
122     # 0 1 2 3 4
123     my @mlist; # list of [ $mapID, $tracknum, $start, $end, $stackpos ];
124     my $gdescr;
125     my $firstfeat;
126    
127     while (<>) {
128     next if m/^\s*#/;
129     chomp;
130     my ($chr, $ftrack, $f, $fstart, $fend, $fscore, $strand, $frame, $attrs)=split(/\t/);
131     next unless $attrs;
132     $f='exon' if ($f =~m/\-exon$/);
133     my ($fid)=($attrs=~m/ID\s*=\s*"?([^;"]+)/);
134     my ($fp)=($attrs=~m/Parent\s*=\s*"?([^;"]+)/);
135     next unless $fid || $fp;
136     ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
137     if ($fend>$gseqlen) {
138     die("Error: coordinate of feature $fid ($fend) is beyond the length of genomic sequence ($gseqlen)\n");
139     }
140     my $tracknum=$trackOrd{$ftrack};
141     unless ($tracknum) {
142     print STDERR "Warning: skipping feature $fid with unknown track '$ftrack'\n";
143     next;
144     }
145     my $m=$map{$fid};
146     ##----------- child
147     if ($fp) { #child feature
148     my $pm=$map{$fp};
149     if (!$pm || ($pm->[6] && $pm->[6] ne $f)) {
150     print STDERR "Warning: subfeature $f ($fstart-$fend) of $fp discarded.\n";
151     next;
152     }
153     $pm->[6]=$f;
154     push (@{$pm->[7]}, [$fstart, $fend, $fscore]);
155     }
156     ##----------- parent
157     else { # 1st level parent feature
158     if ($m) { #existing parent feature.. duplicates not allowed
159     print STDERR "Warning: skipping repeat of parent feature $fid (why non-unique?)\n";
160     next;
161     }
162     my $gdescr;
163     # try to parse the description/annotation, if any
164     if ($attrs=~m/Info\s*=\s*"?([^;"]+)/i) {
165     $gdescr=$1;
166     }
167     elsif ($attrs=~m/Descr?\s*=\s*"?([^;"]+)/i) {
168     $gdescr=$1;
169     }
170     elsif ($attrs=~m/TopHit\s*=\s*"?([^;"]+)/i) {
171     $gdescr=$1;
172     }
173     elsif ($attrs=~m/BestHit\s*=\s*"?([^;"]+)/i) {
174     $gdescr=$1;
175     }
176     elsif ($attrs=~m/Name\s*=\s*"?([^;"]+)/) {
177     $gdescr=$1;
178     }
179     if ($gdescr) {
180     $gdescr=~s/^\s+//;$gdescr=~s/\s+$//;
181     $gdescr='' if index(lc($fid),lc($gdescr))>=0;
182     }
183    
184     my $fullID = ($useNodeID) ? $fid.'|nid:'.$chr : $fid;
185     # 0 1 2 3 4 5 6 7 8 9 10 11
186     $map{$fid}= [ $ftrack, $strand, $fstart, $fend, $f, $gdescr, '', [], [$fid], 0, $fscore, $fullID ];
187    
188     # 0 1 2 3 4
189     push(@mlist, [ $fid, $tracknum, $fstart, $fend, 0 ]);
190     $mcount++;
191     } #parent feature
192     } #while <GFF>
193    
194     # sort mlist by fstart coordinate
195     #print STDERR "Sorting features by start coordinate..\n";
196    
197     @mlist=sort { $main::a->[2]<=>$main::b->[2] } @mlist;
198    
199     #---- cluster mappings by overlap
200     #print STDERR "Clustering features by virtual overlap..\n";
201    
202     my %clusters; #cl => [mapID1, mapID2, ...]
203     for (my $a=0;$a<$mcount; $a++) {
204     for (my $b=$a+1;$b<$mcount;$b++) {
205     my ($ma, $mb)=($map{$mlist[$a]->[0]}, $map{$mlist[$b]->[0]});
206     my ($macl, $mbcl)=($ma->[8], $mb->[8]);
207     last if ($$mb[2] > $$ma[3]+$sepDist);
208     next if ($mbcl==$macl);
209     if (hasOverlap($$ma[2], $$ma[3], $$mb[2], $$mb[3])) {
210     if ($macl && $mbcl) {
211     my $mergecl=mergeClusters($macl, $mbcl);
212     }
213     }
214     }
215     }
216     my $ncl=keys(%clusters);
217     #print STDERR "Computing stack ordering for each of the $ncl clusters..\n";
218    
219     my $clnum=0;
220     foreach my $cl (values(%clusters)) { # for each cluster
221     # 0 1 2 3 4
222     # id trackNo startpos endpos stackOrd
223     my @clist = map( [$_, $trackOrd{$map{$_}->[0]}, $map{$_}->[2],$map{$_}->[3], 0, ], @$cl);
224     # debugShowCl(\@clist, $clnum);
225     # by track priority AND by start coordinate
226     @clist = sort byTrackOrd @clist;
227     my @rows; #$list[i] = [ list of [start, end]] - feature coordinates placed on the same row
228     # array of maximum (rightmost) end and minimum (leftmost) start coord for each stacklevel
229     for (my $i=0;$i<@clist;$i++) {
230     #my $curtrack=$clist[$i]->[1];
231     # we can reset @rows on track change..
232     # and set stackOrd to $lastTrackStack + $r
233     my $iseq=$clist[$i];
234     #search back for possible room
235     my $placed;
236     #print STDOUT join("\t",$$iseq[0], $curtrack, $$iseq[2])."\n";
237     #reserve rows 0 and 1 for trackNo 0 and 1
238     #reserve row 0 for track 0 (sequence gaps)
239     #my $numlocked
240     my $rstart=0;
241     if ($resTracks) {
242     $rstart= $$iseq[1]>$resTracks ? $resTracks : ($$iseq[1]>0 ? $$iseq[1]-1 : 0);
243     }
244     #my $rstart=$$iseq[1]>2 ? 2 : ($$iseq[1]==2 ? 1 : 0);
245     for (my $res=0;$res<$resTracks;$res++) { push(@rows,[]); } #reserve tracks if requested
246     for (my $j=$rstart;$j<@rows;$j++) {
247     if (canPlace($iseq, $rows[$j])) {
248     $$iseq[4]=$j;
249     $placed=1;
250     last;
251     }
252     }# for each row
253     unless ($placed) {# no more room on previous rows, create a new row
254     $$iseq[4]=scalar(@rows);
255     push(@rows, [[$$iseq[2], $$iseq[3]]]);
256     }
257     #----------------
258     #-- stack order determined for this mapping, store it!
259     $map{$$iseq[0]}->[9]=$$iseq[4];
260     } #for each gff record
261     $clnum++;
262     @clist=();
263     }
264    
265    
266     #----------------
267    
268     my $maxstackpos=1;
269     my %gifCol;
270     open(IFA, ">$outdir/$outbase.ifa") || die ("Error creating $outdir/$outbase.ifa!\n");
271     my $gifmap=new GD::Image($imgW,30);
272     my $gifbkg=$gifmap->colorAllocate(unpackRGB('EAEAEA'));
273     $gifmap->transparent($gifbkg);
274     foreach my $t (keys(%trackCol)) {
275     my $cval=$trackCol{$t};
276     $gifCol{$t}=$gifmap->colorAllocate(unpackRGB($cval));
277     }
278     my $iW=$imgW-2;
279     foreach my $l (@mlist) {
280     my $m=$map{$$l[0]};
281     # 0 1 2 3 4 5 6 7 8 9 10 11
282     my ($track, $strand, $start, $end, $f, $descr, $subfname, $subf, $cl, $stackpos, $score, $wID)= @$m;
283     if (@$subf==0) {
284     $subf=[[$start, $end, $score]];
285     $subfname=$f;
286     }
287     my @segs=sort { $main::a->[0] <=> $main::b->[0] } @$subf;
288     my @wsegs = map { $_->[0].'-'.$_->[1].
289     (length($_->[2])>1 ? ':'.substr($_->[2],0,4) : '') } @segs;
290     $stackpos++;
291     my $tc=$trackCol{$track};
292     my $ilen=sprintf('%d',((($end-$start)*$iW)/$gseqlen)); #length of this mapping in the minimap
293     #$ilen=1 if ($ilen<1);
294     my $ipos=sprintf('%d',(($start*$iW)/$gseqlen)+1);
295     my $ypos=$stackpos+1;
296     $gifmap->line($ipos, $ypos, $ipos+$ilen, $ypos, $gifCol{$track});
297     #print IFA ">$$l[0] $start $end $strand $track $stackpos $f\n";
298     print IFA ">$wID $start $end $strand $track $stackpos $f\n";
299     print IFA "$subfname|".join(' ',@wsegs)."\n";
300     if ($descr) {
301     print IFA 'i:'.$descr."\n";
302     # if ($srchFile) {
303     # print SRCH ">$wID $descr\n";
304     # print SRCH "$track $strand $start $end"
305     # }
306     }
307     $maxstackpos=$stackpos if ($stackpos>$maxstackpos);
308     }
309    
310     close(IFA);
311     open(GIFMAP,">$outdir/$outbase.gif")
312     || die ("Error creating $outdir/$outbase.gif !\n");
313     binmode GIFMAP;
314     {
315     my $gifimg=$gifmap->gif();
316     print GIFMAP $gifimg;
317     }
318     close(GIFMAP);
319     #print STDERR "iit_store into $outdir/$outbase.iit..\n";
320     system("iit_store -o $outdir/$outbase.iit < $outdir/$outbase.ifa");
321    
322     open(DTA, ">$outdir/$outbase.dta") || die ("Error creating file $outdir/$outbase.dta\n");
323     print DTA join("\t",$gseqid, $gseqlen, $maxstackpos, $pjinfo)."\n";
324     close(DTA);
325     #print STDERR "Done.\n";
326    
327     #=============================================================
328    
329    
330     sub debugShowCl {
331     #print STDERR ">Cluster".$_[1]."\n";
332     foreach my $cd (@{$_[0]}) {
333     my @d=@$cd;
334     my $id=shift(@d);
335     #print STDERR join(' ',sprintf('%40s',$id),(map(sprintf('%9s',$_), @d)))."\n";
336     }
337     }
338    
339     sub mergeClusters {
340     my ($macl, $mbcl)=@_;
341     delete $clusters{$macl};
342     delete $clusters{$mbcl};
343     my $newcl=[@$macl, @$mbcl];
344     $clusters{$newcl}=$newcl;
345     foreach my $id (@$newcl) {
346     $map{$id}->[8]=$newcl;
347     }
348     }
349    
350     sub byTrackOrd { #sort by track priority AND by start coordinate
351     if ($main::a->[1]==$main::b->[1]) {
352     return $main::a->[2]==$main::b->[2] ? $main::b->[3]-$main::a->[3] :
353     $main::a->[2]-$main::b->[2]
354     }
355     else {
356     return $main::a->[1]-$main::b->[1];
357     }
358     }
359    
360     sub printRow {
361     my ($r)=@_;
362     my @p;
363     foreach my $s (@$r) { push(@p,"[$$s[0]-$$s[1]]") }
364     return join(' ',@p);
365     }
366    
367     sub canPlace {
368     my ($g, $r)=@_; #gene feature, row data
369     #$r is a ref to a list of [start, end] coordinates, sorted by $start
370     if (@$r==0 || $$g[2]-$sepDist > $$r[-1]->[1]) {
371     #room to the right
372     push(@$r, [$$g[2], $$g[3]]);
373     return 1
374     }
375     if ($$g[3]+$sepDist < $$r[0]->[0]) {
376     #room to the left
377     unshift(@$r, [$$g[2], $$g[3]] );
378     return 1
379     }
380     my $rmax=@$r-1;
381     for (my $i=0;$i<$rmax;$i++) {
382     #check for room between this and next gf
383     next unless $$g[2]>$$r[$i]->[0];
384     if ( $$g[2]-$sepDist > $$r[$i]->[1] &&
385     $$g[3]+$sepDist < $$r[$i+1]->[0]) {
386     splice(@$r, $i+1, 0, [$$g[2], $$g[3]]); #insert interval in this array
387     return 1
388     }
389     last if $$r[$i+1]->[0]>$$g[3];
390     }#for each inter-gf space
391     return 0;
392     }
393    
394    
395     sub hasOverlap {
396     my ($a1, $a2, $b1, $b2)=@_;
397     return ($a1<$b1) ? ($b1-$a2<=$sepDist) : ($a1-$b2<=$sepDist);
398     }
399    
400    
401     sub unpackRGB {
402     my $v=hex($_[0]);
403     return ( (($v & 0xFF0000) >> 16),
404     (($v & 0x00FF00) >> 8),
405     ($v & 0x0000FF) );
406     }
407    
408    
409     sub writeModels {
410     foreach my $l (@mlist) {
411     my $ml=$map{$$l[0]} || die ("Error: mapping $$l[0] not in hash!?\n");
412     my ($tag, $trck, $er, $cr, $descr) = @$ml;
413     my $wID=$$ml[11];
414     my ($mstart, $mend, @ex, @cds);
415     if (@$er>0) {
416     @ex= sort { $main::a->[0] <=> $main::b->[0] } @$er;
417     ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
418     }
419     if (@$cr>0) {
420     @cds= sort { $main::a->[0] <=> $main::b->[0] } @$cr;
421     ($mstart, $mend) = ($cds[0]->[0], $cds[-1]->[1]) unless $mstart;
422     }
423    
424     #print IFA ">$l $mstart $mend $tag";
425     print IFA ">$wID $mstart $mend $tag";
426     print IFA " $trck" if $trck;
427     print IFA "\n";
428     if (@ex) {
429     my @wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @ex;
430     print IFA join(',',@wlst)."\n";
431     }
432     if (@cds) {
433     my @wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @cds;
434     print IFA 'C:'.join(',',@wlst)."\n";
435     }
436     if ($descr) {
437     print IFA 'i:'.$descr."\n";
438     }
439     } #for each stored locus
440     }

Properties

Name Value
svn:executable *