ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2gview
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 13585 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 # -- we should also write the minimap image:
5 use GD;
6
7 my $usage = q/Usage:
8 gff2gview [-d <output_dir>] [-o <output_basename>] [-i 'proj_description']
9 [-w <img_width>] [-s <sep_dist>] [-l <reservedTracks>] [-T] [-I] \
10 <tracks.def> <genomic_seq.fa> <input_gff>
11
12 Processes the input files and produces a set of "project" files
13 suitable for web display within the annotation viewer (gview)
14
15 Use the -I option if the first column in the gff input contains a special
16 "node ID" that should be used for URL redirection.
17
18 Use the -T option to avoid generating the tracks file (useful when
19 the track definition file is shared).
20
21 Requirements:
22
23 <tracks.def> contains ordered track info, each line with the space
24 delimited fields: <gff_trackID> <hex_color> <display_name> <redirect URL>
25
26 The <input_gff> file should specify in the second column the track ID
27 for each feature (and it must match a <gff_trackID> from the
28 <tracks.def> file).
29
30 Overlapping features of the same type (i.e. from the same track) are
31 limited to <max_ovl> entries per track (default <max_ovl>=5, i.e.
32 at most 5 overlapping features of the same type will be kept for a locus)
33 /;
34
35 umask 0002;
36 getopts('TId:l:s:i:w:m:o:') || die($usage."\n");
37 my $outdir=$Getopt::Std::opt_d || '.';
38 my $maxOvl=$Getopt::Std::opt_m || 5;
39 my $useNodeID=$Getopt::Std::opt_I;
40 my $notrackout=$Getopt::Std::opt_T;
41 my $resTracks=$Getopt::Std::opt_l;
42 my $imgW=$Getopt::Std::opt_w || 648;
43 my $sepDist=$Getopt::Std::opt_s || 80; # min nt distance to consider "non-overlapping" mappings
44
45 my $pjinfo=$Getopt::Std::opt_i;
46 $outdir=~s/\/+$//;
47 #my $gene_type=$Getopt::Std::opt_g || 'mRNA';
48 if ($outdir ne '.' && !-d $outdir) {
49 mkdir($outdir, 0775) || die("Error creating directory $outdir\n");
50 }
51
52 my $ftrackdef=shift(@ARGV);
53
54 my $fasta=shift(@ARGV);
55 die("$usage\nError gettting the input files. Please check parameters.\n")
56 unless $ARGV[0] && -f $fasta && -f $ftrackdef;
57 my $defoutname;;
58 if ($ARGV[0] eq '-') {
59 shift(@ARGV);
60 $defoutname='gff2gview_stdin';
61 }
62 else {
63 $defoutname=$ARGV[0];
64 $defoutname=~s/\.\w+$//; #remove extension if any
65 }
66 my $outbase=$Getopt::Std::opt_o || $defoutname;
67
68 #-- read tracks:
69
70 open(TRDEF, $ftrackdef) || die ("Error opening track file $ftrackdef ($!)\n");
71 my %trackOrd;
72 my %trackCol; #track colors
73 my $tord=0;
74 my @tlines;
75 while (<TRDEF>) {
76 next if m/^\s*#/;
77 push(@tlines, $_);
78 chomp;
79 $tord++;
80 my @t=split;
81 $trackOrd{$t[0]}=$tord;
82 $trackCol{$t[0]}=$t[1];
83 }
84 close(TRDEF);
85 unless ($notrackout) {
86 #unless (-s "$outdir/$outbase.tracks") {
87 open(TRW, ">$outdir/$outbase.tracks") || die("Error creating file $outdir/$outbase.tracks");
88 print TRW join('',@tlines);
89 close(TRW);
90 # }
91 }
92 #--- write the sequence file:
93 open(FSEQ, $fasta) || die("Error opening $fasta\n");
94
95 open(GSEQ, ">$outdir/$outbase.seq") || die("Error creating file $outdir/$outbase.seq");
96 my $gseqlen=0;
97 my $gseqid;
98 while (<FSEQ>) {
99 next if m/^\s*#/;
100 if (m/^>(\S+)/) {
101 $gseqid=$1;
102 if (!$pjinfo && m/^>\S+\s+(.+)/) {
103 $pjinfo=$1;
104 }
105 next;
106 }
107 chomp;
108 s/[\r\s]+$//s;
109 $gseqlen+=length($_);
110 print GSEQ $_;
111 }
112 close(GSEQ);
113 close(FSEQ);
114
115 #-------------
116
117
118 # 0 1 2 3 4 5 6 7 8
119 my %map; # mapID => [ $track, $strand, $start, $end, $fname, $descr, $subf_name, $subf_listref, $cluster ]
120
121 my $mcount=0;
122 # 0 1 2 3 4
123 my @mlist; # list of [ $mapID, $tracknum, $start, $end, $stackpos ];
124 my $gdescr;
125 my $firstfeat;
126
127 while (<>) {
128 next if m/^\s*#/;
129 chomp;
130 my ($chr, $ftrack, $f, $fstart, $fend, $fscore, $strand, $frame, $attrs)=split(/\t/);
131 next unless $attrs;
132 $f='exon' if ($f =~m/\-exon$/);
133 my ($fid)=($attrs=~m/ID\s*=\s*"?([^;"]+)/);
134 my ($fp)=($attrs=~m/Parent\s*=\s*"?([^;"]+)/);
135 next unless $fid || $fp;
136 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
137 if ($fend>$gseqlen) {
138 die("Error: coordinate of feature $fid ($fend) is beyond the length of genomic sequence ($gseqlen)\n");
139 }
140 my $tracknum=$trackOrd{$ftrack};
141 unless ($tracknum) {
142 print STDERR "Warning: skipping feature $fid with unknown track '$ftrack'\n";
143 next;
144 }
145 my $m=$map{$fid};
146 ##----------- child
147 if ($fp) { #child feature
148 my $pm=$map{$fp};
149 if (!$pm || ($pm->[6] && $pm->[6] ne $f)) {
150 print STDERR "Warning: subfeature $f ($fstart-$fend) of $fp discarded.\n";
151 next;
152 }
153 $pm->[6]=$f;
154 push (@{$pm->[7]}, [$fstart, $fend, $fscore]);
155 }
156 ##----------- parent
157 else { # 1st level parent feature
158 if ($m) { #existing parent feature.. duplicates not allowed
159 print STDERR "Warning: skipping repeat of parent feature $fid (why non-unique?)\n";
160 next;
161 }
162 my $gdescr;
163 # try to parse the description/annotation, if any
164 if ($attrs=~m/Info\s*=\s*"?([^;"]+)/i) {
165 $gdescr=$1;
166 }
167 elsif ($attrs=~m/Descr?\s*=\s*"?([^;"]+)/i) {
168 $gdescr=$1;
169 }
170 elsif ($attrs=~m/TopHit\s*=\s*"?([^;"]+)/i) {
171 $gdescr=$1;
172 }
173 elsif ($attrs=~m/BestHit\s*=\s*"?([^;"]+)/i) {
174 $gdescr=$1;
175 }
176 elsif ($attrs=~m/Name\s*=\s*"?([^;"]+)/) {
177 $gdescr=$1;
178 }
179 if ($gdescr) {
180 $gdescr=~s/^\s+//;$gdescr=~s/\s+$//;
181 $gdescr='' if index(lc($fid),lc($gdescr))>=0;
182 }
183
184 my $fullID = ($useNodeID) ? $fid.'|nid:'.$chr : $fid;
185 # 0 1 2 3 4 5 6 7 8 9 10 11
186 $map{$fid}= [ $ftrack, $strand, $fstart, $fend, $f, $gdescr, '', [], [$fid], 0, $fscore, $fullID ];
187
188 # 0 1 2 3 4
189 push(@mlist, [ $fid, $tracknum, $fstart, $fend, 0 ]);
190 $mcount++;
191 } #parent feature
192 } #while <GFF>
193
194 # sort mlist by fstart coordinate
195 #print STDERR "Sorting features by start coordinate..\n";
196
197 @mlist=sort { $main::a->[2]<=>$main::b->[2] } @mlist;
198
199 #---- cluster mappings by overlap
200 #print STDERR "Clustering features by virtual overlap..\n";
201
202 my %clusters; #cl => [mapID1, mapID2, ...]
203 for (my $a=0;$a<$mcount; $a++) {
204 for (my $b=$a+1;$b<$mcount;$b++) {
205 my ($ma, $mb)=($map{$mlist[$a]->[0]}, $map{$mlist[$b]->[0]});
206 my ($macl, $mbcl)=($ma->[8], $mb->[8]);
207 last if ($$mb[2] > $$ma[3]+$sepDist);
208 next if ($mbcl==$macl);
209 if (hasOverlap($$ma[2], $$ma[3], $$mb[2], $$mb[3])) {
210 if ($macl && $mbcl) {
211 my $mergecl=mergeClusters($macl, $mbcl);
212 }
213 }
214 }
215 }
216 my $ncl=keys(%clusters);
217 #print STDERR "Computing stack ordering for each of the $ncl clusters..\n";
218
219 my $clnum=0;
220 foreach my $cl (values(%clusters)) { # for each cluster
221 # 0 1 2 3 4
222 # id trackNo startpos endpos stackOrd
223 my @clist = map( [$_, $trackOrd{$map{$_}->[0]}, $map{$_}->[2],$map{$_}->[3], 0, ], @$cl);
224 # debugShowCl(\@clist, $clnum);
225 # by track priority AND by start coordinate
226 @clist = sort byTrackOrd @clist;
227 my @rows; #$list[i] = [ list of [start, end]] - feature coordinates placed on the same row
228 # array of maximum (rightmost) end and minimum (leftmost) start coord for each stacklevel
229 for (my $i=0;$i<@clist;$i++) {
230 #my $curtrack=$clist[$i]->[1];
231 # we can reset @rows on track change..
232 # and set stackOrd to $lastTrackStack + $r
233 my $iseq=$clist[$i];
234 #search back for possible room
235 my $placed;
236 #print STDOUT join("\t",$$iseq[0], $curtrack, $$iseq[2])."\n";
237 #reserve rows 0 and 1 for trackNo 0 and 1
238 #reserve row 0 for track 0 (sequence gaps)
239 #my $numlocked
240 my $rstart=0;
241 if ($resTracks) {
242 $rstart= $$iseq[1]>$resTracks ? $resTracks : ($$iseq[1]>0 ? $$iseq[1]-1 : 0);
243 }
244 #my $rstart=$$iseq[1]>2 ? 2 : ($$iseq[1]==2 ? 1 : 0);
245 for (my $res=0;$res<$resTracks;$res++) { push(@rows,[]); } #reserve tracks if requested
246 for (my $j=$rstart;$j<@rows;$j++) {
247 if (canPlace($iseq, $rows[$j])) {
248 $$iseq[4]=$j;
249 $placed=1;
250 last;
251 }
252 }# for each row
253 unless ($placed) {# no more room on previous rows, create a new row
254 $$iseq[4]=scalar(@rows);
255 push(@rows, [[$$iseq[2], $$iseq[3]]]);
256 }
257 #----------------
258 #-- stack order determined for this mapping, store it!
259 $map{$$iseq[0]}->[9]=$$iseq[4];
260 } #for each gff record
261 $clnum++;
262 @clist=();
263 }
264
265
266 #----------------
267
268 my $maxstackpos=1;
269 my %gifCol;
270 open(IFA, ">$outdir/$outbase.ifa") || die ("Error creating $outdir/$outbase.ifa!\n");
271 my $gifmap=new GD::Image($imgW,30);
272 my $gifbkg=$gifmap->colorAllocate(unpackRGB('EAEAEA'));
273 $gifmap->transparent($gifbkg);
274 foreach my $t (keys(%trackCol)) {
275 my $cval=$trackCol{$t};
276 $gifCol{$t}=$gifmap->colorAllocate(unpackRGB($cval));
277 }
278 my $iW=$imgW-2;
279 foreach my $l (@mlist) {
280 my $m=$map{$$l[0]};
281 # 0 1 2 3 4 5 6 7 8 9 10 11
282 my ($track, $strand, $start, $end, $f, $descr, $subfname, $subf, $cl, $stackpos, $score, $wID)= @$m;
283 if (@$subf==0) {
284 $subf=[[$start, $end, $score]];
285 $subfname=$f;
286 }
287 my @segs=sort { $main::a->[0] <=> $main::b->[0] } @$subf;
288 my @wsegs = map { $_->[0].'-'.$_->[1].
289 (length($_->[2])>1 ? ':'.substr($_->[2],0,4) : '') } @segs;
290 $stackpos++;
291 my $tc=$trackCol{$track};
292 my $ilen=sprintf('%d',((($end-$start)*$iW)/$gseqlen)); #length of this mapping in the minimap
293 #$ilen=1 if ($ilen<1);
294 my $ipos=sprintf('%d',(($start*$iW)/$gseqlen)+1);
295 my $ypos=$stackpos+1;
296 $gifmap->line($ipos, $ypos, $ipos+$ilen, $ypos, $gifCol{$track});
297 #print IFA ">$$l[0] $start $end $strand $track $stackpos $f\n";
298 print IFA ">$wID $start $end $strand $track $stackpos $f\n";
299 print IFA "$subfname|".join(' ',@wsegs)."\n";
300 if ($descr) {
301 print IFA 'i:'.$descr."\n";
302 # if ($srchFile) {
303 # print SRCH ">$wID $descr\n";
304 # print SRCH "$track $strand $start $end"
305 # }
306 }
307 $maxstackpos=$stackpos if ($stackpos>$maxstackpos);
308 }
309
310 close(IFA);
311 open(GIFMAP,">$outdir/$outbase.gif")
312 || die ("Error creating $outdir/$outbase.gif !\n");
313 binmode GIFMAP;
314 {
315 my $gifimg=$gifmap->gif();
316 print GIFMAP $gifimg;
317 }
318 close(GIFMAP);
319 #print STDERR "iit_store into $outdir/$outbase.iit..\n";
320 system("iit_store -o $outdir/$outbase.iit < $outdir/$outbase.ifa");
321
322 open(DTA, ">$outdir/$outbase.dta") || die ("Error creating file $outdir/$outbase.dta\n");
323 print DTA join("\t",$gseqid, $gseqlen, $maxstackpos, $pjinfo)."\n";
324 close(DTA);
325 #print STDERR "Done.\n";
326
327 #=============================================================
328
329
330 sub debugShowCl {
331 #print STDERR ">Cluster".$_[1]."\n";
332 foreach my $cd (@{$_[0]}) {
333 my @d=@$cd;
334 my $id=shift(@d);
335 #print STDERR join(' ',sprintf('%40s',$id),(map(sprintf('%9s',$_), @d)))."\n";
336 }
337 }
338
339 sub mergeClusters {
340 my ($macl, $mbcl)=@_;
341 delete $clusters{$macl};
342 delete $clusters{$mbcl};
343 my $newcl=[@$macl, @$mbcl];
344 $clusters{$newcl}=$newcl;
345 foreach my $id (@$newcl) {
346 $map{$id}->[8]=$newcl;
347 }
348 }
349
350 sub byTrackOrd { #sort by track priority AND by start coordinate
351 if ($main::a->[1]==$main::b->[1]) {
352 return $main::a->[2]==$main::b->[2] ? $main::b->[3]-$main::a->[3] :
353 $main::a->[2]-$main::b->[2]
354 }
355 else {
356 return $main::a->[1]-$main::b->[1];
357 }
358 }
359
360 sub printRow {
361 my ($r)=@_;
362 my @p;
363 foreach my $s (@$r) { push(@p,"[$$s[0]-$$s[1]]") }
364 return join(' ',@p);
365 }
366
367 sub canPlace {
368 my ($g, $r)=@_; #gene feature, row data
369 #$r is a ref to a list of [start, end] coordinates, sorted by $start
370 if (@$r==0 || $$g[2]-$sepDist > $$r[-1]->[1]) {
371 #room to the right
372 push(@$r, [$$g[2], $$g[3]]);
373 return 1
374 }
375 if ($$g[3]+$sepDist < $$r[0]->[0]) {
376 #room to the left
377 unshift(@$r, [$$g[2], $$g[3]] );
378 return 1
379 }
380 my $rmax=@$r-1;
381 for (my $i=0;$i<$rmax;$i++) {
382 #check for room between this and next gf
383 next unless $$g[2]>$$r[$i]->[0];
384 if ( $$g[2]-$sepDist > $$r[$i]->[1] &&
385 $$g[3]+$sepDist < $$r[$i+1]->[0]) {
386 splice(@$r, $i+1, 0, [$$g[2], $$g[3]]); #insert interval in this array
387 return 1
388 }
389 last if $$r[$i+1]->[0]>$$g[3];
390 }#for each inter-gf space
391 return 0;
392 }
393
394
395 sub hasOverlap {
396 my ($a1, $a2, $b1, $b2)=@_;
397 return ($a1<$b1) ? ($b1-$a2<=$sepDist) : ($a1-$b2<=$sepDist);
398 }
399
400
401 sub unpackRGB {
402 my $v=hex($_[0]);
403 return ( (($v & 0xFF0000) >> 16),
404 (($v & 0x00FF00) >> 8),
405 ($v & 0x0000FF) );
406 }
407
408
409 sub writeModels {
410 foreach my $l (@mlist) {
411 my $ml=$map{$$l[0]} || die ("Error: mapping $$l[0] not in hash!?\n");
412 my ($tag, $trck, $er, $cr, $descr) = @$ml;
413 my $wID=$$ml[11];
414 my ($mstart, $mend, @ex, @cds);
415 if (@$er>0) {
416 @ex= sort { $main::a->[0] <=> $main::b->[0] } @$er;
417 ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
418 }
419 if (@$cr>0) {
420 @cds= sort { $main::a->[0] <=> $main::b->[0] } @$cr;
421 ($mstart, $mend) = ($cds[0]->[0], $cds[-1]->[1]) unless $mstart;
422 }
423
424 #print IFA ">$l $mstart $mend $tag";
425 print IFA ">$wID $mstart $mend $tag";
426 print IFA " $trck" if $trck;
427 print IFA "\n";
428 if (@ex) {
429 my @wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @ex;
430 print IFA join(',',@wlst)."\n";
431 }
432 if (@cds) {
433 my @wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @cds;
434 print IFA 'C:'.join(',',@wlst)."\n";
435 }
436 if ($descr) {
437 print IFA 'i:'.$descr."\n";
438 }
439 } #for each stored locus
440 }

Properties

Name Value
svn:executable *