ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2iit
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 12573 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4    
5     my $usage = q/Usage:
6     gff2itt [-o <output_name>] [-STIQGBKNCPL] [-t <track_label>] [-c <CDS.gtf>] <input_gff>
7    
8     Options:
9     -X generic GFF storage using new iit_store chr: format
10     -T do not include the track tag in the output
11     -I same as -T, but append the track label to the info line (i:...)
12     -K add the track as a t: line for each record
13     -N add the gene name, if found, as a g: line
14     -C format the record ID as t|<transcript>|g|<gene> for cdbfasta indexing
15     -Q do not include query information data in the output
16     -G the input is GFF annotation instead of scored mappings (implies -Q,
17     -C, -N and uses the new iit_store with -F option)
18     -B the genomic sequence tag will not include the strand character
19     (so intervals on both strands will be returned by a iit_get query
20     and a 's:' line will be added instead)
21     -A write any other GFF\/GTF attributes as a comma delimited list of
22     <attrname>=<value> pairs in the i: line
23     -P skip pseudo* features (e.g. pseudogenes, pseudoRNA etc)
24     -L skip 'locus' features (e.g. those created by annbygff)
25     -c the provided <CDS.gtf> file contains valid CDS for some of entries in
26     <input_gff>, so a C: line will be added for those
27    
28     If query coordonate info is found in the gff file, a line with query
29     alignment coordinates for each exon will be added:
30    
31     q:[<qrylen>]:<qexon1start>-<qexon1end>,<qexon2start>-<qexon2end>,...
32     (<qrylen> may not be available)
33     x:<coord1-coord2>,...
34    
35     The "x:..." line is added if the query coordinates show an internal gap (possible
36     exon alignments missing).
37     /;
38     umask 0002;
39     getopts('ATIQBGKNCXPLt:g:o:c:') || die($usage."\n");
40     my $cdbformat=$Getopt::Std::opt_C;
41     my $gffAttrs=$Getopt::Std::opt_A;
42     my $trackline=$Getopt::Std::opt_K;
43     my $geneline=$Getopt::Std::opt_N;
44     my $outfile=$Getopt::Std::opt_o;
45     my $plainGff=$Getopt::Std::opt_G;
46     my $newDivFmt=$Getopt::Std::opt_X;;
47     $plainGff=1 if $newDivFmt;
48     my $track=$Getopt::Std::opt_t;
49     my $skiploci=$Getopt::Std::opt_L;
50     my $nopseudo=$Getopt::Std::opt_P;
51     my $noTrack=$Getopt::Std::opt_T || $Getopt::Std::opt_I || $trackline;
52     my $track2info=$Getopt::Std::opt_I;
53     my $noStrand=$Getopt::Std::opt_B;
54     my $noQdata = $plainGff || $Getopt::Std::opt_Q;
55    
56     if ($plainGff) {
57     #$cdbformat=1; # use -C to enable this
58     $trackline=1 unless $noTrack;
59     $noTrack=1;
60     $geneline=1;
61     $noStrand=1;
62     #$skiploci=1;
63     }
64    
65     my $cdsfile=$Getopt::Std::opt_c;
66     #my $gene_type=$Getopt::Std::opt_g || 'mRNA';
67    
68     my $infile = shift(@ARGV) || die("$usage\nNo input file/stream provided!\n");
69    
70     my $inh;
71     my %vcds; # transcript_id => [ cds1, cds2, cds3 ... ]
72     if ($cdsfile) {
73     open(CDSF, $cdsfile) || die ("Error opening $cdsfile!\n");
74     while (<CDSF>) {
75     next if m/^\s*#/;
76     chomp;
77     my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
78     next unless $f eq 'CDS';
79     my ($tid)=($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/);
80     die ("Error parsing transcript_id for CDS!\n$_\n") unless $tid;
81     $tid=~tr/"//d; #"
82     ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
83     push(@{$vcds{$tid}}, [$fstart, $fend, $fscore]);
84     }
85     close(CDSF);
86     }
87    
88    
89     if ($infile eq '-') {
90     $inh=\*STDIN;
91     $outfile='gff2iit_stdin' unless $outfile;
92     }
93     else {
94     open(INFILE, $infile) || die("Error opening input file $infile!\n");
95     $inh=\*INFILE;
96     unless ($outfile) {
97     $outfile=$infile;
98     $outfile=~s/\.\w+$//; #remove extension if any
99     }
100     }
101    
102    
103     # 0 1 2 3 4 5 6 7 8 9 10 11 12 13
104     my %recs; # recID => [ tag, track, [@exons], [@cds], lstart, lend, $strand, $tdescr, $qlen, $mcov, $mpid, $gname, $ftype, $attrline ]
105     #
106     my @reclist; # just a plain list of record IDs, in the input order
107     my $curtag; #chromosome and strand for current model
108    
109     my @exd; #exons for current model
110     my @cds; #cds segments if -C flag was used
111     my ($qlen, $hasQregs, $qxgap);
112     open(IFA, ">$outfile.ifa")
113     || die("Error creating file $outfile.ifa ($!)!\n");
114     while (<$inh>) {
115     next if m/^\s*#/;
116     chomp;
117     my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
118     my ($mcov, $mpid, $gname, $tdescr);
119     $mcov='89.89' unless $plainGff;
120     my $curtrack=$track || $v;
121     next unless $fstart>1 && $lnum;
122     next if $f eq 'gene'; # Warning: skipping any 'gene' features, unconditionally
123     next if $skiploci && $f eq 'locus';
124     next if $nopseudo && $f=~m/^pseudo/i;
125     my $gff3_ID;
126     my $gff3_Parent;
127     ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
128     ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
129     ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
130     my $isGff3=($gff3_ID || $gff3_Parent);
131     #($tdescr, $gname, $mcov, $mpid)=('','','',0);
132     if ($isGff3) { # GFF format
133     $gff3_ID=~tr/"//d; #"
134     $gff3_Parent=~tr/"//d; #"
135     $gff3_Parent='' if ($f eq 'mRNA');
136     if ($gff3_ID && !$gff3_Parent) { #top level feature starting
137     if ($f=~m/RNA/i) {
138     # try to parse the description, if any
139     if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
140     $tdescr=$1;
141     }
142     elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
143     $tdescr=$1;
144     }
145     if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
146     $gname=$1;
147     }
148     elsif ($lnum=~m/gene\s*=\s*"?([^;"]+)/) {
149     $gname=$1;
150     }
151     $tdescr='' if ($tdescr eq $gname);
152     if ($lnum=~m/\bcov=([\.\d]+)/i) {
153     $mcov=$1;
154     }
155     if ($lnum=~m/\b(?:pid|Identity|Similarity)=([\.\d]+)/i) {
156     $mpid=$1;
157     }
158     unless ($mpid) {
159     $mpid = ($fscore>40 && $fscore<=100) ? $fscore : '89.89';
160     }
161     $qlen='';
162     $hasQregs=0;
163     $qxgap='';
164     if ($lnum=~m/qxgap=([\-\, \d]+)/) {
165     $qxgap=$1;
166     }
167     if ($lnum=~m/\bqreg=\d+\-\d+\|(\d+)/i) {
168     $qlen=$1;
169     }
170     elsif ($lnum=~m/\bqlen=(\d+)/i) {
171     $qlen=$1;
172     }
173     }
174     if ($f eq 'locus' && $lnum=~m/transcripts=([^;]+)/) {
175     $tdescr.=' ' if $tdescr;
176     $tdescr.='['.$1.']';
177     }
178     my $recID=$chr."\x80".$gff3_ID;
179     die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{$recID}));
180     $curtag=$chr;
181     $curtag.=$strand unless $noStrand;
182     $curtrack=$track || $v;
183     push(@reclist, $recID);
184     my $attrline=parseAttrs($lnum, 1) if $gffAttrs;
185     $recs{$recID} = [$curtag, $curtrack, [], [] , $fstart, $fend, $strand, $tdescr, $qlen, $mcov, $mpid, $gname, $f, $attrline];
186     #($tdescr, $gname, $mcov, $mpid)=('','','',0);
187     next;
188     } # parent/top-level feature
189     } #GFF3 format
190     else { #GTF format
191     next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
192     }
193     #my ($gname)=($descr=~m/Name=(\w+)/);
194     #---------------------------
195     # -------------- exon/CDS line here:
196     my $recID;
197     my ($qstart, $qend);
198     unless ($noQdata) {
199     if ($lnum=~m/\bQ[regmatch]+=(\d+)[\- ]+(\d+)/i) {
200     ($qstart, $qend)=($1,$2);
201     ($qstart,$qend)=($qend,$qstart) if $qend<$qstart;
202     $hasQregs++;
203     }
204     }
205     if ($v=~/^jigsaw/ && $lnum=~m/^\d+$/) {
206     $recID=$chr.'.jsm.'.$lnum;
207     $curtrack='jigsaw' unless $track;
208     }
209     elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
210     $recID=$1;
211     $recID=~tr/"//d; #"
212     }
213     elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
214     $recID=$1;
215     $recID=~tr/"//d; #"
216     }
217     else {
218     die("Error: cannot parse locus/transcript name from input line:\n$_\n");
219     }
220     if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
221     #$recID=$1;
222     #$recID=~tr/"//d; #"
223     $gname=$1;
224     $gname=~tr/"//d; #"
225     }
226     $tdescr='' if index($recID, $tdescr)>=0;
227     $curtag=$chr;
228     $curtag.=$strand unless $noStrand;
229     $recID=$chr."\x80".$recID;
230     my $ld = $recs{$recID};
231     $fscore=$mpid if ($fscore eq '.');
232     if ($ld) { #existing entry
233     my $i=($f eq 'CDS') ? 3 : 2;
234     my ($lstart, $lend)=($$ld[4], $$ld[5]);
235     $$ld[4]=$fstart if $fstart<$lstart;
236     $$ld[5]=$fend if $fend>$lend;
237     push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $qstart, $qend]);
238     }
239     else { # first time seeing this locus/gene
240     push(@reclist, $recID);
241     my $attrline=parseAttrs($lnum, $isGff3) if $gffAttrs;
242     $recs{$recID} = ($f eq 'CDS') ?
243     [$curtag, $curtrack, [], [[$fstart, $fend, $fscore, $qstart, $qend]] , $fstart, $fend, $strand, $tdescr,
244     $qlen, $mcov, $mpid, $gname, $f, $attrline ] :
245     [$curtag, $curtrack, [[$fstart, $fend, $fscore, $qstart, $qend]], [ ], $fstart, $fend, $strand, $tdescr,
246     $qlen, $mcov, $mpid, $gname, $f, $attrline ] ;
247     }
248     } #while <>
249    
250     writeModels();
251    
252     close(INFILE) unless $infile eq '-';
253     close(IFA);
254     system("iit_store -o $outfile.iit < $outfile.ifa");
255    
256    
257     #=============================================================
258    
259     sub writeModels {
260     foreach my $l (@reclist) {
261     my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
262     my ($a,$b)=split(/\x80/,$l);
263     $l=$b if $b; # was formatted as chr[0x80]ID, so we peel off chr prefix
264     # 0 1 2 3 4 5 6 7 8 9 10 11
265     my ($tag, $trck, $er, $cr, $lstart, $lend, $strnd, $descr, $ql, $qcov, $qpid, $gn,
266     $ftype, $attrline) = @$ld;
267     my ($mstart,$mend)=($lstart, $lend);
268     my (@ex, @cds);
269     if (@$er>0) {
270     @ex= sort { $main::a->[0] <=> $main::b->[0] } @$er;
271     ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
272     }
273    
274     if (@$cr==0 && $cdsfile) {
275     $cr=$vcds{$l};
276     }
277     if ($cr && @$cr>0) {
278     @cds= sort { $main::a->[0] <=> $main::b->[0] } @$cr;
279     ($mstart, $mend) = ($cds[0]->[0], $cds[-1]->[1]) unless $mstart;
280     }
281     my $intv_tag="$mstart $mend $tag";
282     if ($newDivFmt) {
283     my $chr=$tag;
284     $chr=~s/[\-\+]$//;
285     $intv_tag="$chr:$mstart..$mend $strnd";
286     }
287     if ($cdbformat) {
288     print IFA ">t|$l";
289     print IFA "|g|$gn" if $gn;
290     print IFA " $intv_tag";
291     }
292     else {
293     print IFA ">$l $intv_tag";
294     }
295     print IFA " $trck" unless $noTrack;
296     print IFA "\n";
297     my @qregs;
298     if (@ex) {
299     my @wlst;
300     if ($plainGff) {
301     @wlst = map { $_->[0].'-'.$_->[1] } @ex;
302     }
303     else {
304     @wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @ex;
305     }
306     print IFA join(',',@wlst)."\n";
307     if ($hasQregs) {
308     @qregs = map { [$_->[3], $_->[4], $_->[0], $_->[1] ] } @ex;
309     }
310     }
311     if ($track2info) {
312     $descr=$descr ? $descr.' '.$trck : $trck;
313     $descr='' if $descr eq ' ';
314     }
315     if (@cds) {
316     my @wlst;
317     if ($plainGff) {
318     @wlst = map { $_->[0].'-'.$_->[1] } @cds;
319     }
320     else {
321     @wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @cds;
322     }
323     print IFA 'C:'.join(',',@wlst)."\n";
324     if ($hasQregs && (@qregs==0)) {
325     @qregs = map { [$_->[3], $_->[4], $_->[0], $_->[1] ] } @cds;
326     }
327     }
328     if ($noStrand && !$newDivFmt) {
329     print IFA 's:'.$strnd."\n";
330     }
331     if ($ftype && $geneline) {
332     print IFA 'f:'.$ftype."\n" unless $ftype eq 'exon' || $ftype eq 'CDS';
333     }
334     if ($trackline) {
335     print IFA 't:'.$trck."\n";
336     }
337     if ($gffAttrs && $attrline) {
338     print IFA 'i:'.$attrline."\n";
339     }
340     else {
341     if ($gn && $geneline) {
342     print IFA 'g:'.$gn."\n";
343     }
344     if ($descr) {
345     print IFA 'i:'.$descr."\n";
346     }
347     }
348     if ($qcov) {
349     print IFA 'v:'.$qcov."\n";
350     }
351     next if ($noQdata);
352     if ($hasQregs) {
353     print IFA 'q:'.$ql.':'.join(',', ( map { $_->[0].'-'.$_->[1] } @qregs ))."\n";
354     my @xgaps;
355     for (my $i=1;$i<@qregs;$i++) {
356     #check for gaps between query exons
357     my ($qstart, $qend)= ($qregs[$i-1]->[0]>$qregs[$i]->[0]) ?
358     ($qregs[$i]->[1], $qregs[$i-1]->[0]) : ($qregs[$i-1]->[1], $qregs[$i]->[0]);
359     push(@xgaps,$qregs[$i-1]->[3].'-'.$qregs[$i]->[2]) if ($qend-$qstart>4);
360     }
361     print IFA 'x:'.join(',',@xgaps)."\n" if (@xgaps>0);
362     }
363     elsif ($qxgap) {
364     print IFA 'x:'.$qxgap."\n";
365     }
366     } #for each stored locus
367     }
368    
369     sub parseAttrs {
370     my ($l, $isGff3)=@_;
371     my $attrline;
372     $l=~s/(Parent|ID|transcript_id|gene_id|exon|exon_number)[= ]+[^;]+;?//g;
373     $l=~s/^[\s\;]+//;
374     $l=~s/; /;/g;
375     $l=~s/[\s\;]+$//;
376     if (!$isGff3) {
377     $l=~s/\s+\"([^"]+)\"/=$1/g;
378     }
379     return $l;
380     }

Properties

Name Value
svn:executable *