1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
|
5 |
my $usage = q/Usage: |
6 |
gff2itt [-o <output_name>] [-STIQGBKNCPL] [-t <track_label>] [-c <CDS.gtf>] <input_gff> |
7 |
|
8 |
Options: |
9 |
-X generic GFF storage using new iit_store chr: format |
10 |
-T do not include the track tag in the output |
11 |
-I same as -T, but append the track label to the info line (i:...) |
12 |
-K add the track as a t: line for each record |
13 |
-N add the gene name, if found, as a g: line |
14 |
-C format the record ID as t|<transcript>|g|<gene> for cdbfasta indexing |
15 |
-Q do not include query information data in the output |
16 |
-G the input is GFF annotation instead of scored mappings (implies -Q, |
17 |
-C, -N and uses the new iit_store with -F option) |
18 |
-B the genomic sequence tag will not include the strand character |
19 |
(so intervals on both strands will be returned by a iit_get query |
20 |
and a 's:' line will be added instead) |
21 |
-A write any other GFF\/GTF attributes as a comma delimited list of |
22 |
<attrname>=<value> pairs in the i: line |
23 |
-P skip pseudo* features (e.g. pseudogenes, pseudoRNA etc) |
24 |
-L skip 'locus' features (e.g. those created by annbygff) |
25 |
-c the provided <CDS.gtf> file contains valid CDS for some of entries in |
26 |
<input_gff>, so a C: line will be added for those |
27 |
|
28 |
If query coordonate info is found in the gff file, a line with query |
29 |
alignment coordinates for each exon will be added: |
30 |
|
31 |
q:[<qrylen>]:<qexon1start>-<qexon1end>,<qexon2start>-<qexon2end>,... |
32 |
(<qrylen> may not be available) |
33 |
x:<coord1-coord2>,... |
34 |
|
35 |
The "x:..." line is added if the query coordinates show an internal gap (possible |
36 |
exon alignments missing). |
37 |
/; |
38 |
umask 0002; |
39 |
getopts('ATIQBGKNCXPLt:g:o:c:') || die($usage."\n"); |
40 |
my $cdbformat=$Getopt::Std::opt_C; |
41 |
my $gffAttrs=$Getopt::Std::opt_A; |
42 |
my $trackline=$Getopt::Std::opt_K; |
43 |
my $geneline=$Getopt::Std::opt_N; |
44 |
my $outfile=$Getopt::Std::opt_o; |
45 |
my $plainGff=$Getopt::Std::opt_G; |
46 |
my $newDivFmt=$Getopt::Std::opt_X;; |
47 |
$plainGff=1 if $newDivFmt; |
48 |
my $track=$Getopt::Std::opt_t; |
49 |
my $skiploci=$Getopt::Std::opt_L; |
50 |
my $nopseudo=$Getopt::Std::opt_P; |
51 |
my $noTrack=$Getopt::Std::opt_T || $Getopt::Std::opt_I || $trackline; |
52 |
my $track2info=$Getopt::Std::opt_I; |
53 |
my $noStrand=$Getopt::Std::opt_B; |
54 |
my $noQdata = $plainGff || $Getopt::Std::opt_Q; |
55 |
|
56 |
if ($plainGff) { |
57 |
#$cdbformat=1; # use -C to enable this |
58 |
$trackline=1 unless $noTrack; |
59 |
$noTrack=1; |
60 |
$geneline=1; |
61 |
$noStrand=1; |
62 |
#$skiploci=1; |
63 |
} |
64 |
|
65 |
my $cdsfile=$Getopt::Std::opt_c; |
66 |
#my $gene_type=$Getopt::Std::opt_g || 'mRNA'; |
67 |
|
68 |
my $infile = shift(@ARGV) || die("$usage\nNo input file/stream provided!\n"); |
69 |
|
70 |
my $inh; |
71 |
my %vcds; # transcript_id => [ cds1, cds2, cds3 ... ] |
72 |
if ($cdsfile) { |
73 |
open(CDSF, $cdsfile) || die ("Error opening $cdsfile!\n"); |
74 |
while (<CDSF>) { |
75 |
next if m/^\s*#/; |
76 |
chomp; |
77 |
my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
78 |
next unless $f eq 'CDS'; |
79 |
my ($tid)=($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/); |
80 |
die ("Error parsing transcript_id for CDS!\n$_\n") unless $tid; |
81 |
$tid=~tr/"//d; #" |
82 |
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
83 |
push(@{$vcds{$tid}}, [$fstart, $fend, $fscore]); |
84 |
} |
85 |
close(CDSF); |
86 |
} |
87 |
|
88 |
|
89 |
if ($infile eq '-') { |
90 |
$inh=\*STDIN; |
91 |
$outfile='gff2iit_stdin' unless $outfile; |
92 |
} |
93 |
else { |
94 |
open(INFILE, $infile) || die("Error opening input file $infile!\n"); |
95 |
$inh=\*INFILE; |
96 |
unless ($outfile) { |
97 |
$outfile=$infile; |
98 |
$outfile=~s/\.\w+$//; #remove extension if any |
99 |
} |
100 |
} |
101 |
|
102 |
|
103 |
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 |
104 |
my %recs; # recID => [ tag, track, [@exons], [@cds], lstart, lend, $strand, $tdescr, $qlen, $mcov, $mpid, $gname, $ftype, $attrline ] |
105 |
# |
106 |
my @reclist; # just a plain list of record IDs, in the input order |
107 |
my $curtag; #chromosome and strand for current model |
108 |
|
109 |
my @exd; #exons for current model |
110 |
my @cds; #cds segments if -C flag was used |
111 |
my ($qlen, $hasQregs, $qxgap); |
112 |
open(IFA, ">$outfile.ifa") |
113 |
|| die("Error creating file $outfile.ifa ($!)!\n"); |
114 |
while (<$inh>) { |
115 |
next if m/^\s*#/; |
116 |
chomp; |
117 |
my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
118 |
my ($mcov, $mpid, $gname, $tdescr); |
119 |
$mcov='89.89' unless $plainGff; |
120 |
my $curtrack=$track || $v; |
121 |
next unless $fstart>1 && $lnum; |
122 |
next if $f eq 'gene'; # Warning: skipping any 'gene' features, unconditionally |
123 |
next if $skiploci && $f eq 'locus'; |
124 |
next if $nopseudo && $f=~m/^pseudo/i; |
125 |
my $gff3_ID; |
126 |
my $gff3_Parent; |
127 |
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
128 |
($gff3_ID)=($lnum=~m/\bID=([^;]+)/); |
129 |
($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/); |
130 |
my $isGff3=($gff3_ID || $gff3_Parent); |
131 |
#($tdescr, $gname, $mcov, $mpid)=('','','',0); |
132 |
if ($isGff3) { # GFF format |
133 |
$gff3_ID=~tr/"//d; #" |
134 |
$gff3_Parent=~tr/"//d; #" |
135 |
$gff3_Parent='' if ($f eq 'mRNA'); |
136 |
if ($gff3_ID && !$gff3_Parent) { #top level feature starting |
137 |
if ($f=~m/RNA/i) { |
138 |
# try to parse the description, if any |
139 |
if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) { |
140 |
$tdescr=$1; |
141 |
} |
142 |
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
143 |
$tdescr=$1; |
144 |
} |
145 |
if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) { |
146 |
$gname=$1; |
147 |
} |
148 |
elsif ($lnum=~m/gene\s*=\s*"?([^;"]+)/) { |
149 |
$gname=$1; |
150 |
} |
151 |
$tdescr='' if ($tdescr eq $gname); |
152 |
if ($lnum=~m/\bcov=([\.\d]+)/i) { |
153 |
$mcov=$1; |
154 |
} |
155 |
if ($lnum=~m/\b(?:pid|Identity|Similarity)=([\.\d]+)/i) { |
156 |
$mpid=$1; |
157 |
} |
158 |
unless ($mpid) { |
159 |
$mpid = ($fscore>40 && $fscore<=100) ? $fscore : '89.89'; |
160 |
} |
161 |
$qlen=''; |
162 |
$hasQregs=0; |
163 |
$qxgap=''; |
164 |
if ($lnum=~m/qxgap=([\-\, \d]+)/) { |
165 |
$qxgap=$1; |
166 |
} |
167 |
if ($lnum=~m/\bqreg=\d+\-\d+\|(\d+)/i) { |
168 |
$qlen=$1; |
169 |
} |
170 |
elsif ($lnum=~m/\bqlen=(\d+)/i) { |
171 |
$qlen=$1; |
172 |
} |
173 |
} |
174 |
if ($f eq 'locus' && $lnum=~m/transcripts=([^;]+)/) { |
175 |
$tdescr.=' ' if $tdescr; |
176 |
$tdescr.='['.$1.']'; |
177 |
} |
178 |
my $recID=$chr."\x80".$gff3_ID; |
179 |
die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{$recID})); |
180 |
$curtag=$chr; |
181 |
$curtag.=$strand unless $noStrand; |
182 |
$curtrack=$track || $v; |
183 |
push(@reclist, $recID); |
184 |
my $attrline=parseAttrs($lnum, 1) if $gffAttrs; |
185 |
$recs{$recID} = [$curtag, $curtrack, [], [] , $fstart, $fend, $strand, $tdescr, $qlen, $mcov, $mpid, $gname, $f, $attrline]; |
186 |
#($tdescr, $gname, $mcov, $mpid)=('','','',0); |
187 |
next; |
188 |
} # parent/top-level feature |
189 |
} #GFF3 format |
190 |
else { #GTF format |
191 |
next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature |
192 |
} |
193 |
#my ($gname)=($descr=~m/Name=(\w+)/); |
194 |
#--------------------------- |
195 |
# -------------- exon/CDS line here: |
196 |
my $recID; |
197 |
my ($qstart, $qend); |
198 |
unless ($noQdata) { |
199 |
if ($lnum=~m/\bQ[regmatch]+=(\d+)[\- ]+(\d+)/i) { |
200 |
($qstart, $qend)=($1,$2); |
201 |
($qstart,$qend)=($qend,$qstart) if $qend<$qstart; |
202 |
$hasQregs++; |
203 |
} |
204 |
} |
205 |
if ($v=~/^jigsaw/ && $lnum=~m/^\d+$/) { |
206 |
$recID=$chr.'.jsm.'.$lnum; |
207 |
$curtrack='jigsaw' unless $track; |
208 |
} |
209 |
elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) { |
210 |
$recID=$1; |
211 |
$recID=~tr/"//d; #" |
212 |
} |
213 |
elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) { |
214 |
$recID=$1; |
215 |
$recID=~tr/"//d; #" |
216 |
} |
217 |
else { |
218 |
die("Error: cannot parse locus/transcript name from input line:\n$_\n"); |
219 |
} |
220 |
if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) { |
221 |
#$recID=$1; |
222 |
#$recID=~tr/"//d; #" |
223 |
$gname=$1; |
224 |
$gname=~tr/"//d; #" |
225 |
} |
226 |
$tdescr='' if index($recID, $tdescr)>=0; |
227 |
$curtag=$chr; |
228 |
$curtag.=$strand unless $noStrand; |
229 |
$recID=$chr."\x80".$recID; |
230 |
my $ld = $recs{$recID}; |
231 |
$fscore=$mpid if ($fscore eq '.'); |
232 |
if ($ld) { #existing entry |
233 |
my $i=($f eq 'CDS') ? 3 : 2; |
234 |
my ($lstart, $lend)=($$ld[4], $$ld[5]); |
235 |
$$ld[4]=$fstart if $fstart<$lstart; |
236 |
$$ld[5]=$fend if $fend>$lend; |
237 |
push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $qstart, $qend]); |
238 |
} |
239 |
else { # first time seeing this locus/gene |
240 |
push(@reclist, $recID); |
241 |
my $attrline=parseAttrs($lnum, $isGff3) if $gffAttrs; |
242 |
$recs{$recID} = ($f eq 'CDS') ? |
243 |
[$curtag, $curtrack, [], [[$fstart, $fend, $fscore, $qstart, $qend]] , $fstart, $fend, $strand, $tdescr, |
244 |
$qlen, $mcov, $mpid, $gname, $f, $attrline ] : |
245 |
[$curtag, $curtrack, [[$fstart, $fend, $fscore, $qstart, $qend]], [ ], $fstart, $fend, $strand, $tdescr, |
246 |
$qlen, $mcov, $mpid, $gname, $f, $attrline ] ; |
247 |
} |
248 |
} #while <> |
249 |
|
250 |
writeModels(); |
251 |
|
252 |
close(INFILE) unless $infile eq '-'; |
253 |
close(IFA); |
254 |
system("iit_store -o $outfile.iit < $outfile.ifa"); |
255 |
|
256 |
|
257 |
#============================================================= |
258 |
|
259 |
sub writeModels { |
260 |
foreach my $l (@reclist) { |
261 |
my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n"); |
262 |
my ($a,$b)=split(/\x80/,$l); |
263 |
$l=$b if $b; # was formatted as chr[0x80]ID, so we peel off chr prefix |
264 |
# 0 1 2 3 4 5 6 7 8 9 10 11 |
265 |
my ($tag, $trck, $er, $cr, $lstart, $lend, $strnd, $descr, $ql, $qcov, $qpid, $gn, |
266 |
$ftype, $attrline) = @$ld; |
267 |
my ($mstart,$mend)=($lstart, $lend); |
268 |
my (@ex, @cds); |
269 |
if (@$er>0) { |
270 |
@ex= sort { $main::a->[0] <=> $main::b->[0] } @$er; |
271 |
($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); |
272 |
} |
273 |
|
274 |
if (@$cr==0 && $cdsfile) { |
275 |
$cr=$vcds{$l}; |
276 |
} |
277 |
if ($cr && @$cr>0) { |
278 |
@cds= sort { $main::a->[0] <=> $main::b->[0] } @$cr; |
279 |
($mstart, $mend) = ($cds[0]->[0], $cds[-1]->[1]) unless $mstart; |
280 |
} |
281 |
my $intv_tag="$mstart $mend $tag"; |
282 |
if ($newDivFmt) { |
283 |
my $chr=$tag; |
284 |
$chr=~s/[\-\+]$//; |
285 |
$intv_tag="$chr:$mstart..$mend $strnd"; |
286 |
} |
287 |
if ($cdbformat) { |
288 |
print IFA ">t|$l"; |
289 |
print IFA "|g|$gn" if $gn; |
290 |
print IFA " $intv_tag"; |
291 |
} |
292 |
else { |
293 |
print IFA ">$l $intv_tag"; |
294 |
} |
295 |
print IFA " $trck" unless $noTrack; |
296 |
print IFA "\n"; |
297 |
my @qregs; |
298 |
if (@ex) { |
299 |
my @wlst; |
300 |
if ($plainGff) { |
301 |
@wlst = map { $_->[0].'-'.$_->[1] } @ex; |
302 |
} |
303 |
else { |
304 |
@wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @ex; |
305 |
} |
306 |
print IFA join(',',@wlst)."\n"; |
307 |
if ($hasQregs) { |
308 |
@qregs = map { [$_->[3], $_->[4], $_->[0], $_->[1] ] } @ex; |
309 |
} |
310 |
} |
311 |
if ($track2info) { |
312 |
$descr=$descr ? $descr.' '.$trck : $trck; |
313 |
$descr='' if $descr eq ' '; |
314 |
} |
315 |
if (@cds) { |
316 |
my @wlst; |
317 |
if ($plainGff) { |
318 |
@wlst = map { $_->[0].'-'.$_->[1] } @cds; |
319 |
} |
320 |
else { |
321 |
@wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @cds; |
322 |
} |
323 |
print IFA 'C:'.join(',',@wlst)."\n"; |
324 |
if ($hasQregs && (@qregs==0)) { |
325 |
@qregs = map { [$_->[3], $_->[4], $_->[0], $_->[1] ] } @cds; |
326 |
} |
327 |
} |
328 |
if ($noStrand && !$newDivFmt) { |
329 |
print IFA 's:'.$strnd."\n"; |
330 |
} |
331 |
if ($ftype && $geneline) { |
332 |
print IFA 'f:'.$ftype."\n" unless $ftype eq 'exon' || $ftype eq 'CDS'; |
333 |
} |
334 |
if ($trackline) { |
335 |
print IFA 't:'.$trck."\n"; |
336 |
} |
337 |
if ($gffAttrs && $attrline) { |
338 |
print IFA 'i:'.$attrline."\n"; |
339 |
} |
340 |
else { |
341 |
if ($gn && $geneline) { |
342 |
print IFA 'g:'.$gn."\n"; |
343 |
} |
344 |
if ($descr) { |
345 |
print IFA 'i:'.$descr."\n"; |
346 |
} |
347 |
} |
348 |
if ($qcov) { |
349 |
print IFA 'v:'.$qcov."\n"; |
350 |
} |
351 |
next if ($noQdata); |
352 |
if ($hasQregs) { |
353 |
print IFA 'q:'.$ql.':'.join(',', ( map { $_->[0].'-'.$_->[1] } @qregs ))."\n"; |
354 |
my @xgaps; |
355 |
for (my $i=1;$i<@qregs;$i++) { |
356 |
#check for gaps between query exons |
357 |
my ($qstart, $qend)= ($qregs[$i-1]->[0]>$qregs[$i]->[0]) ? |
358 |
($qregs[$i]->[1], $qregs[$i-1]->[0]) : ($qregs[$i-1]->[1], $qregs[$i]->[0]); |
359 |
push(@xgaps,$qregs[$i-1]->[3].'-'.$qregs[$i]->[2]) if ($qend-$qstart>4); |
360 |
} |
361 |
print IFA 'x:'.join(',',@xgaps)."\n" if (@xgaps>0); |
362 |
} |
363 |
elsif ($qxgap) { |
364 |
print IFA 'x:'.$qxgap."\n"; |
365 |
} |
366 |
} #for each stored locus |
367 |
} |
368 |
|
369 |
sub parseAttrs { |
370 |
my ($l, $isGff3)=@_; |
371 |
my $attrline; |
372 |
$l=~s/(Parent|ID|transcript_id|gene_id|exon|exon_number)[= ]+[^;]+;?//g; |
373 |
$l=~s/^[\s\;]+//; |
374 |
$l=~s/; /;/g; |
375 |
$l=~s/[\s\;]+$//; |
376 |
if (!$isGff3) { |
377 |
$l=~s/\s+\"([^"]+)\"/=$1/g; |
378 |
} |
379 |
return $l; |
380 |
} |