1 |
gpertea |
23 |
#!/usr/bin/perl |
2 |
|
|
use strict; |
3 |
|
|
use Getopt::Std; |
4 |
|
|
|
5 |
|
|
my $usage = q/Usage: |
6 |
|
|
gff2itt [-o <output_name>] [-STIQGBKNCPL] [-t <track_label>] [-c <CDS.gtf>] <input_gff> |
7 |
|
|
|
8 |
|
|
Options: |
9 |
|
|
-X generic GFF storage using new iit_store chr: format |
10 |
|
|
-T do not include the track tag in the output |
11 |
|
|
-I same as -T, but append the track label to the info line (i:...) |
12 |
|
|
-K add the track as a t: line for each record |
13 |
|
|
-N add the gene name, if found, as a g: line |
14 |
|
|
-C format the record ID as t|<transcript>|g|<gene> for cdbfasta indexing |
15 |
|
|
-Q do not include query information data in the output |
16 |
|
|
-G the input is GFF annotation instead of scored mappings (implies -Q, |
17 |
|
|
-C, -N and uses the new iit_store with -F option) |
18 |
|
|
-B the genomic sequence tag will not include the strand character |
19 |
|
|
(so intervals on both strands will be returned by a iit_get query |
20 |
|
|
and a 's:' line will be added instead) |
21 |
|
|
-A write any other GFF\/GTF attributes as a comma delimited list of |
22 |
|
|
<attrname>=<value> pairs in the i: line |
23 |
|
|
-P skip pseudo* features (e.g. pseudogenes, pseudoRNA etc) |
24 |
|
|
-L skip 'locus' features (e.g. those created by annbygff) |
25 |
|
|
-c the provided <CDS.gtf> file contains valid CDS for some of entries in |
26 |
|
|
<input_gff>, so a C: line will be added for those |
27 |
|
|
|
28 |
|
|
If query coordonate info is found in the gff file, a line with query |
29 |
|
|
alignment coordinates for each exon will be added: |
30 |
|
|
|
31 |
|
|
q:[<qrylen>]:<qexon1start>-<qexon1end>,<qexon2start>-<qexon2end>,... |
32 |
|
|
(<qrylen> may not be available) |
33 |
|
|
x:<coord1-coord2>,... |
34 |
|
|
|
35 |
|
|
The "x:..." line is added if the query coordinates show an internal gap (possible |
36 |
|
|
exon alignments missing). |
37 |
|
|
/; |
38 |
|
|
umask 0002; |
39 |
|
|
getopts('ATIQBGKNCXPLt:g:o:c:') || die($usage."\n"); |
40 |
|
|
my $cdbformat=$Getopt::Std::opt_C; |
41 |
|
|
my $gffAttrs=$Getopt::Std::opt_A; |
42 |
|
|
my $trackline=$Getopt::Std::opt_K; |
43 |
|
|
my $geneline=$Getopt::Std::opt_N; |
44 |
|
|
my $outfile=$Getopt::Std::opt_o; |
45 |
|
|
my $plainGff=$Getopt::Std::opt_G; |
46 |
|
|
my $newDivFmt=$Getopt::Std::opt_X;; |
47 |
|
|
$plainGff=1 if $newDivFmt; |
48 |
|
|
my $track=$Getopt::Std::opt_t; |
49 |
|
|
my $skiploci=$Getopt::Std::opt_L; |
50 |
|
|
my $nopseudo=$Getopt::Std::opt_P; |
51 |
|
|
my $noTrack=$Getopt::Std::opt_T || $Getopt::Std::opt_I || $trackline; |
52 |
|
|
my $track2info=$Getopt::Std::opt_I; |
53 |
|
|
my $noStrand=$Getopt::Std::opt_B; |
54 |
|
|
my $noQdata = $plainGff || $Getopt::Std::opt_Q; |
55 |
|
|
|
56 |
|
|
if ($plainGff) { |
57 |
|
|
#$cdbformat=1; # use -C to enable this |
58 |
|
|
$trackline=1 unless $noTrack; |
59 |
|
|
$noTrack=1; |
60 |
|
|
$geneline=1; |
61 |
|
|
$noStrand=1; |
62 |
|
|
#$skiploci=1; |
63 |
|
|
} |
64 |
|
|
|
65 |
|
|
my $cdsfile=$Getopt::Std::opt_c; |
66 |
|
|
#my $gene_type=$Getopt::Std::opt_g || 'mRNA'; |
67 |
|
|
|
68 |
|
|
my $infile = shift(@ARGV) || die("$usage\nNo input file/stream provided!\n"); |
69 |
|
|
|
70 |
|
|
my $inh; |
71 |
|
|
my %vcds; # transcript_id => [ cds1, cds2, cds3 ... ] |
72 |
|
|
if ($cdsfile) { |
73 |
|
|
open(CDSF, $cdsfile) || die ("Error opening $cdsfile!\n"); |
74 |
|
|
while (<CDSF>) { |
75 |
|
|
next if m/^\s*#/; |
76 |
|
|
chomp; |
77 |
|
|
my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
78 |
|
|
next unless $f eq 'CDS'; |
79 |
|
|
my ($tid)=($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/); |
80 |
|
|
die ("Error parsing transcript_id for CDS!\n$_\n") unless $tid; |
81 |
|
|
$tid=~tr/"//d; #" |
82 |
|
|
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
83 |
|
|
push(@{$vcds{$tid}}, [$fstart, $fend, $fscore]); |
84 |
|
|
} |
85 |
|
|
close(CDSF); |
86 |
|
|
} |
87 |
|
|
|
88 |
|
|
|
89 |
|
|
if ($infile eq '-') { |
90 |
|
|
$inh=\*STDIN; |
91 |
|
|
$outfile='gff2iit_stdin' unless $outfile; |
92 |
|
|
} |
93 |
|
|
else { |
94 |
|
|
open(INFILE, $infile) || die("Error opening input file $infile!\n"); |
95 |
|
|
$inh=\*INFILE; |
96 |
|
|
unless ($outfile) { |
97 |
|
|
$outfile=$infile; |
98 |
|
|
$outfile=~s/\.\w+$//; #remove extension if any |
99 |
|
|
} |
100 |
|
|
} |
101 |
|
|
|
102 |
|
|
|
103 |
|
|
# 0 1 2 3 4 5 6 7 8 9 10 11 12 13 |
104 |
|
|
my %recs; # recID => [ tag, track, [@exons], [@cds], lstart, lend, $strand, $tdescr, $qlen, $mcov, $mpid, $gname, $ftype, $attrline ] |
105 |
|
|
# |
106 |
|
|
my @reclist; # just a plain list of record IDs, in the input order |
107 |
|
|
my $curtag; #chromosome and strand for current model |
108 |
|
|
|
109 |
|
|
my @exd; #exons for current model |
110 |
|
|
my @cds; #cds segments if -C flag was used |
111 |
|
|
my ($qlen, $hasQregs, $qxgap); |
112 |
|
|
open(IFA, ">$outfile.ifa") |
113 |
|
|
|| die("Error creating file $outfile.ifa ($!)!\n"); |
114 |
|
|
while (<$inh>) { |
115 |
|
|
next if m/^\s*#/; |
116 |
|
|
chomp; |
117 |
|
|
my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/); |
118 |
|
|
my ($mcov, $mpid, $gname, $tdescr); |
119 |
|
|
$mcov='89.89' unless $plainGff; |
120 |
|
|
my $curtrack=$track || $v; |
121 |
|
|
next unless $fstart>1 && $lnum; |
122 |
|
|
next if $f eq 'gene'; # Warning: skipping any 'gene' features, unconditionally |
123 |
|
|
next if $skiploci && $f eq 'locus'; |
124 |
|
|
next if $nopseudo && $f=~m/^pseudo/i; |
125 |
|
|
my $gff3_ID; |
126 |
|
|
my $gff3_Parent; |
127 |
|
|
($fstart, $fend)=($fend, $fstart) if $fend<$fstart; |
128 |
|
|
($gff3_ID)=($lnum=~m/\bID=([^;]+)/); |
129 |
|
|
($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/); |
130 |
|
|
my $isGff3=($gff3_ID || $gff3_Parent); |
131 |
|
|
#($tdescr, $gname, $mcov, $mpid)=('','','',0); |
132 |
|
|
if ($isGff3) { # GFF format |
133 |
|
|
$gff3_ID=~tr/"//d; #" |
134 |
|
|
$gff3_Parent=~tr/"//d; #" |
135 |
|
|
$gff3_Parent='' if ($f eq 'mRNA'); |
136 |
|
|
if ($gff3_ID && !$gff3_Parent) { #top level feature starting |
137 |
|
|
if ($f=~m/RNA/i) { |
138 |
|
|
# try to parse the description, if any |
139 |
|
|
if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) { |
140 |
|
|
$tdescr=$1; |
141 |
|
|
} |
142 |
|
|
elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) { |
143 |
|
|
$tdescr=$1; |
144 |
|
|
} |
145 |
|
|
if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) { |
146 |
|
|
$gname=$1; |
147 |
|
|
} |
148 |
|
|
elsif ($lnum=~m/gene\s*=\s*"?([^;"]+)/) { |
149 |
|
|
$gname=$1; |
150 |
|
|
} |
151 |
|
|
$tdescr='' if ($tdescr eq $gname); |
152 |
|
|
if ($lnum=~m/\bcov=([\.\d]+)/i) { |
153 |
|
|
$mcov=$1; |
154 |
|
|
} |
155 |
|
|
if ($lnum=~m/\b(?:pid|Identity|Similarity)=([\.\d]+)/i) { |
156 |
|
|
$mpid=$1; |
157 |
|
|
} |
158 |
|
|
unless ($mpid) { |
159 |
|
|
$mpid = ($fscore>40 && $fscore<=100) ? $fscore : '89.89'; |
160 |
|
|
} |
161 |
|
|
$qlen=''; |
162 |
|
|
$hasQregs=0; |
163 |
|
|
$qxgap=''; |
164 |
|
|
if ($lnum=~m/qxgap=([\-\, \d]+)/) { |
165 |
|
|
$qxgap=$1; |
166 |
|
|
} |
167 |
|
|
if ($lnum=~m/\bqreg=\d+\-\d+\|(\d+)/i) { |
168 |
|
|
$qlen=$1; |
169 |
|
|
} |
170 |
|
|
elsif ($lnum=~m/\bqlen=(\d+)/i) { |
171 |
|
|
$qlen=$1; |
172 |
|
|
} |
173 |
|
|
} |
174 |
|
|
if ($f eq 'locus' && $lnum=~m/transcripts=([^;]+)/) { |
175 |
|
|
$tdescr.=' ' if $tdescr; |
176 |
|
|
$tdescr.='['.$1.']'; |
177 |
|
|
} |
178 |
|
|
my $recID=$chr."\x80".$gff3_ID; |
179 |
|
|
die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{$recID})); |
180 |
|
|
$curtag=$chr; |
181 |
|
|
$curtag.=$strand unless $noStrand; |
182 |
|
|
$curtrack=$track || $v; |
183 |
|
|
push(@reclist, $recID); |
184 |
|
|
my $attrline=parseAttrs($lnum, 1) if $gffAttrs; |
185 |
|
|
$recs{$recID} = [$curtag, $curtrack, [], [] , $fstart, $fend, $strand, $tdescr, $qlen, $mcov, $mpid, $gname, $f, $attrline]; |
186 |
|
|
#($tdescr, $gname, $mcov, $mpid)=('','','',0); |
187 |
|
|
next; |
188 |
|
|
} # parent/top-level feature |
189 |
|
|
} #GFF3 format |
190 |
|
|
else { #GTF format |
191 |
|
|
next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature |
192 |
|
|
} |
193 |
|
|
#my ($gname)=($descr=~m/Name=(\w+)/); |
194 |
|
|
#--------------------------- |
195 |
|
|
# -------------- exon/CDS line here: |
196 |
|
|
my $recID; |
197 |
|
|
my ($qstart, $qend); |
198 |
|
|
unless ($noQdata) { |
199 |
|
|
if ($lnum=~m/\bQ[regmatch]+=(\d+)[\- ]+(\d+)/i) { |
200 |
|
|
($qstart, $qend)=($1,$2); |
201 |
|
|
($qstart,$qend)=($qend,$qstart) if $qend<$qstart; |
202 |
|
|
$hasQregs++; |
203 |
|
|
} |
204 |
|
|
} |
205 |
|
|
if ($v=~/^jigsaw/ && $lnum=~m/^\d+$/) { |
206 |
|
|
$recID=$chr.'.jsm.'.$lnum; |
207 |
|
|
$curtrack='jigsaw' unless $track; |
208 |
|
|
} |
209 |
|
|
elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) { |
210 |
|
|
$recID=$1; |
211 |
|
|
$recID=~tr/"//d; #" |
212 |
|
|
} |
213 |
|
|
elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) { |
214 |
|
|
$recID=$1; |
215 |
|
|
$recID=~tr/"//d; #" |
216 |
|
|
} |
217 |
|
|
else { |
218 |
|
|
die("Error: cannot parse locus/transcript name from input line:\n$_\n"); |
219 |
|
|
} |
220 |
|
|
if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) { |
221 |
|
|
#$recID=$1; |
222 |
|
|
#$recID=~tr/"//d; #" |
223 |
|
|
$gname=$1; |
224 |
|
|
$gname=~tr/"//d; #" |
225 |
|
|
} |
226 |
|
|
$tdescr='' if index($recID, $tdescr)>=0; |
227 |
|
|
$curtag=$chr; |
228 |
|
|
$curtag.=$strand unless $noStrand; |
229 |
|
|
$recID=$chr."\x80".$recID; |
230 |
|
|
my $ld = $recs{$recID}; |
231 |
|
|
$fscore=$mpid if ($fscore eq '.'); |
232 |
|
|
if ($ld) { #existing entry |
233 |
|
|
my $i=($f eq 'CDS') ? 3 : 2; |
234 |
|
|
my ($lstart, $lend)=($$ld[4], $$ld[5]); |
235 |
|
|
$$ld[4]=$fstart if $fstart<$lstart; |
236 |
|
|
$$ld[5]=$fend if $fend>$lend; |
237 |
|
|
push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $qstart, $qend]); |
238 |
|
|
} |
239 |
|
|
else { # first time seeing this locus/gene |
240 |
|
|
push(@reclist, $recID); |
241 |
|
|
my $attrline=parseAttrs($lnum, $isGff3) if $gffAttrs; |
242 |
|
|
$recs{$recID} = ($f eq 'CDS') ? |
243 |
|
|
[$curtag, $curtrack, [], [[$fstart, $fend, $fscore, $qstart, $qend]] , $fstart, $fend, $strand, $tdescr, |
244 |
|
|
$qlen, $mcov, $mpid, $gname, $f, $attrline ] : |
245 |
|
|
[$curtag, $curtrack, [[$fstart, $fend, $fscore, $qstart, $qend]], [ ], $fstart, $fend, $strand, $tdescr, |
246 |
|
|
$qlen, $mcov, $mpid, $gname, $f, $attrline ] ; |
247 |
|
|
} |
248 |
|
|
} #while <> |
249 |
|
|
|
250 |
|
|
writeModels(); |
251 |
|
|
|
252 |
|
|
close(INFILE) unless $infile eq '-'; |
253 |
|
|
close(IFA); |
254 |
|
|
system("iit_store -o $outfile.iit < $outfile.ifa"); |
255 |
|
|
|
256 |
|
|
|
257 |
|
|
#============================================================= |
258 |
|
|
|
259 |
|
|
sub writeModels { |
260 |
|
|
foreach my $l (@reclist) { |
261 |
|
|
my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n"); |
262 |
|
|
my ($a,$b)=split(/\x80/,$l); |
263 |
|
|
$l=$b if $b; # was formatted as chr[0x80]ID, so we peel off chr prefix |
264 |
|
|
# 0 1 2 3 4 5 6 7 8 9 10 11 |
265 |
|
|
my ($tag, $trck, $er, $cr, $lstart, $lend, $strnd, $descr, $ql, $qcov, $qpid, $gn, |
266 |
|
|
$ftype, $attrline) = @$ld; |
267 |
|
|
my ($mstart,$mend)=($lstart, $lend); |
268 |
|
|
my (@ex, @cds); |
269 |
|
|
if (@$er>0) { |
270 |
|
|
@ex= sort { $main::a->[0] <=> $main::b->[0] } @$er; |
271 |
|
|
($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]); |
272 |
|
|
} |
273 |
|
|
|
274 |
|
|
if (@$cr==0 && $cdsfile) { |
275 |
|
|
$cr=$vcds{$l}; |
276 |
|
|
} |
277 |
|
|
if ($cr && @$cr>0) { |
278 |
|
|
@cds= sort { $main::a->[0] <=> $main::b->[0] } @$cr; |
279 |
|
|
($mstart, $mend) = ($cds[0]->[0], $cds[-1]->[1]) unless $mstart; |
280 |
|
|
} |
281 |
|
|
my $intv_tag="$mstart $mend $tag"; |
282 |
|
|
if ($newDivFmt) { |
283 |
|
|
my $chr=$tag; |
284 |
|
|
$chr=~s/[\-\+]$//; |
285 |
|
|
$intv_tag="$chr:$mstart..$mend $strnd"; |
286 |
|
|
} |
287 |
|
|
if ($cdbformat) { |
288 |
|
|
print IFA ">t|$l"; |
289 |
|
|
print IFA "|g|$gn" if $gn; |
290 |
|
|
print IFA " $intv_tag"; |
291 |
|
|
} |
292 |
|
|
else { |
293 |
|
|
print IFA ">$l $intv_tag"; |
294 |
|
|
} |
295 |
|
|
print IFA " $trck" unless $noTrack; |
296 |
|
|
print IFA "\n"; |
297 |
|
|
my @qregs; |
298 |
|
|
if (@ex) { |
299 |
|
|
my @wlst; |
300 |
|
|
if ($plainGff) { |
301 |
|
|
@wlst = map { $_->[0].'-'.$_->[1] } @ex; |
302 |
|
|
} |
303 |
|
|
else { |
304 |
|
|
@wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @ex; |
305 |
|
|
} |
306 |
|
|
print IFA join(',',@wlst)."\n"; |
307 |
|
|
if ($hasQregs) { |
308 |
|
|
@qregs = map { [$_->[3], $_->[4], $_->[0], $_->[1] ] } @ex; |
309 |
|
|
} |
310 |
|
|
} |
311 |
|
|
if ($track2info) { |
312 |
|
|
$descr=$descr ? $descr.' '.$trck : $trck; |
313 |
|
|
$descr='' if $descr eq ' '; |
314 |
|
|
} |
315 |
|
|
if (@cds) { |
316 |
|
|
my @wlst; |
317 |
|
|
if ($plainGff) { |
318 |
|
|
@wlst = map { $_->[0].'-'.$_->[1] } @cds; |
319 |
|
|
} |
320 |
|
|
else { |
321 |
|
|
@wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @cds; |
322 |
|
|
} |
323 |
|
|
print IFA 'C:'.join(',',@wlst)."\n"; |
324 |
|
|
if ($hasQregs && (@qregs==0)) { |
325 |
|
|
@qregs = map { [$_->[3], $_->[4], $_->[0], $_->[1] ] } @cds; |
326 |
|
|
} |
327 |
|
|
} |
328 |
|
|
if ($noStrand && !$newDivFmt) { |
329 |
|
|
print IFA 's:'.$strnd."\n"; |
330 |
|
|
} |
331 |
|
|
if ($ftype && $geneline) { |
332 |
|
|
print IFA 'f:'.$ftype."\n" unless $ftype eq 'exon' || $ftype eq 'CDS'; |
333 |
|
|
} |
334 |
|
|
if ($trackline) { |
335 |
|
|
print IFA 't:'.$trck."\n"; |
336 |
|
|
} |
337 |
|
|
if ($gffAttrs && $attrline) { |
338 |
|
|
print IFA 'i:'.$attrline."\n"; |
339 |
|
|
} |
340 |
|
|
else { |
341 |
|
|
if ($gn && $geneline) { |
342 |
|
|
print IFA 'g:'.$gn."\n"; |
343 |
|
|
} |
344 |
|
|
if ($descr) { |
345 |
|
|
print IFA 'i:'.$descr."\n"; |
346 |
|
|
} |
347 |
|
|
} |
348 |
|
|
if ($qcov) { |
349 |
|
|
print IFA 'v:'.$qcov."\n"; |
350 |
|
|
} |
351 |
|
|
next if ($noQdata); |
352 |
|
|
if ($hasQregs) { |
353 |
|
|
print IFA 'q:'.$ql.':'.join(',', ( map { $_->[0].'-'.$_->[1] } @qregs ))."\n"; |
354 |
|
|
my @xgaps; |
355 |
|
|
for (my $i=1;$i<@qregs;$i++) { |
356 |
|
|
#check for gaps between query exons |
357 |
|
|
my ($qstart, $qend)= ($qregs[$i-1]->[0]>$qregs[$i]->[0]) ? |
358 |
|
|
($qregs[$i]->[1], $qregs[$i-1]->[0]) : ($qregs[$i-1]->[1], $qregs[$i]->[0]); |
359 |
|
|
push(@xgaps,$qregs[$i-1]->[3].'-'.$qregs[$i]->[2]) if ($qend-$qstart>4); |
360 |
|
|
} |
361 |
|
|
print IFA 'x:'.join(',',@xgaps)."\n" if (@xgaps>0); |
362 |
|
|
} |
363 |
|
|
elsif ($qxgap) { |
364 |
|
|
print IFA 'x:'.$qxgap."\n"; |
365 |
|
|
} |
366 |
|
|
} #for each stored locus |
367 |
|
|
} |
368 |
|
|
|
369 |
|
|
sub parseAttrs { |
370 |
|
|
my ($l, $isGff3)=@_; |
371 |
|
|
my $attrline; |
372 |
|
|
$l=~s/(Parent|ID|transcript_id|gene_id|exon|exon_number)[= ]+[^;]+;?//g; |
373 |
|
|
$l=~s/^[\s\;]+//; |
374 |
|
|
$l=~s/; /;/g; |
375 |
|
|
$l=~s/[\s\;]+$//; |
376 |
|
|
if (!$isGff3) { |
377 |
|
|
$l=~s/\s+\"([^"]+)\"/=$1/g; |
378 |
|
|
} |
379 |
|
|
return $l; |
380 |
|
|
} |