ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2iit
Revision: 23
Committed: Tue Jul 26 21:44:38 2011 UTC (8 years, 2 months ago) by gpertea
Original Path: ann_bin/gff2iit
File size: 12573 byte(s)
Log Message:
adding misc scripts

Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4
5 my $usage = q/Usage:
6 gff2itt [-o <output_name>] [-STIQGBKNCPL] [-t <track_label>] [-c <CDS.gtf>] <input_gff>
7
8 Options:
9 -X generic GFF storage using new iit_store chr: format
10 -T do not include the track tag in the output
11 -I same as -T, but append the track label to the info line (i:...)
12 -K add the track as a t: line for each record
13 -N add the gene name, if found, as a g: line
14 -C format the record ID as t|<transcript>|g|<gene> for cdbfasta indexing
15 -Q do not include query information data in the output
16 -G the input is GFF annotation instead of scored mappings (implies -Q,
17 -C, -N and uses the new iit_store with -F option)
18 -B the genomic sequence tag will not include the strand character
19 (so intervals on both strands will be returned by a iit_get query
20 and a 's:' line will be added instead)
21 -A write any other GFF\/GTF attributes as a comma delimited list of
22 <attrname>=<value> pairs in the i: line
23 -P skip pseudo* features (e.g. pseudogenes, pseudoRNA etc)
24 -L skip 'locus' features (e.g. those created by annbygff)
25 -c the provided <CDS.gtf> file contains valid CDS for some of entries in
26 <input_gff>, so a C: line will be added for those
27
28 If query coordonate info is found in the gff file, a line with query
29 alignment coordinates for each exon will be added:
30
31 q:[<qrylen>]:<qexon1start>-<qexon1end>,<qexon2start>-<qexon2end>,...
32 (<qrylen> may not be available)
33 x:<coord1-coord2>,...
34
35 The "x:..." line is added if the query coordinates show an internal gap (possible
36 exon alignments missing).
37 /;
38 umask 0002;
39 getopts('ATIQBGKNCXPLt:g:o:c:') || die($usage."\n");
40 my $cdbformat=$Getopt::Std::opt_C;
41 my $gffAttrs=$Getopt::Std::opt_A;
42 my $trackline=$Getopt::Std::opt_K;
43 my $geneline=$Getopt::Std::opt_N;
44 my $outfile=$Getopt::Std::opt_o;
45 my $plainGff=$Getopt::Std::opt_G;
46 my $newDivFmt=$Getopt::Std::opt_X;;
47 $plainGff=1 if $newDivFmt;
48 my $track=$Getopt::Std::opt_t;
49 my $skiploci=$Getopt::Std::opt_L;
50 my $nopseudo=$Getopt::Std::opt_P;
51 my $noTrack=$Getopt::Std::opt_T || $Getopt::Std::opt_I || $trackline;
52 my $track2info=$Getopt::Std::opt_I;
53 my $noStrand=$Getopt::Std::opt_B;
54 my $noQdata = $plainGff || $Getopt::Std::opt_Q;
55
56 if ($plainGff) {
57 #$cdbformat=1; # use -C to enable this
58 $trackline=1 unless $noTrack;
59 $noTrack=1;
60 $geneline=1;
61 $noStrand=1;
62 #$skiploci=1;
63 }
64
65 my $cdsfile=$Getopt::Std::opt_c;
66 #my $gene_type=$Getopt::Std::opt_g || 'mRNA';
67
68 my $infile = shift(@ARGV) || die("$usage\nNo input file/stream provided!\n");
69
70 my $inh;
71 my %vcds; # transcript_id => [ cds1, cds2, cds3 ... ]
72 if ($cdsfile) {
73 open(CDSF, $cdsfile) || die ("Error opening $cdsfile!\n");
74 while (<CDSF>) {
75 next if m/^\s*#/;
76 chomp;
77 my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
78 next unless $f eq 'CDS';
79 my ($tid)=($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/);
80 die ("Error parsing transcript_id for CDS!\n$_\n") unless $tid;
81 $tid=~tr/"//d; #"
82 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
83 push(@{$vcds{$tid}}, [$fstart, $fend, $fscore]);
84 }
85 close(CDSF);
86 }
87
88
89 if ($infile eq '-') {
90 $inh=\*STDIN;
91 $outfile='gff2iit_stdin' unless $outfile;
92 }
93 else {
94 open(INFILE, $infile) || die("Error opening input file $infile!\n");
95 $inh=\*INFILE;
96 unless ($outfile) {
97 $outfile=$infile;
98 $outfile=~s/\.\w+$//; #remove extension if any
99 }
100 }
101
102
103 # 0 1 2 3 4 5 6 7 8 9 10 11 12 13
104 my %recs; # recID => [ tag, track, [@exons], [@cds], lstart, lend, $strand, $tdescr, $qlen, $mcov, $mpid, $gname, $ftype, $attrline ]
105 #
106 my @reclist; # just a plain list of record IDs, in the input order
107 my $curtag; #chromosome and strand for current model
108
109 my @exd; #exons for current model
110 my @cds; #cds segments if -C flag was used
111 my ($qlen, $hasQregs, $qxgap);
112 open(IFA, ">$outfile.ifa")
113 || die("Error creating file $outfile.ifa ($!)!\n");
114 while (<$inh>) {
115 next if m/^\s*#/;
116 chomp;
117 my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $lnum)=split(/\t/);
118 my ($mcov, $mpid, $gname, $tdescr);
119 $mcov='89.89' unless $plainGff;
120 my $curtrack=$track || $v;
121 next unless $fstart>1 && $lnum;
122 next if $f eq 'gene'; # Warning: skipping any 'gene' features, unconditionally
123 next if $skiploci && $f eq 'locus';
124 next if $nopseudo && $f=~m/^pseudo/i;
125 my $gff3_ID;
126 my $gff3_Parent;
127 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
128 ($gff3_ID)=($lnum=~m/\bID=([^;]+)/);
129 ($gff3_Parent)=($lnum=~m/\bParent=([^;]+)/);
130 my $isGff3=($gff3_ID || $gff3_Parent);
131 #($tdescr, $gname, $mcov, $mpid)=('','','',0);
132 if ($isGff3) { # GFF format
133 $gff3_ID=~tr/"//d; #"
134 $gff3_Parent=~tr/"//d; #"
135 $gff3_Parent='' if ($f eq 'mRNA');
136 if ($gff3_ID && !$gff3_Parent) { #top level feature starting
137 if ($f=~m/RNA/i) {
138 # try to parse the description, if any
139 if ($lnum=~m/\b(?:descr|tophit|info|product)\s*=\s*"?([^;"]+)/i) {
140 $tdescr=$1;
141 }
142 elsif ($lnum=~m/Name\s*=\s*"?([^;"]+)/) {
143 $tdescr=$1;
144 }
145 if ($lnum=~m/\bgene_name[\s=]+"?([^;"]+)/i) {
146 $gname=$1;
147 }
148 elsif ($lnum=~m/gene\s*=\s*"?([^;"]+)/) {
149 $gname=$1;
150 }
151 $tdescr='' if ($tdescr eq $gname);
152 if ($lnum=~m/\bcov=([\.\d]+)/i) {
153 $mcov=$1;
154 }
155 if ($lnum=~m/\b(?:pid|Identity|Similarity)=([\.\d]+)/i) {
156 $mpid=$1;
157 }
158 unless ($mpid) {
159 $mpid = ($fscore>40 && $fscore<=100) ? $fscore : '89.89';
160 }
161 $qlen='';
162 $hasQregs=0;
163 $qxgap='';
164 if ($lnum=~m/qxgap=([\-\, \d]+)/) {
165 $qxgap=$1;
166 }
167 if ($lnum=~m/\bqreg=\d+\-\d+\|(\d+)/i) {
168 $qlen=$1;
169 }
170 elsif ($lnum=~m/\bqlen=(\d+)/i) {
171 $qlen=$1;
172 }
173 }
174 if ($f eq 'locus' && $lnum=~m/transcripts=([^;]+)/) {
175 $tdescr.=' ' if $tdescr;
176 $tdescr.='['.$1.']';
177 }
178 my $recID=$chr."\x80".$gff3_ID;
179 die("Error: duplicate feature $gff3_ID on $chr\n") if (exists($recs{$recID}));
180 $curtag=$chr;
181 $curtag.=$strand unless $noStrand;
182 $curtrack=$track || $v;
183 push(@reclist, $recID);
184 my $attrline=parseAttrs($lnum, 1) if $gffAttrs;
185 $recs{$recID} = [$curtag, $curtrack, [], [] , $fstart, $fend, $strand, $tdescr, $qlen, $mcov, $mpid, $gname, $f, $attrline];
186 #($tdescr, $gname, $mcov, $mpid)=('','','',0);
187 next;
188 } # parent/top-level feature
189 } #GFF3 format
190 else { #GTF format
191 next if ($f eq 'transcript'); #exception: GTF with parent 'transcript' feature
192 }
193 #my ($gname)=($descr=~m/Name=(\w+)/);
194 #---------------------------
195 # -------------- exon/CDS line here:
196 my $recID;
197 my ($qstart, $qend);
198 unless ($noQdata) {
199 if ($lnum=~m/\bQ[regmatch]+=(\d+)[\- ]+(\d+)/i) {
200 ($qstart, $qend)=($1,$2);
201 ($qstart,$qend)=($qend,$qstart) if $qend<$qstart;
202 $hasQregs++;
203 }
204 }
205 if ($v=~/^jigsaw/ && $lnum=~m/^\d+$/) {
206 $recID=$chr.'.jsm.'.$lnum;
207 $curtrack='jigsaw' unless $track;
208 }
209 elsif ($lnum=~m/Parent=(['"\:\w\|\-\.]+)/) {
210 $recID=$1;
211 $recID=~tr/"//d; #"
212 }
213 elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
214 $recID=$1;
215 $recID=~tr/"//d; #"
216 }
217 else {
218 die("Error: cannot parse locus/transcript name from input line:\n$_\n");
219 }
220 if (!$gname && $lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
221 #$recID=$1;
222 #$recID=~tr/"//d; #"
223 $gname=$1;
224 $gname=~tr/"//d; #"
225 }
226 $tdescr='' if index($recID, $tdescr)>=0;
227 $curtag=$chr;
228 $curtag.=$strand unless $noStrand;
229 $recID=$chr."\x80".$recID;
230 my $ld = $recs{$recID};
231 $fscore=$mpid if ($fscore eq '.');
232 if ($ld) { #existing entry
233 my $i=($f eq 'CDS') ? 3 : 2;
234 my ($lstart, $lend)=($$ld[4], $$ld[5]);
235 $$ld[4]=$fstart if $fstart<$lstart;
236 $$ld[5]=$fend if $fend>$lend;
237 push(@{$$ld[$i]}, [$fstart, $fend, $fscore, $qstart, $qend]);
238 }
239 else { # first time seeing this locus/gene
240 push(@reclist, $recID);
241 my $attrline=parseAttrs($lnum, $isGff3) if $gffAttrs;
242 $recs{$recID} = ($f eq 'CDS') ?
243 [$curtag, $curtrack, [], [[$fstart, $fend, $fscore, $qstart, $qend]] , $fstart, $fend, $strand, $tdescr,
244 $qlen, $mcov, $mpid, $gname, $f, $attrline ] :
245 [$curtag, $curtrack, [[$fstart, $fend, $fscore, $qstart, $qend]], [ ], $fstart, $fend, $strand, $tdescr,
246 $qlen, $mcov, $mpid, $gname, $f, $attrline ] ;
247 }
248 } #while <>
249
250 writeModels();
251
252 close(INFILE) unless $infile eq '-';
253 close(IFA);
254 system("iit_store -o $outfile.iit < $outfile.ifa");
255
256
257 #=============================================================
258
259 sub writeModels {
260 foreach my $l (@reclist) {
261 my $ld=$recs{$l} || die ("Error: locus $l found in list but not in hash!\n");
262 my ($a,$b)=split(/\x80/,$l);
263 $l=$b if $b; # was formatted as chr[0x80]ID, so we peel off chr prefix
264 # 0 1 2 3 4 5 6 7 8 9 10 11
265 my ($tag, $trck, $er, $cr, $lstart, $lend, $strnd, $descr, $ql, $qcov, $qpid, $gn,
266 $ftype, $attrline) = @$ld;
267 my ($mstart,$mend)=($lstart, $lend);
268 my (@ex, @cds);
269 if (@$er>0) {
270 @ex= sort { $main::a->[0] <=> $main::b->[0] } @$er;
271 ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
272 }
273
274 if (@$cr==0 && $cdsfile) {
275 $cr=$vcds{$l};
276 }
277 if ($cr && @$cr>0) {
278 @cds= sort { $main::a->[0] <=> $main::b->[0] } @$cr;
279 ($mstart, $mend) = ($cds[0]->[0], $cds[-1]->[1]) unless $mstart;
280 }
281 my $intv_tag="$mstart $mend $tag";
282 if ($newDivFmt) {
283 my $chr=$tag;
284 $chr=~s/[\-\+]$//;
285 $intv_tag="$chr:$mstart..$mend $strnd";
286 }
287 if ($cdbformat) {
288 print IFA ">t|$l";
289 print IFA "|g|$gn" if $gn;
290 print IFA " $intv_tag";
291 }
292 else {
293 print IFA ">$l $intv_tag";
294 }
295 print IFA " $trck" unless $noTrack;
296 print IFA "\n";
297 my @qregs;
298 if (@ex) {
299 my @wlst;
300 if ($plainGff) {
301 @wlst = map { $_->[0].'-'.$_->[1] } @ex;
302 }
303 else {
304 @wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @ex;
305 }
306 print IFA join(',',@wlst)."\n";
307 if ($hasQregs) {
308 @qregs = map { [$_->[3], $_->[4], $_->[0], $_->[1] ] } @ex;
309 }
310 }
311 if ($track2info) {
312 $descr=$descr ? $descr.' '.$trck : $trck;
313 $descr='' if $descr eq ' ';
314 }
315 if (@cds) {
316 my @wlst;
317 if ($plainGff) {
318 @wlst = map { $_->[0].'-'.$_->[1] } @cds;
319 }
320 else {
321 @wlst = map { $_->[0].'-'.$_->[1].':'.$_->[2] } @cds;
322 }
323 print IFA 'C:'.join(',',@wlst)."\n";
324 if ($hasQregs && (@qregs==0)) {
325 @qregs = map { [$_->[3], $_->[4], $_->[0], $_->[1] ] } @cds;
326 }
327 }
328 if ($noStrand && !$newDivFmt) {
329 print IFA 's:'.$strnd."\n";
330 }
331 if ($ftype && $geneline) {
332 print IFA 'f:'.$ftype."\n" unless $ftype eq 'exon' || $ftype eq 'CDS';
333 }
334 if ($trackline) {
335 print IFA 't:'.$trck."\n";
336 }
337 if ($gffAttrs && $attrline) {
338 print IFA 'i:'.$attrline."\n";
339 }
340 else {
341 if ($gn && $geneline) {
342 print IFA 'g:'.$gn."\n";
343 }
344 if ($descr) {
345 print IFA 'i:'.$descr."\n";
346 }
347 }
348 if ($qcov) {
349 print IFA 'v:'.$qcov."\n";
350 }
351 next if ($noQdata);
352 if ($hasQregs) {
353 print IFA 'q:'.$ql.':'.join(',', ( map { $_->[0].'-'.$_->[1] } @qregs ))."\n";
354 my @xgaps;
355 for (my $i=1;$i<@qregs;$i++) {
356 #check for gaps between query exons
357 my ($qstart, $qend)= ($qregs[$i-1]->[0]>$qregs[$i]->[0]) ?
358 ($qregs[$i]->[1], $qregs[$i-1]->[0]) : ($qregs[$i-1]->[1], $qregs[$i]->[0]);
359 push(@xgaps,$qregs[$i-1]->[3].'-'.$qregs[$i]->[2]) if ($qend-$qstart>4);
360 }
361 print IFA 'x:'.join(',',@xgaps)."\n" if (@xgaps>0);
362 }
363 elsif ($qxgap) {
364 print IFA 'x:'.$qxgap."\n";
365 }
366 } #for each stored locus
367 }
368
369 sub parseAttrs {
370 my ($l, $isGff3)=@_;
371 my $attrline;
372 $l=~s/(Parent|ID|transcript_id|gene_id|exon|exon_number)[= ]+[^;]+;?//g;
373 $l=~s/^[\s\;]+//;
374 $l=~s/; /;/g;
375 $l=~s/[\s\;]+$//;
376 if (!$isGff3) {
377 $l=~s/\s+\"([^"]+)\"/=$1/g;
378 }
379 return $l;
380 }

Properties

Name Value
svn:executable *