ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gtf2gff
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 13177 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 my $usage = q/Usage:
5 gtf2gff [-f <gff3_to_filter>] [-C] [-F] [-s <sel_gff_feature>] [-t <tracklabel>] <input_gtf>
6
7 Convert the provided <input_gtf> file into a gff3 file, optionally
8 setting the second column to <tracklabel> (if -t option was used).
9
10 If -f option is given, gtf2gff will work in "filtering mode" --
11 the file <gff3_to_filter> will be filtered such that only transcripts
12 found in the <input_gtf> file will be included.
13 Other options:
14 -s specify what subfeatures to keep (default: both '*exon' and 'CDS'
15 subfeatures are kept (e.g. use '-s CDS' if you want to keep only
16 'CDS' features)
17 -C input is assumed to be in NCBI annotation format so some cleaning\/renaming will be
18 performed while most other attributes will be preserved (including the
19 "gene" entries)
20 -F fix the CDS phase field (for erroneous gtf\/gff confusing phase with frame)
21 /;
22 umask 0002;
23 getopts('FCNf:t:s:') || die($usage."\n");
24 my %qtrim;
25 @qtrim{'gbkey','db_xref', 'protein_id'}=(); #remove quotes for these attributes
26 my ($gfflt, $ctrack)=($Getopt::Std::opt_f, $Getopt::Std::opt_t);
27 my $ncbigtf=$Getopt::Std::opt_C;
28 my $flipphase=$Getopt::Std::opt_F;
29 my $selfeature=uc($Getopt::Std::opt_s);
30 if ($gfflt) {
31 open(INGFF, $gfflt) || die("Error opening $gfflt!\n");
32 }
33 my %mrnas; # mrna => [ $ctg, $strand, $geneid, $track, [@exons], [@cds], gffattr]
34 my @mrnalst; # list of transcript IDs in the order they were found
35 my $namedOnly=$Getopt::Std::opt_N;
36 my $curID; #current "contig|geneID|transcript_id" combo
37 #my @curData; # [ $ctg, $strand, $geneid, $track, [@exons], [@cds], gffattr ]
38 my $lastGeneID; # last 'gene' line gene_id
39 my $prevGeneID;
40 my ($lastgstart, $lastgend);
41 my $curChr; # current chromosome ID, flush output when this changes
42 #my $myGeneID;
43 my $myTrID;
44 my %gids; # $geneid=> counter to check the uniqueness of gene IDs
45 my %gregs; # $geneid=> [start, end]
46 my %trids; # to check/correct the uniqueness of transcript IDs
47 my %loci; #gene lines to print on a contig
48 my %trseen; # transcript id seen at least once in this locus
49 my $isGtf;
50 $isGtf=1 if $ncbigtf;
51 while (<>) {
52 next if m/^\s*#/;
53 chomp;
54 my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $attrs)=split(/\t/);
55 $track=$ctrack if $ctrack;
56 $f = 'exon' if $f=~m/\-?exon$/;
57 if ($f eq 'mRNA') { #gff record start
58 my ($transcript)=($attrs=~m/ID=([^;]+)/);
59 if ($transcript) {
60 $mrnas{$transcript}=[$chr, $strand, '', $track, [], [], $attrs];
61 }
62 }
63 next unless ($attrs && ($f eq 'exon' || $f eq 'CDS' || ($ncbigtf && $f eq 'gene')));
64 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
65 my ($gid)=($attrs=~m/gene_id\s+"([^"]+)/);
66 my ($trid)=($attrs=~m/transcript_id\s+"([^"]+)/);
67 if ($ncbigtf) {
68 $trid=~s/^\Q$chr://;
69 $chr=~s/\.\d+$//;
70 $gid=~s/\.\d+\:/:/;
71 if ($curChr ne $chr) { #base contig change, the input MUST be grouped at least by contig
72 writeModels() if $curChr;
73 $curChr=$chr;
74 }
75 if ($trid) { #'exon' or 'CDS' feature
76 die("Unexpected transcript id $trid for non-exon/CDS feature!\n$_\n")
77 unless $f eq 'CDS' || $f eq 'exon';
78 $trid=$chr.':'.$trid;
79 my $mrnadata=$mrnas{$trid};
80 my $gidnum=$gids{$gid};
81 my $trid0=$trid;
82 if ($gidnum>1) { #could be from a different transcript in fact, with the same geneID
83 #must be from the last one announced here
84 $gid.='.gn'.$gidnum;
85 }
86 $trid=getXId($trid, $gid);
87 $mrnadata=$mrnas{$trid} if ($trid ne $trid0);
88 if ($curID ne $trid) { #transcript ID change
89 $curID=$trid0;
90 $myTrID=$trid;
91 if (!$mrnadata) { # new transcript ID for this contig
92 #my $tridn=++$trids{$trid};
93 #$myTrID.=".tm$tridn" if $tridn>1;
94 my $myattrs="; ".$attrs;
95 #parse all attributes with their values
96 my %a;
97 foreach my $av (($myattrs=~m/\; (\w+ "[^"]*")/g)) {
98 my ($atr, $val)=split(' ',$av,2);
99 #$val=~s/;?\s*Derived by automated [\w :\.]+//;
100 $val=~tr/;/./s;
101 $val=~s/[\,;\.:]+"$/"/;
102 next if $atr eq 'note' && length($val)<4;
103 $val=~tr/"//d #"
104 if exists($qtrim{$atr});
105 $a{$atr}=$val;
106 }
107 delete @a{'gene_id','transcript_id','exon_number'};
108 my %notes;
109 my $note=delete $a{'note'};
110 if ($note) {
111 my @n=split(/;\s+/);
112 @notes{@n}=();
113 }
114 $myattrs=join(";", map { "$_=".$a{$_} } keys(%a));
115 $myattrs=~s/=""/=1/g;
116 $myattrs="Parent=$gid;".$myattrs;
117 # 0 1 2 3 4=exons 5=CDS 6 7
118 $mrnadata=[$chr, $strand, $gid, $track, [], [], $myattrs, \%notes];
119 $mrnas{$myTrID}=$mrnadata;
120 push(@mrnalst, $myTrID);
121 }
122 else { #has $mrnadata, seen before
123 $gid=$mrnadata->[2];
124 }
125 }
126 # -- now add this segment to the data
127 if ($f eq 'CDS') {
128 if ($flipphase && $frame>0) { $frame= ($frame==1) ? 2 : 1; }
129 push(@{$mrnadata->[5]}, [$fstart, $fend, $fscore, $frame]);
130 #
131 }
132 else { push(@{$mrnadata->[4]}, [$fstart, $fend, $fscore, $frame]); } #exon
133 } # 'exon' or 'CDS'
134 else { # 'gene' feature
135 die ("Unexpected non-gene entry lacking transcript_id!\n$_\n") unless $f eq 'gene';
136 $prevGeneID=$lastGeneID;
137 $lastGeneID=$gid;
138 my $newgene=1;
139 my $gidn=++$gids{$gid};
140 if ($gidn>1) { # && !geneovlExtend($gid, $fstart, $fend)) {
141 if (!geneOvl($gid, $fstart, $fend)) { #non-overlapping duplicate
142 $lastGeneID.='.gn'.$gidn; #separate geneID if it doesn't overlap previous
143 }
144 else { #overlapping duplicate, skip it
145 $gidn=--$gids{$gid};
146 $newgene=0;
147 if ($gidn>1) {
148 $lastGeneID.='.gn'.$gidn;
149 }
150 $loci{$lastGeneID}[3]=$gregs{$gid}[$gidn-1]->[0];
151 $loci{$lastGeneID}[4]=$gregs{$gid}[$gidn-1]->[1];
152 }
153 }
154 else { #first time this gene id is seen
155 $gregs{$gid}=[[$fstart,$fend]];
156 }
157 $lastgstart=$fstart;
158 $lastgend=$fend;
159 if ($newgene) {
160 my $gname=$gid;
161 $gname=~s/\Q$chr://;
162 my $myattrs="ID=$lastGeneID;Name=$gname";
163 $attrs=~s/gene_id\s+"[^;]+//;
164 $attrs=~s/\;\s*(\w+)\s+"/;$1="/g;
165 $attrs=~s/;?\s*Derived by automated [\w :\.]+//;
166 $attrs=~s/note="";//g;
167 $attrs=~s/db_xref="([^"]+)"/db_xref=$1/g;
168 $attrs=~s/=""/=1/g;
169 $attrs=~s/\.";/";/;
170 { local $/=';';chomp($attrs); }
171 $myattrs.=$attrs;
172 #print the 'gene' line:
173 $loci{$lastGeneID}=[$chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $myattrs];
174 #print join("\t", $chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $myattrs)."\n";
175 }
176 } #gene line
177 next;
178 } #-- NCBI gtf processing only
179 # ------------- non NCBI gtf from here on ----
180 my ($transcript, $geneid);
181 if ($attrs=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
182 $transcript=$1;
183 $transcript=~tr/"//d; #"
184 $isGtf=1;
185 }
186 elsif ($attrs=~m/Parent=(['"\:\w\|\-\.]+)/) {
187 $transcript=$1;
188 $transcript=~tr/"//d; #"
189 $isGtf=0;
190 }
191 if ($attrs=~m/gene_?id[= ]+(['"\:\w\.\|\-]+)/i) {
192 $geneid=$1;
193 $geneid=~tr/"//d; #"
194 $isGtf=1;
195 $transcript=$geneid unless $transcript;
196 }
197
198 if (!$transcript) {
199 die("Error: cannot parse transcript name from input line:\n$_\n");
200 }
201 #die("Errror: only one type of feature is expected in the GTF input!\n")
202 # if ($isGtf && $curfeat && $curfeat ne $f);
203 #$curfeat=$f unless $curfeat;
204 if ($gfflt) {
205 $mrnas{$transcript}=1;
206 }
207 else { #store all data as needed;
208 next if ($selfeature && $f ne $selfeature); #for GFF files, only consider selected feature (i.e. exons, not CDS)
209 my $ld = $mrnas{$transcript};
210 if ($ld) { #existing entry
211 my $lidx=($f eq 'CDS')? 5 : 4;
212 push(@{$$ld[$lidx]}, [$fstart, $fend, $fscore, $frame]);
213 if ($geneid) {
214 $mrnas{$transcript}[2]=$geneid;
215 }
216 }
217 else { # first time seeing this locus/gene
218 #push(@loc, $transcript);
219 # 0 1 2 3 4=exon lst 5 = CDS list
220 #
221 my $tr=($ctrack || $track);
222 $mrnas{$transcript} = ($f eq 'CDS') ? [$chr, $strand, $geneid, $tr, [], [[$fstart, $fend, $fscore, $frame]]] :
223 [$chr, $strand, $geneid, $tr, [[$fstart, $fend, $fscore, $frame]],[]];
224 push(@mrnalst,$transcript);
225 }
226 }
227 } # while input lines
228
229 #write_mRNA($myTrID, \@curData) if $ncbigtf && $myTrID;
230
231 if ($gfflt) {
232 while (<INGFF>) {
233 next if m/^##/;
234 my @t=split(/\t/);
235 my ($id)=($t[8]=~m/ID=(['"\:\w\|\-\.]+)/);
236 my ($parent)=($t[8]=~m/Parent=(['"\:\w\|\-\.]+)/);
237 #if ($parent)
238 #print STDERR "id='$id';parent='$parent'\n";
239 $t[1]=$ctrack if $ctrack;
240 if ($t[2] eq 'exon' || $t[2] eq 'CDS') {
241 $t[8]=~s/ID=(['"\:\w\|\-\.]+);?//;
242 $t[8]=~s/Name=(['"\:\w\|\-\.]+);?//;
243 }
244 $t[8]=~s/;?Target=.+$//;
245 $t[8]=~s/;Parent=['"\:\w\|\-\.]+// if ($t[2] eq 'mRNA');
246 print join("\t",@t) if exists($mrnas{$id}) || exists($mrnas{$parent});
247 }
248 }
249 else {
250 writeModels();
251 }
252
253 #================================
254 sub geneOvl {
255 my ($gid, $nstart, $nend)=@_;
256 my ($pstart, $pend)=@{$gregs{$gid}[-1]}; #last non-ovl region for this gene id
257 #if ($gid eq 'GPC_000000049:COL18A1') {
258 # print STDERR ">>>>>> (new) [$nstart-$nend] vs [$pstart-$pend] (prev)>>>>>>>>>>>>\n";
259 # }
260 if ($pstart>$nend || $nstart>$pend) {
261 #non-overlapping regions
262 push(@{$gregs{$gid}},[$nstart, $nend]);
263 return 0;
264 }
265 # else {
266 #overlapping, merge into the last interval
267 $gregs{$gid}[-1]->[0]=$nstart if $nstart<$pstart;
268 $gregs{$gid}[-1]->[1]=$nend if $nend>$pend;
269 return 1;
270 }
271
272
273 sub getXId {
274 my ($tid, $gid)=@_;
275 if ((++$trseen{$tid.'~'.$gid})==1) { #very first time this $tid is seen in this locus
276 $trids{$tid}++;#
277 }
278 my $tn=$trids{$tid};
279 $tid.='.tn'.$tn if ($tn>1);
280 return $tid;
281 }
282
283 sub writeModels {
284 foreach my $m (@mrnalst) {
285 my $md=$mrnas{$m} || die ("Error: transcript $m found in list but not in hash!\n");
286 write_mRNA($m, $md);
287 } #for each stored transcript
288 %mrnas=();
289 @mrnalst=();
290 # any other extra non-mRNA loci will be written here:
291 foreach my $gid (keys(%loci)) {
292 my $gdata=$loci{$gid};
293 $gdata->[1]='PSEUDO';
294 print join("\t",@$gdata)."\n";
295 delete $loci{$gid};
296 }
297 }
298
299
300
301
302 sub write_mRNA {
303 my ($l, $ld)=@_;
304 my ($ctg, $strand, $geneid, $track, $er, $cr, $gffattr) = @$ld;
305
306 next if $namedOnly && $l eq $geneid;
307
308 my (@ex, $mstart, $mend);
309 my (@cx, $cstart, $cend);
310 if (@$er>0) {
311 @ex= sort { $main::a->[0] <=> $main::b->[0] } @$er;
312 ($mstart, $mend) = ($ex[0]->[0], $ex[-1]->[1]);
313 }
314 if (@$cr>0) {
315 @cx= sort { $main::a->[0] <=> $main::b->[0] } @$cr;
316 ($cstart, $cend) = ($cx[0]->[0], $cx[-1]->[1]);
317 }
318 my ($tstart, $tend)= ( ($mstart<$cstart) ? ($mstart||$cstart) : ($cstart||$mstart) , ($mend>$cend)?$mend : $cend );
319 my $gname=$geneid || $l;
320 my $ginfo="ID=$l;Name=$gname";
321 if ($gffattr) {
322 $gffattr=~s/ID=[^;]+;?//;
323 $gffattr=~s/Parent=[^;]+;?// unless $ncbigtf;
324 $gffattr=~s/Name=[^;]+;?//;
325 $ginfo.=';'.$gffattr if length($gffattr)>1;
326 }
327
328 my $gdata=$loci{$geneid};
329 if ($gdata) {
330 print join("\t",@$gdata)."\n";
331 delete $loci{$geneid};
332 }
333 print join("\t", $ctg, $track, 'mRNA', $tstart, $tend, '.', $strand,
334 '.', $ginfo)."\n";
335
336 unless ($selfeature && $selfeature eq 'CDS') {
337 foreach my $x (@ex) {
338 print join("\t", $ctg, $track, 'exon', $x->[0], $x->[1], $x->[2], $strand,
339 $x->[3], "Parent=$l")."\n";
340 }
341 }
342
343 unless ($selfeature && $selfeature ne 'CDS') {
344 foreach my $x (@cx) {
345 print join("\t", $ctg, $track, 'CDS', $x->[0], $x->[1], $x->[2], $strand,
346 $x->[3], "Parent=$l")."\n";
347 }
348 }
349 }

Properties

Name Value
svn:executable *