ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gff2fasta
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 8184 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4
5 my $usage=qq/
6 $0 [-f <feature>] [-U | -u <xb>] [-o <outfa>] <genomic_seq.fa> <gff_file>
7
8 The sequence of each mRNA entities found in <gff_file> is extracted
9 in a multifasta output file (written to STDOUT unless -o option is
10 provided)
11
12 Only the first genomic sequence in <genomic_seq.fa> is considered,
13 and all coordinates in the GFF input file will be applied to it.
14
15 Options:
16
17 -f : the segments corresponding to features of type <feature>
18 will be extracted instead of the default "exon" features
19 (e.g. -f CDS )
20 -U : include the "introns" too, so for each mRNA the whole
21 genomic region will be extracted, with unspliced introns
22 -u : like -U but also add <xb> extra bases to both mRNA ends
23 (upstream and downstream)
24 -o : write the multi-fasta output to file <outfa>
25
26 For jigsaw gff files, the -f option is ignored (as all exons
27 will be considered). /;
28 umask 0002;
29 getopts('Uu:f:o:') || die($usage."\n");
30 my ($unspliced, $feature, $outfile, $uxtra)=($Getopt::Std::opt_U, $Getopt::Std::opt_f,
31 $Getopt::Std::opt_o,$Getopt::Std::opt_u);
32 #my ($fgff, $fa, $feature, $opt)=@ARGV;
33 #$feature = 'exon' unless $feature;
34 $unspliced=1 if $uxtra>0;
35 my ($fa, $fgff)=@ARGV;
36 die ("$usage\n") unless -f $fgff && -f $fa;
37
38 my %tex; # transcript_id => [start, strand, name, geneid, descr, \@exons, \@cds]
39 # 0 1 2 3 4 5 8
40 my %gid; # geneId=>[transcript_id1, transcript_id2, ..]
41 my $isgtf=1;
42 open(FASTA, $fa) || die ("Error opening fasta file $fa!\n");
43 open(FGFF, $fgff) || die ("Error opening $fgff!\n");
44 while (<FGFF>) {
45 #next if m/^\s*#/;
46 chomp;
47 my ($chr, $v, $f, $fstart, $fend, $fscore, $fstrand, $frame, $lnum)=split(/\t/);
48 next unless $lnum;
49 #next unless ($lnum && $f eq $feature || ($v=~/^jigsaw/ && $f=~/exon/));
50 ($fstart, $fend)=($fend,$fstart) if $fstart>$fend;
51
52 if (uc($f) eq 'MRNA' && $lnum=~m/ID=([^;]+)/) { #parent feature in GFF
53 my $tid=$1;
54 $isgtf=0;
55 my ($name)=($lnum=~m/Name\s*=\s*"?([^;"]+)/);
56 my ($descr, $geneid);
57 $descr=$1 if ($lnum=~m/(?:Info|Descr\w*|Product\w*)\s*=\s*"?([^;"]+)/i);
58 $geneid=$1 if ($lnum=~m/Gene\w*\s*=\s*"?([^;"]+)/);
59 #print STDERR ">>>> found transcript $tid with geneid=$geneid and descr='$descr'\n";
60 $tex{$tid}=[$fstart, $fstrand, $name, $geneid, $descr, [], []]
61 unless exists($tex{$tid});
62 push(@{$gid{$tid}}, $geneid) if $geneid;
63 next;
64 }
65
66 # -- non-parent features (or GTF lines)
67 #my ($gname)=($descr=~m/Name=(\w+)/);
68 my ($tid, $geneid);
69 if ($v=~/jigsaw/i && index($lnum, $chr)==0) {
70 ($tid)=($lnum=~m/^([^;]+)/);
71 $tid=~s/\.(\d+)$/.jsm.$1/;
72 }
73 elsif ($lnum=~m/Parent=([^;]+)/) {
74 $tid=$1;
75 #$tid=~tr/"'//d; #"
76 }
77 elsif ($lnum=~m/transcript_id[= ]+(['"\:\w\.\|\-]+)/) {
78 $tid=$1;
79 $tid=~tr/"'//d; #"
80 }
81 #elsif ($lnum=~m/^\w+$/) { } # UCSC gff ?
82 # - gtf
83 if ($lnum=~m/gene_id[= ]+(['"\:\w\.\|\-]+)/) {
84 $geneid=$1;
85 $geneid=~tr/"'//d; #"
86 }
87
88 unless ($tid) {
89 die("Error: cannot parse transcript ID from input line:\n$_\n");
90 }
91 # my ($tname)=($tid=~/(\w+)/);
92
93 # [start, strand, name, geneid, descr, \@exons, \@cds]
94 # 0 1 2 3 4 5 6
95 if (lc($f) eq 'cds') { #CDS features are ALWAYS stored
96 if (exists($tex{$tid})) { #
97 push(@{$tex{$tid}[6]}, [$fstart, $fend, $frame]);
98 my $s=$tex{$tid}[0];
99 $tex{$tid}[0]=$fstart if $fstart<$s;
100 }
101 else {
102 $tex{$tid}=[$fstart, $fstrand, $geneid, $geneid, '', [], [$fstart, $fend, $frame]];
103 }
104 }
105 elsif (!$feature || lc($feature) eq lc($f)) { #exon or exclusively requested
106 if (exists($tex{$tid})) { #
107 push(@{$tex{$tid}[5]}, [$fstart, $fend, $frame]);
108 }
109 else {
110 $tex{$tid}=[$fstart, $fstrand, $geneid, $geneid, '', [$fstart, $fend], []];
111 my $s=$tex{$tid}[0];
112 $tex{$tid}[0]=$fstart if $fstart<$s;
113 }
114 }
115 } # while <FGFF>
116
117 close(FGFF);
118 # my $gfftype = $isgtf ? 'GTF' : 'GFF';
119 # print STDERR "$gfftype data loaded, now loading sequence..\n";
120 my ($gseq, $gseqlen); #base genomic sequence
121 while (<FASTA>) {
122 next if /^>/;
123 tr/ \n\t//d;
124 $gseq.=$_;
125 $gseqlen+=length($_);
126 }
127 close(FASTA);
128 # print STDERR "writing transcript sequences..\n";
129 if ($outfile) {
130 open(OUTF, '>'.$outfile) || die "Error creating file $outfile\n";
131 select OUTF;
132 }
133 #if ($unspliced) {
134 # open(OUTFU, '>'.$outfname.'.transcripts.unspliced.fa') || die "Error creating file $outfname\n";
135 # }
136 my %gc; # geneId => num_tid
137
138 foreach my $tid (sort { $tex{$main::a}[0] <=> $tex{$main::b}[0] } (keys(%tex))) {
139 my $td=$tex{$tid};
140 # [start, strand, name, geneid, descr, \@exons, \@cds]
141 # 0 1 2 3 4 5 6
142
143 my @ex = sort { $main::a->[0] <=> $main::b->[0] } @{$$td[5]};
144 my @cds = sort { $main::a->[0] <=> $main::b->[0] } @{$$td[6]};
145 my ($strand, $name, $geneid, $descr)=($$td[1], $$td[2], $$td[3], $$td[4]);
146 my ($cstart, $cend);
147 my $reverse=($strand eq '-');
148 if (@cds>0) {
149 if ($reverse) {
150 $cds[-1]->[1]-=int($cds[-1]->[2]); # adjust for initial frame
151 }
152 else {
153 $cds[0]->[0]+=int($cds[0]->[2]); # adjust for initial frame
154 }
155 ($cstart, $cend)=($cds[0]->[0], $cds[-1]->[1]);
156 }
157
158 my @seg; #segment coordinates to extract
159 my $cdsonly=0;
160 if (lc($feature) eq 'cds' || @ex==0) {
161 @seg=@cds;
162 $cdsonly=1;
163 @ex=(); #discard exons if just CDS requested
164 }
165 if (@ex>0) { @seg=@ex; }
166 next unless @seg>0; #nothing to do!
167 my $startpos=$seg[0]->[0];
168 my $endpos=$seg[-1]->[1];
169 # consistency check:
170 #die("Error: startpos $startpos for $tid ($geneid) does not match stored $$td[0]!\n")
171 # unless $startpos==$$td[0];
172 #check for overlaps just in case
173 for (my $i=0;$i<@seg; $i++) {
174 if ($i>0 && hasOverlap($seg[$i-1], $seg[$i])) {
175 die "ERROR: $tid exons have overlaps!\n";
176 }
177 }
178 my $tdefline=$tid;
179 my $tseq='';
180 my $tseqlen;
181 my ($cds_start, $cds_end);
182 if ($unspliced) { # raw region
183 $tdefline.=" [unspliced region $strand:$startpos-$endpos]";
184 $tdefline.=" gid:$geneid" if $geneid;
185 $tdefline.=" $descr" if $descr;
186 if ($uxtra) {
187 $startpos-=$uxtra;
188 $startpos=1 if $startpos<1;
189 $endpos+=$uxtra;
190 $endpos=$gseqlen if $endpos>$gseqlen;
191 }
192 $tseqlen=$endpos-$startpos+1;
193 $tseq=substr($gseq, $startpos-1, $tseqlen);
194 }
195 else {
196 $tdefline.=" gid:$geneid" if $geneid;
197 # if ($cstart>0) {
198 # ($cstart, $cend)=($strand eq '-') ? ($endpos-$cend+1, $endpos-$cstart+1) :
199 # ($cstart-$startpos+1, $cend-$startpos+1);
200 # }
201 # $tdefline.=" CDS:$cstart-$cend";
202 foreach my $s (@seg) {
203 if ($cstart) {
204 if (!$cds_start && $cstart>=$$s[0] && $cstart<=$$s[1]) {
205 $cds_start=$tseqlen+($cstart-$$s[0])+1;
206 }
207 if (!$cds_end && $cend>=$$s[0] && $cend<=$$s[1]) { #CDS end within this exon
208 $cds_end=$tseqlen+($cend-$$s[0])+1;
209 }
210 } #CDS info available
211 my $seglen=$$s[1]-$$s[0]+1;
212 my $segseq=substr($gseq, $$s[0]-1, $seglen);
213 $tseq.=$segseq;
214 $tseqlen+=$seglen;
215 } #for each seq segment
216 }# -- exon/CDS splicing
217 if ($cstart) {
218 ($cds_start, $cds_end)=($tseqlen-$cds_end+1, $tseqlen-$cds_start+1) if ($reverse);
219 $tdefline.=" CDS:$cds_start-$cds_end";
220 }
221 $tdefline.=" $descr" if $descr;
222 $tseq=reverseComplement($tseq) if $reverse;
223
224 print &fastafmt(\$tseq, $tdefline);
225 }
226
227 select(STDOUT);
228 close(OUTF) if $outfile;
229
230 #-------------------------
231
232 sub reverseComplement {
233 my $s=reverse($_[0]);
234 $s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
235 return $s;
236 }
237
238 sub fastafmt {
239 my $seqr=shift(@_);
240 my $defline=shift(@_);
241 my $linelen=shift(@_) || 70;;
242 my $rec;
243 $rec=">$defline\n" if $defline;
244 my $seqlen=length($$seqr);
245 my $pos=0;
246 while ($pos<$seqlen) {
247 $rec.= substr($$seqr,$pos,$linelen)."\n";
248 $pos+=$linelen;
249 }
250 return $rec;
251 }
252
253 sub hasOverlap {
254 my ($a, $b)=@_;
255 return !($$a[0]>$$b[1] || $$b[0] > $$a[1]);
256 }

Properties

Name Value
svn:executable *