ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/seqmanip
Revision: 160
Committed: Fri Feb 3 15:31:57 2012 UTC (7 years, 7 months ago) by gpertea
File size: 26499 byte(s)
Log Message:
added fq_extract script

Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use Fcntl qw(SEEK_SET SEEK_END SEEK_CUR); # for seek
5
6 my $usage=q/
7 A simple fasta sequence(s) manipulation tool.
8 Usage:
9 seqmanip [-C] [-r<ranges>] [-L] [-D|-X] [-I] [-G] [-T|-t <frame>|-Z]
10 [-M|-m <ncount>] [-f <ranges_file>] [fasta file(s)...]
11 The input files should be one or more nucleotide or aminoacid sequences
12 in FASTA or raw format; if no files are given, input is expected at STDIN
13
14 Options:
15 -C reverse complement (is -r is given, first the subsequence is extracted
16 and only then reverse-complemented)
17 -l the line length for the output fasta record (default 70)
18 -r extract a range (subsequence) from the first input fasta sequence;
19 <range> can have one of the formats:
20 <start>..<end> or <start>-<end> or <start>:<len>
21 Multiple such intervals can be given after the -r option (comma or space
22 delimited) and they will be spliced together
23 This option doesn't work with -M or -n options.
24 -f same as -r but for multi-fasta\/seq input; expects a file containing
25 lines of this format:
26 <seqname> <range>
27 The format of <range> is the same with the -r above or it may be just
28 space delimited <start> <end> numbers; the output will be a multi-fasta
29 file with one subsequence for each line in the <ranges_file>
30 -M merge (concatenate) all input sequences into a single sequence
31 -m merge all input sequences separating them by <ncount> Ns
32 -d provides a custom defline for -M and -N options
33 -D add basic defline if the input doesn't provide one
34 -X exclude defline from the output (just print the sequence)
35 -L provide one-line output for each sequence
36 -T first frame translation (DNA to aminoacid)
37 -t <frame>th frame translation; <frame> could be 1,2,3,-1,-2,-3
38 -E show exon\/splice-sites\/start\/stop info (requires -r)
39 -G show GFF output for the given range(s) (requires -r)
40 -Z raw reverse translate the output (aminoacid to DNA, rather useless)
41 -V only show a sequence if it's a valid, complete CDS
42 -q only show a sequence if it's longer than <minseqlen>
43 -n discard sequences with a percentage of Ns (or Xs) higher than <maxpercN>
44 -y remove spans of Ns longer than <maxnspan> replacing them by <maxnspan> Ns
45 -Y trim Ns from either end of sequence
46 -U cleanup\/remove extraneous characters(e.g. digits, dashes, stars)
47 from every input sequence
48 -R convert RNA to DNA (simply replacing U wih T)
49 -P convert all sequence letters to uppercase
50 -A use ANSI colors to highlight features in terminal (for -E option)
51 -O return longest ORF sequence for each input sequence (start..stop)
52 -Q same as -O but doesn't require a start codon
53 /;
54
55 my %codons=(
56 'AAA'=>'K', 'AAC'=>'N', 'AAG'=>'K', 'AAR'=>'K', 'AAT'=>'N',
57 'AAY'=>'N', 'ACA'=>'T', 'ACB'=>'T', 'ACC'=>'T', 'ACD'=>'T',
58 'ACG'=>'T', 'ACH'=>'T', 'ACK'=>'T', 'ACM'=>'T', 'ACN'=>'T',
59 'ACR'=>'T', 'ACS'=>'T', 'ACT'=>'T', 'ACV'=>'T', 'ACW'=>'T',
60 'ACY'=>'T', 'AGA'=>'R', 'AGC'=>'S', 'AGG'=>'R', 'AGR'=>'R',
61 'AGT'=>'S', 'AGY'=>'S', 'ATA'=>'I', 'ATC'=>'I', 'ATG'=>'M',
62 'ATH'=>'I', 'ATM'=>'I', 'ATT'=>'I', 'ATW'=>'I', 'ATY'=>'I',
63 'CAA'=>'Q', 'CAC'=>'H', 'CAG'=>'Q', 'CAR'=>'Q', 'CAT'=>'H',
64 'CAY'=>'H', 'CCA'=>'P', 'CCB'=>'P', 'CCC'=>'P', 'CCD'=>'P',
65 'CCG'=>'P', 'CCH'=>'P', 'CCK'=>'P', 'CCM'=>'P', 'CCN'=>'P',
66 'CCR'=>'P', 'CCS'=>'P', 'CCT'=>'P', 'CCV'=>'P', 'CCW'=>'P',
67 'CCY'=>'P', 'CGA'=>'R', 'CGB'=>'R', 'CGC'=>'R', 'CGD'=>'R',
68 'CGG'=>'R', 'CGH'=>'R', 'CGK'=>'R', 'CGM'=>'R', 'CGN'=>'R',
69 'CGR'=>'R', 'CGS'=>'R', 'CGT'=>'R', 'CGV'=>'R', 'CGW'=>'R',
70 'CGY'=>'R', 'CTA'=>'L', 'CTB'=>'L', 'CTC'=>'L', 'CTD'=>'L',
71 'CTG'=>'L', 'CTH'=>'L', 'CTK'=>'L', 'CTM'=>'L', 'CTN'=>'L',
72 'CTR'=>'L', 'CTS'=>'L', 'CTT'=>'L', 'CTV'=>'L', 'CTW'=>'L',
73 'CTY'=>'L', 'GAA'=>'E', 'GAC'=>'D', 'GAG'=>'E', 'GAR'=>'E',
74 'GAT'=>'D', 'GAY'=>'D', 'GCA'=>'A', 'GCB'=>'A', 'GCC'=>'A',
75 'GCD'=>'A', 'GCG'=>'A', 'GCH'=>'A', 'GCK'=>'A', 'GCM'=>'A',
76 'GCN'=>'A', 'GCR'=>'A', 'GCS'=>'A', 'GCT'=>'A', 'GCV'=>'A',
77 'GCW'=>'A', 'GCY'=>'A', 'GGA'=>'G', 'GGB'=>'G', 'GGC'=>'G',
78 'GGD'=>'G', 'GGG'=>'G', 'GGH'=>'G', 'GGK'=>'G', 'GGM'=>'G',
79 'GGN'=>'G', 'GGR'=>'G', 'GGS'=>'G', 'GGT'=>'G', 'GGV'=>'G',
80 'GGW'=>'G', 'GGY'=>'G', 'GTA'=>'V', 'GTB'=>'V', 'GTC'=>'V',
81 'GTD'=>'V', 'GTG'=>'V', 'GTH'=>'V', 'GTK'=>'V', 'GTM'=>'V',
82 'GTN'=>'V', 'GTR'=>'V', 'GTS'=>'V', 'GTT'=>'V', 'GTV'=>'V',
83 'GTW'=>'V', 'GTY'=>'V', 'MGA'=>'R', 'MGG'=>'R', 'MGR'=>'R',
84 'NNN'=>'X', 'RAY'=>'B', 'SAR'=>'Z', 'TAA'=>'.', 'TAC'=>'Y',
85 'TAG'=>'.', 'TAR'=>'.', 'TAT'=>'Y', 'TAY'=>'Y', 'TCA'=>'S',
86 'TCB'=>'S', 'TCC'=>'S', 'TCD'=>'S', 'TCG'=>'S', 'TCH'=>'S',
87 'TCK'=>'S', 'TCM'=>'S', 'TCN'=>'S', 'TCR'=>'S', 'TCS'=>'S',
88 'TCT'=>'S', 'TCV'=>'S', 'TCW'=>'S', 'TCY'=>'S', 'TGA'=>'.',
89 'TGC'=>'C', 'TGG'=>'W', 'TGT'=>'C', 'TGY'=>'C', 'TRA'=>'.',
90 'TTA'=>'L', 'TTC'=>'F', 'TTG'=>'L', 'TTR'=>'L', 'TTT'=>'F',
91 'TTY'=>'F', 'XXX'=>'X', 'YTA'=>'L', 'YTG'=>'L', 'YTR'=>'L'
92 );
93
94
95 getopts('AZTUPMEGNRXDVCLYOQr:d:m:q:l:t:f:y:n:') || die "$usage\n";
96 #my %ranges; # seqname => [offset, len] only populated by -r or -f option;
97 my @ranges; # list of [chr, strand, defline, [[offset1,len1], [offset1,len2], ..]]
98 # 0 1 2 3
99 #my @range_chrs; # just to keep track of the order of ranges requested
100 my $fai_loaded=0;
101 my %fai_h; # seqID => [fpos, linelen, delimlen, seqlen]
102
103 my $m_linerest; #for the merge case (-M)
104 my $linelen=$Getopt::Std::opt_l || 70;
105 my $user_defline=$Getopt::Std::opt_d || 'dummyname';
106 my $ansicolors=$Getopt::Std::opt_A;
107 #some ansi colors:
108 my ($clred, $clgreen, $clreset)=("\e[1;33;41m", # bright yellow on red
109 "\e[0;32;40m", # green on black
110 "\e[0m") #reset colors
111 if $ansicolors;
112 my %toDNA=reverse(%codons);
113 @toDNA{'P','F','K','G'}=('CCC','TTT','AAA','GGG');
114 my ($getOrf, $getOrfAny)=($Getopt::Std::opt_O, $Getopt::Std::opt_Q);
115 $getOrf=1 if $getOrfAny;
116 my $complement=$Getopt::Std::opt_C;
117 my $validateCDS=$Getopt::Std::opt_V;
118 my $exoninfo=$Getopt::Std::opt_E;
119 my $gffout=$Getopt::Std::opt_G;
120 my $rna2dna=$Getopt::Std::opt_R;
121 my $mergedist=$Getopt::Std::opt_m;
122 my $minseqlen=$Getopt::Std::opt_q;
123 my $maxnspan=$Getopt::Std::opt_y;
124 my $trimNs=$Getopt::Std::opt_Y;
125 my $maxnrepl='';
126 if ($maxnspan) {
127 $maxnrepl='N' x $maxnspan;
128 $maxnspan++;
129 }
130 my $maxpercn=$Getopt::Std::opt_n;
131 my $mergesep;
132 if ($mergedist>0) {
133 $Getopt::Std::opt_M=1;
134 $mergesep='N' x $mergedist;
135 }
136 my $rangetarget; #if a specific seq target was provided
137 # if this is not set, then the first fasta sequence given as input
138 # will be used for range queries
139 # parse range information if available
140 if ($Getopt::Std::opt_r) {
141 my $rangetext=$Getopt::Std::opt_r;
142 my $rstrand='+';
143 #if ($rangetext=~s/([\-\+])$//) { $rstrand=$2;}
144 if ($rangetext=~s/(\d+[\.\-\:_]+\d+)([\-\+])$/$1/) { $rstrand=$2; }
145 my @ar=split(/[\,\s]+/,$rangetext); #allow multiple ranges
146 my $seqtarget;
147 unless ($ar[0]=~m/^\d+\:\d+$/) {
148 if ($ar[0]=~s/^([\w\|\-\+\.]+)\://) {
149 # target chromosome given
150 $seqtarget=$1;
151 if ($seqtarget=~s/([\-\+])$//) { $rstrand=$1;}
152 $rangetarget=1;
153 }
154 }
155 if ($seqtarget) { # 0 1 2 3
156 push(@ranges, [$seqtarget,$rstrand,'', []]);
157 }
158 else {
159 push(@ranges, ['-',$rstrand,'',[]]);
160 }
161 my $rdata=$ranges[0]->[3]; # [ ]
162 foreach my $rt (@ar) {
163 my ($rStart, $rLen)=&parseRange($rt);
164 push(@$rdata, [$rStart, $rLen]);
165 }
166 }
167 elsif ($Getopt::Std::opt_f) { # file with ranges to pull
168 my $frname=$Getopt::Std::opt_f;
169 my $using_stdin;
170 if ($frname eq '-' || $frname eq 'stdin') {
171 #open(FRNG, <&STDIN);
172 open(FRNG, "<&=STDIN") or die "Couldn't alias STDIN : $!";
173 $using_stdin=1;
174 }
175 else {
176 open (FRNG, $frname) || die ("Error: cannot open seq ranges file '$frname'!\n");
177 }
178 while (<FRNG>) {
179 chomp;
180 my $rstrand='+';
181 my ($seqname, $rangetxt)=split(/\s+/,$_,2);
182 my $defline;
183 if ($seqname=~m/^([\w\|\-\+\.]+)\:(\d+[\.\-_]+\d*[\-\+]?)/) {
184 #chr:start-end format
185 # strand may follow chr or end
186 $seqname=$1;
187 $defline=$rangetxt;
188 $rangetxt=$2;
189 if ($seqname=~s/([\-\+])$//) { $rstrand=$1;}
190 elsif ($rangetxt=~s/(\d+[\.\-_]+\d+)([\-\+])$/$1/) { $rstrand=$2; }
191 }
192 else { #1st column is chr(strand), 2nd column is the interval
193 if ($seqname=~s/([\-\+])$//) { $rstrand=$1; }
194 $rangetxt=~s/(\d+)[\t ]+(\d+)/$1-$2/g;
195 my @rt=split(/\s+/,$rangetxt,2);
196 if ($rt[1]) {
197 $defline=$rt[1];
198 $rangetxt=$rt[0];
199 }
200 }
201 next unless $rangetxt;
202 push(@ranges, [$seqname, $rstrand, $defline, []]);
203 my $rdata=$ranges[-1]->[3];
204 #$rangetxt=~s/(\d+)[\t ]+(\d+)/$1-$2/g; #could be space delimited
205 my @ar=split(/\,/,$rangetxt);
206 foreach my $rt (@ar) {
207 my ($rStart, $rLen)=&parseRange($rt);
208 push(@$rdata, [$rStart, $rLen]);
209 }
210 }
211 $rangetarget=1 if @ranges>0;
212 close(FRNG) unless $using_stdin;
213 }
214 my $seq;
215 my $seq_len;
216 my $count=0;
217 my $defline;
218 my $curseq;
219 my $seqcount;
220 print ">$user_defline\n" if ($Getopt::Std::opt_M && !$Getopt::Std::opt_X);
221 die("Error: target sequence(s) given but no input file!\n")
222 if ($rangetarget && (@ARGV==0 || ! -f $ARGV[0]));
223 my $qrange=(@ranges>0 && @ARGV>0 && -f $ARGV[0]);
224 if ($qrange) { #target
225 if (@ranges==1 && $ranges[0]->[0] eq '-') { #single fasta file
226 processRanges($ARGV[0], $ranges[0]);
227 } # single-fasta file
228 else {
229 #actual sequence names provided, need cdbfasta or samtools index
230 my $fasta=$ARGV[0];
231 #die("Error: no fasta file $fasta!\n") unless -f $fasta;
232 my $faidx;
233 if ($fasta=~s/\.fai$//) {
234 $faidx=$fasta.'.fai';
235 }
236 else {
237 $faidx=$fasta.'.fai' if -f $fasta.'.fai';
238 }
239 unless ($faidx) {
240 if ($fasta=~s/\.cidx$//) {
241 $faidx=$fasta.'.cidx';
242 }
243 else {
244 $faidx=$fasta.'.cidx' if -f $fasta.'.cidx';
245 }
246 }
247 #print STDERR "info: using fasta index $faidx for $fasta.\n";
248 die("Error: no index file for fasta file $fasta") unless $faidx;
249 die("Error: no fasta file $fasta!\n") unless -f $fasta;
250 foreach my $rd (@ranges) {
251 processRanges($fasta, $rd, $faidx);
252 }
253 } # cidx file needed
254 } # fast[er] range queries
255 else { # normal (serial) stream processing, very slow range processing for large sequences
256 while (<>) {
257 if (m/^(>.+)/) {
258 process_seq() if $curseq; #sequence is in $seq variable
259 $defline=$1;
260 ($curseq)=($defline=~m/^>(\S+)/);
261 $seq_len=0;
262 next;
263 }
264 chomp;
265 tr/ \t\r\n//d;
266 $seq.=$_;
267 $seq_len+=length();
268 process_seq() unless $curseq; #no defline = raw input: one sequence per line
269 }
270 process_seq() if $curseq; #sequence is in $seq variable
271 }
272
273 print "$m_linerest\n" if ($Getopt::Std::opt_M && $m_linerest);
274
275 #============================
276
277 sub fai_load {
278 open(FAI, $_[0]) || die ("Error opening index file $_[0]!\n");
279 while (<FAI>) {
280 my @t=split(/\t/);# 0=seqID 1=seqlen 2=fpos 3=linelen 4=linelen+dlen
281 next if m/^#/ && @t<5;
282 $fai_h{$t[0]}=[$t[2], $t[3], $t[4]-$t[3], $t[1]];
283 # seqID => [fpos, linelen, delimlen, seqlen]
284 }
285 close(FAI);
286 $fai_loaded=1;
287 }
288
289 sub processRanges {
290 #my ($fasta, $chr, $chr_rdata, $faidx, $udefline)=@_;
291 my ($fasta, $r_data, $faidx)=@_;
292 my $chr=$$r_data[0];
293 $chr='' if $chr eq '-';
294 my $chr_strand=$$r_data[1];
295 my $udefline=$$r_data[2];
296 my $chr_rdata=$$r_data[3];
297 my @intvs = sort { $main::a->[0] <=> $main::b->[0] } @$chr_rdata; #range intervals
298 my ($fpos, $seqlen, $dlen, $flinelen);
299 open(FA, $fasta) || die("Error opening fasta file $fasta\n");
300 $fpos=0;
301 my $iscdb=($faidx=~m/\.cidx$/);
302 if ($chr) {
303 die("Error: target sequence given, but no fasta index\n") unless $faidx;
304 if ($iscdb) {
305 my $r=`cdbyank -a '$chr' -P $faidx`;
306 my $syserr=$?;
307 chomp($r);
308 die("Error at cdbyank -a '$chr' -P $faidx (exit code $syserr)\n") if length($r)==0 || $syserr;
309 $fpos=int($r);
310 seek(FA, $fpos, SEEK_SET);
311 }
312 else { #fasta index
313 #only load the first time
314 fai_load($faidx) unless ($fai_loaded);
315 my $cd=$fai_h{$chr};
316 die("Error retrieving $chr data from fasta index!\n") unless $cd;
317 ($fpos, $flinelen, $dlen, $seqlen)=@$cd;
318 $intvs[-1]->[1]=$seqlen-$intvs[-1]->[0]+1 unless $intvs[-1]->[1];
319 if ($intvs[-1]->[1]<=0) {
320 #invalid range
321 $seq='';
322 $defline='';
323 return;
324 }
325 $defline = $udefline ? '>'.$udefline : ">$chr";
326 seek(FA, $fpos, SEEK_SET);
327 } #samtools fai
328 } #indexed access
329 if ($iscdb || !$chr) {
330 #for cdb or plain fasta we have to determine line length by reading the first 1k bytes
331 my $rbuf;
332 read(FA, $rbuf, 1024);
333 my @lines=split(/[\n\r]+/,$rbuf);
334 my @ldelim=($rbuf=~m/([\n\r]+)/gs);
335 if (@ldelim<2) { #we need at least 2 full lines read
336 my $radd;
337 read(FA, $radd, 4096);
338 $rbuf.=$radd;
339 @lines=split(/[\n\r]+/,$rbuf);
340 @ldelim=($rbuf=~m/([\n\r]+)/gs);
341 }
342 $dlen=length($ldelim[0]);
343 die("Error determining line ending type for $fasta!\n") if ($dlen==0);
344 $defline = $udefline ? ">$udefline" : $lines[0];
345 $flinelen=length($lines[1]); #line length in fasta record
346 $fpos+=length($lines[0])+$dlen;
347 seek(FA, $fpos, SEEK_SET); #reposition at the beginning of the sequence
348 }
349 #now we are positioned at the beginning of the fasta record sequence
350 #seek(FA, $fpos+length($lines[0])+$dlen, SEEK_SET);
351 $seq='';
352 my $revC=1 if $chr_strand eq '-';
353 my $rstart=$intvs[0]->[0]-1; #first range start position, 0-based
354 my $rstartpos= ($rstart<$flinelen) ? $rstart : ($rstart+$dlen*(int($rstart/$flinelen)));
355 my $fstartpos=$fpos+$rstartpos;
356 seek(FA, $fstartpos, SEEK_SET);
357 if ($intvs[-1]->[1]) { #ending for last interval is known
358 my $rend=$intvs[-1]->[0]+$intvs[-1]->[1]-2;
359 #$seq_len=$rend-$rstart+1;
360 my $rendpos=($rend<$flinelen) ? $rend : ($rend+$dlen*(int($rend/$flinelen)));
361 #print STDERR "pulling from $fstartpos to $rendpos\n";
362 my $readlen=$rendpos-$rstartpos+1;
363 #my $fstartpos=$fpos+length($lines[0])+$dlen+$rstartpos;
364 read(FA, $seq, $readlen) || die("Error reading $readlen bytes from fasta $fasta at offset $fstartpos\n");
365 close(FA);
366 }
367 else { #last interval goes to the end of sequence, but seqlen is not known
368 my $rend=$intvs[-1]->[0]+$intvs[-1]->[1]-2;
369 #$seq_len=$rend-$rstart+1;
370 local $_;
371 while (<FA> && !m/^>/) {
372 $seq.=$_;
373 }
374 close(FA);
375 }
376 $seq=~tr/\n\r//d;
377 $seq_len=length($seq);
378 #now extract the ranges:
379 my @rdata;
380 my $txt_intvs;
381 foreach my $r (@intvs) {
382 push(@rdata, [$$r[0]-$rstart, $$r[1]]);
383 #$spliceseq.=substr($subseq, $cstart, $clen);
384 $txt_intvs.='|'.$$r[0].'-'.($$r[0]+$$r[1]-1);
385 }
386 $defline.=$txt_intvs.$chr_strand unless $udefline;
387 process_seq(\@rdata, $rstart, $revC);
388 }
389
390 sub process_seq {
391 #my $range = $rangekey ? $ranges{$rangekey} : $ranges{$_[0]};
392 return unless $seq;
393 my ($range, $basepos, $revC)=@_;
394 if ($maxnspan) {
395 $seq=~s/[NX]{$maxnspan,}/$maxnrepl/ig;
396 $seq_len=length($seq);
397 }
398 if ($Getopt::Std::opt_U) {
399 $seq=~s/[<>\d\s\-\*\.\,\;\:,\x7B-\xFF]+//sg;
400 $seq_len=length($seq);
401 }
402 if ($rna2dna) {
403 $seq =~ tr/Uu/Tt/;
404 }
405 $seq=uc($seq) if $Getopt::Std::opt_P;
406 if ($trimNs) {
407 $seq=~s/^[NX]+//;
408 $seq=~s/[NX]+$//;
409 $seq_len=length($seq);
410 }
411 if ($minseqlen && $seq_len<$minseqlen) {
412 $seq='';
413 $seq_len=0;
414 return;
415 }
416 if ($maxpercn) {
417 my $nN=($seq=~tr/AaCcGgTtUu//c);
418 if ((($nN*100.0)/$seq_len)>$maxpercn) {
419 $seq='';
420 $seq_len=0;
421 return;
422 }
423 }
424 # basepos is a 0 based offset
425 unless ($range || @ranges==0) {
426 $range=$ranges[0]->[3];
427 #print STDERR "using default, first ranges!\n"; #DEBUG
428 }
429 #print STDERR "key=$_[0], range=$$range[0] ($$range[1])\n";
430 #print STDERR "$range\t$seq\n";
431
432 $seqcount++;
433 if ($Getopt::Std::opt_M) { #merge only;
434 #merge all input
435 if ($mergedist>0 && $seqcount>1) {
436 $seq=$m_linerest.$mergesep.$seq;
437 }
438 else {
439 $seq=$m_linerest.$seq;
440 }
441 $m_linerest='';
442 $seq_len=length($seq);
443 for (my $p=0;$p<$seq_len;$p+=$linelen) {
444 my $sline=substr($seq, $p, $linelen);
445 if (length($sline)==$linelen) {
446 print $sline."\n";
447 }
448 else {
449 $m_linerest=$sline;
450 last;
451 }
452 }
453 $seq='';
454 $seq_len=0;
455 return;
456 }
457 my @seqranges;
458 my $intr_ends=''; # 0 1 2 3
459 my @gfdata; #array of [exon_start, exon_len, cdsphase, diagram]
460 if ($range) { #extract range
461 my @intvs = sort { $main::a->[0] <=> $main::b->[0] } @$range;
462 #print STDERR length($seq)." (seq_len=$seq_len)\n";
463 #print STDERR "beginning: ".substr($seq,0,10)."\n";
464 #print STDERR " ending: ".substr($seq,-10)."\n";
465 my $i=0;
466 foreach my $rng (@intvs) {
467 $i++;
468 if ($$rng[1]>0) {
469 push(@seqranges, substr($seq, $$rng[0]-1, $$rng[1]));
470 my $ss_before=substr($seq, $$rng[0]-3,2);
471 $ss_before='NN' unless length($ss_before)==2;
472 my $ss_after=substr($seq, $$rng[0]+$$rng[1]-1,2);
473 $ss_after='NN' unless length($ss_after)==2;
474 $intr_ends.=$ss_before.$ss_after;
475 push(@gfdata, [$$rng[0], $$rng[1], 0,'']);
476 #print STDERR ">>$seq<<\n";
477 }
478 #elsif ($$rng[1]<0) { # DON"T use this or mix ranges it's not consistent!
479 # my $rseq=substr($seq, $$rng[0]-1, -$$rng[1]);
480 # push(@seqranges, reverseComplement($rseq));
481 # push(@gfdata, [$$rng[0], $$rng[1], 0, '']);
482 # }
483 else { #0 length means to the end of sequence
484 push(@seqranges, substr($seq, $$rng[0]-1));
485 push(@gfdata, [$$rng[0], length($seqranges[-1]), 0,'']);
486 }
487 }#for each range
488 #print STDERR "intr_ends=$intr_ends\n";
489 }
490 else { #no ranges requested, create a dummy one with the whole sequence
491 push(@seqranges, $seq);
492 push(@gfdata, [1, length($seqranges[-1]),0, '']);
493 }
494 # -- first and last codons
495 #there is that rare case when the start/stop codons can be
496 #broken/split by an intron
497 my ($fcodon, $lcodon);
498 my $num_exons=scalar(@seqranges);
499 if ($num_exons>1) {
500 my $l=$#seqranges;
501 ($fcodon, $lcodon)=( uc(substr($seqranges[0].$seqranges[1],0,3)),
502 uc(substr($seqranges[$l-1].$seqranges[$l],-3)) );
503 }
504 else { # single-exon
505 ($fcodon, $lcodon)=( uc(substr($seqranges[0],0,3)),
506 uc(substr($seqranges[-1],-3)) );
507 }
508 my $trframe=$Getopt::Std::opt_t || $Getopt::Std::opt_T;
509 my $splseq;
510 $revC=($revC || $complement);
511 foreach my $sq (@seqranges) {
512 # -- complement requested?
513 if ($revC) {
514 $sq=reverseComplement($sq);
515 #complement the range coordinates too!
516 #$rStart=length($seq)-($rStart+$rLen)+2 if $range;
517 $splseq=$sq.$splseq;
518 }
519 else {
520 $splseq.=$sq;
521 }
522 } # for each range
523
524 if (length($intr_ends)>4) { # have introns
525 $intr_ends=substr($intr_ends,2,-2);
526 if ($revC) {
527 $intr_ends=reverseComplement($intr_ends);
528 }
529 }
530
531 if ($revC) {
532 @gfdata=reverse(@gfdata);
533 ($fcodon, $lcodon)=(reverseComplement($lcodon),
534 reverseComplement($fcodon));
535 }
536 # determine CDS phases
537 my $initphase=0; #could be provided
538 my $acclen=3-$initphase;
539 foreach my $x (@gfdata) {
540 #$$x[2]=($initframe+$acclen) % 3;
541 $$x[2]=(3-$acclen%3) % 3;
542 $acclen+=$$x[1];
543 }
544 my $printflag=1; #should it be printed or not?
545 # $splseq has the actual sequence to be processed
546 # ========== vvvv - any other seq processing/trimming should be applied here:
547 if ($getOrf) {
548 my ($orf_start, $orf_end);
549 ($splseq, $orf_start, $orf_end)=getLongestOrf($splseq);
550 if ($orf_start<=0) {
551 $defline.=' ORF:none';
552 $orf_start=0;$orf_end=0;
553 $splseq='';
554 $printflag=0;
555 }
556 else {
557 $defline.=" ORF:$orf_start-$orf_end" if $defline;
558 }
559 }
560 my $transl;
561 if ($trframe) {
562 $transl=&dna2prot($splseq, $trframe); #<trframe>th frame translate request
563 $splseq=$transl;
564 }
565 elsif ($Getopt::Std::opt_Z) {
566 $splseq=&prot2dna($splseq); #back-translate request
567 }
568 # ========== ^^^^
569 #------ output processing here:
570 if ($validateCDS) {
571 $transl=&dna2prot($splseq, $trframe) unless $transl;
572 $printflag= ($splseq%3 == 0 && $codons{$fcodon} eq 'M'
573 && index($transl, '.')==length($transl)-1)
574 }
575 unless ($printflag) {
576 $seq=''; # signal that it was processed
577 $seq_len=0;
578 return;
579 }
580 unless ($gffout) {
581 if ($defline) { #format back to fasta
582 print $defline."\n" unless ($Getopt::Std::opt_X);
583 }
584 else { #no defline found in the original file
585 if ($Getopt::Std::opt_D && !$Getopt::Std::opt_X) {
586 #my $defl='>SEQ'.$seqcount.'_'.length($seq);
587 my $defl='>SEQ'.$seqcount.'_'.$seq_len;
588 print "$defl\n";
589 }
590 }
591 } #print defline here
592 unless ($gffout) {
593 if ($Getopt::Std::opt_L) { #one line printing
594 print $splseq."\n";
595 }
596 else { # fasta multi-line printing
597 for (my $p=0;$p<length($splseq);$p+=$linelen) {
598 my $pl=substr($splseq, $p, $linelen)."\n";
599 print( ($trframe && $ansicolors) ? &colorize($pl, '.') : $pl );
600 }
601 }
602 }
603 my @wmsg;
604 if ($exoninfo || $gffout) {
605 $transl=&dna2prot($splseq, $trframe) unless $transl;
606 push(@wmsg,'NoStartCodon') if $codons{$fcodon} ne 'M';
607 #$wmsg.=';NoStopCodon' if $codons{$lcodon} ne '.';
608 push(@wmsg,'NoStopCodon') if substr($transl,-1) ne '.';
609 }
610
611 if ($exoninfo) { #show exon info:
612 print STDERR '----> Exon/CDS info ('.$num_exons." exons):\n";
613 my $afcodon=$fcodon;
614 $afcodon=$clgreen.$fcodon.$clreset if ($fcodon eq 'ATG');
615 my @gfd;;
616 my $pstrand=' ';
617 if ($revC) {
618 @gfd=map { $_->[0]+$_->[1]-1 } (@gfdata);
619 $pstrand='-';
620 }
621 else { @gfd=map { $_->[0] } @gfdata; }
622 printf STDERR '%8d%s [%s~~', $basepos+$gfd[0],$pstrand,$afcodon;
623 #print STDERR ' '.($basepos+int($gfdata[0]->[2])).' ['.$fcodon.'~~';
624 if (@gfdata>1) { # multi-exon
625 my @splsites=unpack('(A4)*',$intr_ends);
626 my $xn=1;
627 foreach my $pair (@splsites) {
628 my $don=uc(substr($pair,0,2));
629 $don=$clred.$don.$clreset if ($ansicolors && $don ne 'GT');
630 my $acc=uc(substr($pair,-2));
631 $acc=$clred.$acc.$clreset if ($ansicolors && $acc ne 'AG');
632 print STDERR '~~~]'.$don."\n";
633 #print STDERR ' '.($basepos+int($gfdata[$xn]->[2])).' '.
634 # $acc. '[~~~~~';
635 printf STDERR '%8d%s %s[~~~~~', $basepos+$gfd[$xn], $pstrand,$acc;
636 $xn++;
637 }
638 } #multi-exon
639 my $alcodon=$lcodon;
640 $alcodon=$clgreen.$lcodon.$clreset if $codons{uc($lcodon)} eq '.';
641 print STDERR $lcodon."]\n----------------------------";
642 if (@wmsg>0) {
643 print STDERR "Warning: ".$clred.join($clreset.', '.$clred,@wmsg).$clreset;
644 }
645 print STDERR "\n";
646 }
647 if ($gffout) {
648 #show GFF output
649 my $strand='+';
650 if ($revC) {
651 @gfdata=reverse(@gfdata); #restore in fact, because we reversed it earlier
652 $strand='-';
653 }
654 my ($seqid)=($defline=~m/^>(\S+)/);
655 my $warns='';
656 $warns=';'.join(';',@wmsg) if (@wmsg>0);
657 print join("\t", $seqid, 'jigsaw', 'mRNA', $basepos+$gfdata[0]->[0],
658 $basepos+$gfdata[-1]->[0]+$gfdata[-1]->[1]-1,'.',$strand,'.', 'ID=tjsm.XX;Name=tjsm.XX'.$warns)."\n";
659 foreach my $gfl (@gfdata) {
660 print join("\t", $seqid, 'jigsaw', 'CDS', $basepos+$$gfl[0],
661 $basepos+$$gfl[0]+$$gfl[1]-1,'.',$strand,$$gfl[2], 'Parent=tjsm.XX')."\n";
662 }
663 }
664 $seq=''; #processed!
665 $seq_len=0;
666 }
667
668
669 sub parseRange {
670 my $r=$_[0];
671 my ($rStart, $sep, $rLen)= ($r =~ m/^(\d+)([ \t\.\-\:_]+)(\d*)/);
672 if ($rLen) {
673 if ($sep ne ':') { # start-to-end format
674 ($rLen, $rStart) = ($rLen<$rStart) ? ($rStart-$rLen+1, $rLen) : ($rLen-$rStart+1, $rStart);
675 }
676 }
677 #$rLen=-$rLen if ($rLen<0 && $r=~m/\d+\s*\-\s*$/);
678 $rLen=-$rLen if ($rLen<0);
679 return ($rStart, $rLen); #$rLen could be empty
680 }
681
682
683 sub colorize {
684 #colorize all $p strings in $s
685 my ($s, $p)=@_;
686 #$red="\e[1;33;41m";
687 #$normal="\e[0m";
688 my $r="\e[1;33;41m"; # bright yellow on red
689 my $b="\e[0m"; #reset colors
690 $p=~s/\./\\./g;
691 $s=~s/($p)/$r$1$b/g;
692 return $s;
693 }
694
695 sub reverseComplement {
696 my $s=reverse($_[0]);
697 $s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
698 return $s;
699 }
700
701 sub dna2prot {
702 my $frame=$_[1] || 1;
703 my $s;
704 if ($frame<0) {
705 $s=reverseComplement($_[0]);
706 $frame=-$frame;
707 }
708 else { $s=$_[0]; }
709 if ($frame==1) { $s= uc($s); }
710 else { $s = uc(substr($s, $frame-1)); }
711 #my @cods = ($s =~ m/([A-Z][A-Z][A-Z])/g);
712 my @cods = unpack('(A3)*', $s);
713 my $r;
714 foreach my $c (@cods) {
715 my $aa=$codons{$c} || 'X';
716 $r.=$aa;
717 }
718 return $r;
719 }
720
721 sub prot2dna {
722 my $s= uc($_[0]);
723 my @aa = ($s =~ m/([A-Z])/g);
724 my $r;
725 foreach my $a (@aa) {
726 my $codon=$toDNA{$a} || 'N';
727 $r.=$codon;
728 }
729 return $r;
730 }
731
732 sub getLongestOrf {
733 my ($seq)=@_;
734 my $best=0;
735 my ($bests,$beste)=(-1,-1);
736 #my $bestorf="";
737 my $seqlen=length($seq);
738 my @starts=();
739 my @ends=();
740 if ($getOrfAny) { # include start frames if not a stop codon directly
741 for (my $frame=0;$frame<3;$frame++) {
742 unless ($seq=~m/^.{$frame}(taa|tga|tag)/i) {
743 push @starts,$frame+1;
744 }
745 }
746 }
747 while ($seq=~m/(atg)/gi) {
748 push @starts,pos($seq)-2;
749 }
750
751 while ($seq=~m/(taa|tga|tag)/gi) {
752 push @ends,pos($seq)-2;
753 }
754 push @ends,($seqlen-2,$seqlen-1,$seqlen);
755 for my $s (@starts) {
756 for my $e (@ends) {
757 if ($e%3==$s%3 and $e>$s) {
758 if ($e-$s>$best) {
759 $best=$e-$s;
760 ($bests,$beste)=($s,$e);
761 #$bestorf=substr($seq,$s-1,$e-$s+1);
762 }
763 last
764 }
765 else {
766 next
767 }
768 } #for each end
769 } #for each start
770 if ($bests<=0) {
771 return ('',0,0);
772 }
773 $beste+=2 if $beste<=length($seq)-2; #make it so it includes the stop codon!
774 return (substr($seq,$bests-1,$beste-$bests+1), $bests, $beste);
775 }

Properties

Name Value
svn:executable *