ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/seqmanip
Revision: 160
Committed: Fri Feb 3 15:31:57 2012 UTC (7 years, 8 months ago) by gpertea
File size: 26499 byte(s)
Log Message:
added fq_extract script

Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use Fcntl qw(SEEK_SET SEEK_END SEEK_CUR); # for seek
5    
6     my $usage=q/
7     A simple fasta sequence(s) manipulation tool.
8     Usage:
9     seqmanip [-C] [-r<ranges>] [-L] [-D|-X] [-I] [-G] [-T|-t <frame>|-Z]
10     [-M|-m <ncount>] [-f <ranges_file>] [fasta file(s)...]
11     The input files should be one or more nucleotide or aminoacid sequences
12     in FASTA or raw format; if no files are given, input is expected at STDIN
13    
14     Options:
15     -C reverse complement (is -r is given, first the subsequence is extracted
16     and only then reverse-complemented)
17     -l the line length for the output fasta record (default 70)
18     -r extract a range (subsequence) from the first input fasta sequence;
19     <range> can have one of the formats:
20     <start>..<end> or <start>-<end> or <start>:<len>
21     Multiple such intervals can be given after the -r option (comma or space
22     delimited) and they will be spliced together
23     This option doesn't work with -M or -n options.
24     -f same as -r but for multi-fasta\/seq input; expects a file containing
25     lines of this format:
26     <seqname> <range>
27     The format of <range> is the same with the -r above or it may be just
28     space delimited <start> <end> numbers; the output will be a multi-fasta
29     file with one subsequence for each line in the <ranges_file>
30     -M merge (concatenate) all input sequences into a single sequence
31     -m merge all input sequences separating them by <ncount> Ns
32     -d provides a custom defline for -M and -N options
33     -D add basic defline if the input doesn't provide one
34     -X exclude defline from the output (just print the sequence)
35     -L provide one-line output for each sequence
36     -T first frame translation (DNA to aminoacid)
37     -t <frame>th frame translation; <frame> could be 1,2,3,-1,-2,-3
38     -E show exon\/splice-sites\/start\/stop info (requires -r)
39     -G show GFF output for the given range(s) (requires -r)
40     -Z raw reverse translate the output (aminoacid to DNA, rather useless)
41     -V only show a sequence if it's a valid, complete CDS
42     -q only show a sequence if it's longer than <minseqlen>
43     -n discard sequences with a percentage of Ns (or Xs) higher than <maxpercN>
44     -y remove spans of Ns longer than <maxnspan> replacing them by <maxnspan> Ns
45     -Y trim Ns from either end of sequence
46     -U cleanup\/remove extraneous characters(e.g. digits, dashes, stars)
47     from every input sequence
48     -R convert RNA to DNA (simply replacing U wih T)
49     -P convert all sequence letters to uppercase
50     -A use ANSI colors to highlight features in terminal (for -E option)
51     -O return longest ORF sequence for each input sequence (start..stop)
52     -Q same as -O but doesn't require a start codon
53     /;
54    
55     my %codons=(
56     'AAA'=>'K', 'AAC'=>'N', 'AAG'=>'K', 'AAR'=>'K', 'AAT'=>'N',
57     'AAY'=>'N', 'ACA'=>'T', 'ACB'=>'T', 'ACC'=>'T', 'ACD'=>'T',
58     'ACG'=>'T', 'ACH'=>'T', 'ACK'=>'T', 'ACM'=>'T', 'ACN'=>'T',
59     'ACR'=>'T', 'ACS'=>'T', 'ACT'=>'T', 'ACV'=>'T', 'ACW'=>'T',
60     'ACY'=>'T', 'AGA'=>'R', 'AGC'=>'S', 'AGG'=>'R', 'AGR'=>'R',
61     'AGT'=>'S', 'AGY'=>'S', 'ATA'=>'I', 'ATC'=>'I', 'ATG'=>'M',
62     'ATH'=>'I', 'ATM'=>'I', 'ATT'=>'I', 'ATW'=>'I', 'ATY'=>'I',
63     'CAA'=>'Q', 'CAC'=>'H', 'CAG'=>'Q', 'CAR'=>'Q', 'CAT'=>'H',
64     'CAY'=>'H', 'CCA'=>'P', 'CCB'=>'P', 'CCC'=>'P', 'CCD'=>'P',
65     'CCG'=>'P', 'CCH'=>'P', 'CCK'=>'P', 'CCM'=>'P', 'CCN'=>'P',
66     'CCR'=>'P', 'CCS'=>'P', 'CCT'=>'P', 'CCV'=>'P', 'CCW'=>'P',
67     'CCY'=>'P', 'CGA'=>'R', 'CGB'=>'R', 'CGC'=>'R', 'CGD'=>'R',
68     'CGG'=>'R', 'CGH'=>'R', 'CGK'=>'R', 'CGM'=>'R', 'CGN'=>'R',
69     'CGR'=>'R', 'CGS'=>'R', 'CGT'=>'R', 'CGV'=>'R', 'CGW'=>'R',
70     'CGY'=>'R', 'CTA'=>'L', 'CTB'=>'L', 'CTC'=>'L', 'CTD'=>'L',
71     'CTG'=>'L', 'CTH'=>'L', 'CTK'=>'L', 'CTM'=>'L', 'CTN'=>'L',
72     'CTR'=>'L', 'CTS'=>'L', 'CTT'=>'L', 'CTV'=>'L', 'CTW'=>'L',
73     'CTY'=>'L', 'GAA'=>'E', 'GAC'=>'D', 'GAG'=>'E', 'GAR'=>'E',
74     'GAT'=>'D', 'GAY'=>'D', 'GCA'=>'A', 'GCB'=>'A', 'GCC'=>'A',
75     'GCD'=>'A', 'GCG'=>'A', 'GCH'=>'A', 'GCK'=>'A', 'GCM'=>'A',
76     'GCN'=>'A', 'GCR'=>'A', 'GCS'=>'A', 'GCT'=>'A', 'GCV'=>'A',
77     'GCW'=>'A', 'GCY'=>'A', 'GGA'=>'G', 'GGB'=>'G', 'GGC'=>'G',
78     'GGD'=>'G', 'GGG'=>'G', 'GGH'=>'G', 'GGK'=>'G', 'GGM'=>'G',
79     'GGN'=>'G', 'GGR'=>'G', 'GGS'=>'G', 'GGT'=>'G', 'GGV'=>'G',
80     'GGW'=>'G', 'GGY'=>'G', 'GTA'=>'V', 'GTB'=>'V', 'GTC'=>'V',
81     'GTD'=>'V', 'GTG'=>'V', 'GTH'=>'V', 'GTK'=>'V', 'GTM'=>'V',
82     'GTN'=>'V', 'GTR'=>'V', 'GTS'=>'V', 'GTT'=>'V', 'GTV'=>'V',
83     'GTW'=>'V', 'GTY'=>'V', 'MGA'=>'R', 'MGG'=>'R', 'MGR'=>'R',
84     'NNN'=>'X', 'RAY'=>'B', 'SAR'=>'Z', 'TAA'=>'.', 'TAC'=>'Y',
85     'TAG'=>'.', 'TAR'=>'.', 'TAT'=>'Y', 'TAY'=>'Y', 'TCA'=>'S',
86     'TCB'=>'S', 'TCC'=>'S', 'TCD'=>'S', 'TCG'=>'S', 'TCH'=>'S',
87     'TCK'=>'S', 'TCM'=>'S', 'TCN'=>'S', 'TCR'=>'S', 'TCS'=>'S',
88     'TCT'=>'S', 'TCV'=>'S', 'TCW'=>'S', 'TCY'=>'S', 'TGA'=>'.',
89     'TGC'=>'C', 'TGG'=>'W', 'TGT'=>'C', 'TGY'=>'C', 'TRA'=>'.',
90     'TTA'=>'L', 'TTC'=>'F', 'TTG'=>'L', 'TTR'=>'L', 'TTT'=>'F',
91     'TTY'=>'F', 'XXX'=>'X', 'YTA'=>'L', 'YTG'=>'L', 'YTR'=>'L'
92     );
93    
94    
95     getopts('AZTUPMEGNRXDVCLYOQr:d:m:q:l:t:f:y:n:') || die "$usage\n";
96     #my %ranges; # seqname => [offset, len] only populated by -r or -f option;
97     my @ranges; # list of [chr, strand, defline, [[offset1,len1], [offset1,len2], ..]]
98     # 0 1 2 3
99     #my @range_chrs; # just to keep track of the order of ranges requested
100     my $fai_loaded=0;
101     my %fai_h; # seqID => [fpos, linelen, delimlen, seqlen]
102    
103     my $m_linerest; #for the merge case (-M)
104     my $linelen=$Getopt::Std::opt_l || 70;
105     my $user_defline=$Getopt::Std::opt_d || 'dummyname';
106     my $ansicolors=$Getopt::Std::opt_A;
107     #some ansi colors:
108     my ($clred, $clgreen, $clreset)=("\e[1;33;41m", # bright yellow on red
109     "\e[0;32;40m", # green on black
110     "\e[0m") #reset colors
111     if $ansicolors;
112     my %toDNA=reverse(%codons);
113     @toDNA{'P','F','K','G'}=('CCC','TTT','AAA','GGG');
114     my ($getOrf, $getOrfAny)=($Getopt::Std::opt_O, $Getopt::Std::opt_Q);
115     $getOrf=1 if $getOrfAny;
116     my $complement=$Getopt::Std::opt_C;
117     my $validateCDS=$Getopt::Std::opt_V;
118     my $exoninfo=$Getopt::Std::opt_E;
119     my $gffout=$Getopt::Std::opt_G;
120     my $rna2dna=$Getopt::Std::opt_R;
121     my $mergedist=$Getopt::Std::opt_m;
122     my $minseqlen=$Getopt::Std::opt_q;
123     my $maxnspan=$Getopt::Std::opt_y;
124     my $trimNs=$Getopt::Std::opt_Y;
125     my $maxnrepl='';
126     if ($maxnspan) {
127     $maxnrepl='N' x $maxnspan;
128     $maxnspan++;
129     }
130     my $maxpercn=$Getopt::Std::opt_n;
131     my $mergesep;
132     if ($mergedist>0) {
133     $Getopt::Std::opt_M=1;
134     $mergesep='N' x $mergedist;
135     }
136     my $rangetarget; #if a specific seq target was provided
137     # if this is not set, then the first fasta sequence given as input
138     # will be used for range queries
139     # parse range information if available
140     if ($Getopt::Std::opt_r) {
141     my $rangetext=$Getopt::Std::opt_r;
142     my $rstrand='+';
143     #if ($rangetext=~s/([\-\+])$//) { $rstrand=$2;}
144     if ($rangetext=~s/(\d+[\.\-\:_]+\d+)([\-\+])$/$1/) { $rstrand=$2; }
145     my @ar=split(/[\,\s]+/,$rangetext); #allow multiple ranges
146     my $seqtarget;
147     unless ($ar[0]=~m/^\d+\:\d+$/) {
148     if ($ar[0]=~s/^([\w\|\-\+\.]+)\://) {
149     # target chromosome given
150     $seqtarget=$1;
151     if ($seqtarget=~s/([\-\+])$//) { $rstrand=$1;}
152     $rangetarget=1;
153     }
154     }
155     if ($seqtarget) { # 0 1 2 3
156     push(@ranges, [$seqtarget,$rstrand,'', []]);
157     }
158     else {
159     push(@ranges, ['-',$rstrand,'',[]]);
160     }
161     my $rdata=$ranges[0]->[3]; # [ ]
162     foreach my $rt (@ar) {
163     my ($rStart, $rLen)=&parseRange($rt);
164     push(@$rdata, [$rStart, $rLen]);
165     }
166     }
167     elsif ($Getopt::Std::opt_f) { # file with ranges to pull
168     my $frname=$Getopt::Std::opt_f;
169     my $using_stdin;
170     if ($frname eq '-' || $frname eq 'stdin') {
171     #open(FRNG, <&STDIN);
172     open(FRNG, "<&=STDIN") or die "Couldn't alias STDIN : $!";
173     $using_stdin=1;
174     }
175     else {
176     open (FRNG, $frname) || die ("Error: cannot open seq ranges file '$frname'!\n");
177     }
178     while (<FRNG>) {
179     chomp;
180     my $rstrand='+';
181     my ($seqname, $rangetxt)=split(/\s+/,$_,2);
182     my $defline;
183     if ($seqname=~m/^([\w\|\-\+\.]+)\:(\d+[\.\-_]+\d*[\-\+]?)/) {
184     #chr:start-end format
185     # strand may follow chr or end
186     $seqname=$1;
187     $defline=$rangetxt;
188     $rangetxt=$2;
189     if ($seqname=~s/([\-\+])$//) { $rstrand=$1;}
190     elsif ($rangetxt=~s/(\d+[\.\-_]+\d+)([\-\+])$/$1/) { $rstrand=$2; }
191     }
192     else { #1st column is chr(strand), 2nd column is the interval
193     if ($seqname=~s/([\-\+])$//) { $rstrand=$1; }
194     $rangetxt=~s/(\d+)[\t ]+(\d+)/$1-$2/g;
195     my @rt=split(/\s+/,$rangetxt,2);
196     if ($rt[1]) {
197     $defline=$rt[1];
198     $rangetxt=$rt[0];
199     }
200     }
201     next unless $rangetxt;
202     push(@ranges, [$seqname, $rstrand, $defline, []]);
203     my $rdata=$ranges[-1]->[3];
204     #$rangetxt=~s/(\d+)[\t ]+(\d+)/$1-$2/g; #could be space delimited
205     my @ar=split(/\,/,$rangetxt);
206     foreach my $rt (@ar) {
207     my ($rStart, $rLen)=&parseRange($rt);
208     push(@$rdata, [$rStart, $rLen]);
209     }
210     }
211     $rangetarget=1 if @ranges>0;
212     close(FRNG) unless $using_stdin;
213     }
214     my $seq;
215     my $seq_len;
216     my $count=0;
217     my $defline;
218     my $curseq;
219     my $seqcount;
220     print ">$user_defline\n" if ($Getopt::Std::opt_M && !$Getopt::Std::opt_X);
221     die("Error: target sequence(s) given but no input file!\n")
222     if ($rangetarget && (@ARGV==0 || ! -f $ARGV[0]));
223     my $qrange=(@ranges>0 && @ARGV>0 && -f $ARGV[0]);
224     if ($qrange) { #target
225     if (@ranges==1 && $ranges[0]->[0] eq '-') { #single fasta file
226     processRanges($ARGV[0], $ranges[0]);
227     } # single-fasta file
228     else {
229     #actual sequence names provided, need cdbfasta or samtools index
230     my $fasta=$ARGV[0];
231     #die("Error: no fasta file $fasta!\n") unless -f $fasta;
232     my $faidx;
233     if ($fasta=~s/\.fai$//) {
234     $faidx=$fasta.'.fai';
235     }
236     else {
237     $faidx=$fasta.'.fai' if -f $fasta.'.fai';
238     }
239     unless ($faidx) {
240     if ($fasta=~s/\.cidx$//) {
241     $faidx=$fasta.'.cidx';
242     }
243     else {
244     $faidx=$fasta.'.cidx' if -f $fasta.'.cidx';
245     }
246     }
247     #print STDERR "info: using fasta index $faidx for $fasta.\n";
248     die("Error: no index file for fasta file $fasta") unless $faidx;
249     die("Error: no fasta file $fasta!\n") unless -f $fasta;
250     foreach my $rd (@ranges) {
251     processRanges($fasta, $rd, $faidx);
252     }
253     } # cidx file needed
254     } # fast[er] range queries
255     else { # normal (serial) stream processing, very slow range processing for large sequences
256     while (<>) {
257     if (m/^(>.+)/) {
258     process_seq() if $curseq; #sequence is in $seq variable
259     $defline=$1;
260     ($curseq)=($defline=~m/^>(\S+)/);
261     $seq_len=0;
262     next;
263     }
264     chomp;
265     tr/ \t\r\n//d;
266     $seq.=$_;
267     $seq_len+=length();
268     process_seq() unless $curseq; #no defline = raw input: one sequence per line
269     }
270     process_seq() if $curseq; #sequence is in $seq variable
271     }
272    
273     print "$m_linerest\n" if ($Getopt::Std::opt_M && $m_linerest);
274    
275     #============================
276    
277     sub fai_load {
278     open(FAI, $_[0]) || die ("Error opening index file $_[0]!\n");
279     while (<FAI>) {
280     my @t=split(/\t/);# 0=seqID 1=seqlen 2=fpos 3=linelen 4=linelen+dlen
281     next if m/^#/ && @t<5;
282     $fai_h{$t[0]}=[$t[2], $t[3], $t[4]-$t[3], $t[1]];
283     # seqID => [fpos, linelen, delimlen, seqlen]
284     }
285     close(FAI);
286     $fai_loaded=1;
287     }
288    
289     sub processRanges {
290     #my ($fasta, $chr, $chr_rdata, $faidx, $udefline)=@_;
291     my ($fasta, $r_data, $faidx)=@_;
292     my $chr=$$r_data[0];
293     $chr='' if $chr eq '-';
294     my $chr_strand=$$r_data[1];
295     my $udefline=$$r_data[2];
296     my $chr_rdata=$$r_data[3];
297     my @intvs = sort { $main::a->[0] <=> $main::b->[0] } @$chr_rdata; #range intervals
298     my ($fpos, $seqlen, $dlen, $flinelen);
299     open(FA, $fasta) || die("Error opening fasta file $fasta\n");
300     $fpos=0;
301     my $iscdb=($faidx=~m/\.cidx$/);
302     if ($chr) {
303     die("Error: target sequence given, but no fasta index\n") unless $faidx;
304     if ($iscdb) {
305     my $r=`cdbyank -a '$chr' -P $faidx`;
306     my $syserr=$?;
307     chomp($r);
308     die("Error at cdbyank -a '$chr' -P $faidx (exit code $syserr)\n") if length($r)==0 || $syserr;
309     $fpos=int($r);
310     seek(FA, $fpos, SEEK_SET);
311     }
312     else { #fasta index
313     #only load the first time
314     fai_load($faidx) unless ($fai_loaded);
315     my $cd=$fai_h{$chr};
316     die("Error retrieving $chr data from fasta index!\n") unless $cd;
317     ($fpos, $flinelen, $dlen, $seqlen)=@$cd;
318     $intvs[-1]->[1]=$seqlen-$intvs[-1]->[0]+1 unless $intvs[-1]->[1];
319     if ($intvs[-1]->[1]<=0) {
320     #invalid range
321     $seq='';
322     $defline='';
323     return;
324     }
325     $defline = $udefline ? '>'.$udefline : ">$chr";
326     seek(FA, $fpos, SEEK_SET);
327     } #samtools fai
328     } #indexed access
329     if ($iscdb || !$chr) {
330     #for cdb or plain fasta we have to determine line length by reading the first 1k bytes
331     my $rbuf;
332     read(FA, $rbuf, 1024);
333     my @lines=split(/[\n\r]+/,$rbuf);
334     my @ldelim=($rbuf=~m/([\n\r]+)/gs);
335     if (@ldelim<2) { #we need at least 2 full lines read
336     my $radd;
337     read(FA, $radd, 4096);
338     $rbuf.=$radd;
339     @lines=split(/[\n\r]+/,$rbuf);
340     @ldelim=($rbuf=~m/([\n\r]+)/gs);
341     }
342     $dlen=length($ldelim[0]);
343     die("Error determining line ending type for $fasta!\n") if ($dlen==0);
344     $defline = $udefline ? ">$udefline" : $lines[0];
345     $flinelen=length($lines[1]); #line length in fasta record
346     $fpos+=length($lines[0])+$dlen;
347     seek(FA, $fpos, SEEK_SET); #reposition at the beginning of the sequence
348     }
349     #now we are positioned at the beginning of the fasta record sequence
350     #seek(FA, $fpos+length($lines[0])+$dlen, SEEK_SET);
351     $seq='';
352     my $revC=1 if $chr_strand eq '-';
353     my $rstart=$intvs[0]->[0]-1; #first range start position, 0-based
354     my $rstartpos= ($rstart<$flinelen) ? $rstart : ($rstart+$dlen*(int($rstart/$flinelen)));
355     my $fstartpos=$fpos+$rstartpos;
356     seek(FA, $fstartpos, SEEK_SET);
357     if ($intvs[-1]->[1]) { #ending for last interval is known
358     my $rend=$intvs[-1]->[0]+$intvs[-1]->[1]-2;
359     #$seq_len=$rend-$rstart+1;
360     my $rendpos=($rend<$flinelen) ? $rend : ($rend+$dlen*(int($rend/$flinelen)));
361     #print STDERR "pulling from $fstartpos to $rendpos\n";
362     my $readlen=$rendpos-$rstartpos+1;
363     #my $fstartpos=$fpos+length($lines[0])+$dlen+$rstartpos;
364     read(FA, $seq, $readlen) || die("Error reading $readlen bytes from fasta $fasta at offset $fstartpos\n");
365     close(FA);
366     }
367     else { #last interval goes to the end of sequence, but seqlen is not known
368     my $rend=$intvs[-1]->[0]+$intvs[-1]->[1]-2;
369     #$seq_len=$rend-$rstart+1;
370     local $_;
371     while (<FA> && !m/^>/) {
372     $seq.=$_;
373     }
374     close(FA);
375     }
376     $seq=~tr/\n\r//d;
377     $seq_len=length($seq);
378     #now extract the ranges:
379     my @rdata;
380     my $txt_intvs;
381     foreach my $r (@intvs) {
382     push(@rdata, [$$r[0]-$rstart, $$r[1]]);
383     #$spliceseq.=substr($subseq, $cstart, $clen);
384     $txt_intvs.='|'.$$r[0].'-'.($$r[0]+$$r[1]-1);
385     }
386     $defline.=$txt_intvs.$chr_strand unless $udefline;
387     process_seq(\@rdata, $rstart, $revC);
388     }
389    
390     sub process_seq {
391     #my $range = $rangekey ? $ranges{$rangekey} : $ranges{$_[0]};
392     return unless $seq;
393     my ($range, $basepos, $revC)=@_;
394     if ($maxnspan) {
395     $seq=~s/[NX]{$maxnspan,}/$maxnrepl/ig;
396     $seq_len=length($seq);
397     }
398     if ($Getopt::Std::opt_U) {
399     $seq=~s/[<>\d\s\-\*\.\,\;\:,\x7B-\xFF]+//sg;
400     $seq_len=length($seq);
401     }
402     if ($rna2dna) {
403     $seq =~ tr/Uu/Tt/;
404     }
405     $seq=uc($seq) if $Getopt::Std::opt_P;
406     if ($trimNs) {
407     $seq=~s/^[NX]+//;
408     $seq=~s/[NX]+$//;
409     $seq_len=length($seq);
410     }
411     if ($minseqlen && $seq_len<$minseqlen) {
412     $seq='';
413     $seq_len=0;
414     return;
415     }
416     if ($maxpercn) {
417     my $nN=($seq=~tr/AaCcGgTtUu//c);
418     if ((($nN*100.0)/$seq_len)>$maxpercn) {
419     $seq='';
420     $seq_len=0;
421     return;
422     }
423     }
424     # basepos is a 0 based offset
425     unless ($range || @ranges==0) {
426     $range=$ranges[0]->[3];
427     #print STDERR "using default, first ranges!\n"; #DEBUG
428     }
429     #print STDERR "key=$_[0], range=$$range[0] ($$range[1])\n";
430     #print STDERR "$range\t$seq\n";
431    
432     $seqcount++;
433     if ($Getopt::Std::opt_M) { #merge only;
434     #merge all input
435     if ($mergedist>0 && $seqcount>1) {
436     $seq=$m_linerest.$mergesep.$seq;
437     }
438     else {
439     $seq=$m_linerest.$seq;
440     }
441     $m_linerest='';
442     $seq_len=length($seq);
443     for (my $p=0;$p<$seq_len;$p+=$linelen) {
444     my $sline=substr($seq, $p, $linelen);
445     if (length($sline)==$linelen) {
446     print $sline."\n";
447     }
448     else {
449     $m_linerest=$sline;
450     last;
451     }
452     }
453     $seq='';
454     $seq_len=0;
455     return;
456     }
457     my @seqranges;
458     my $intr_ends=''; # 0 1 2 3
459     my @gfdata; #array of [exon_start, exon_len, cdsphase, diagram]
460     if ($range) { #extract range
461     my @intvs = sort { $main::a->[0] <=> $main::b->[0] } @$range;
462     #print STDERR length($seq)." (seq_len=$seq_len)\n";
463     #print STDERR "beginning: ".substr($seq,0,10)."\n";
464     #print STDERR " ending: ".substr($seq,-10)."\n";
465     my $i=0;
466     foreach my $rng (@intvs) {
467     $i++;
468     if ($$rng[1]>0) {
469     push(@seqranges, substr($seq, $$rng[0]-1, $$rng[1]));
470     my $ss_before=substr($seq, $$rng[0]-3,2);
471     $ss_before='NN' unless length($ss_before)==2;
472     my $ss_after=substr($seq, $$rng[0]+$$rng[1]-1,2);
473     $ss_after='NN' unless length($ss_after)==2;
474     $intr_ends.=$ss_before.$ss_after;
475     push(@gfdata, [$$rng[0], $$rng[1], 0,'']);
476     #print STDERR ">>$seq<<\n";
477     }
478     #elsif ($$rng[1]<0) { # DON"T use this or mix ranges it's not consistent!
479     # my $rseq=substr($seq, $$rng[0]-1, -$$rng[1]);
480     # push(@seqranges, reverseComplement($rseq));
481     # push(@gfdata, [$$rng[0], $$rng[1], 0, '']);
482     # }
483     else { #0 length means to the end of sequence
484     push(@seqranges, substr($seq, $$rng[0]-1));
485     push(@gfdata, [$$rng[0], length($seqranges[-1]), 0,'']);
486     }
487     }#for each range
488     #print STDERR "intr_ends=$intr_ends\n";
489     }
490     else { #no ranges requested, create a dummy one with the whole sequence
491     push(@seqranges, $seq);
492     push(@gfdata, [1, length($seqranges[-1]),0, '']);
493     }
494     # -- first and last codons
495     #there is that rare case when the start/stop codons can be
496     #broken/split by an intron
497     my ($fcodon, $lcodon);
498 gpertea 160 my $num_exons=scalar(@seqranges);
499     if ($num_exons>1) {
500 gpertea 23 my $l=$#seqranges;
501     ($fcodon, $lcodon)=( uc(substr($seqranges[0].$seqranges[1],0,3)),
502     uc(substr($seqranges[$l-1].$seqranges[$l],-3)) );
503     }
504     else { # single-exon
505     ($fcodon, $lcodon)=( uc(substr($seqranges[0],0,3)),
506     uc(substr($seqranges[-1],-3)) );
507     }
508     my $trframe=$Getopt::Std::opt_t || $Getopt::Std::opt_T;
509     my $splseq;
510     $revC=($revC || $complement);
511     foreach my $sq (@seqranges) {
512     # -- complement requested?
513     if ($revC) {
514     $sq=reverseComplement($sq);
515     #complement the range coordinates too!
516     #$rStart=length($seq)-($rStart+$rLen)+2 if $range;
517     $splseq=$sq.$splseq;
518     }
519     else {
520     $splseq.=$sq;
521     }
522     } # for each range
523    
524     if (length($intr_ends)>4) { # have introns
525     $intr_ends=substr($intr_ends,2,-2);
526     if ($revC) {
527     $intr_ends=reverseComplement($intr_ends);
528     }
529     }
530    
531     if ($revC) {
532     @gfdata=reverse(@gfdata);
533     ($fcodon, $lcodon)=(reverseComplement($lcodon),
534     reverseComplement($fcodon));
535     }
536     # determine CDS phases
537     my $initphase=0; #could be provided
538     my $acclen=3-$initphase;
539     foreach my $x (@gfdata) {
540     #$$x[2]=($initframe+$acclen) % 3;
541     $$x[2]=(3-$acclen%3) % 3;
542     $acclen+=$$x[1];
543     }
544     my $printflag=1; #should it be printed or not?
545     # $splseq has the actual sequence to be processed
546     # ========== vvvv - any other seq processing/trimming should be applied here:
547     if ($getOrf) {
548     my ($orf_start, $orf_end);
549     ($splseq, $orf_start, $orf_end)=getLongestOrf($splseq);
550     if ($orf_start<=0) {
551     $defline.=' ORF:none';
552     $orf_start=0;$orf_end=0;
553     $splseq='';
554     $printflag=0;
555     }
556     else {
557     $defline.=" ORF:$orf_start-$orf_end" if $defline;
558     }
559     }
560     my $transl;
561     if ($trframe) {
562     $transl=&dna2prot($splseq, $trframe); #<trframe>th frame translate request
563     $splseq=$transl;
564     }
565     elsif ($Getopt::Std::opt_Z) {
566     $splseq=&prot2dna($splseq); #back-translate request
567     }
568     # ========== ^^^^
569     #------ output processing here:
570     if ($validateCDS) {
571     $transl=&dna2prot($splseq, $trframe) unless $transl;
572     $printflag= ($splseq%3 == 0 && $codons{$fcodon} eq 'M'
573     && index($transl, '.')==length($transl)-1)
574     }
575     unless ($printflag) {
576     $seq=''; # signal that it was processed
577     $seq_len=0;
578     return;
579     }
580     unless ($gffout) {
581     if ($defline) { #format back to fasta
582     print $defline."\n" unless ($Getopt::Std::opt_X);
583     }
584     else { #no defline found in the original file
585     if ($Getopt::Std::opt_D && !$Getopt::Std::opt_X) {
586     #my $defl='>SEQ'.$seqcount.'_'.length($seq);
587     my $defl='>SEQ'.$seqcount.'_'.$seq_len;
588     print "$defl\n";
589     }
590     }
591     } #print defline here
592     unless ($gffout) {
593     if ($Getopt::Std::opt_L) { #one line printing
594     print $splseq."\n";
595     }
596     else { # fasta multi-line printing
597     for (my $p=0;$p<length($splseq);$p+=$linelen) {
598     my $pl=substr($splseq, $p, $linelen)."\n";
599     print( ($trframe && $ansicolors) ? &colorize($pl, '.') : $pl );
600     }
601     }
602     }
603     my @wmsg;
604     if ($exoninfo || $gffout) {
605     $transl=&dna2prot($splseq, $trframe) unless $transl;
606     push(@wmsg,'NoStartCodon') if $codons{$fcodon} ne 'M';
607     #$wmsg.=';NoStopCodon' if $codons{$lcodon} ne '.';
608     push(@wmsg,'NoStopCodon') if substr($transl,-1) ne '.';
609     }
610    
611     if ($exoninfo) { #show exon info:
612 gpertea 160 print STDERR '----> Exon/CDS info ('.$num_exons." exons):\n";
613 gpertea 23 my $afcodon=$fcodon;
614     $afcodon=$clgreen.$fcodon.$clreset if ($fcodon eq 'ATG');
615     my @gfd;;
616     my $pstrand=' ';
617     if ($revC) {
618     @gfd=map { $_->[0]+$_->[1]-1 } (@gfdata);
619     $pstrand='-';
620     }
621     else { @gfd=map { $_->[0] } @gfdata; }
622     printf STDERR '%8d%s [%s~~', $basepos+$gfd[0],$pstrand,$afcodon;
623     #print STDERR ' '.($basepos+int($gfdata[0]->[2])).' ['.$fcodon.'~~';
624     if (@gfdata>1) { # multi-exon
625     my @splsites=unpack('(A4)*',$intr_ends);
626     my $xn=1;
627     foreach my $pair (@splsites) {
628 gpertea 110 my $don=uc(substr($pair,0,2));
629 gpertea 23 $don=$clred.$don.$clreset if ($ansicolors && $don ne 'GT');
630 gpertea 110 my $acc=uc(substr($pair,-2));
631 gpertea 23 $acc=$clred.$acc.$clreset if ($ansicolors && $acc ne 'AG');
632     print STDERR '~~~]'.$don."\n";
633     #print STDERR ' '.($basepos+int($gfdata[$xn]->[2])).' '.
634     # $acc. '[~~~~~';
635     printf STDERR '%8d%s %s[~~~~~', $basepos+$gfd[$xn], $pstrand,$acc;
636     $xn++;
637     }
638     } #multi-exon
639     my $alcodon=$lcodon;
640     $alcodon=$clgreen.$lcodon.$clreset if $codons{uc($lcodon)} eq '.';
641     print STDERR $lcodon."]\n----------------------------";
642     if (@wmsg>0) {
643     print STDERR "Warning: ".$clred.join($clreset.', '.$clred,@wmsg).$clreset;
644     }
645     print STDERR "\n";
646     }
647     if ($gffout) {
648     #show GFF output
649     my $strand='+';
650     if ($revC) {
651     @gfdata=reverse(@gfdata); #restore in fact, because we reversed it earlier
652     $strand='-';
653     }
654     my ($seqid)=($defline=~m/^>(\S+)/);
655     my $warns='';
656     $warns=';'.join(';',@wmsg) if (@wmsg>0);
657     print join("\t", $seqid, 'jigsaw', 'mRNA', $basepos+$gfdata[0]->[0],
658     $basepos+$gfdata[-1]->[0]+$gfdata[-1]->[1]-1,'.',$strand,'.', 'ID=tjsm.XX;Name=tjsm.XX'.$warns)."\n";
659     foreach my $gfl (@gfdata) {
660     print join("\t", $seqid, 'jigsaw', 'CDS', $basepos+$$gfl[0],
661     $basepos+$$gfl[0]+$$gfl[1]-1,'.',$strand,$$gfl[2], 'Parent=tjsm.XX')."\n";
662     }
663     }
664     $seq=''; #processed!
665     $seq_len=0;
666     }
667    
668    
669     sub parseRange {
670     my $r=$_[0];
671     my ($rStart, $sep, $rLen)= ($r =~ m/^(\d+)([ \t\.\-\:_]+)(\d*)/);
672     if ($rLen) {
673     if ($sep ne ':') { # start-to-end format
674     ($rLen, $rStart) = ($rLen<$rStart) ? ($rStart-$rLen+1, $rLen) : ($rLen-$rStart+1, $rStart);
675     }
676     }
677     #$rLen=-$rLen if ($rLen<0 && $r=~m/\d+\s*\-\s*$/);
678     $rLen=-$rLen if ($rLen<0);
679     return ($rStart, $rLen); #$rLen could be empty
680     }
681    
682    
683     sub colorize {
684     #colorize all $p strings in $s
685     my ($s, $p)=@_;
686     #$red="\e[1;33;41m";
687     #$normal="\e[0m";
688     my $r="\e[1;33;41m"; # bright yellow on red
689     my $b="\e[0m"; #reset colors
690     $p=~s/\./\\./g;
691     $s=~s/($p)/$r$1$b/g;
692     return $s;
693     }
694    
695     sub reverseComplement {
696     my $s=reverse($_[0]);
697     $s =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
698     return $s;
699     }
700    
701     sub dna2prot {
702     my $frame=$_[1] || 1;
703     my $s;
704     if ($frame<0) {
705     $s=reverseComplement($_[0]);
706     $frame=-$frame;
707     }
708     else { $s=$_[0]; }
709     if ($frame==1) { $s= uc($s); }
710     else { $s = uc(substr($s, $frame-1)); }
711     #my @cods = ($s =~ m/([A-Z][A-Z][A-Z])/g);
712     my @cods = unpack('(A3)*', $s);
713     my $r;
714     foreach my $c (@cods) {
715     my $aa=$codons{$c} || 'X';
716     $r.=$aa;
717     }
718     return $r;
719     }
720    
721     sub prot2dna {
722     my $s= uc($_[0]);
723     my @aa = ($s =~ m/([A-Z])/g);
724     my $r;
725     foreach my $a (@aa) {
726     my $codon=$toDNA{$a} || 'N';
727     $r.=$codon;
728     }
729     return $r;
730     }
731    
732     sub getLongestOrf {
733     my ($seq)=@_;
734     my $best=0;
735     my ($bests,$beste)=(-1,-1);
736     #my $bestorf="";
737     my $seqlen=length($seq);
738     my @starts=();
739     my @ends=();
740     if ($getOrfAny) { # include start frames if not a stop codon directly
741     for (my $frame=0;$frame<3;$frame++) {
742     unless ($seq=~m/^.{$frame}(taa|tga|tag)/i) {
743     push @starts,$frame+1;
744     }
745     }
746     }
747     while ($seq=~m/(atg)/gi) {
748     push @starts,pos($seq)-2;
749     }
750    
751     while ($seq=~m/(taa|tga|tag)/gi) {
752     push @ends,pos($seq)-2;
753     }
754     push @ends,($seqlen-2,$seqlen-1,$seqlen);
755     for my $s (@starts) {
756     for my $e (@ends) {
757     if ($e%3==$s%3 and $e>$s) {
758     if ($e-$s>$best) {
759     $best=$e-$s;
760     ($bests,$beste)=($s,$e);
761     #$bestorf=substr($seq,$s-1,$e-$s+1);
762     }
763     last
764     }
765     else {
766     next
767     }
768     } #for each end
769     } #for each start
770     if ($bests<=0) {
771     return ('',0,0);
772     }
773     $beste+=2 if $beste<=length($seq)-2; #make it so it includes the stop codon!
774     return (substr($seq,$bests-1,$beste-$bests+1), $bests, $beste);
775     }

Properties

Name Value
svn:executable *