ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/tran6frames.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 5880 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4
5 my $usage=q{
6 Usage:
7
8 tran6frames.pl [-O] [-t <tag>] [-R] <seqfile>
9
10 -R : the sequence is masked for repeats
11 (with lower case letters) and the translation
12 will ignore (skip) such regions
13 -O : use the offset information in the defline of each fasta
14 record for generating the base name of the translated
15 sequences; the generated ID will have the format:
16 <original_seqid>|<original_offset>|<tran_start>_<tran_end>
17 -t add the provided string <tag> to the seq ID generated for each
18 record; the new ID will have the format:
19 <original_seqid>|<tag>|<tran_start>_<tran_end>
20 };
21
22 getopts('ORt:') || die($usage."\n");
23
24 my ($useoffset, $repeat, $nametag)=
25 ($Getopt::Std::opt_O, $Getopt::Std::opt_R, $Getopt::Std::opt_t);
26
27 if ($useoffset && $nametag) {
28 die("$usage\n Error: options -t and -O are exclusive!\n");
29 }
30
31 my %codons=(
32 'AAA'=>'K', 'AAC'=>'N', 'AAG'=>'K', 'AAR'=>'K', 'AAT'=>'N',
33 'AAY'=>'N', 'ACA'=>'T', 'ACB'=>'T', 'ACC'=>'T', 'ACD'=>'T',
34 'ACG'=>'T', 'ACH'=>'T', 'ACK'=>'T', 'ACM'=>'T', 'ACN'=>'T',
35 'ACR'=>'T', 'ACS'=>'T', 'ACT'=>'T', 'ACV'=>'T', 'ACW'=>'T',
36 'ACY'=>'T', 'AGA'=>'R', 'AGC'=>'S', 'AGG'=>'R', 'AGR'=>'R',
37 'AGT'=>'S', 'AGY'=>'S', 'ATA'=>'I', 'ATC'=>'I', 'ATG'=>'M',
38 'ATH'=>'I', 'ATM'=>'I', 'ATT'=>'I', 'ATW'=>'I', 'ATY'=>'I',
39 'CAA'=>'Q', 'CAC'=>'H', 'CAG'=>'Q', 'CAR'=>'Q', 'CAT'=>'H',
40 'CAY'=>'H', 'CCA'=>'P', 'CCB'=>'P', 'CCC'=>'P', 'CCD'=>'P',
41 'CCG'=>'P', 'CCH'=>'P', 'CCK'=>'P', 'CCM'=>'P', 'CCN'=>'P',
42 'CCR'=>'P', 'CCS'=>'P', 'CCT'=>'P', 'CCV'=>'P', 'CCW'=>'P',
43 'CCY'=>'P', 'CGA'=>'R', 'CGB'=>'R', 'CGC'=>'R', 'CGD'=>'R',
44 'CGG'=>'R', 'CGH'=>'R', 'CGK'=>'R', 'CGM'=>'R', 'CGN'=>'R',
45 'CGR'=>'R', 'CGS'=>'R', 'CGT'=>'R', 'CGV'=>'R', 'CGW'=>'R',
46 'CGY'=>'R', 'CTA'=>'L', 'CTB'=>'L', 'CTC'=>'L', 'CTD'=>'L',
47 'CTG'=>'L', 'CTH'=>'L', 'CTK'=>'L', 'CTM'=>'L', 'CTN'=>'L',
48 'CTR'=>'L', 'CTS'=>'L', 'CTT'=>'L', 'CTV'=>'L', 'CTW'=>'L',
49 'CTY'=>'L', 'GAA'=>'E', 'GAC'=>'D', 'GAG'=>'E', 'GAR'=>'E',
50 'GAT'=>'D', 'GAY'=>'D', 'GCA'=>'A', 'GCB'=>'A', 'GCC'=>'A',
51 'GCD'=>'A', 'GCG'=>'A', 'GCH'=>'A', 'GCK'=>'A', 'GCM'=>'A',
52 'GCN'=>'A', 'GCR'=>'A', 'GCS'=>'A', 'GCT'=>'A', 'GCV'=>'A',
53 'GCW'=>'A', 'GCY'=>'A', 'GGA'=>'G', 'GGB'=>'G', 'GGC'=>'G',
54 'GGD'=>'G', 'GGG'=>'G', 'GGH'=>'G', 'GGK'=>'G', 'GGM'=>'G',
55 'GGN'=>'G', 'GGR'=>'G', 'GGS'=>'G', 'GGT'=>'G', 'GGV'=>'G',
56 'GGW'=>'G', 'GGY'=>'G', 'GTA'=>'V', 'GTB'=>'V', 'GTC'=>'V',
57 'GTD'=>'V', 'GTG'=>'V', 'GTH'=>'V', 'GTK'=>'V', 'GTM'=>'V',
58 'GTN'=>'V', 'GTR'=>'V', 'GTS'=>'V', 'GTT'=>'V', 'GTV'=>'V',
59 'GTW'=>'V', 'GTY'=>'V', 'MGA'=>'R', 'MGG'=>'R', 'MGR'=>'R',
60 'NNN'=>'X', 'RAY'=>'B', 'SAR'=>'Z', 'TAA'=>'.', 'TAC'=>'Y',
61 'TAG'=>'.', 'TAR'=>'.', 'TAT'=>'Y', 'TAY'=>'Y', 'TCA'=>'S',
62 'TCB'=>'S', 'TCC'=>'S', 'TCD'=>'S', 'TCG'=>'S', 'TCH'=>'S',
63 'TCK'=>'S', 'TCM'=>'S', 'TCN'=>'S', 'TCR'=>'S', 'TCS'=>'S',
64 'TCT'=>'S', 'TCV'=>'S', 'TCW'=>'S', 'TCY'=>'S', 'TGA'=>'.',
65 'TGC'=>'C', 'TGG'=>'W', 'TGT'=>'C', 'TGY'=>'C', 'TRA'=>'.',
66 'TTA'=>'L', 'TTC'=>'F', 'TTG'=>'L', 'TTR'=>'L', 'TTT'=>'F',
67 'TTY'=>'F', 'XXX'=>'X', 'YTA'=>'L', 'YTG'=>'L', 'YTR'=>'L'
68 );
69
70 #open(F,$seqfile) || die("Error opening $seqfile!\n");
71
72 $/="\n>";
73 while(<>) {
74 chomp;
75 s/^>//;
76 next unless length($_)>0;
77 my ($name, $descr, $seq)=(m/^(\S+)[ \t\x01]*(.*?)\n(.+)/s);
78 die "ERROR 21: Wrong FASTA format: .$_." unless $name && $seq;
79 $seq =~ tr/ \t\n\r//d;
80 my ($ofs)=($descr=~m/^(\d+)/);
81 if ($nametag) {
82 $name.='|'.$nametag;
83 }
84 elsif ($useoffset && defined($ofs)) {
85 $name.='|'.$ofs;
86 }
87 process($name,$seq);
88 }
89
90 sub process {
91 my ($name,$seq)=@_;
92
93 my $len=length($seq);
94 #my $index=0;
95 #$index=translate($name,$seq,$len,$index,1);
96 translate($name,$seq,$len,1);
97 $seq= reverse $seq;
98 $seq =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
99 #translate($name,$seq,$len,$index,-1);
100 translate($name,$seq,$len);
101 }
102
103 sub translate {
104 my ($name,$seq,$len,$fwd)=@_;
105 my $i=0;
106 my @start;
107
108 $start[0]=0;
109 $start[1]=1;
110 $start[2]=2;
111
112 my @prot;
113 my @aa;
114 my (@numaa, @numX);
115 while($i+3<$len) {
116 for(my $j=0;$j<3;$j++) {
117 my $cod=substr($seq,$i+$j,3);
118 $aa[$j]=$repeat ? $codons{$cod} : $codons{uc($cod)};
119 $aa[$j]='X' unless $aa[$j];
120 $numX[$j]++ if $aa[$j] eq 'X';
121 if($aa[$j] eq '.') {
122 if($i+$j-$start[$j]>=60 && $numaa[$j]-$numX[$j]>6) {
123 print ">$name|";
124 my ($cstart, $cend)=$fwd ?($start[$j]+1, $i+$j):
125 ($len-$start[$j], $len-$i-$j+1);
126 print join('_',$cstart, $cend)."\n";
127 #$index++;
128 #print $prot[$j],"\n";
129 &printfa($prot[$j]);
130 }
131 $start[$j]=$i+$j+3;
132 $prot[$j]="";
133 $numaa[$j]=0;
134 $numX[$j]=0;
135 } #stop found
136 elsif($aa[$j]) {
137 $prot[$j].=$aa[$j];
138 $numaa[$j]++;
139 }
140 } #for $j
141 $i+=3;
142 } #while $i
143 #process the last aa translation (after the last stop found)
144 for(my $j=0;$j<3;$j++) {
145 my $lenp=length($prot[$j]);
146 if($start[$j]<$len && $lenp>=20 && $numaa[$j]-$numX[$j]>6) {
147 print ">$name|";
148 #$index++;
149 my ($cstart, $cend)= $fwd ? ($start[$j]+1, $start[$j]+3*$lenp) :
150 ($len-$start[$j], $len-$start[$j]-3*$lenp+1);
151 $cend=1 if $cend<1;
152 #if ($cstart==389) {
153 # print STDERR join(", ", "len=$len", "fwd=$fwd","start[j]=$start[$j] (j=$j)", "lenp=$lenp")."\n";
154 # }
155 print join('_',$cstart, $cend)."\n";
156 &printfa($prot[$j]);
157 $prot[$j]="";
158 }
159 }
160 #return $index;
161 }
162
163 sub printfa {
164 my $seqlen=length($_[0]);
165 my $pos=0;
166 while ($pos<$seqlen) {
167 print substr($_[0],$pos,60)."\n";
168 $pos+=60;
169 }
170 }

Properties

Name Value
svn:executable *