ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/orf_check.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 5512 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q{Usage:
7 orf_check.pl [-P] [-a <min_aa_len>] <gffread_mrnas.fasta>
8 Use -N option to not check for ORFs, but just print the output
9 without the last column.
10 };
11 umask 0002;
12 getopts('Pa:o:') || die($usage."\n");
13 my $outfile=$Getopt::Std::opt_o;
14 my $minaa=$Getopt::Std::opt_a || 30;
15 my $justPrint=$Getopt::Std::opt_P;
16 my $minbases=$minaa*3;
17 if ($outfile) {
18 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
19 select(OUTF);
20 }
21 #-- globals
22 my %codons=(
23 'AAA'=>'K', 'AAC'=>'N', 'AAG'=>'K', 'AAR'=>'K', 'AAT'=>'N',
24 'AAY'=>'N', 'ACA'=>'T', 'ACB'=>'T', 'ACC'=>'T', 'ACD'=>'T',
25 'ACG'=>'T', 'ACH'=>'T', 'ACK'=>'T', 'ACM'=>'T', 'ACN'=>'T',
26 'ACR'=>'T', 'ACS'=>'T', 'ACT'=>'T', 'ACV'=>'T', 'ACW'=>'T',
27 'ACY'=>'T', 'AGA'=>'R', 'AGC'=>'S', 'AGG'=>'R', 'AGR'=>'R',
28 'AGT'=>'S', 'AGY'=>'S', 'ATA'=>'I', 'ATC'=>'I', 'ATG'=>'M',
29 'ATH'=>'I', 'ATM'=>'I', 'ATT'=>'I', 'ATW'=>'I', 'ATY'=>'I',
30 'CAA'=>'Q', 'CAC'=>'H', 'CAG'=>'Q', 'CAR'=>'Q', 'CAT'=>'H',
31 'CAY'=>'H', 'CCA'=>'P', 'CCB'=>'P', 'CCC'=>'P', 'CCD'=>'P',
32 'CCG'=>'P', 'CCH'=>'P', 'CCK'=>'P', 'CCM'=>'P', 'CCN'=>'P',
33 'CCR'=>'P', 'CCS'=>'P', 'CCT'=>'P', 'CCV'=>'P', 'CCW'=>'P',
34 'CCY'=>'P', 'CGA'=>'R', 'CGB'=>'R', 'CGC'=>'R', 'CGD'=>'R',
35 'CGG'=>'R', 'CGH'=>'R', 'CGK'=>'R', 'CGM'=>'R', 'CGN'=>'R',
36 'CGR'=>'R', 'CGS'=>'R', 'CGT'=>'R', 'CGV'=>'R', 'CGW'=>'R',
37 'CGY'=>'R', 'CTA'=>'L', 'CTB'=>'L', 'CTC'=>'L', 'CTD'=>'L',
38 'CTG'=>'L', 'CTH'=>'L', 'CTK'=>'L', 'CTM'=>'L', 'CTN'=>'L',
39 'CTR'=>'L', 'CTS'=>'L', 'CTT'=>'L', 'CTV'=>'L', 'CTW'=>'L',
40 'CTY'=>'L', 'GAA'=>'E', 'GAC'=>'D', 'GAG'=>'E', 'GAR'=>'E',
41 'GAT'=>'D', 'GAY'=>'D', 'GCA'=>'A', 'GCB'=>'A', 'GCC'=>'A',
42 'GCD'=>'A', 'GCG'=>'A', 'GCH'=>'A', 'GCK'=>'A', 'GCM'=>'A',
43 'GCN'=>'A', 'GCR'=>'A', 'GCS'=>'A', 'GCT'=>'A', 'GCV'=>'A',
44 'GCW'=>'A', 'GCY'=>'A', 'GGA'=>'G', 'GGB'=>'G', 'GGC'=>'G',
45 'GGD'=>'G', 'GGG'=>'G', 'GGH'=>'G', 'GGK'=>'G', 'GGM'=>'G',
46 'GGN'=>'G', 'GGR'=>'G', 'GGS'=>'G', 'GGT'=>'G', 'GGV'=>'G',
47 'GGW'=>'G', 'GGY'=>'G', 'GTA'=>'V', 'GTB'=>'V', 'GTC'=>'V',
48 'GTD'=>'V', 'GTG'=>'V', 'GTH'=>'V', 'GTK'=>'V', 'GTM'=>'V',
49 'GTN'=>'V', 'GTR'=>'V', 'GTS'=>'V', 'GTT'=>'V', 'GTV'=>'V',
50 'GTW'=>'V', 'GTY'=>'V', 'MGA'=>'R', 'MGG'=>'R', 'MGR'=>'R',
51 'NNN'=>'X', 'RAY'=>'B', 'SAR'=>'Z', 'TAA'=>'.', 'TAC'=>'Y',
52 'TAG'=>'.', 'TAR'=>'.', 'TAT'=>'Y', 'TAY'=>'Y', 'TCA'=>'S',
53 'TCB'=>'S', 'TCC'=>'S', 'TCD'=>'S', 'TCG'=>'S', 'TCH'=>'S',
54 'TCK'=>'S', 'TCM'=>'S', 'TCN'=>'S', 'TCR'=>'S', 'TCS'=>'S',
55 'TCT'=>'S', 'TCV'=>'S', 'TCW'=>'S', 'TCY'=>'S', 'TGA'=>'.',
56 'TGC'=>'C', 'TGG'=>'W', 'TGT'=>'C', 'TGY'=>'C', 'TRA'=>'.',
57 'TTA'=>'L', 'TTC'=>'F', 'TTG'=>'L', 'TTR'=>'L', 'TTT'=>'F',
58 'TTY'=>'F', 'XXX'=>'X', 'YTA'=>'L', 'YTG'=>'L', 'YTR'=>'L'
59 );
60
61 # --
62 my ($t, $xloc, $chr, $strand, @exons, @segs, $seq);
63 while (<>) {
64 chomp;
65 next unless $_;
66 if (m/^>(\S+)/) { #defline
67 my ($id, $l)=($1,$_);
68 process_seq();
69 $t=$id;
70 ($xloc)=($l=~m/\bgene=(\w+)/);
71 my ($loc)=($l=~m/\bloc:(\S+)/);
72 if ($loc) {
73 ($chr,$strand)=($loc=~m/(\S+)\|\d+\-\d+\|([\-\+])$/);
74 }
75 my ($ex)=($l=~m/\bexons:([\d\-\,]+)/);
76 if ($ex) {
77 @exons= map { [(split(/\-/))] } (split(/\,/,$ex));
78 }
79 my ($sg)=($l=~m/\bsegs:([\d\-\,]+)/);
80 if ($sg) {
81 @segs= map { [(split(/\-/))] } (split(/\,/,$sg));
82 }
83 next;
84 }
85 #sequence
86 $seq.=$_;
87 }
88
89 process_seq();
90
91 # --
92 if ($outfile) {
93 select(STDOUT);
94 close(OUTF);
95 }
96
97 #************ Subroutines **************
98 sub process_seq {
99 return unless $seq;
100 #print $t.' '.$chr.$strand.' '.join(',', @x)."\n";
101 $seq=uc($seq);
102 my @orfs; #list of [start_on_mrna, orflen, start_on_genome, stop_on_genome]
103 my $pstart=0;
104 my $i=-1;
105 my ($lasti, @lastend);
106 goto SKIPORF if ($justPrint);
107 while (($i=index($seq,'ATG',$pstart))>=0) {
108 last if $i+3>length($seq)-$minaa*3;
109 if ($i<$lastend[ $i % 3 ]) {
110 $pstart=$i+3;
111 next;
112 }
113 my $aa=trFrom($seq,$i);
114 my $blen=length($aa)*3;
115 if ($blen>$minbases) {
116 $lasti=$i;
117 $lastend[ $i % 3 ]=$i+$blen;
118 if ($aa=~m/>trunc$/) { #just discard those without stop codons, or with no 3'UTR
119 $pstart=$i+3;
120 next;
121 }
122 my ($gstart, $gstop)=(pos2genome($i+1), pos2genome($i+$blen));
123 push(@orfs, [$i+1,$blen, $gstart, $gstop]);
124 #print " [====> orf aa: (ntlen=$blen, at ".($i+1)."..".($i+$blen).")\n $aa\n";
125 }
126 $pstart=$i+3;
127 }
128 SKIPORF:
129 if (@orfs>0 || $justPrint) {
130 my @o;
131 if (@orfs>0) {
132 @orfs=sort { $main::b->[1]<=>$main::a->[1] } @orfs;
133 @o=map { $_->[0].':'.$_->[1] } @orfs;
134 }
135 my @x=map { $_->[0].'-'.$_->[1] } @exons;
136 print join("\t", $t,$xloc,$chr.$strand, 'exons='.join(',', @x),'orfs='.join(',',@o))."\n";
137 #print " >> max orf has ntlen=$orfs[0]->[1] at $orfs[0]->[0]\n";
138 }
139 # - end of record processing: clear globals
140 ($t, $xloc, $chr, $strand, $seq)=();
141 @exons=();
142 @segs=();
143 }
144
145 sub trFrom { #translate seq from offset, returns translation until stop codon
146 # ($seq, $ofs) given
147 my $ofs=$_[1];
148 my $s=substr($_[0], $ofs);
149 my @cods = unpack('(A3)*',$s);
150 my $r;
151 my $i=1;
152 foreach my $c (@cods) {
153 if (length($c)<3) { # premature ending, this is not a valid ORF
154 return $r.'>trunc'; #doesn't have a stop codon
155 }
156 my $aa=$codons{$c} || 'X';
157 last if $aa eq '.';
158 $r.=$aa;
159 $i++;
160 if ($i==@cods) {
161 return $r.'>trunc';
162 }
163 }
164 return $r;
165 }
166
167 # map position of a base from transcript to genome
168 # requires exon coordinates and strand info
169 sub pos2genome {
170 if($strand eq '-') {
171
172 }
173 # -- positive strand
174
175 return 0;
176 }

Properties

Name Value
svn:executable *