ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/msatfinder/prepimp.pl
Revision: 1.1
Committed: Tue Feb 13 16:23:12 2007 UTC (9 years, 5 months ago) by confused
Branch: MAIN
CVS Tags: HEAD
Log Message:
Deals with non-standard nucleotide letters, necessary for using Markov chain analysis

Line File contents
1 #! /usr/bin/perl -w
2 use strict;
3
4 #removes dodgy nucleotide symbols from a fasta file
5
6 unless (@ARGV)
7 {
8 print "You need to enter the name of the file you wish to examine\n";
9 exit;
10 }
11
12 my $type = $ARGV[0];
13
14 my $in = $ARGV[1];
15
16 my $out;
17 if ($ARGV[2])
18 {
19 $out = $ARGV[2];
20 }
21 else
22 {
23 $in =~ m/([\w-]*)(\.\w*)$/;
24 $out = "$1-treat$2";
25 }
26 my $index=0;
27
28 my @R = ('G','A'); #puRine
29 my @Y = ('T','C'); #pYrimidine
30 my @K = ('G','T'); #Ketone
31 my @M = ('A','C'); #aMino group
32 my @S = ('G','C'); #Strong interaction
33 my @W = ('A','T'); #Weak interaction
34 my @B = ('G','T','C'); #B comes after A
35 my @D = ('G','A','T'); #D comes after C
36 my @H = ('A','C','T'); #H comes after G
37 my @V = ('G','C','A'); #V comes after U
38 my @N = ('A','G','C','T'); #aNy
39
40 #amino acids
41 my @Z = ('E','Q');
42 my @X = ('A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','Z');
43
44 my %all;
45
46 if ($type eq "-n")
47 {
48 %all = ( 'R' => \@R,
49 'Y' => \@Y,
50 'K' => \@K,
51 'M' => \@M,
52 'S' => \@S,
53 'W' => \@W,
54 'B' => \@B,
55 'D' => \@D,
56 'H' => \@H,
57 'V' => \@V,
58 'N' => \@N
59 );
60 }
61 else
62 {
63 @B = ('N','D');
64 %all = ('B' => \@B,
65 'Z' => \@Z,
66 'X' => \@X
67 );
68 }
69 my %other;
70 open(IN, "$in");
71 open(OUT, ">$out");
72 while (<IN>)
73 {
74 my $line=$_;
75 if ($line !~ />/)
76 {
77 if ($type eq "-n")
78 {
79 $line =~ s/([^atgcATGC\s])/random($1)/ge;
80 }
81 else
82 {
83 $line =~ s/([^arndceqghilkmfpstwyvARNDCEQGHILKMFPSTWYV\s])/random($1)/ge;
84 }
85 }
86 print OUT $line;
87 }
88 close IN;
89 close OUT;
90 print "Total number of non-standard characters : $index\n";
91 if ($index > 0 && $type eq "-n")
92 {
93 print "\nThe following table shows the non-ATGC characters found followed by the number of each such characters found.\nThe final four columns deal with how these characters were randomly replaced with either A,T,G or C.\n\n";
94 print "\t\tA\tT\tG\tC\n\n";
95 while (my ($key, $value) = each %other)
96 {
97 my $string="$key\t";
98 my %value=%{$value};
99 $string.=$value{value}."\t";
100 delete $value{value};
101 my @atgc = ("A","T","G","C");
102 for (my $i=0;$i<scalar @atgc;$i++)
103 {
104 if ($value{$atgc[$i]})
105 {
106 $string .= "$value{$atgc[$i]}\t";
107 }
108 else
109 {
110 $string .= "\t";
111 }
112 }
113 print "$string\n";
114 }
115 }
116 if ($index > 0 && $type eq "-p")
117 {
118 print "\nThe following table shows the non-standard amino acid characters found followed by the number of each such characters found.\nThe final four columns deal with how these characters were randomly replaced with one of the 20 amino acids.\n\n";
119 print "\t\tA\tR\tN\tD\tC\tE\tQ\tG\tH\tI\tL\tK\tM\tF\tP\tS\tT\tW\tY\tZ\n\n";
120 while (my ($key, $value) = each %other)
121 {
122 my $string="$key\t";
123 my %value=%{$value};
124 $string.=$value{value}."\t";
125 delete $value{value};
126 my @atgc = ("A","R","N","D","C","E","Q","G","H","I","L","K","M","F","P","S","T","W","Y","Z");
127 for (my $i=0;$i<scalar @atgc;$i++)
128 {
129 if ($value{$atgc[$i]})
130 {
131 $string .= "$value{$atgc[$i]}\t";
132 }
133 else
134 {
135 $string .= "\t";
136 }
137 }
138 print "$string\n";
139 }
140 }
141 sub random
142 {
143 $index++;
144 my $in = $_[0];
145 $in = uc($in);
146
147 #store counts of non-ATGC characters
148 if ($other{$in})
149 {
150 $other{$in}{value}++;
151 }
152 else
153 {
154 $other{$in}{value}=1;
155 }
156
157 if ($all{$in})
158 {
159 my @array = @{$all{$in}};
160 my $index=rand @array;
161
162 #the dodgy character, and what it got replaced with
163 #print "$in => $array[$index]\n";
164 if ($other{$in}{$array[$index]})
165 {
166 $other{$in}{$array[$index]}++;
167 }
168 else
169 {
170 $other{$in}{$array[$index]}=1;
171 }
172 return $array[$index];
173 }
174 else
175 {
176 print "Character not recognised : $in\n";
177 }
178 }