ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/msatfinder/ird.pl
Revision: 1.4
Committed: Thu Jul 7 13:14:54 2005 UTC (10 years, 10 months ago) by knirirr
Branch: MAIN
CVS Tags: HEAD
Changes since 1.3: +2 -2 lines
Log Message:
Fixed typo in last improvement...

Line File contents
1 #!/usr/bin/perl
2
3 #######################################################################################
4 # a script to make a table of the inter-repeat distances for each genome, #
5 # and calculate which form parts of interrupted msats. This code will #
6 # eventually be incorporated into msatfinder (http://www.bioinf.ceh.ac.uk/msatfinder) #
7 # v. 0.1, M. Thurston (mith@ceh.ac.uk), Thu Jul 7 09:46:37 BST 2005 #
8 #######################################################################################
9 use strict;
10 use Bio::SeqIO;
11 use Getopt::Std;
12
13 #####################
14 # usage information #
15 #####################
16 my $usage="
17 Usage:
18
19 ird.pl [options] -r file
20 e.g.
21
22 ird.pd -g fasta -t fasta -p \$PWD -r repeat_file
23
24 Options:
25
26 -r repeat_file to be read
27 -s <1-3> turn on summary files. 1: Positions of all msats.
28 2: Positions including \"root\" msats.
29 3: Both of the above files
30 -f set flank size (default 300)
31 -g suffix of your genome files (default gbk)
32 -t type of genome file, e.g. fasta (default genbank)
33 -p path to genome files, e.g. /home/user/genomes/ (default = CWD)
34 -i <0,1> generate fasta and tab files (default 1)
35 -a directory to save fasta files to - must already exist (default Fasta)
36 -b directory to save tab files to - must already exist (default Msat_tabs)
37 -h list these options
38
39 The repeat_file used as input is the repeats file output by msatfinder,
40 found in the Repeats/ directory.
41
42 ";
43
44
45 ################
46 # Options here #
47 ################
48 my $summary_file = "msats_pre.csv";
49 my $summary_file2 = "msats_post.csv";
50 my ($summary,$flank,$glob,$format,$path,$generate,$fasta,$tab,$file);
51 my %opts=();
52 getopts('hr:s:f:g:t:p:i:a:b:',\%opts);
53 if (defined $opts{h}) { print $usage; exit; }
54 if ($opts{r}) { $file = $opts{r}; } else { print "No input file specified. Exiting!\n"; exit; }
55 if (defined $opts{s}) { $summary = $opts{s}; } else { $summary = 0; }
56 if (defined $opts{f}) { $flank = $opts{f}; } else { $flank = 300; }
57 if (defined $opts{g}) { $glob = $opts{g}; } else { $glob = "gbk"; }
58 if (defined $opts{t}) { $format = $opts{t}; } else { $format = "genbank"; }
59 if (defined $opts{p}) { $path = $opts{p} . "/"; } else { $path = "./"; }
60 if (defined $opts{i}) { $generate = $opts{i}; } else { $generate = 1; }
61 if (defined $opts{a}) { $fasta = $opts{a}; } else { $fasta = "Fasta"; }
62 if (defined $opts{b}) { $tab = $opts{b}; } else { $tab = "Msat_tabs"; }
63 unless (-e $fasta) { print "Directory $fasta does not exist. Exiting!\n"; exit; }
64 unless (-e $tab) { print "Directory $tab does not exist. Exiting!\n"; exit; }
65
66
67 # create an msat object for each line of info
68 # store objects in a hash, and also relate names
69 # of msats to genomes for printing later.
70 my @genomes = ();
71 my @allmsats = ();
72 my @interrupts = ();
73 my %msats = ();
74 my %linked = ();
75 my %seen = ();
76 my %root = ();
77 my %components = ();
78 open (IN, "<$file") or die "Can't open input file $file: $!";
79 while (my $line = <IN>)
80 {
81 next if ($line =~ /repeat/);
82 my $name = [split(/\|/, $line)]->[0];
83 my @parts = split(/\./, $name);
84 my $genome = @parts[0];
85 push(@genomes, $genome) unless ($seen{$genome}== 1);
86
87 # store names of each msat in each genome
88 # if read from output file, these should go in in order
89 push(@{$msats{$genome}}, $name);
90 $seen{$genome} = 1;
91 }
92 close IN;
93
94 # hashes that store the results lines
95 my %revname = ();
96 my %forname = ();
97 my %revdist = ();
98 my %fordist = ();
99
100 # order msats within the genome,
101 # both forward and reverse
102 foreach my $genome (@genomes)
103 {
104 my @msats = @{$msats{$genome}};
105 my @revmsats = reverse @msats;
106 my $old = "";
107 my $oend = "";
108 foreach my $msat (@msats)
109 {
110 my @parts = split(/\./, $msat);
111 my $loc = $parts[1];
112 my $fp = length($parts[2]) * $parts[3];
113 my $end = $loc + $fp;
114 my $dist = $loc - $oend;
115 $revname{$msat} = $old;
116 $revdist{$msat} = $dist unless ($old eq "");
117 $old = $msat;
118 $oend = $end;
119 }
120 $old = "";
121 my $ostart = "";
122 foreach my $revmsat (@revmsats)
123 {
124 my @parts = split(/\./, $revmsat);
125 my $loc = $parts[1];
126 my $fp = length($parts[2]) * $parts[3];
127 my $end = $loc + $fp;
128 my $dist = $ostart - $end;
129 $forname{$revmsat} = $old;
130 $fordist{$revmsat} = $dist unless ($old eq "");
131 $old = $revmsat;
132 $ostart = $loc;
133 }
134
135 }
136
137 # print the results
138 if ($summary == 1 or $summary == 3)
139 {
140 open (OUT, ">$summary_file") or die "Can't open $summary_file: $!";
141 print OUT "genome,msat,next,dist,last,dist\n";
142 foreach my $genome (@genomes)
143 {
144 foreach my $msat (@{$msats{$genome}})
145 {
146 print OUT "$genome,$msat,$forname{$msat},$fordist{$msat},$revname{$msat},$revdist{$msat}\n";
147 }
148 }
149 close OUT;
150 }
151
152 # determine which are interrupted msats
153 foreach my $genome (@genomes)
154 {
155 my $old = "";
156 foreach my $msat (@{$msats{$genome}})
157 {
158 push (@allmsats, $msat);
159 my @parts = split(/\./, $msat);
160 my $loc = $parts[1];
161 my $ml = length($parts[2]);
162 my $fp = $ml * $parts[3];
163 my $last = $revname{$msat};
164 my $dist = $revdist{$msat};
165 my $ol = length([split(/\./,$old)]->[2]);
166 # fits criteria
167 if ($dist <= $fp and $ml == $ol)
168 {
169 if (defined $root{$old})
170 {
171 $root{$msat} = $root{$old};
172 push(@{$components{$root{$old}}}, $msat);
173 push (@interrupts, $msat);
174 }
175 else
176 {
177 $root{$msat} = $old;
178 $root{$old} = $old;
179 push(@{$components{$old}}, $msat);
180 push(@{$components{$old}}, $old);
181 push (@interrupts, $msat);
182 push (@interrupts, $old);
183 }
184 }
185 $old = $msat;
186 }
187 }
188
189 # create separate array of non-interrupted msats,
190 # and interrupted ones
191 my %intseen = ();
192 my @noninterrupts = ();
193 foreach my $item (@interrupts) { $intseen{$item} = 1; }
194 foreach my $item (@allmsats)
195 {
196 unless ($intseen{$item})
197 {
198 push(@noninterrupts, $item);
199 }
200 }
201
202 # go through array of interrupted msats, and combine each
203 # set into one single msat
204 my %longroot = ();
205 my @joined = ();
206 foreach my $msat (@interrupts)
207 {
208 if (defined $components{$msat})
209 {
210 my @reorder = sort {[split(/\./,$a)]->[1] <=> [split(/\./,$b)]->[1] } @{$components{$msat}};
211 my $start = "";
212 my $stop = "";
213 my @locations=();
214 my @motifs=();
215 my $genome;
216 my $units;
217 my $oldloc = 0;
218 my $oldfp = 0;
219 foreach my $comp (@reorder)
220 {
221 my @parts = split(/\./, $comp);
222 my $loc = $parts[1];
223 my $motif = $parts[2];
224 my $unit = $parts[3];
225 $genome = $parts[0];
226 push (@locations, $loc);
227 push (@motifs, $motif);
228 $units += $unit;
229 $units += int(($loc - ($oldloc + $oldfp))/length($motif)) if ($oldloc > 0);
230 $oldloc = $loc;
231 $oldfp = length($motif) * $unit;
232 }
233 # create the new msat name
234 my $newname = $genome . "." . $locations[0] . "." . join("-", @motifs) . "." . $units;
235 push (@joined, $newname);
236 $longroot{$msat} = $newname;
237 }
238 }
239
240 # place all msats in an array corresponding to the genome they
241 # are in, both normal and interrupted
242 my %finalmsats = ();
243 foreach my $msat (@noninterrupts)
244 {
245 my $genome = [(split/\./, $msat)]->[0];
246 push(@{$finalmsats{$genome}}, $msat);
247 }
248 foreach my $msat (@joined)
249 {
250 my $genome = [(split/\./, $msat)]->[0];
251 push(@{$finalmsats{$genome}}, $msat);
252 }
253
254 # print the names of each msat
255 if ($summary == 2 or $summary == 3)
256 {
257 open (OUT, ">$summary_file2") or die "Can't open $summary_file2: $!";
258 foreach my $msat (@noninterrupts)
259 {
260 print OUT [split(/\./, $msat)]->[0],",$msat\n";
261 }
262 foreach my $msat (@joined)
263 {
264 print OUT [split(/\./, $msat)]->[0],",$msat\n";
265 }
266 close OUT;
267 }
268
269 # generate all the fasta and FT files
270 # I've copied and pasted code due to being lazy; a later
271 # version will clean this up, amongst other things
272 if ($generate == 1)
273 {
274 my ($tabtext,$tabfile);
275 foreach my $genome (@genomes)
276 {
277 print "Scanning genome $genome\n";
278 $tabtext = "";
279 $tabfile = "$tab/$genome.msat_tab";
280 open (TAB, ">$tabfile") or die "Can't open $tabfile: $!";
281 my $infile = Bio::SeqIO->new(-file => "$path$genome.$glob",
282 -format => "$format");
283 while (my $seq = $infile->next_seq())
284 {
285 my $sequence = $seq->seq();
286 foreach my $msat (@{$finalmsats{$genome}})
287 {
288 my @parts = split(/\./, $msat);
289 my $loc = $parts[1];
290 my $motif = $parts[2];
291 my $submotif = [split(/-/,$motif)]->[0];
292 my $unit = $parts[3];
293 my $fp = length($submotif) * $unit;
294 my $end = $loc + $fp;
295 my $fastafile = "$fasta/$msat.fasta";
296 my $subseq = substr($sequence, ($loc - $flank), (2*$flank + $fp));
297 # print fasta file
298 print "Printing fasta file ($fastafile) for msat $msat\n";
299 open (FASTA, ">$fastafile") or die "Can't open $fastafile: $!";
300 print FASTA ">$msat\n$subseq";
301 close FASTA;
302 #generate tab file info
303 print "Printing msat $msat to tab file\n";
304 print TAB <<EOF;
305 FT repeat_region $loc..$end
306 FT /label={$motif}$unit
307 FT /note="($motif)$unit"
308 EOF
309 }
310 }
311 close TAB;
312 }
313 }
314
315 print "Finished!\n";
316
317 __END__