ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/msatfinder/msatfinder
Revision: 1.25
Committed: Tue Feb 13 16:21:44 2007 UTC (9 years, 7 months ago) by confused
Branch: MAIN
CVS Tags: HEAD
Changes since 1.24: +868 -271 lines
Log Message:
New version of msatfinder, new engines

Line File contents
1 #! /usr/bin/perl
2
3 # Copyright 2005 Milo Thurston (mith@ceh.ac.uk) and Dawn Field (dfield@ceh.ac.uk)
4
5 ######################################################
6 # A reasonably simple application to search for all #
7 # mono-hexanucleotide repeats in Genbank or #
8 # fasta formatted files, and provide detailed output #
9 ######################################################
10
11 #############################################################################
12 # This program is free software; you can redistribute it and/or modify #
13 # it under the terms of the GNU General Public License as published by #
14 # the Free Software Foundation; either version 2 of the License, or #
15 # (at your option) any later version. #
16 # #
17 # This program is distributed in the hope that it will be useful, #
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of #
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
20 # GNU General Public License for more details. #
21 # #
22 # You should have received a copy of the GNU General Public License #
23 # along with this program; if not, write to the Free Software #
24 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA #
25 #############################################################################
26
27 use strict;
28 use CGI;
29 use Cwd;
30 use Bio::Location::Split;
31 use Bio::SeqIO;
32 use Config::Simple;
33 use File::Copy;
34 use Getopt::Std;
35 use List::Util qw(min max);
36 use Term::ReadLine;
37
38 my $version = "2.0.9";
39
40
41 #####################
42 # usage information #
43 #####################
44 my $usage="
45 Usage:
46
47 msatfinder [options] files
48 msatfinder -b *.gbk
49 msatfinder -b -l examples.list
50
51
52 Options:
53
54 -b backup your old data directories (adds date & time suffix)
55 -d delete most recent data directories, don't search for microsatellites
56 -e <1-6> engine to use - see the manual (default 1)
57 -f set flank size, overriding config file (default 300)
58 -h list these options
59 -i check for interrupted msats (not checked by default)
60 -l <list_file> read a list of genomes from a text file
61 -r yes/no - whether or not to randomise sequence prior to counting msats
62 -s silence most output to screen (overrides config file)
63 -x delete all data directories, don't search for microsatellites
64
65 By default, msatfinder will take the list of files given on the command
66 line as the files to read. Instead you may use the -l option, and tell
67 it to read a list of the genomes you would like to be processed.
68
69 Use options \"-d\" and \"-x\" with caution.
70
71 ";
72
73
74 ################
75 # idendify cwd #
76 ################
77 my $cwd = getcwd;
78 #######################
79 # date related things #
80 #######################
81 my $date = `date`;
82 chomp($date);
83 my %dtrans = ( "JAN" => "01",
84 "FEB" => "02",
85 "MAR" => "03",
86 "APR" => "04",
87 "MAY" => "05",
88 "JUN" => "06",
89 "JUL" => "07",
90 "AUG" => "08",
91 "SEP" => "09",
92 "OCT" => "10",
93 "NOV" => "11",
94 "DEC" => "12" );
95 my $datestring = `date +%j%H%M%S`;
96 chomp($datestring);
97
98 ####################################
99 # import settings from config file #
100 ####################################
101 my $config = &getconfig($cwd,@ARGV);
102
103 ############################
104 # various global variables #
105 ############################
106 my $override = $config->{'FINDER.override'};
107 my $artemis = $config->{'FINDER.artemis'};
108 my $mine = $config->{'FINDER.mine'};
109 my $fastafile = $config->{'FINDER.fastafile'};
110 my $sumswitch = $config->{'FINDER.sumswitch'};
111 my $screendump = $config->{'FINDER.screendump'};
112 my $run_eprimer = $config->{'DEPENDENCIES.run_eprimer'};
113 my $eprimer_args = $config->{'DEPENDENCIES.eprimer_args'};
114 my $debug = $config->{'COMMON.debug'};
115 my $flank_size = $config->{'COMMON.flank_size'};
116 my $remote_link = $config->{'FINDER.remote_link'};
117 my $primer3core = $config->{'DEPENDENCIES.primer3core'};
118 my $eprimer = $config->{'DEPENDENCIES.eprimer'};
119 my $dbname = $config->{'VIEWER.dbname'};
120 my $dbtype = $config->{'VIEWER.dbtype'};
121 my $dbusername = $config->{'VIEWER.gdbusername'};
122 my $dbusername2 = $config->{'VIEWER.sdbusername'};
123 my $cgiloc = $config->{'VIEWER.cgiloc'};
124 my $msatview = $config->{'VIEWER.msatview'};
125 my $run_mview = $config->{'ALIGNER.run_mview'};
126
127 my $printerror = 0; # note if an error must be printed to the error file
128 my $runird = 0;
129
130 my $file_format = "vertical";
131 #$config->{'CONTACT.format'};
132
133 my $dlm="\|"; #delimiter
134 my $gff;
135 #if ($file_format eq "vertical")
136 #{
137 # $dlm = "\|";
138 #}
139 #elsif ($file_format eq "tab")
140 #{
141 # $dlm = "\t";
142 #}
143 #elsif ($file_format eq "gff")
144 #{
145 # $dlm = "\|";
146 # $gff=1;
147 #}
148 $gff=1;
149 my $gff_index=0;
150
151 my $non_divisible = 0; #in the case of sputnik, the length of the msat is not equal to motif_length x repeat_number. Last number in msat description is not repeat number, but stop position. Also, sputnik doesn't provide a motif.
152 my $msat_dir = "/usr/local/bioinf/msatfinder/msatfinder/";
153
154 ########################################
155 # subdirectories created by msatfinder #
156 ########################################
157 my $mine_dir = $config->{'COMMON.mine_dir'};
158 my $repeat_dir = $config->{'COMMON.repeat_dir'};
159 my $tab_dir = $config->{'COMMON.tab_dir'};
160 my $bigtab_dir = $config->{'COMMON.bigtab_dir'};
161 my $fasta_dir = $config->{'COMMON.fasta_dir'};
162 my $prime_dir = $config->{'COMMON.prime_dir'};
163 my $count_dir = $config->{'COMMON.count_dir'};
164 my $gff_dir = $config->{'COMMON.gff_dir'};
165 my @newdirs = ($mine_dir,$repeat_dir,$tab_dir,$bigtab_dir,$fasta_dir,$prime_dir,$count_dir,$gff_dir);
166
167 ##############################################
168 # determine which options have been selected #
169 ##############################################
170 my %opts=();
171 getopts('abcdxhilp:rt:e:f:m:sg:',\%opts);
172 # print help message
173 if (defined $opts{h})
174 {
175 print $usage;
176 exit;
177 }
178 if (defined $opts{s})
179 {
180 $screendump = 0;
181 }
182 # delete files
183 if (defined $opts{d})
184 {
185 print "Deleting existing data.\n";
186 foreach my $dir (@newdirs)
187 {
188 $dir =~ s/\///;
189 system("rm -rf $dir") == 0 or warn "Having trouble deleting directory $dir, $!";
190 }
191 print "Exiting.\n";
192 exit;
193 }
194 # delete ALL files
195 if (defined $opts{x})
196 {
197 print "Deleting ALL old data.\n";
198 foreach my $dir (@newdirs)
199 {
200 $dir =~ s/\///;
201 system("rm -rf $dir*") == 0 or warn "Having trouble deleting directory $dir, $!";
202 }
203 print "Exiting.\n";
204 exit;
205
206 }
207 # backup files
208 if (defined $opts{b})
209 {
210 print "Backing up date files with suffix $datestring\n";
211 foreach my $move (@newdirs)
212 {
213 $move =~ s/\///;
214 system("cp -r $cwd/$move $cwd/$move.$datestring") == 0 or warn "Can't copy $move, $!";
215 }
216
217 }
218 # check for interrupts
219 if (defined $opts{i})
220 {
221 $runird = 1;
222 }
223 # randomise sequence?
224 my $random;
225 my $random_times;
226 if (defined $opts{r})
227 {
228 $random = 1;
229 $random_times = $config->{'CONTACT.random'};
230 }
231 my $pimp=0;
232 if (defined $opts{p})
233 {
234 $pimp = $opts{p};
235 }
236 my $email;
237 if (defined $opts{g})
238 {
239 $email = $opts{g};
240 }
241 my $threshold_method = "manual";
242 my $polyuse = $config->{'CONTACT.polyuse'};
243 if ($polyuse = 'yes')
244 {
245 $threshold_method = "poly";
246 }
247 if ($pimp == 2)
248 {
249 $threshold_method = "markov";
250 }
251
252 my $fasta_file;
253 # which engine to use
254 my $engine;
255 if (defined $opts{e})
256 {
257 $engine = $opts{e};
258 }
259 else
260 {
261 $engine = 1;
262 }
263 if ($engine == 1) { print "Using regexp engine.\n" if ($screendump == 1); }
264 elsif ($engine == 2) { print "Using multipass engine.\n" if ($screendump == 1); }
265 elsif ($engine == 3) { print "Using iterative engine.\n" if ($screendump == 1); }
266 elsif ($engine == 4) { print "Using sputnik engine.\n" if ($screendump == 1); }
267 elsif ($engine == 5) { print "Using dust engine.\n" if ($screendump == 1); }
268 elsif ($engine == 6) { print "Using seg engine.\n" if ($screendump == 1); }
269 else { die "Somehow, no engine has been defined!\n"; }
270 print "Current directory: $cwd\n" if ($screendump == 1);
271
272 # what are the flanks?
273 if (defined $opts{f})
274 {
275 if ($opts{f} eq "")
276 {
277 $flank_size = 300;
278 }
279 else
280 {
281 $flank_size = $opts{f};
282 }
283 print "Overriding flank size in config file - using value: $flank_size\n" if ($screendump == 1);
284 }
285
286 #####################################
287 # set up motif range and thresholds #
288 #####################################
289 my $threshold = $config->{'FINDER.motif_threshold'};
290 $threshold =~ s/^\|//;
291 $threshold =~ s/\|$//;
292 my @thresholds;
293 my @motif_range;
294
295 # load motif ranges into array
296 my @motif_range_pre = split(/\|/, $threshold);
297
298 # remove any duplicates
299 my %seen = ();
300 my @threshes=();
301 foreach my $blah (@motif_range_pre)
302 {
303 my ($mot,$thresh) = split(/,/,$blah);
304 $thresholds[$mot-1] = $thresh;
305 push(@threshes,$thresh);
306 $seen{$mot}++;
307 }
308 my @uniq = keys %seen;
309 @motif_range = sort { $a <=> $b } @uniq;
310 print "Using motif types: ", join(",",@motif_range), "\n" if ($screendump == 1);
311 die "You must specify motif types to search for.\nExiting!\n" unless (@motif_range);
312
313 # check that there are thresholds appropriate to
314 # each motif type...
315 my $unspec = 0;
316 my $typeerror = 0;
317 foreach my $type (@motif_range)
318 {
319 $unspec++ unless ($thresholds[$type-1] =~ /^\d+$/ and $thresholds[$type-1] > 0);
320 $typeerror++ unless ($type =~ /^\d+$/);
321 }
322 # dodgy thresholds
323 if ($typeerror >= 1)
324 {
325 if ($typeerror == 1)
326 {
327 print "Error - you have entered a motif type incorrectly: \n";
328 }
329 else
330 {
331 print "Error - some motif types entered incorrectly:\n";
332 }
333 foreach my $type (@motif_range)
334 {
335 print "* $type\n" unless ($type =~ /^\d+$/);
336 }
337 print "Please edit the .rc file and try again. Exiting!\n";
338 exit;
339 }
340 # oops, at least one threshold undefined
341 if ($unspec >= 1)
342 {
343 if ($unspec == 1)
344 {
345 print "Error - a threshold is not defined correctly:\n";
346 }
347 else
348 {
349 print "Error - some thresholds are not specified. You have supplied the following:\n";
350 }
351 print "motif\tthreshold\n";
352 foreach my $type (@motif_range)
353 {
354 my $defstring;
355 if (@thresholds[$type-1] > 0)
356 {
357 $defstring = @thresholds[$type-1];
358 }
359 else
360 {
361 $defstring = "MISSING";
362 }
363 print "$type:\t$defstring\n";
364 }
365 print "Please edit the .rc file and try again. Exiting!\n";
366 exit;
367 }
368
369 ###############################################################
370 # open a file containing a list of the genomes to be searched #
371 ###############################################################
372 my @infiles = ();
373 my @nreg = ();
374 my $list_file;
375 if ($opts{l})
376 {
377 if (@ARGV)
378 {
379 chomp($list_file = shift);
380 }
381 unless (defined $list_file)
382 {
383 print "You need to specify the name of a list file to process.\n";
384 my @possibles = glob "*\.list";
385 if (@possibles)
386 {
387 print "Please select one of the following files:\n";
388 foreach (@possibles)
389 {
390 print " - $_\n";
391 }
392 print "Type one of the above filenames (use the up arrow to select),\n or type in another.\n";
393
394 # get the user input
395 my $term = new Term::ReadLine 'choose file';
396 my $prompt = "Select an file: ";
397 foreach (@possibles) { $term->addhistory($_); }
398 $term->ornaments('0');
399 $list_file = $term->readline($prompt);
400 }
401 else
402 {
403 print "I couln't find any list files in $cwd. Please choose a filename, or select X to exit.\n";
404
405 # get the user input
406 my $term = new Term::ReadLine 'choose file';
407 my $prompt = "Select an file: ";
408 $term->ornaments('0');
409 $list_file = $term->readline($prompt);
410 exit if ($list_file =~ /^[Xx]/);
411 }
412 }
413
414
415 # actually open the file at last...
416 open (LISTFILE, "$list_file") or die "Can't open the file \"$list_file\": $!\nPlease check you have a valid list file, and try again.";
417 while (my $infile = <LISTFILE>)
418 {
419 chomp $infile;
420 push (@infiles, $infile);
421 }
422 close LISTFILE;
423 }
424 else
425 {
426 @infiles = @ARGV;
427 }
428
429
430 # exit if no infiles
431 unless (@infiles)
432 {
433 print "You don't seem to have specified any files to process...\n";
434 print $usage;
435 exit;
436 }
437
438
439 # apply the override
440 if ($override == 1)
441 {
442 $artemis = 0;
443 $mine = 0;
444 $fastafile = 0;
445 $sumswitch = 0;
446 $screendump = 0;
447 $run_eprimer = 0;
448 }
449
450 # check the deps
451 my @deps = qw(primer3_core eprimer3);
452 my %find = ();
453 $find{primer3_core} = $primer3core;
454 $find{eprimer3} = $eprimer;
455 if ($run_eprimer == 1)
456 {
457 &depcheck(\%find,$screendump,@deps);
458 }
459
460 # die if you have more things to look for than thresholds
461 if (scalar @motif_range > scalar @thresholds)
462 {
463 die "You seem to have more types of msat to look for than you have thresholds! Please edit the config file and try again.\nExiting!\n";
464 }
465
466 # precalculate some stuff for findmotifs_regex
467 my $minrep = &min(@threshes);
468 my $maxmotiflen = scalar(@thresholds);
469
470
471 #Consistency check - see findmotifs_regex
472 if( $minrep < 3 and $engine == 1) { print "Can't cope with finding less than three repeats of patterns! (\$minrep is $minrep) whilst using the regex engine.\n"; exit; };
473 if( $thresholds[0] =~ /\d+/ and $thresholds[0] < scalar(@thresholds) and $engine == 1) { print "Threshold for monos too small - cannot resolve ambiguities.\n"; exit; };
474
475 # this makes sure that your directories exist
476 # before trying to write anything to them
477 my $dir;
478 foreach $dir (@newdirs)
479 {
480 $dir =~ s/\///;
481 if (-e "$cwd/$dir") { unlink glob "$dir/*"; }
482 unless (-e "$cwd/$dir") { mkdir "$cwd/$dir" or warn "Can't create $dir"; }
483 }
484
485 #################################################################
486 # an explication of main variables used throughout this script, #
487 # except for all the ones I haven't bothered to list here. #
488 #################################################################
489 my $aat; # %A divided by %AT
490 my $abs_path; # path to $repeat_dir (same as $cwd . $repeat_dir)
491 my $allfile; # one of the NC_xxx.xxx numbers in @all_files
492 my $all_files2; # prepares a list of files w/ and w/o repeats
493 my $alphabet; # DNA, RNA or protein?
494 my $analysis_type; # how the .db files were prepared
495 my $ATgenome; # AT% in the entire genome
496 my $blaster; # the entire repeat plus flanking sequence
497 my $gffblaster; # copy for use in gff
498 my $cdsnum; # number of CDS regions found per genome
499 my $cgc; # %C divided by %CG
500 my $CGgen_flankseq; # GC content of flank
501 my $CGgen_repeat; # GC content of repeat
502 my $CGgenome; # % CG in the complete genome
503 my $CGmatch; # see the previous CG variable
504 my $checklength; #
505 my $checkright; # Determine actual size of
506 my $checkright_len; # flanks, and the position
507 my $checkright_len_pc; # of the repeat in the
508 my $checkleft; # genome (dist from L & R).
509 my $checkleft_len; #
510 my $checkleft_len_pc; #
511 my $circular; # linear or circular DNA/RNA?
512 my $count; # counts the number of genomes
513 my $coding; # is this repeat coding or non-coding?
514 my $def; # Genbank DEFINITION line
515 my $dodgy; # file without valid content
516 my $end; # used for determining repeat length
517 my $entry; # CGI object used for MINE.db writing
518 my $feature; # header for the .msat_tab file
519 my $gfffeature; # header for gff equivalent
520 my $feature_plus_flank; # header for the .flank_tab file
521 my $gfffeature_plus_flank; # header for gff equivalent
522 my $file; # the NC_xxx.xxx number currently being examined
523 my $filein; # the Bio::SeqIO object
524 my $flankseq; # flanking sequence only
525 my $fname; # what the fasta file is called
526 my $fout; # the famous lovely fasta header
527 my $gffout; #gff header
528 my $genome_len; # total length of the genome
529 my $gentaxon; # a more general taxonomic category
530 my $howmany; # number of list files in $cwd
531 my $isaaa = 0; # is a amino acid...
532 my $isformat; # what format is the file?
533 my $join; # manipulation of Genbank CDS lines
534 my $left; # sequence upstream of the repeat
535 my $linkfile; # symlink to NC file, found in $cwd/<Msat/Flank>_tab
536 my $loc; # genbank locus
537 my $match_count; # number of matches found
538 my $match_len; # length of the match
539 my $matrix_file; # file with num. reps. vs. size
540 my $mineend; # "end" of filename on MINE.db and fasta repeat files
541 my $minenum; # number of repeats found per genome
542 my $motif_type; # mono, di, tri etc.
543 my $motif_revcom; # motif reverse compliment
544 my $numall; # total number of msats found
545 my $numint; # total number of interrupted msats found
546 my $numfiles; # number of files surveyed
547 my $numreps; # number of repeats ($mineum minus the zeros)
548 my $percentcode; # percentage coding for each genome
549 my $prefix; # the taxonomic group (phages, viruses etc.)
550 my $realfile; # file that $linkfile links to
551 my $replength; # stores the longest repeat length
552 my $repsum; # sum of repeat length in this genome
553 my $repsumpc; # percentage of this genome consisting of repeats
554 my $reverse; # set to 1 if the gene is a "compliment"
555 my $right; # sequence downstream of the repeat
556 my $sequence_string; # does what it says on the tin
557 my $strand; # ss or ds?
558 my $organism; # genbank ORGANISM
559 my $tabfile; # name of the feature table associated with $file
560 my $gfftabfile; # name of the gff feature table associated with $file
561 my $tempcoding; # sum of total number of coding bases
562 my $totalreps = 0; # number of genomes containing repeats
563 my $totalreppc; # percentage of genomes containing repeats
564 my $primercount = 0; # count total number of genomes with primers
565
566 # these variables are responsible for c/gc content &c.
567 my $base_a;
568 my $base_t;
569 my $base_g;
570 my $base_c;
571
572
573 # arrays and hashes
574 my @all_files=(); # stores all the $file seen
575 my @all_files2=(); # stores a little more than @all_files (for output 8)
576 my @genome_len=(); # list of genome lengths
577 my @line=(); # lines of the genbank file
578 my %mineseen=(); # stores the highest $minenum for each $file
579 my %ncseen=(); # print each $file once to the brief summary
580 my @dodgy=(); # files that contain invalid content
581 my $total_ids; # how many unique seqs examined?
582
583 my %repeat_counter=(); # store up the total type and number of repeats;
584 my @short_names=(); # all short names (first key to repeat_counter)
585 my @all_motifs=(); # all motif types ever found (second key to repeat_counter)
586 my %motifs_by_genome=(); # all motif types found in one particular genome
587 my %max_replength=(); # longest number of repeat units for any repeat in a genome
588 my %has_msat=(); # remember the genomes with msats in
589 my %unique_motifs=(); # unique motifs from all genomes
590
591 my %repseen=(); # sums up the total length of repeats
592 # in the whole genome
593 my %genlenhash=(); # matches genome length to $file
594 my %defseen=(); # matches $def to @all_files2
595 my %primercheck=(); # can primers be made for this particular sequence?
596 my %pccodreps=(); # percentage of repeats in coding region
597
598 my $genename; # remember the start of each repeat, to determine
599 my $protname; # which gene it is in, later.
600 my $prodname;
601 my %duplicates; # deals with any duplicated sequence names
602
603 # hashes used for determining which
604 # entries are saved into a CSV text file
605 # later in the script
606 %ncseen=();
607 %mineseen=();
608
609 ###################################
610 # .db files prepared by msatfinder #
611 ###################################
612 $analysis_type="msatfinder, run on $date, thresholds >" . join(",",@thresholds);
613
614 # determine $prefix (taxonomic_group) from
615 # the name of the working directory
616 ($prefix = $cwd) =~ s#.*/##;
617 $prefix =~ /(\w+)\.*/;
618 $prefix = lc $1;
619 $cwd .= "/";
620
621 # set up the taxon information for use in
622 # the final database file
623 $gentaxon = $config->{"FINDER.$prefix"};
624 unless (defined $gentaxon)
625 {
626 $gentaxon = $prefix;
627 }
628
629 # more will be added here to allow more generic
630 # categorisation of the organelles, bacteria &c.
631 # N.B. "specific" and "generic" taxa are notes
632 # for the convenience of the user only (e.g. they
633 # remind me what directories I put things in.
634
635 #########################################
636 # Where should the output files go? #
637 #########################################
638 # The summarised repeats now go in a
639 # subdirectory of the working directory.
640 # I have called this "./Repeats"
641
642 $abs_path = $cwd . $repeat_dir; # path to $repeat_dir
643 my $gff_path = $cwd . $gff_dir;
644 ##########################################
645 # Open the core output files for writing #
646 ##########################################
647
648 # a list of files that seem corrupted (some genbank links are broken resulting in 'empty' files
649 # during automated download from NCBI
650 open (OUTPUT_FILE1, ">$abs_path$prefix.errors") or die "Can't open $abs_path$prefix.errors file: $!";
651 print "$abs_path\n";
652 my $no_errors=1;
653
654 # an index file giving the search parameters and details
655 open (OUTPUT_FILE2, ">$abs_path$prefix.index") or die "Can't open $abs_path$prefix.index file: $!";
656 # summary data of coding regions, etc. for each rep file
657 #my $FILE3 = "$abs_path$prefix.repeats";
658
659
660
661 #if ($gff) {$FILE3.=".gff";}
662 open (OUTPUT_FILE3, ">$abs_path$prefix.repeats") or die "Can't open $abs_path$prefix.summary file: $!";
663 #open (OUTPUT_FILE3, ">$FILE3") or die "Can't open $FILE3 file: $!";
664 #my $FILE5 = "$abs_path$prefix.genomes";
665 #if ($gff) {$FILE5.=".gff";}
666 # a list of all files, whether or not repeats are present
667 #open (OUTPUT_FILE5, ">$FILE5") or die "Can't open $FILE5: $!";
668 open (OUTPUT_FILE5, ">$abs_path$prefix.genomes") or die "Can't open $abs_path$prefix.total_sum: $!";
669 #open (OUTPUT_FILE7, ">$gff_path$prefix.genomes.gff") or die "Can't open $abs_path$prefix.genomes.gff: $!"; #equiv of FILE5
670 open (OUTPUT_FILE8, ">$gff_path$prefix.repeats.gff") or die "Can't open $gff_path$prefix.repeats.gff; $!"; #equiv of FILE3
671
672
673 #########################################
674 # initialize some variables #
675 #########################################
676 $| = 1; # flush the buffer...
677 $count = 0; # number of genomes...
678
679
680 #########################
681 #########################
682 # Start searching files #
683 #########################
684 #########################
685
686 # Read through the list of file names to get all files to search
687 my $headerprint = 0;
688 foreach my $file (@infiles)
689 {
690 # take in the list of files
691 push (@all_files, $file);
692 $numfiles = @all_files;
693
694 # announce the cuurent file being searched
695 print "-----\n" if ($screendump == 1);
696 print "Searching file: $file\n" if ($screendump == 1);
697
698 # MINE file rep number setting
699 $minenum = 0;
700 print "hit reset\n if ($screendump == 1)";
701 ++$count;
702 $sequence_string = undef;
703
704 ##############################
705 # check if it's a fasta file #
706 # if not, get LOCUS #
707 ##############################
708
709 $alphabet = "undefined";
710 $strand = "undefined";
711 $organism = "undefined";
712 $def = "undefined";
713 my $oldstop = 0;
714 my ($oldline,$olderline) = ("ftang","ftang");
715 $isformat = 5;
716
717 open (IN, "<$file") or die "Can't open the genome file $file: $!";
718 my $sprotorembl = 0;
719 while (my $line =<IN>)
720 {
721 chomp($line);
722 # fasta format
723 if ($line =~ "^>")
724 {
725 print "It looks like this is a FASTA file: $file\n" if ($screendump == 1);
726 $isformat = 1;
727 last;
728 }
729 # Swissprot format
730 elsif ($line =~ /^ID/)
731 {
732 # could be EMBL or Swissprot
733 $sprotorembl = 1;
734 last;
735 }
736 # genbank format - the original, needs a lot of
737 # processing to extract info. that bioperl can't get
738 elsif ( $line =~ /^LOCUS/)
739 {
740 print "It looks like this is a Genbank file: $file\n" if ($screendump == 1);
741 $isformat = 0;
742 # get alphabet, and strandedness
743 # I've assumed that DNA is ds unless
744 # specifically stated otherwise.
745 # NCBI claim that this is the case.
746 my @locs = split(/\s/, $line);
747 foreach my $loc (@locs)
748 {
749 if ($loc =~ /NA/)
750 {
751 if ($loc =~ /(\D\D)-(\D+)/)
752 {
753 $strand = lc $1;
754 $alphabet = lc $2;
755 }
756 else
757 {
758 $strand = "ds";
759 $alphabet = lc $loc;
760 }
761 }
762 }
763 }
764 elsif ($isformat == 5) # assume must be ASCII format
765 {
766 print "I don't recognise the format of this this file: $file\nI'll assume that it's ASCII." if ($screendump == 1);
767 $isformat = 4;
768 last;
769 }
770 # definition
771 elsif ($line =~ /^\s*DEFINITION\s*/)
772 {
773 $def = $line;
774 $def =~ s/DEFINITION\s+//;
775 }
776 # species name
777 elsif ($line =~ /^\s*ORGANISM\s*/)
778 {
779 $organism = "$line";
780 $organism =~ s/\s*ORGANISM\s*\b//;
781 }
782 # exit the loop after reading the important stuff
783 elsif ($line =~ "FEATURES") { last; }
784
785 # match the old line, in case a definition
786 # extends over two lines (e.g. ORGANISM)
787 if ($oldline =~ /^DEFINITION/ and $line =~ /^\s+/)
788 {
789 $line =~ s/\s+//;
790 $def = "$def " . $line;
791 print "Case 1: gbline is $line, oldline is $oldline, def is $def\n" if ($debug == 1);
792 }
793 if ($oldline =~ /^\s*ORGANISM/ and $line =~ /^\s+/)
794 {
795 $line =~ s/\s+//;
796 $organism = "$organism " . $line;
797 print "Case 2: gbline is $line, oldline is $oldline, organism is $organism\n" if ($debug == 1);
798 }
799 if ($olderline =~ /^\s*ORGANISM/ and $line =~ /^\s+/)
800 {
801 $line =~ s/\s+//;
802 $organism = "$organism " . $line;
803 print "Case 3: gbline is $line, olderline is $olderline, organism is $organism\n" if ($debug == 1);
804 }
805
806 # saves the previous line
807 $olderline = $oldline;
808 $oldline = $line;
809 }
810 close IN;
811
812 # swissprot or EMBL format?
813 if ($sprotorembl == 1)
814 {
815 # see if file is swissprot
816 # if not, it must be EMBL
817 my $testfile = Bio::SeqIO->new(-file => "$file",
818 -format => 'EMBL');
819 my $testseq;
820 while (1)
821 {
822 my $testseq;
823 eval
824 {
825 $testfile->verbose(2);
826 undef $@;
827 $testseq = $filein->next_seq()
828 }; #Done eval
829 if($@)
830 {
831 print "It looks like this is an EMBL file.\n" if ($screendump == 1);
832 $isformat = 3;
833 last;
834 }
835 else
836 {
837 print "It looks like this is a Swissprot file.\n" if ($screendump == 1);
838 $isformat = 2;
839 last;
840 }
841 }
842 $testfile->close();
843 }
844
845 # new object, depending on format
846 if ($isformat == 1) # FASTA
847 {
848 $filein = Bio::SeqIO->new(-file => "$file",
849 -format => 'Fasta');
850 }
851 elsif ($isformat == 2) # Swissprot
852 {
853 $filein = Bio::SeqIO->new(-file => "$file",
854 -format => 'swiss');
855 }
856 elsif ($isformat == 3) # EMBL
857 {
858 $filein = Bio::SeqIO->new(-file => "$file",
859 -format => 'EMBL');
860 }
861 elsif ($isformat == 0) # genbank
862 {
863 $filein = Bio::SeqIO->new(-file => "$file",
864 -format => 'genbank');
865 }
866 elsif ($isformat == 4) # ASCII
867 {
868 $filein = Bio::SeqIO->new(-file => "$file",
869 -format => 'raw');
870 }
871 else { die "Unknown file type - please report this error.\n"; }
872
873 ###########################################
874 # loop through every sequence in the file #
875 ###########################################
876 @nreg=();
877 my $noofseqs = 1;
878 print "Reading sequences from file $file...\n" if ($screendump == 1);
879 SEQ: while (my $seq = $filein->next_seq()) # this point is very slow...
880 {
881 last unless $seq;
882 # how much of this sequence is coding
883 $tempcoding = 0;
884 $cdsnum = 0;
885
886 # first, check if amino acid not dna
887 if ($seq->alphabet() =~ /protein/)
888 {
889 $isaaa = 1;
890 print "This looks like an amino acid sequence.\n" if ($screendump == 1);
891 }
892 else
893 {
894 print "This looks like a nucleic acid sequence.\n" if ($screendump == 1);
895 }
896 # print headers
897 &fileheaders($isformat,$isaaa) if ($headerprint == 0);
898 $headerprint = 1;
899
900 # get the ID of this sequence
901 # and the sequence string
902 $sequence_string = lc($seq->seq());
903 $genome_len = length $sequence_string;
904 my $id = $seq->display_id();
905 $id =~ s/;//;
906 $id =~ s/\.gbk//;
907
908 # checks for duplicated sequence names
909 if ($duplicates{$id} == 1)
910 {
911 $duplicates{$id}++;
912 next;
913 }
914 else
915 {
916 $duplicates{$id} = 1;
917 }
918
919 # create a short name to refer to
920 # this particular sequence
921 my ($short_name,@ids);
922 if ($isformat == 4)
923 {
924 $short_name = $datestring . "_" . $noofseqs;
925 }
926 else
927 {
928 @ids = split(/\|/, $id);
929 if ($id =~ /^gi/)
930 {
931 if (defined $ids[4]) { $short_name = $ids[4]; }
932 else { $short_name = $ids[1]; }
933 }
934 elsif ($id =~ /^gb/) { $short_name = $ids[2]; }
935 elsif ($id =~ /^ref/) { $short_name = $ids[1]; }
936 else { $short_name = $id; }
937 }
938 # N.B. if the FASTA filename has certain characters in then
939 # it will cause msatfinder to crash. Get rid of them here.
940 $short_name =~ s/\./_/g;
941 $short_name =~ s/\|/_/g;
942 $short_name =~ s/:/_/g;
943 $short_name =~ s/,/_/g;
944 if ($short_name eq "")
945 {
946 print "Invalid name in FASTA header. Please replace with something like \"seq1\", \"seq2\" etc.\nExiting!\n";
947 die;
948 }
949 $noofseqs++;
950
951 # check for a valid sequence
952 if (! $seq->validate_seq($sequence_string))
953 {
954 print "Sequence $short_name is invalid!\n";
955 print OUTPUT_FILE1 "INVALID_SEQ|$short_name\n";
956 $no_errors=0;
957
958 $printerror = 1;
959 next SEQ;
960 }
961
962 #open (OUTSEQ, ">$short_name") || print "can't open outseq file:$!";
963 #my $outseq = Bio::SeqIO->newFh(-format=>'Genbank');
964 #print $outseq $_ while <$seq>;
965 #print OUTSEQ $filein->write_seq($seq);
966
967 #close OUTSEQ;
968 # count msats in CDS regions for this genome
969 $pccodreps{$short_name} = 0;
970 push(@short_names,$short_name);
971 $reverse = 0;
972
973 # stuff needed to get the species details
974 my $sp;
975 my $ps;
976 my ($species, $genus, @classification, $common_name, $sub_species, $binomial, $division, $gdate, @gdates, $specific_host, $lab_host, $db_xref, $note, $feature_count) = "";
977 my $ann;
978 my (@annotations1,@annotations2);
979
980 ##################################################
981 # if a genbank file, extract further information #
982 ##################################################
983 if ($isformat == 0 or $isformat == 2 or $isformat == 3)
984 {
985
986 # species object
987 $sp = $seq->species();
988 # information on the organism
989 $alphabet = $seq->alphabet() unless (defined $alphabet);
990 eval { $genus = $sp->genus(); } or $genus = "NA";
991 eval { $species = $sp->species(); } or $species = "NA";
992 eval { @classification = $sp->classification(); } or @classification = qw(na);
993 eval { $common_name = $sp->common_name(); } or $common_name = "NA";
994 eval { $sub_species = $sp->sub_species(); } or $sub_species = "NA";
995 eval { $binomial = $sp->binomial(); } or $binomial = "NA";
996 eval { $division = $seq->division(); } or $division = "NA";
997 @gdates = $seq->get_dates();
998 my $gdate1 = $gdates[0];
999 my ($d1,$d2,$d3) = split("-",$gdate1);
1000 $gdate = $d3 . "-" . $dtrans{$d2} . "-" . $d1;
1001 $circular = 'linear';
1002 $circular = 'circular' if $seq->is_circular();
1003 $feature_count = $seq->feature_count();
1004
1005
1006 # features
1007 my $whirley = Whirley->new(4);
1008 print "Reading all annotations in sequence...\n" if ($screendump == 1);
1009 foreach my $feat ($seq->all_SeqFeatures()) # this part is very slow...
1010 {
1011 print STDERR "Please wait: ", $whirley->(), "\r" if ($screendump == 1);
1012 my $tag;
1013 eval { $tag = $feat->primary_tag(); } or $tag = "wibble";
1014 # source: get hosts, notes &c. for this genome
1015 if ($tag =~ "source")
1016 {
1017 my @tags = $feat->all_tags();
1018 foreach my $val (@tags)
1019 {
1020 my @tagvalues = $feat->each_tag_value($val);
1021 if ($val =~ /specific_host/) { $specific_host = $tagvalues[0]; }
1022 if ($val =~ /lab_host/) { $lab_host = $tagvalues[0]; }
1023 if ($val =~ /note/) { $note = $tagvalues[0]; }
1024 if ($val =~ /db_xref/)
1025 {
1026 $db_xref = $tagvalues[0];
1027 $db_xref =~ s/taxon://;
1028 }
1029 }
1030 }
1031 # get start, stop and other stuff here.
1032 if ($tag =~ "CDS")
1033 {
1034 $cdsnum++;
1035 my $location = new Bio::Location::Split;
1036 $location = $feat->location();
1037 if ($location->isa('Bio::Location::Split'))
1038 {
1039 my @sublocs = $location->sub_Location();
1040 foreach my $loc (@sublocs)
1041 {
1042 my $featstart = $loc->start();
1043 my $featend = $loc->end();
1044 &addregion($featstart,$featend);
1045 }
1046 }
1047 else
1048 {
1049 my $featstart = $feat->start();
1050 my $featend = $feat->end();
1051 &addregion($featstart,$featend);
1052 }
1053 }
1054 }
1055 }
1056
1057 ########################################
1058 # calculate other things based on data #
1059 # from these info.-rich formats #
1060 # but not swissprot - is amino acid #
1061 ########################################
1062 unless ($isaaa == 1)
1063 {
1064 # how much coding DNA is present?
1065 $tempcoding = &howlong || "0";
1066
1067 if ($tempcoding > $genome_len)
1068 {
1069 print "Something's wrong here!\n";
1070 print "Tempcoding: $tempcoding\n";
1071 print "Length: $genome_len\n";
1072 print OUTPUT_FILE1 "CODING_MISMATCH|$short_name|$tempcoding|$genome_len\n";
1073 $no_errors=0;
1074 $printerror = 1;
1075 }
1076
1077 # determine the base count
1078 $base_a = $sequence_string =~ tr/aA/aA/;
1079 $base_c = $sequence_string =~ tr/cC/cC/;
1080 $base_g = $sequence_string =~ tr/gG/gG/;
1081 $base_t = $sequence_string =~ tr/tT/tT/;
1082
1083 # check it!
1084 my $totlen = $genome_len - ($base_a + $base_t + $base_c + $base_g);
1085 if( $totlen < 0 )
1086 {
1087 warn "Base count larger than seq length in $short_name!\n";
1088 }
1089 }
1090
1091 # increment the unique id count
1092 $total_ids++;
1093
1094 # more debugging code
1095 if ($debug == 1)
1096 {
1097 print "IDS: @ids\n";
1098 print "ID: $short_name\n";
1099 print "FC: $feature_count\n";
1100 }
1101
1102 # annouce the sequence being searched
1103 print "Inspecting sequence: $short_name...\n" if ($screendump == 1);
1104
1105 # after conversion genome is in $sequence_string; get length and G+C
1106 $genlenhash{$short_name} = $genome_len;
1107 push (@genome_len, $genome_len);
1108 unless ($isaaa == 1)
1109 {
1110 $CGgenome = $sequence_string =~ tr/cgCG/cgCG/;
1111 $ATgenome = $sequence_string =~ tr/atAT/atAT/;
1112
1113 # determine if the genome is unusually
1114 # a or c rich
1115 if ($ATgenome == 0)
1116 {
1117 print "ATgenome is $ATgenome in $short_name\n";
1118 print "File $short_name has invalid content!\n";
1119 push (@dodgy, $short_name);
1120 print OUTPUT_FILE1 "LOW_AT_CONTENT|$short_name\n";
1121 $no_errors=0;
1122 $printerror = 1;
1123 next;
1124 }
1125 $aat = $base_a/$ATgenome;
1126 $aat = sprintf("%.3f", $aat);
1127 if ($CGgenome == 0 )
1128 {
1129 print "CGgenome is $CGgenome in $short_name\n";
1130 print "File $short_name has invalid content!\n";
1131 push (@dodgy, $short_name);
1132 next;
1133 }
1134 $cgc = $base_c/$CGgenome;
1135 $cgc = sprintf("%.3f", $cgc);
1136 }
1137 # stop division by zero
1138 if ($genome_len >0)
1139 {
1140 unless ($isaaa == 1)
1141 {
1142 $CGgenome = ($CGgenome/$genome_len)*100;
1143 $CGgenome = sprintf("%.3f", $CGgenome);
1144 }
1145 # send an error message if the sequence is very short - probably not a genome?
1146 if (length $sequence_string <=50)
1147 {
1148 print OUTPUT_FILE1 "DODGY_SEQ|$short_name|$genome_len|$sequence_string\n";
1149 $no_errors=0;
1150 }
1151 $printerror = 1;
1152
1153 # make sure that the $percentcode is correct
1154 # moved up from lower in the script
1155 unless ($isaaa == 1)
1156 {
1157 $percentcode = 100*($tempcoding/$genome_len);
1158 $percentcode = sprintf("%.3f", $percentcode);
1159 }
1160 # hashes used when writing summary files
1161 # at the end of the script
1162 $mineseen{$short_name}=$minenum;
1163 print "$short_name\t$minenum\t(1)\n";
1164 $repseen{$short_name}=0;
1165 $defseen{$short_name} = $def;
1166 # Here we create yet another array to store all files names
1167 # "total_sum"
1168 if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 0)
1169 {
1170 my $all_files2 = join "|", $short_name, $prefix, $gentaxon, $division, $strand, $alphabet, $circular, $gdate, $binomial, $genus, $species, $sub_species, $specific_host, $lab_host, $db_xref, $note, $common_name, $organism, $def, $genome_len, $tempcoding, $percentcode, $CGgenome, $cdsnum, $base_a, $base_t, $base_g, $base_c, $aat, $cgc;
1171 push (@all_files2, $all_files2);
1172 }
1173 if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 1)
1174 {
1175 my $all_files2 = join "|", $short_name, $prefix, $gentaxon, $division, $alphabet, $circular, $gdate, $binomial, $genus, $species, $sub_species, $specific_host, $lab_host, $db_xref, $note, $common_name, $genome_len;
1176 push (@all_files2, $all_files2);
1177 }
1178 if ($isformat == 1 and $isaaa == 0)
1179 {
1180 my $all_files2 = join "|", $short_name, $genome_len, $CGgenome, $base_a, $base_t, $base_g, $base_c, $aat, $cgc;
1181 push (@all_files2, $all_files2);
1182 }
1183 if ($isformat == 1 and $isaaa == 1)
1184 {
1185 my $all_files2 = join "|", $short_name, $genome_len;
1186 push (@all_files2, $all_files2);
1187 }
1188
1189
1190 my $temp = $cwd;
1191 #$temp =~ s/[^\/]*?\/$//;
1192 my $rand_file='fasta.in';
1193 my $temp2 = $temp;
1194 $temp2 =~ s/[^\/]*?\/$//;
1195 $fasta_file = "$temp$rand_file";
1196 #this creates a file containing a fasta version of the ssequence
1197 #if ($random || $pimp > 0)
1198 #{
1199 open (RANDOM, ">$temp$rand_file") || print "can't open file: $!";
1200 my $seq_temp = uc ($sequence_string);
1201 my $rand_in = ">$short_name\n";
1202 while (length($seq_temp) >= 70)
1203 {
1204 my $substring = substr($seq_temp, 0, 70, "");
1205 $rand_in .= "$substring\n";
1206 }
1207 $rand_in .= "$seq_temp";
1208 print RANDOM "$rand_in";
1209 close RANDOM;
1210 #}
1211
1212 if ($random)
1213 {
1214 #this isn't great, because all processes still feed off the single random.fa file ...
1215 #print out sequence, randomise it, then suck it in again
1216 my $randomise = 'randomise';
1217 my $randomisefa = "random.fa";
1218 if ($random_times)
1219 {
1220 for (my $i=1;$i<=$random_times;$i++)
1221 {
1222 system ("cp $temp2$randomise $temp/"); #cp randomise program across
1223 system ("rm $temp2$randomisefa") or warn "trouble, can't remove file : $!<br>"; #have to delete output file, since web user can write to files, but can't over-write
1224 system ("$temp$randomise $temp$rand_file >/dev/null 2>&1") == 0 or warn "trouble : $!"; #run randomise on the rand_file
1225 unless ($i == $random_times)
1226 {
1227 system ("cp $temp2$randomisefa $temp$rand_file");
1228 }
1229 }
1230 }
1231 open (RANDOM, "$temp2$randomisefa") || print "can't open file: $!"; #open the random.fa output file
1232 open (RANDOUT, ">Fasta/$short_name.random.fasta") || die "can't open file: $!"; #print out random.fa to seqID.random.fasta (could just copy it across, really)
1233 while (<RANDOM>)
1234 {
1235 print RANDOUT $_;
1236 }
1237 close RANDOUT;
1238
1239
1240 my $temp = $cwd;
1241 chop $temp;
1242 my @rand_results = <RANDOM>;
1243 pop @rand_results;
1244 close RANDOM;
1245 $sequence_string = join('',@rand_results);
1246 }
1247
1248 #create pimp output file as default
1249 my $pimp_prog="pimp";
1250 my $temp_path=$abs_path;
1251 $temp_path =~ s/Repeats\/$/Counts/;
1252 my $pimp_results;
1253 my $prepimp="prepimp.pl";
1254 my $prepimped=$short_name.".nonstandard.count";
1255
1256 #run poly if necessary
1257 my $poly = $config->{'CONTACT.poly'};
1258 if ($polyuse eq 'yes')
1259 {
1260 $poly = 'yes';
1261 }
1262
1263 #dealing with prepimp: nuc or prot?
1264 my $prepimpflag;
1265 if ($isaaa=0)
1266 {
1267 $prepimpflag="-n";
1268 }
1269 else
1270 {
1271 $prepimpflag="-p";
1272 }
1273
1274 if ($poly eq 'yes')
1275 {
1276 my $all_atgc = "fasta-treat.in";
1277 my @nonATGC = `perl $temp2$prepimp $prepimpflag $temp/$rand_file $temp/$all_atgc`;
1278 open (NONATGC, ">$temp_path/$prepimped") or die "Can't open $temp_path$prepimped: $!";
1279 print NONATGC "@nonATGC";
1280 close NONATGC;
1281
1282 my $poly_prog="poly";
1283 my $poly_results;
1284 $poly_results=`$temp2$poly_prog $temp/$all_atgc`; #$temp/$rand_file
1285 open (OUTPUT_FILE9, ">$temp_path/$short_name.poly.count") or die "Can't open $temp_path/$short_name.poly.count file: $!";
1286 my @split = split (/\s/,$poly_results);
1287 print OUTPUT_FILE9 "Maximum expected length of any given SSR (simple sequence repeat) according to Poly (http://www.biomedcentral.com/1471-2105/4/22):\nmono".$dlm."$split[2]\ndi".$dlm."$split[3]\ntri".$dlm."$split[4]\ntetra".$dlm."$split[5]\npenta".$dlm."$split[6]\nhexa".$dlm."$split[7]";
1288 close OUTPUT_FILE9;
1289 my @polys = split (/\s/,$poly_results);
1290 shift @polys;
1291 shift @polys;
1292 my @temp;
1293 foreach my $i (@polys)
1294 {
1295 $i =~ s/[^\d]//g;
1296 push (@temp,$i);
1297 }
1298 @polys = @temp;
1299 if ($polyuse eq 'yes')
1300 {
1301 for (my $i=0;$i<scalar @thresholds;$i++)
1302 {
1303 if ($thresholds[$i])
1304 {
1305 $thresholds[$i] = $polys[$i];
1306 }
1307 }
1308 }
1309 }
1310
1311 if ($pimp > 0)
1312 {
1313 #print "perl $temp2$prepimp $prepimpflag $temp$rand_file\n";
1314 my $all_atgc = "fasta-treat.in";
1315 my @nonATGC = `perl $temp2$prepimp $prepimpflag $temp/$rand_file $temp/$all_atgc`;
1316 open (NONATGC, ">$temp_path/$prepimped") or die "Can't open $temp_path$prepimped: $!";
1317 print NONATGC "@nonATGC";
1318 close NONATGC;
1319 #print "$temp2$prepimp $temp$rand_file > $temp_path/$prepimped\n";
1320 #get markov chain length
1321 my $markovlength = $config->{'CONTACT.markovlength'};
1322 #put in markov length here (markovlength)
1323
1324 $pimp_results=`$temp2$pimp_prog -m $markovlength $temp/$all_atgc`; #$temp/$rand_file
1325 my @split = split (/\s/,$pimp_results);
1326 #print OUTPUT_FILE9 "Maximum expected length of any given SSR (simple sequence repeat) motif length (monos to hexas) according to Poly (http://www.biomedcentral.com/1471-2105/4/22):\nmono".$dlm."$split[2]\ndi".$dlm."$split[3]\ntri".$dlm."$split[4]\ntetra".$dlm."$split[5]\npenta".$dlm."$split[6]\nhexa".$dlm."$split[7]";
1327
1328
1329 #print "$temp2$pimp_prog $temp$rand_file\n";
1330 #temp_path
1331 open (OUTPUT_FILE6, ">$temp_path/$short_name.markov.count") or die "Can't open $temp_path/markov.count file: $!";
1332 print OUTPUT_FILE6 "Maximum expected length of any given SSR (simple sequence repeat) motif length (monos to hexas) according to emsat:\nmono".$dlm."$split[2]\ndi".$dlm."$split[3]\ntri".$dlm."$split[4]\ntetra".$dlm."$split[5]\npenta".$dlm."$split[6]\nhexa".$dlm."$split[7]";
1333 close OUTPUT_FILE6;
1334 }
1335 #if user wants to use pimp thresholds
1336 if ($pimp == 2)
1337 {
1338 if ($polyuse eq 'yes')
1339 {
1340 print OUTPUT_FILE2 "You have requested to use both Markov and Poly thresholds, which is not possible. Markov thresholds will be used.<br>";
1341 }
1342 my @pimps = split (/\s/,$pimp_results);
1343 shift @pimps;
1344 shift @pimps;
1345 my @temp;
1346 foreach my $i (@pimps)
1347 {
1348 $i =~ s/[^\d]//g;
1349 push (@temp,$i);
1350 }
1351 @pimps = @temp;
1352 for (my $i=0;$i<scalar @thresholds;$i++)
1353 {
1354 if ($thresholds[$i])
1355 {
1356 $thresholds[$i] = $pimps[$i];
1357 }
1358 }
1359 }
1360 ##################################################################
1361 # This is the important part where the repeats are found using a #
1362 # the regexp below. Loop through this section by repeat length #
1363 ##################################################################
1364 $repsum = 0;
1365 my $res; # array ref with all msats within it
1366 my $whirley = Whirley->new($engine);
1367 if ($engine == 6 && $isaaa == 0)
1368 {
1369 $engine=1;
1370 print OUTPUT_FILE2 "Engine switched from seg to regex, since seg is only used for amino acid sequences<br>";
1371 }
1372 if ($engine == 1) { $res = &findmotifs_regex($sequence_string,$whirley,$screendump,$short_name);}
1373 elsif ($engine == 2) { $res = &findmotifs_multipass($sequence_string,$whirley,$screendump,$short_name); }
1374 elsif ($engine == 3) { $res = &findmotifs_iterate($sequence_string,$whirley,$screendump,$short_name); }
1375 elsif ($engine ==4) { $res = &findmotifs_sputnik($sequence_string,$whirley,$screendump,$short_name); }
1376 elsif ($engine ==5) { $res = &findmotifs_dust($sequence_string,$whirley,$screendump,$short_name); }
1377 elsif ($engine ==6) { $res = &findmotifs_seg($sequence_string,$whirley,$screendump,$short_name); }
1378 # now process the msats in the res array
1379 # MATCH (a place marker for gvim)
1380 $replength = 0;
1381 my @motifs_in_this_genome = ();
1382 my %unique_motifs_local=();
1383 next unless (@$res);
1384 my $allmsats;
1385 # determine if any msats are interruped. Doesn't search within a given msat, just checks if two msats are close and the same motif.
1386 if ($runird == 1 && $non_divisible==0)
1387 {
1388 print "Scanning for interrupted msats...\n";
1389 # do a whirley here...
1390 $allmsats = &ird(@$res);
1391 }
1392 else
1393 {
1394 $allmsats = $res;
1395 }
1396 #if ($debug == 2) { foreach my $msat (@allmsats) { print "$msat\n"; } }
1397 foreach my $msat (@{$allmsats})
1398 {
1399 # make alterations based on interrupted msats!
1400 $has_msat{$short_name} = 1;
1401 my @parts = split(/\./, $msat);
1402 my $match = $parts[2];
1403 my $motif_type;
1404 if ($match =~ /-/) { $motif_type = length([split(/-/,$match)]->[0]); }
1405 else { $motif_type = length($match); }
1406 my $match_len = $motif_type * $parts[3]; #also known as msat footprint
1407 if ($non_divisible == 1)
1408 {
1409 $match_len = $parts[3];
1410 }
1411 my $match_count = $match_len/$motif_type; #number of motifs
1412 my $start = $parts[1];
1413 my $end = $start + $match_len - 1;
1414 my $whole_match = substr ($sequence_string, $start-1, $match_len);
1415 $repseen{$short_name} += $match_len;
1416 # ignore poly-N and poly-X of any type
1417 next if ($match =~ /n/ or $match =~ /x/);
1418
1419 # count total number found
1420 $numall++;
1421 $numint++ if ($match =~ /-/);
1422
1423 # determine the motif type, and
1424 # count the number of msats of
1425 # each type and length
1426 $repeat_counter{$short_name}{$match}[$match_count]++;
1427 $repeat_counter{all}{$match}[$match_count]++;
1428 push(@all_motifs,$match) unless ($unique_motifs{$match});
1429 push (@motifs_in_this_genome,$match) unless ($unique_motifs_local{$match});
1430 $unique_motifs{$match} = 1;
1431 $unique_motifs_local{$match} = 1;
1432
1433 # reverse complement
1434 $motif_revcom = $match;
1435 $motif_revcom =~ tr/ACTGactg/TGACtgac/;
1436
1437 # set the various coding region descriptions
1438 # to N/A (assuming non-coding unless we find
1439 # otherwise
1440 $genename = "N/A";
1441 $protname = "N/A";
1442 $prodname = "N/A";
1443 $coding = 0;
1444
1445 # determine whether or not the start or end of this repeat
1446 # falls within a CDS region
1447 my $check1 = &isitin($start);
1448 my $check2 = &isitin($end);
1449 if ($check1 == 1 or $check2 == 1)
1450 {
1451 $coding = 1;
1452 $pccodreps{$short_name}++;
1453
1454 # determine which gene it is in
1455 # and all that
1456 foreach my $feat ($seq->all_SeqFeatures())
1457 {
1458 # this information is not connected to
1459 # the CDS wherein the repeat is found...
1460 my $tag;
1461 eval { $tag = $feat->primary_tag(); } or $tag = "wibble";
1462 if ($tag =~ "source")
1463 {
1464 my @tags = $feat->all_tags();
1465 foreach my $val (@tags)
1466 {
1467 my @tagvalues = $feat->each_tag_value($val);
1468 if ($val =~ /specific_host/) { $specific_host = $tagvalues[0]; }
1469 if ($val =~ /lab_host/) { $lab_host = $tagvalues[0]; }
1470 if ($val =~ /note/) { $note = $tagvalues[0]; }
1471 if ($val =~ /db_xref/)
1472 {
1473 $db_xref = $tagvalues[0];
1474 $db_xref =~ s/taxon://;
1475 }
1476 }
1477 }
1478
1479 # ...but this information is. It seemed like a good
1480 # idea to put them in the same place.
1481 my $featstart = $feat->start();
1482 my $featend = $feat->end();
1483 $reverse = $feat->strand();
1484 if (($start >= $featstart and $start <= $featend and $tag eq "CDS") or ($start >= $featstart and $end <= $featend and $tag eq "CDS"))
1485 {
1486 if ($debug == 1)
1487 {
1488 print "--\n";
1489 print "NAME: $short_name\n";
1490 print "TAG: $tag\n";
1491 print "Start: $featstart\n";
1492 print "Repstart: $start\n";
1493 print "End: $featend\n";
1494 print "--\n";
1495 }
1496 my @tags = $feat->all_tags();
1497 foreach my $val (@tags)
1498 {
1499 my @tagvalues = $feat->each_tag_value($val);
1500 if ($val =~ /gene/) { $genename = $tagvalues[0]; }
1501 if ($val =~ /product/) { $prodname = $tagvalues[0]; }
1502 if ($val =~ /protein_id/) { $protname = $tagvalues[0]; }
1503 }
1504 }
1505 }
1506 }
1507
1508 # print out stuff
1509 print "\nFound ($match)$match_count msat at $start:\n" if ($screendump == 1);
1510
1511 # create Swissprot formatted output for Artemis
1512 if ($artemis > 0)
1513 {
1514 $feature = <<EOF;
1515 FT repeat_region $start..$end
1516 FT /label={$match}$match_count
1517 FT /note="($match)$match_count"
1518 FT /note="Coding: $coding"
1519 FT /note="Gene: $genename"
1520 FT /note="Product: $prodname"
1521 FT /note="Protein: $protname"
1522 EOF
1523 $gfffeature = "$msat\tMsatfinder.v.$version\trepeat_region\t$start\t$end\t.\t.\t.\tName=$msat;note=($match)$match_count ;coding=$coding;label=$msat;gene_name=$genename;product=$prodname;protein=$protname\n";
1524 # $gfffeature_plus_flank= "$gff_name\tMsatfinder.v.$version\trepeat_region\t$fstart\t$fend\t.\t.\t.\tID=$msat;Name=$gff_name;note=($match)$match_count and flanks ;label=$msat;coding=$coding;gene_name=$genename;product=$prodname;protein=$protname\n";
1525 print $feature if $screendump == 1;
1526 print $gfffeature if $screendump == 1;
1527 }
1528
1529 if ($artemis == 2)
1530 {
1531 my ($fstart,$fend);
1532 if ($start < $flank_size)
1533 {
1534 $fstart = 1;
1535 }
1536 else
1537 {
1538 $fstart = $start - $flank_size;
1539 }
1540 if (($end + $flank_size) > $genome_len)
1541 {
1542 $fend = $genome_len;
1543 }
1544 else
1545 {
1546 $fend = $end + $flank_size;
1547 }
1548 $feature_plus_flank = <<EOF;
1549 FT repeat_region $fstart..$fend
1550 FT /label={$match}$match_count
1551 FT /note="($match)$match_count + flanks"
1552 FT /note="Coding: $coding"
1553 FT /note="Gene: $genename"
1554 FT /note="Product: $prodname"
1555 FT /note="Protein: $protname"
1556 FT /note="Includes flanking region"
1557 EOF
1558 my $gff_name=$msat;
1559 $gff_name =~ s/\./_/g;
1560 $gfffeature_plus_flank= "$gff_name\tMsatfinder.v.$version\trepeat_region\t$fstart\t$fend\t.\t.\t.\tID=$msat;Name=$gff_name;note=($match)$match_count and flanks ;label=$msat;coding=$coding;gene_name=$genename;product=$prodname;protein=$protname\n";
1561 }
1562
1563
1564 # this makes sure that the various repeat length arrays are
1565 # incremented only when a repeat larger than the last one
1566 # is found (see the .matrix file for the result)
1567 $replength = $match_count if ($match_count > $replength);
1568
1569 # print the feature to the artemis file if $artemis > 0
1570 if ($artemis > 0 && $whole_match)
1571 {
1572 $tabfile = "$short_name.msat_tab";
1573 $gfftabfile = "$short_name.msat_tab.gff";
1574 my $basename = [split(/\//, $file)]->[-1];
1575 print "Opening output file $tabfile\n" if ($screendump == 1);
1576 unless (-e "$gff_dir$gfftabfile")
1577 {
1578 open (GFFART, ">$gff_dir/$gfftabfile") or die "Can't open $gff_dir/$gfftabfile file for writing: $!";
1579 print GFFART "##gff-version 3\n";
1580 close GFFART;
1581 }
1582 open (ART, ">>$tab_dir/$tabfile") or die "Can't open $tab_dir/$tabfile file for writing: $!";
1583 open (GFFART, ">>$gff_dir/$gfftabfile") or die "Can't open $gff_dir/$gfftabfile file for writing: $!";
1584 print ART $feature;
1585 print GFFART $gfffeature;
1586
1587 # link files in the artemis directory
1588 unless (-e "$tab_dir$file")
1589 {
1590 print "Linking $file to $tab_dir\n" if ($debug == 1);
1591 system("cd $tab_dir; ln -sf ../$basename .") == 0 or warn "Can't link $file to $tab_dir: $!";
1592 close ART;
1593 }
1594 close GFFART;
1595
1596 # print the "flank tabs" out
1597 if ($artemis == 2)
1598 {
1599 $tabfile = "$short_name.flank_tab";
1600 $gfftabfile = "$short_name.flank_tab.gff";
1601 print "Opening output file $tabfile\n" if ($screendump == 1);
1602 unless (-e "$gff_dir$gfftabfile")
1603 {
1604 open (GFFART2, ">$gff_dir/$gfftabfile") or die "Can't open $gff_dir/$gfftabfile file for writing: $!";
1605 print GFFART2 "##gff-version 3\n";
1606 close GFFART2;
1607 }
1608 open (ART2, ">>$bigtab_dir/$tabfile") or die "Can't open your $bigtab_dir/$tabfile for writing: $!";
1609 open (GFFART2, ">>$gff_dir/$gfftabfile") or die "Can't open your $gff_dir/$gfftabfile for writing: $!";
1610 print ART2 $feature_plus_flank;
1611 print GFFART2 $gfffeature_plus_flank;
1612 unless (-e "$bigtab_dir$file")
1613 {
1614 print "Linking $file to $bigtab_dir\n" if ($debug == 1);
1615 system("cd $bigtab_dir; ln -sf ../$basename .") == 0 or warn "Can't link $file to $bigtab_dir: $!";
1616 }
1617 close GFFART2;
1618 close ART2;
1619 }
1620 }
1621
1622 # used for determining the sequence upstream and
1623 # downstream of the match
1624 # modify this based on ird code...
1625 # fixed 3/4/06
1626 if ($start < $flank_size)
1627 {
1628 $left = substr ($sequence_string, 0, $start-1);
1629 }
1630 else
1631 {
1632 $left = substr ($sequence_string, $start-$flank_size, $flank_size-1);
1633 }
1634 if (($end + $flank_size) > length $sequence_string)
1635 {
1636 $right = substr ($sequence_string, $end);
1637 }
1638 else
1639 {
1640 $right = substr ($sequence_string, $end, $flank_size);
1641 }
1642 $blaster= $left . $whole_match . $right;
1643 $flankseq = $left . $right;
1644 my $repeat_plus_flank = length $blaster;
1645 my $flankseq_length = length($flankseq);
1646 my $motif_length = length($whole_match);
1647
1648 #calcuate %GC of flanks
1649 $CGgen_flankseq = $flankseq =~ tr/cgCG/cgCG/;
1650 $CGgen_flankseq = ($CGgen_flankseq/$flankseq_length)*100 unless ($flankseq_length == 0);
1651 $CGgen_flankseq = sprintf("%.3f", $CGgen_flankseq);
1652
1653 #calcuate %GC of repeat
1654 $CGgen_repeat = $whole_match =~ tr/cgCG/cgCG/;
1655 print "CG\% MSAT: $CGgen_repeat / (length $whole_match))\n" if ($debug == 1);
1656 $CGgen_repeat = sprintf("%.3f", $CGgen_repeat);
1657
1658 # this part writes the MINE.db files in
1659 # $PWD/MINE, one file per repeat
1660 #if ($mine == 1 && $whole_match).
1661 # If debugging is turned on, produce output to stderr
1662 # if any flanking regions extend past the end of the sequence
1663 if ($whole_match)
1664 {
1665 $minenum++;
1666 print "$minenum\n";
1667 $mineend = $start . ".$match." . "$match_count";
1668 $checklength = ((2*$flank_size) + $match_len-1);
1669 $checkright = substr ($sequence_string, $end);
1670 $checkright_len = length $checkright;
1671 $checkright_len_pc = sprintf("%.2f", 100*($checkright_len/$genome_len));
1672 $checkleft = substr ($sequence_string, 1, $start-1);
1673 $checkleft_len = length $checkleft;
1674 $checkleft_len_pc = sprintf("%.2f", 100*($checkleft_len/$genome_len));
1675 if ($repeat_plus_flank < $checklength && $debug == 1)
1676 {
1677 warn "Short flanks in $short_name.$mineend\:\t$checkleft_len\t$checkright_len\n";
1678 }
1679
1680 ########################################
1681 # create new CGI for the MINE .db file #
1682 ########################################
1683 if ($mine == 1)
1684 {
1685 print "Writing MINE file $short_name.$mineend.db\n" if ($screendump == 1);
1686 open (ENTRY, ">$mine_dir/$short_name.$mineend.db") or die "Won't open $short_name.$mineend.db";
1687 my $entry = new CGI;
1688 $entry -> delete('keywords');
1689 $entry -> append('filename', "$short_name.$mineend.db");
1690 if ($remote_link)
1691 {
1692 $entry -> append('genome', "<a href=\"$remote_link$short_name\">$short_name</a>");
1693 }
1694 else
1695 {
1696 $entry -> append('genome', $short_name);
1697 }
1698 $entry -> append('analysis_type', $analysis_type);
1699 $entry -> append('specific_taxon', $prefix);
1700 $entry -> append('generic_taxon', $gentaxon);
1701 $entry -> append('organism', $organism);
1702 $entry -> append('strand', "$strand") unless ($isaaa == 1);
1703 $entry -> append('alphabet', $alphabet);
1704 $entry -> append('binomial', $binomial);
1705 $entry -> append('genus', $genus);
1706 $entry -> append('species', $species);
1707 $entry -> append('strain/subspecies', $sub_species);
1708 $entry -> append('common_name', $common_name);
1709 $entry -> append('definition', $def);
1710 $entry -> append('genome_length',$genome_len);
1711 $entry -> append('flank_length', $flank_size);
1712 $entry -> append('repeat_plus_flank', $repeat_plus_flank);
1713 $entry -> append('rep_start', $start);
1714 $entry -> append('rep_stop', $end);
1715 $entry -> append('repeat', "($match)$match_count");
1716 $entry -> append('motif', $match);
1717 $entry -> append('motif_revcom', $motif_revcom) unless ($isaaa == 1);
1718 $entry -> append('footprint', $match_len);
1719 $entry -> append('type', $motif_type);
1720 $entry -> append('dist_from_left', $checkleft_len);
1721 $entry -> append('pc_from_left', $checkleft_len_pc);
1722 $entry -> append('dist_from_right', $checkright_len);
1723 $entry -> append('pc_from_right', $checkright_len_pc);
1724 $entry -> append('total_nt_coding', $tempcoding) unless ($isaaa == 1);
1725 $entry -> append('percent_coding', $percentcode) unless ($isaaa == 1);
1726 # N.B. MINE's search.cgi can't handle values of 0 - this
1727 # feature should be set to "yes" or "no" for it to work.
1728 $entry -> append('coding_msat', $coding) unless ($isaaa == 1);
1729 $entry -> append('gene_name', $genename) unless ($isaaa == 1);
1730 $entry -> append('protein_name', $protname) unless ($isaaa == 1);
1731 $entry -> append('product_name', $prodname) unless ($isaaa == 1);
1732 $entry -> append('no_of_coding_regions', $cdsnum) unless ($isaaa == 1);
1733 $entry -> append('askew', $aat) unless ($isaaa == 1);
1734 $entry -> append('cskew', $cgc) unless ($isaaa == 1);
1735 $entry -> append('gc_content_genome', $CGgenome) unless ($isaaa == 1);
1736 $entry -> append('gc_content_flanks', $CGgen_flankseq) unless ($isaaa == 1);
1737 $entry -> append('gc_content_repeat', $CGgen_repeat) unless ($isaaa == 1);
1738 $entry -> append('Artemis_repeats', "$short_name.msat_tab");
1739 $entry -> append('Artemis_flank_repeats', "$short_name.flank_tab");
1740 $entry -> append('seq', $blaster);
1741 $entry -> save("ENTRY");
1742 close(ENTRY);
1743 }
1744 }
1745
1746 ##############
1747 # fasta file #
1748 ##############
1749
1750 # write stuff
1751 my $fout;
1752 if ($isaaa == 1)
1753 {
1754 my $cname = $common_name || "N/A";
1755 $fout = ">$short_name.$mineend, Name: $cname, Repeat: ($match)$match_count, start: $start, end: $end, thresholds: @thresholds, compliment=$reverse";
1756 #if ($gff)
1757 #{
1758 $gffout="$short_name.$mineend\tMsatfinder v.$version\trepeat_region\t$start\t$end\t.\t?\t.\tName=$cname;repeat=($match)$match_count;thresholds=@thresholds;compliment=$reverse";
1759 #}
1760 $fname = "$fasta_dir$short_name.$mineend";
1761 }
1762 else
1763 {
1764 $fout = ">$short_name.$mineend, Name: $common_name, Repeat: ($match)$match_count, start: $start, end: $end, thresholds: @thresholds, A:$base_a C:$base_c G:$base_g T:$base_t, coding=$coding, genename=$genename, proteinname=$protname, productname=$prodname, compliment=$reverse";
1765 #if ($gff)
1766 #{
1767 $gffout="$short_name.$mineend\tMsatfinder v.$version\trepeat_region\t$start\t$end\t.\t?\t.\tName=$common_name;repeat=($match)$match_count;thresholds=@thresholds;a=$base_a;t=$base_t;c=$base_c;g=$base_g;coding=$coding;genename=$genename;proteinname=$protname;productname=$prodname;compliment=$reverse";
1768 #}
1769 }
1770 my $fname = "$fasta_dir$short_name.$mineend.fasta";
1771 my $gffname = "$gff_dir$short_name.$mineend.fasta.gff";
1772 #if ($gff)
1773 #{
1774 # $fname.=".gff";
1775 $gffblaster="##FASTA\n>$short_name.$mineend\n$blaster";
1776 #}
1777 if ($fastafile == 1)
1778 {
1779 print "Opening fasta FILE $fname\n" if ($screendump == 1);
1780 open (FASTA, ">$fname") or die "Can't open your fasta file for writing: $!";
1781 print FASTA "$fout\n";
1782 print FASTA "$blaster\n";
1783 close FASTA;
1784 }
1785
1786 # send left and right flanks to eprimer
1787 if ($run_eprimer == 1)
1788 {
1789 if ($isaaa == 1)
1790 {
1791 print "Primer generation disabled for AA files!\n" if $screendump == 1;
1792 }
1793 else
1794 {
1795 &primers($blaster,$match_len,$short_name,length($left),$mineend);
1796 }
1797 }
1798
1799 # now print the .db data to a text CSV file
1800 # for general reference - "summary"
1801 if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 0)
1802 {
1803 print OUTPUT_FILE3 "$short_name.$mineend".$dlm."$short_name".$dlm."$prefix".$dlm."$gentaxon".$dlm."$division".$dlm."$strand".$dlm."$alphabet".$dlm."$circular".$dlm."$gdate".$dlm."$binomial".$dlm."$genus".$dlm."$species".$dlm."$sub_species".$dlm."$specific_host".$dlm."$lab_host".$dlm."$db_xref".$dlm."$note".$dlm."$common_name".$dlm."$organism".$dlm."$def".$dlm."$genome_len".$dlm."$flank_size".$dlm."$repeat_plus_flank".$dlm."$start".$dlm."$end".$dlm."($match)$match_count".$dlm."$match".$dlm."$motif_revcom".$dlm."$match_len".$dlm."$match_count".$dlm."$checkleft_len".$dlm."$checkleft_len_pc".$dlm."$checkright_len".$dlm."$checkright_len_pc".$dlm."$motif_type".$dlm."$tempcoding".$dlm."$percentcode".$dlm."$coding".$dlm."$genename".$dlm."$protname".$dlm."$prodname".$dlm."$reverse".$dlm."$primercheck{$short_name}".$dlm."$CGgenome".$dlm."$CGgen_flankseq".$dlm."$CGgen_repeat".$dlm."$cdsnum\n" if ($sumswitch == 1);
1804 print OUTPUT_FILE8 "$short_name.$mineend\tMsatfinder.v.$version\trepeat_region\t$start\t$end\t.\t.\t.\tspecific_taxon=$prefix;generic_taxon=$gentaxon;division=$division;strand=$strand;alphabet=$alphabet;circular=$circular;submit_date=$gdate;binomial=$binomial;genus=$genus;species=$species;strain_or_subspecies=$sub_species;specific_host=$specific_host;lab_host=$lab_host;taxid=$db_xref;notes=$note;common_name=$common_name;organism=$organism;definition=$def;genome_length=$genome_len;flank_length=$flank_size;repeat_plus_flank=$repeat_plus_flank;repeat=($match)$match_count;motif=$match;motif_revcom=$motif_revcom;footprint=$match_len;repeat_units=$match_count;dist_from_left=$checkleft_len;pc_from_left=$checkleft_len_pc;dist_from_right=$checkright_len;pc_from_right=$checkright_len_pc;repeat_type=$motif_type;total_nt_coding=$tempcoding;percent_coding=$percentcode;coding_repeat=$coding;gene=$genename;protein=$protname;product=$prodname;reverse=$reverse;primers=$primercheck{$short_name};gc_content_genome=$CGgenome;gc_content_flank=$CGgen_flankseq;gc_content_repeat=$CGgen_repeat;no_of_coding_regions=$cdsnum\n" if ($sumswitch == 1);
1805 }
1806 if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 1)
1807 {
1808 print OUTPUT_FILE3 "$short_name.$mineend".$dlm."$short_name".$dlm."$prefix".$dlm."$gentaxon".$dlm."$division".$dlm."$alphabet".$dlm."$circular".$dlm."$gdate".$dlm."$binomial".$dlm."$genus".$dlm."$species".$dlm."$sub_species".$dlm."$specific_host".$dlm."$lab_host".$dlm."$db_xref".$dlm."$note".$dlm."$common_name".$dlm."$genome_len".$dlm."$flank_size".$dlm."$repeat_plus_flank".$dlm."$start".$dlm."$end".$dlm."($match)$match_count".$dlm."$match".$dlm."$match_len".$dlm."$match_count".$dlm."$checkleft_len".$dlm."$checkleft_len_pc".$dlm."$checkright_len".$dlm."$checkright_len_pc".$dlm."$motif_type\n" if ($sumswitch == 1);
1809 print OUTPUT_FILE8 "$short_name.$mineend\tMsatfinder.v.$version\trepeat_region\t$start\túnd\t.\t.\t.\tshort_name=$short_name;specific_taxon=$prefix;generic_taxon=$gentaxon;division=$division;alphabet=$alphabet;circular=$circular;submit_date=$gdate;binomial=$binomial;genus=$genus;species=$species;strain_or_subspecies=$sub_species;specific_host=$specific_host;lab_host=$lab_host;taxid=$db_xref;notes=$note;common_name=$common_name;sequence_length=$genome_len;flank_length=$flank_size;repeat_plus_flank=$repeat_plus_flank;repeat=($match)$match_count;motif=$match;footprint=$match_len;repeat_units=$match_count;dist_from_left=$checkleft_len;pc_from_left=$checkleft_len_pc;dist_from_right=$checkright_len;pc_from_right=$checkright_len_pc;repeat_type=$motif_type\n";
1810 }
1811 if ($isformat == 1 and $isaaa == 0)
1812 {
1813 print OUTPUT_FILE3 "$short_name.$mineend".$dlm."$short_name".$dlm."$genome_len".$dlm."$flank_size".$dlm."$repeat_plus_flank".$dlm."$start".$dlm."$end".$dlm."($match)$match_count".$dlm."$match".$dlm."$motif_revcom".$dlm."$match_len".$dlm."$match_count".$dlm."$checkleft_len".$dlm."$checkleft_len_pc".$dlm."$checkright_len".$dlm."$checkright_len_pc".$dlm."$motif_type".$dlm."$primercheck{$short_name}".$dlm."$CGgenome".$dlm."$CGgen_flankseq".$dlm."$CGgen_repeat\n" if ($sumswitch == 1);
1814 print OUTPUT_FILE8 "$short_name.$mineend\tMsatfinder.v.$version\trepeat_region\t$start\t$end\t.\t.\t.\tshort_name=$short_name;genome_length=$genome_len;flank_length=$flank_size;repeat_plus_flank=$repeat_plus_flank;repeat=($match)$match_count;motif=$match;motif_revcom=$motif_revcom;footprint=$match_len;repeat_units=$match_count;dist_from_left=$checkleft_len;pc_from_left=$checkleft_len_pc;dist_from_right=$checkright_len;pc_from_right=$checkright_len_pc;repeat_type=$motif_type;repeat_type=$motif_type;primers=$primercheck{$short_name};gc_content_genome=$CGgenome;gc_content_flank=$CGgen_flankseq;gc_content_repeat=$CGgen_repeat\n";
1815 }
1816 if ($isformat == 1 and $isaaa == 1)
1817 {
1818 #unless ($gff)
1819 #{
1820 print OUTPUT_FILE3 "$short_name.$mineend".$dlm."$short_name".$dlm."$genome_len".$dlm."$flank_size".$dlm."$repeat_plus_flank".$dlm."$start".$dlm."$end".$dlm."($match)$match_count".$dlm."$match".$dlm."$match_len".$dlm."$match_count".$dlm."$checkleft_len".$dlm."$checkleft_len_pc".$dlm."$checkright_len".$dlm."$checkright_len_pc".$dlm."$motif_type\n" if ($sumswitch == 1);
1821 print OUTPUT_FILE8 "$short_name.$mineend\tMsatfinder.v.$version\trepeat_region\t$start\t$end\t.\t.\t.\tshort_name=$short_name;genome_length=$genome_len;flank_length=$flank_size;repeat_plus_flank=$repeat_plus_flank;repeat=($match)$match_count;motif=$match;footprint=$match_len;repeat_units=$match_count;dist_from_left=$checkleft_len;pc_from_left=$checkleft_len_pc;dist_from_right=$checkright_len;pc_from_right=$checkright_len_pc;repeat_type=$motif_type\n";
1822 #}
1823 }
1824
1825 # increment the count of the number of
1826 # unique genomes with repeats
1827 if ($ncseen{$short_name} == undef) { $totalreps++; }
1828 $ncseen{$short_name}=1;
1829 # this hash determines the number of repeats per genome
1830 # each key will be reset each time through this loop
1831 # resulting in the highest number being retained.
1832 $mineseen{$short_name}=$minenum;
1833
1834 # G+C content of the repeat
1835 $CGmatch = $whole_match =~ tr/cgCG/cgCG/;
1836
1837 # stop division by zero
1838 if ($match_len >0) {$CGmatch = ($CGmatch/$match_len)*100;}
1839
1840 # re-set reverse for each new repeat found
1841 $reverse = 0;
1842 } # end of MATCH
1843 # sort the motifs into order based on lenth then alphabet
1844 print "SN(0): @motifs_in_this_genome\n" if ($debug == 2);
1845 print "RL: $replength\n" if ($debug == 2);
1846 $motifs_by_genome{$short_name} = \@motifs_in_this_genome;
1847 $max_replength{$short_name} = $replength;
1848 } # end while $in
1849 } # end of ->next_seq()
1850 } # end of all the files in LISTFILE
1851 # SEARCHING COMPLETE
1852
1853 # note if no errors found
1854 unless ($printerror == 1)
1855 {
1856 print OUTPUT_FILE3 "No errors found\n";
1857 }
1858
1859
1860 # print out a list of all files, whether or
1861 # not they have repeats
1862 print "-----\n";
1863
1864 foreach my $line (@all_files2)
1865 {
1866 # extract number of repeats
1867 $line =~ /^(.+?)\|/;
1868 $file = $1;
1869 $numreps = $mineseen{$file};
1870 $repsum = $repseen{$file};
1871 $def = $defseen{$file};
1872 my $count_coding_reps = $pccodreps{$file};
1873 my $pccr;
1874 if ($numreps > 0 and $isformat != 2)
1875 {
1876 $pccr = 100 * ($count_coding_reps/$numreps);
1877 $pccr = sprintf("%.3f", $pccr);
1878 }
1879 else
1880 {
1881 $pccr = "NaN";
1882 }
1883
1884 # set a switch (0/1) if there are msats
1885 my $repyn;
1886 if ($numreps == 0) { $repyn = 0; }
1887 elsif ($numreps > 0) { $repyn = 1; }
1888
1889 # determine genome length
1890 my $genome_length = $genlenhash{$file};
1891 if ($genome_length > 0 && $repsum > 0)
1892 {
1893 $repsumpc = 100*($repsum/$genome_length);
1894 $repsumpc = sprintf("%.3f", $repsumpc);
1895 }
1896 else
1897 {
1898 $repsumpc = 0;
1899 }
1900 unless (defined $numreps) { $numreps = "0"; }
1901 print "numreps: $numreps, file: $file\n" if ($debug == 1);
1902 if ($isformat == 0 or $isformat == 2 or $isformat == 3)
1903 {
1904 print OUTPUT_FILE5 "$line$dlm$repsum$dlm$repsumpc$dlm$repyn$dlm$numreps$dlm$pccr\n";
1905 #my @split = split(/\|/,$line);
1906 #print OUTPUT_FILE7 "$split[0]\tMsatfinder.v.$version\tregion\t1\t$split[1]\t.\t.\t.\ttotal_rep_length=$repsum;pc_repeats=$repsumpc;msats=$repyn;no_of_msats=$numreps;percent_coding_msats=$pccr\n";
1907 }
1908 if ($isformat == 1)
1909 {
1910 print OUTPUT_FILE5 "$line$dlm$repsum$dlm$repsumpc$dlm$repyn$dlm$numreps\n";#removed $pccr from end, because you can't calculate percent_coding_msats from fasta file
1911 #my @split = split(/\|/,$line);
1912 # print OUTPUT_FILE7 "$split[0]\tMsatfinder.v.$version\tregion\t1\t$split[1]\t.\t.\t.\tgc_content=$split[2];a=$split[3];t=$split[4];g=$split[5];c=$split[6];askew=$split[7];cskew=$split[8];total_rep_length=$repsum;pc_repeats=$repsumpc;msats=$repyn;no_of_msats=$numreps\n";
1913 #print OUTPUT_FILE7 "$split[0]\tMsatfinder v.$version\tregion\t1\t$split[1]\t.\t?\t.\ttotal_rep_length=$repsum;pc_repeats=$repsumpc;msats=$repyn;no_of_msats=$numreps\n";
1914 }
1915 if ($debug == 1)
1916 {
1917 print "FILE: $file\n";
1918 print "RS: $repsum\n";
1919 print "DEF: $def\n";
1920 print "CCR: $count_coding_reps\n";
1921 print "NR: $numreps\n";
1922 print "PCCR: $pccr\n";
1923 print "-----\n";
1924 }
1925 }
1926
1927 # do a bit of summary to STDOUT
1928 if ($screendump ==1 )
1929 {
1930 print "\n\nA total of $count files surveyed\n";
1931 foreach $allfile (@all_files) { print "$allfile\n"; }
1932 if (defined $dodgy[0])
1933 {
1934 print "\n\nFiles with invalid GC content were found.\n";
1935 foreach $dodgy (@dodgy)
1936 {
1937 print OUTPUT_FILE1 "INVALID_GC|$dodgy\n";
1938 $no_errors=0;
1939 $printerror = 1;
1940 }
1941 print "\n\n";
1942 }
1943 }
1944
1945 #############################
1946 # provide handy index files #
1947 #############################
1948 # set up arrays/hashes for summation of all
1949 # values over all genomes - fill these with values
1950 # for single genomes, calculated in the next loop
1951 # and print them later
1952 my %overall_size_matrix=();
1953 my $overall_max_replength = 0;
1954 my %overall_repeat_counter =();
1955 my %rem_revcom = ();
1956 # determine the lowest threshold in use
1957 my @lowest_un;
1958 foreach my $mr (@motif_range)
1959 {
1960 push (@lowest_un, $thresholds[$mr]) if (defined $thresholds[$mr]);
1961 }
1962 my @lowest = sort { $a <=> $b } @lowest_un;
1963
1964 #####################################
1965 # print count files for each genome #
1966 #####################################
1967 foreach my $short_name (@short_names)
1968 {
1969 #next if ($non_divisible==1);
1970 print "SN(1): $short_name\n" if ($debug == 2);
1971 next unless ($has_msat{$short_name} == 1);
1972 print "SN(2): $short_name\n" if ($debug == 2);
1973 # get max replength
1974 $overall_max_replength = $max_replength{$short_name} if ($max_replength{$short_name} > $overall_max_replength);
1975
1976 # display by exact motif
1977 my $count_file = "$count_dir/$short_name.motif.count";
1978 open (COUNTFILE, ">$count_file") or die "Can't open the file \"$count_dir/$count_file\": $!";
1979 print COUNTFILE "units$dlm";
1980 foreach my $unmot ($motifs_by_genome{$short_name})
1981 {
1982 print "UN: @$unmot\n" if ($debug == 2);
1983 my @out = ();
1984 my @sorted = sort { length($a) <=> length($b) || $a cmp $b} @$unmot;
1985 foreach my $un (@sorted)
1986 {
1987 push (@out,$un);
1988 }
1989 my $out = join "$dlm", @out;
1990 $out =~ s/,$//;
1991 print COUNTFILE "$out\n";
1992 print "OUT: $out\n" if ($debug == 2);
1993 }
1994 for (my $k=$lowest[0];$k<=$max_replength{$short_name};$k++)
1995 {
1996 print COUNTFILE "$k$dlm";
1997 foreach my $unmot ($motifs_by_genome{$short_name})
1998 {
1999 my $count = 1;
2000 my @sorted = sort { length($a) <=> length($b) || $a cmp $b} @$unmot;
2001 foreach my $un (@sorted)
2002 {
2003 print COUNTFILE $repeat_counter{$short_name}{$un}[$k] || 0;
2004 print COUNTFILE "$dlm" unless $count == scalar(@$unmot);
2005 $overall_repeat_counter{$un}[$k] += $repeat_counter{$short_name}{$un}[$k];
2006 $count++;
2007 }
2008 }
2009 print COUNTFILE "\n";
2010 }
2011 close COUNTFILE;
2012 # by motif length
2013 # first, print file header
2014 $count_file = "$count_dir/$short_name.type.count";
2015 open (COUNTFILE, ">$count_file") or die "Can't open the file \"$count_dir/$count_file\": $!";
2016 print COUNTFILE "units$dlm";
2017 my $out = join "$dlm", @motif_range;
2018 $out =~ s/,$//;
2019 print COUNTFILE "$out\n";
2020 my %size_matrix=();
2021 # count up the number of each class of msat
2022 for (my $k=$lowest[0];$k<=$max_replength{$short_name};$k++)
2023 {
2024 foreach my $unmot ($motifs_by_genome{$short_name})
2025 {
2026 foreach my $un (@$unmot)
2027 {
2028 my $thing;
2029 if ($un =~ /-/) { $thing = [split(/-/,$un)]->[0]; }
2030 else { $thing = $un; }
2031 $size_matrix{length($thing)}[$k] += $repeat_counter{$short_name}{$un}[$k];
2032 $overall_size_matrix{length($thing)}[$k] += $repeat_counter{$short_name}{$un}[$k];
2033 }
2034 }
2035 }
2036 # print them to file
2037 for (my $k=$lowest[0];$k<=$max_replength{$short_name};$k++)
2038 {
2039 print COUNTFILE "$k$dlm";
2040 foreach my $i (@motif_range)
2041 {
2042 print COUNTFILE $size_matrix{$i}[$k] || 0;
2043 print COUNTFILE "$dlm" unless $i == @motif_range[-1];
2044 }
2045 print COUNTFILE "\n";
2046 }
2047 close COUNTFILE;
2048 }
2049
2050 #########################################################
2051 # print out summary files of numbers and types of msats #
2052 # found in all genomes #
2053 #########################################################
2054 # by motif length
2055 my $count_file = "$abs_path$prefix.type.count";
2056 open (COUNTFILE, ">$count_file") or die "Can't open the file \"$count_dir/$count_file\": $!";
2057 # print headers
2058 print COUNTFILE "units$dlm";
2059 my $out = join "$dlm", @motif_range;
2060 $out =~ s/,$//;
2061 print COUNTFILE "$out\n";
2062
2063 # new "do calculations"
2064 my %size_matrix=();
2065 # count up the number of each class of msat
2066 for (my $k=$lowest[0];$k<=$overall_max_replength;$k++)
2067 {
2068 foreach my $un (@all_motifs)
2069 {
2070 my $thing;
2071 if ($un =~ /-/) { $thing = [split(/-/,$un)]->[0]; }
2072 else { $thing = $un; }
2073 $size_matrix{length($thing)}[$k] += $repeat_counter{all}{$un}[$k];
2074 #$overall_size_matrix{length($thing)}[$k] += $repeat_counter{all}{$un}[$k];
2075 }
2076 }
2077 # print them to file
2078 for (my $k=$lowest[0];$k<=$overall_max_replength;$k++)
2079 {
2080 print COUNTFILE "$k$dlm";
2081 foreach my $i (@motif_range)
2082 {
2083 print COUNTFILE $size_matrix{$i}[$k] || 0;
2084 print COUNTFILE "$dlm" unless $i == @motif_range[-1];
2085 }
2086 print COUNTFILE "\n";
2087 }
2088 close COUNTFILE;
2089
2090 # genome totals, but by exact motif
2091 # use reverse compliment of motifs for this one (only),
2092 # to reduce size of the file
2093 my $count_file = "$abs_path$prefix.motif.count";
2094 open (COUNTFILE, ">$count_file") or die "Can't open the file \"$count_dir/$count_file\": $!";
2095 print COUNTFILE "units$dlm";
2096
2097 # print headers
2098 my @sorted = sort { length($a) <=> length($b) || $a cmp $b} @all_motifs;
2099 my %mseen=();
2100
2101 # prepare another hash with revcom'd motifs therein
2102 my %revcoms = ();
2103 my @word_keys = ();
2104 for (my $k=$lowest[0];$k<=$overall_max_replength;$k++)
2105 {
2106 foreach my $motif (@sorted)
2107 {
2108 my $key;
2109 if ($motif =~ /-/)
2110 {
2111 my @parts = split(/-/,$motif);
2112 my @reas;
2113 foreach my $part (@parts)
2114 {
2115 my $bit = join '', sort split '', $part;
2116 push(@reas,$bit);
2117 }
2118 $key = join '-', @reas;
2119 }
2120 else
2121 {
2122 $key = join '', sort split '', $motif;
2123 }
2124 push (@word_keys, $key) unless ($mseen{$key} == 1);;
2125 $revcoms{$key}[$k] += $overall_repeat_counter{$motif}[$k];
2126 $mseen{$key} = 1;
2127 }
2128 }
2129
2130 my $rcout = join "$dlm", @word_keys;
2131 $rcout =~ s/,$//;
2132 print COUNTFILE "$rcout\n";
2133
2134 # print values
2135 for (my $k=$lowest[0];$k<=$overall_max_replength;$k++)
2136 {
2137 my $count = 1;
2138 print COUNTFILE "$k$dlm";
2139 my @sorted = sort { length($a) <=> length($b) || $a cmp $b} @all_motifs;
2140 foreach my $un (@word_keys)
2141 {
2142 print COUNTFILE $revcoms{$un}[$k] || 0;
2143 print COUNTFILE "$dlm" unless $count == scalar(@word_keys);
2144 $count++;
2145 }
2146 print COUNTFILE "\n";
2147 }
2148 close COUNTFILE;
2149 # get total length of genomes searched
2150 my $totlen = 0;
2151 foreach my $length(@genome_len)
2152 {
2153 $totlen += $length;
2154 }
2155
2156 # do an index file with an overall summary.
2157 # count total msats by type
2158 my %totalcount = ();
2159 foreach my $i (@motif_range)
2160 {
2161 foreach my $num (@{$overall_size_matrix{$i}})
2162 {
2163 $totalcount{$i} += $num;
2164 }
2165 }
2166
2167 # count total msats by motif
2168 # compressing by revcom
2169 my %totalrevcom = ();
2170 my $numrevs = 0;
2171 foreach my $stuff (@sorted)
2172 {
2173 for (my $k=$lowest[0];$k<=$overall_max_replength;$k++)
2174 {
2175 my $key;
2176 if ($stuff =~ /-/)
2177 {
2178 $key = join '', sort split '', [split(/-/,$stuff)]->[0];
2179 }
2180 else
2181 {
2182 $key = join '', sort split '', $stuff;
2183 }
2184 $totalrevcom{$key} += $overall_repeat_counter{$stuff}[$k];
2185 }
2186 }
2187
2188 my $totfound = $numall || 0;
2189 my $totint = $numint || 0;
2190
2191 # the main summary file
2192 my %engine_list = (1 => 'regex',
2193 2 => 'multipass',
2194 3 => 'iterative',
2195 4 => 'sputnik',
2196 5 => 'dust',
2197 6 => 'seg');
2198 print OUTPUT_FILE2 <<EOF;
2199 ** msatfinder v. $version **
2200
2201 This analysis was run on $date
2202
2203 Engine used was = $engine_list{$engine}
2204 Threshold selection method = $threshold_method
2205 Total number of sequences surveyed = $total_ids
2206 Number of sequences containing repeats = $totalreps
2207 Total number of bp searched = $totlen
2208 Total number of microsatellites found = $totfound
2209 Total number of msats with primers = $primercount
2210 EOF
2211 if ($runird == 1)
2212 {
2213 print OUTPUT_FILE2 "Proportion of interrupted msats = $totint/$totfound\n\n";
2214 }
2215 else
2216 {
2217 print OUTPUT_FILE2 "\n";
2218 }
2219
2220 print OUTPUT_FILE2 "repeat_type\tthreshold\tnumber_found\n";
2221 foreach my $i (@motif_range)
2222 {
2223 print OUTPUT_FILE2 $i, "\t\t", $thresholds[$i-1], "\t\t", $totalcount{$i} || 0, "\n";
2224 }
2225 print OUTPUT_FILE2 "\n";
2226 unless ($non_divisible == 1)
2227 {
2228 print OUTPUT_FILE2 "motif\t\tnumber_found\n";
2229 foreach my $rco (@word_keys)
2230 {
2231 print OUTPUT_FILE2 $rco . "\t\t" . $totalrevcom{$rco}, "\n";
2232 }
2233 }
2234
2235 ####################################################################
2236 # print out a handy index file in order to assist users in finding #
2237 # their data, once the program is complete #
2238 ####################################################################
2239 my $data_index = "results.html";
2240 open (DI, ">$data_index") or die "Can't open $data_index: $!";
2241 print DI "<html>\n<head>\n</head><body>\n";
2242 print DI <<EOF;
2243 <p>Below is a listing of the various directories and files created by msatfinder, <i>viz.</i>
2244 <ul>
2245 <li><a href=#$repeat_dir><b>$repeat_dir</b></a> contains most of the summary information, including the overall summary, <a href="$repeat_dir/$prefix.index">$prefix.index</a>.
2246 <li><a href=#$count_dir><b>$count_dir</b></a> summary of msat types and motifs for each sequence run.
2247 <li><a href=#$fasta_dir><b>$fasta_dir</b></a> fasta files for each msat found.
2248 <li><a href=#$prime_dir><b>$prime_dir</b></a> primer files for each msat found.
2249 <li><a href=#$gff_dir><b>$gff_dir</b></a> gff3 format files. repeats.gff provides summary information, while flank_tab.gff and msat_tab.gff can be opened in Artemis.
2250 <li><a href=#$tab_dir><b>$tab_dir</b></a>, <b>$bigtab_dir</b> feature tables for viewing in artemis (with and without flanks, respectively).
2251 <li><a href=#$mine_dir><b>$mine_dir</b></a> MINE files - summaries of information on each msat, for use with <a href=\"http://mic.sgmjournals.org/cgi/reprint/147/2/249.pdf\">MINE</a>.
2252 </ul>
2253 EOF
2254 foreach my $dir ($repeat_dir,$count_dir,$fasta_dir,$prime_dir,$gff_dir,$tab_dir,$bigtab_dir,$mine_dir)
2255 {
2256 print DI "<p><a name=$dir><b>$dir</b></p>\n";
2257 my @files = glob "$dir*";
2258 foreach my $file (@files)
2259 {
2260 unless ($file =~ m/motif\.count/ && $non_divisible == 1 && $dir =~ m/Repeats/)
2261 {
2262 my $fileroot = [split("/",$file)]->[-1];
2263 print DI "<a href=\"$file\">$fileroot</a>\n";
2264 if ($file =~ m/.*\_tab[^\/]*$/ && $total_ids == 1)
2265 {
2266 my $baseURL = $cwd;
2267 $baseURL =~ s/\/var\/www\//http:\/\/www.genomics.ceh.ac.uk\//;
2268 my $baseURL2 = $baseURL;
2269 $baseURL2 =~ s/[^\/\d]*(\d*)\/$//; #for acess to within msatfinder.12321 from internet
2270 my $ID = $1;
2271 chop $baseURL2;
2272 print DI "&nbsp;&nbsp;&nbsp;&nbsp;<a href=\"$baseURL2/jnlp.php?ID=$ID&tab=$file\">Artemis</a>";
2273 }
2274 print DI "<br>\n";
2275
2276 }
2277
2278 }
2279 }
2280 print DI "</body>\n</html>";
2281 close DI;
2282
2283 ######################################
2284 # summarise primers in a single file #
2285 ######################################
2286 if ($run_eprimer == 1)
2287 {
2288 # N.B. Bio::Seq::PrimedSeq, Bio::Tools::Primer3 are not used because
2289 # they are rubbish. The latter is broken and the former returns results
2290 # as a messy jumble, not in order
2291 my $outfile = "$repeat_dir/primers.csv";
2292 open (OUT, ">$outfile") or warn "Can't open $outfile: $!";
2293 my @files = glob "$prime_dir*primers.txt";
2294 print OUT "msat,num,prod_size,f-primer,f-len,f-temp,f-gc,f-seq,r-primer,r-len,r-temp,r-gc,r-seq\n";
2295 foreach my $file (@files)
2296 {
2297 my $msat = $file;
2298 $msat =~ s/Primers\///;
2299 $msat =~ s/\.primers.txt$//;
2300 open (IN, "<$file") or warn "Can't open $file: $!";
2301 my ($num,$prod_size,$f_start,$f_len,$f_temp,$f_gc,$f_seq,$r_start,$r_len,$r_temp,$r_gc,$r_seq);
2302 while (my $line = <IN>)
2303 {
2304 next if ($line =~ /^#/ or $line =~ "");
2305 if ($line =~ /^\s+\d/)
2306 {
2307 my @stuff = split(/\s+/,$line);
2308 $prod_size = $stuff[4];
2309 $num = $stuff[1];
2310 }
2311 elsif ($line =~ /FORWARD/)
2312 {
2313 my @stuff = split(/\s+/,$line);
2314 $f_start = $stuff[3];
2315 $f_len = $stuff[4];
2316 $f_temp = $stuff[5];
2317 $f_gc = $stuff[6];
2318 $f_seq = $stuff[7];
2319 }
2320 elsif ($line =~ /REVERSE/)
2321 {
2322 my @stuff = split(/\s+/,$line);
2323 $r_start = $stuff[3];
2324 $r_len = $stuff[4];
2325 $r_temp = $stuff[5];
2326 $r_gc = $stuff[6];
2327 $r_seq = $stuff[7];
2328 print OUT "$msat,$num,$prod_size,$f_start,$f_len,$f_temp,$f_gc,$f_seq,$r_start,$r_len,$r_temp,$r_gc,$r_seq\n";
2329 }
2330 }
2331 close IN;
2332 }
2333 close OUT
2334 }
2335
2336 #########################################
2337 # Generate extra base files for
2338 # artemis if necessary
2339 #########################################
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355 ###########################################
2356 # Deal with any duplicated sequence names #
2357 ###########################################
2358 if (%duplicates != ())
2359 {
2360 my @duplicates;
2361 foreach my $key (sort(keys(%duplicates)))
2362 {
2363 if ($duplicates{$key} > 1)
2364 {
2365 push (@duplicates, $key);
2366 }
2367 }
2368 if (scalar @duplicates > 0)
2369 {
2370 print "A total of ".scalar @duplicates." sequence names were repeated two or more times. Msatfinder will reject any sequence whose name has been already used. The duplicated names are listed in the errors file.\n\n\n";
2371 foreach my $i (@duplicates)
2372 {
2373 print OUTPUT_FILE1 "REPEATED NAME|$i\n";
2374 $no_errors=0;
2375 }
2376 }
2377 }
2378
2379 if ($no_errors == 1)
2380 {
2381 print OUTPUT_FILE1 "No errors found\n";
2382 }
2383 #############
2384 # finished! #
2385 #############
2386 print "Msatfinder finished!\n";
2387
2388 ###########
2389 # send an email out with link, if necessary
2390 ###########
2391
2392
2393 if ($email)
2394 {
2395 my $ip = $config->{'CONTACT.ip'};
2396 my $dl = $config->{'CONTACT.download'};
2397 my $baseURL = $cwd;
2398 $baseURL =~ s/\/var\/www\//http:\/\/www.genomics.ceh.ac.uk\//;
2399 my $baseURL3 = $baseURL;
2400 my $baseURL2 = $cwd;
2401 $baseURL =~ s/[^\/\d]*(\d*)\/$//; #for acess to within msatfinder.12321 from internet
2402 my $id = $1;
2403 $baseURL2 =~ s/[^\/\d]*(\d*)\/$//; #abs path to data
2404 #my $result = `ls $baseURL2$ip.$id*`;
2405
2406 if ($dl eq 'tar')
2407 {
2408 $dl = 'tar.gz';
2409 }
2410 #system("chmod -R a+r $cwd") == 0 or die "Could not cd to output dir: $!, stopped";
2411 # tar/zip up the output
2412 my $suffix;
2413 print "$dl is dl\n";
2414 if ($dl eq "zip")
2415 {
2416 #$suffix = "zip";
2417 system("cd $baseURL2; zip -r $baseURL2$ip.$id.$dl msatfinder.$id > /dev/null 2>&1") == 0 or die "could not zip file: $!";
2418 #system("cd $tempdir; zip -r $absPATH/$tarip.$datestring.$suffix msatfinder.$datestring > /dev/null 2>&1") == 0 or die "could not zip file: $!";
2419 }
2420
2421 else #if ($dl eq "tar")
2422 {
2423 print "system(\"cd $baseURL2; tar cvf $baseURL2$ip.$id.tar $cwd > /dev/null 2>&1; gzip $ip.$id.tar\") == 0 or die \"Can't tar files: $!";
2424 system("cd $baseURL2; tar cvf $baseURL2$ip.$id.tar $cwd > /dev/null 2>&1; gzip $ip.$id.tar") == 0 or die "Can't tar files: $!\n";
2425 #system("cd $tempdir; tar cvf $absPATH/$tarip.$datestring.tar msatfinder.$datestring > /dev/null 2>&1; gzip $tarip.$datestring.tar") == 0 or die "Can't tar files: $!";
2426 }
2427
2428
2429 open(SENDMAIL, "|/usr/lib/sendmail -oi -t -odq") or die "Can't fork for sendmail: $!\n";
2430 print SENDMAIL "To: $email\n";
2431 #print OUTPUT_FILE2 "To: $email\n";
2432 print SENDMAIL "From: pswi\@ceh.ac.uk\n";
2433 #print OUTPUT_FILE2 "From: pswi\@ceh.ac.uk\n";
2434 print SENDMAIL "Subject: msatfinder results\n";
2435 print SENDMAIL "Your msatfinder results are complete\n\n";
2436 #print OUTPUT_FILE2 "Your msatfinder results are complete\n\n";
2437 #print OUTPUT_FILE2 "CWD : $cwd\n";
2438
2439 print SENDMAIL "Your results can be viewed here : ".$baseURL3."results.html\n\n";
2440 #print OUTPUT_FILE2 "baseUrl is $baseURL<br>";
2441 my $result = `ls $baseURL2$ip.$id*`;
2442 #print "result is $result\n";
2443 $baseURL2 =~ s/\/var\/www\//http:\/\/www.genomics.ceh.ac.uk\//;
2444 print SENDMAIL "or downloaded from here : $baseURL2$ip.$id.$dl";
2445
2446 #print OUTPUT_FILE2 "ls $listing<br>";
2447
2448 #Or viewed at : <a href=\"$baseURL/results.html\">$baseURL/results.html</a>\n\n";
2449 close SENDMAIL;
2450
2451
2452
2453 }
2454
2455
2456
2457 #############
2458 # FUNCTIONS #
2459 #############
2460
2461 # determine whether primers can be made from the
2462 # left and right flanking sequences
2463 #&primers($blaster,$match_len,$short_name,length($left),$mineend);
2464 sub primers
2465 {
2466 my $priseq = shift;
2467 my $mlen = shift;
2468 my $id = shift;
2469 my $left = shift;
2470 my $end = shift;
2471
2472 # make sure that primers include the repeat region
2473 my $start_include = $left -1;
2474 my @endings = split(/\./,$end);
2475 my $end_include = $left + (length($endings[1]) * $endings[2]) + 1;
2476
2477 # get primer data
2478 my $infile = "$prime_dir$id.$end.temp";
2479 open (WRITE, ">$infile") or die "Can't open $infile: $!";
2480 print WRITE $priseq;
2481 close WRITE;
2482 my $outfile = "$prime_dir$id.$end.primers.txt";
2483 if ($screendump == 1)
2484 {
2485 print "Determining PCR primers.\n";
2486 print "Outfile: $outfile\n";
2487 }
2488 system("$find{eprimer3} -sequence $infile -outfile $outfile -target $start_include,$end_include $eprimer_args > /dev/null 2>&1") == 0 or warn "Could not run \"$find{eprimer3} -sequence $infile -outfile $outfile -target $start_include,$end_include $eprimer_args\"; $!";
2489 if ($screendump == 1)
2490 {
2491 print "Running command: $find{eprimer3} -sequence $infile -outfile $outfile -target $start_include,$end_include $eprimer_args\n";
2492 print "Unlinking file: $infile\n"
2493 }
2494 unlink $infile unless ($debug == 1);
2495 $primercount++;
2496 $primercheck{$id} = 1;
2497 # remove output files if no primers were found, unless the user wants to know why
2498 unless ($eprimer_args =~ /pickanyway/ and $eprimer_args =~ /explain/)
2499 {
2500 if (-e $outfile)
2501 {
2502 open (CHECK, "<$outfile") or die "Error checking eprimer output $outfile: $!";
2503 my @lines = <CHECK>;
2504 close CHECK;
2505 unless (grep /FORWARD/, @lines)
2506 {
2507 unlink $outfile;
2508 print "No valid primers found for $id\n" if ($screendump == 1);
2509 $primercount--;
2510 $primercheck{$id} = 0;
2511 }
2512 }
2513 }
2514 }
2515
2516 #########################################
2517 # find microsatellites by various means #
2518 #########################################
2519 # there are three means available:
2520 # 1. Iterate through the sequence once, trying to build the longest msat possible.
2521 # 2. Go through once with a powerful regular expression, and post-filter.
2522 # 3. Go through scalar $motif_range times, finding all msats.
2523 sub findmotifs_iterate
2524 {
2525 no warnings;
2526 my $sequence = shift;
2527 my $whirley = shift;
2528 my $screendump = shift;
2529 my $genome = shift;
2530 my @result = ();
2531 my $iterate = 0;
2532 CREEP: while ($iterate < (length($sequence) -1)) # position within sequence
2533 {
2534 print STDERR "Please wait: ", $whirley->(), "\r" if ($screendump == 1);
2535 print "IT: $iterate\n" if ($debug == 2);
2536 CHECK: foreach my $i (@motif_range) # mono,di,tri &c.
2537 {
2538 print "I: $i, ", $motif_range[-1], "\n" if ($debug == 2);
2539 my $units = 1; # number of repeat units of msat
2540 my $first = substr($sequence,$iterate,$i);
2541 # attempt to extend the match...
2542 while (1)
2543 {
2544 my $offset = $iterate + ($units*$i);
2545 my $subsequent = substr($sequence,$offset,$i);
2546 print "Comp(1): $first, $subsequent, $i\n" if ($debug == 2);
2547 if ($first ne $subsequent)
2548 {
2549 print "Comp(2): $first, $subsequent, $i\n" if ($debug == 2);
2550 if ($units >= $thresholds[$i-1])
2551 {
2552 # attempt to reject false matches here, e.g. a tri picked up
2553 # as hexa or nona, &c.
2554 my $footprint = $units * $i;
2555 if (&splitmotif($first) == 1) # reject this - it's too small
2556 {
2557 $iterate = $iterate + $footprint;
2558 next CREEP;
2559 }
2560 else # keep this one, as it is a good match after all!
2561 {
2562 print "Match($units): ", "$first." x $units, "\n" if ($debug == 2);
2563 my $name = "$genome." . ($iterate+1) . ".$first." . ($footprint/length($first));
2564 #push(@result,{motif=>$first,footprint=>$footprint,mpos=>($iterate+1)});
2565 push (@result,$name);
2566 $iterate = $iterate + $footprint;
2567 next CREEP;
2568 }
2569 }
2570 elsif ($i < $motif_range[-1])
2571 {
2572 next CHECK;
2573 }
2574 else
2575 {
2576 $iterate++;
2577 next CREEP;
2578 }
2579 }
2580 $units++;
2581 }
2582 }
2583 }
2584 return \@result;
2585 }
2586 # thanks to T. Booth for this one
2587 sub findmotifs_regex
2588 {
2589 # V3 with speed of teh true ninja
2590 my $sequence = shift;
2591 my $whirley = shift;
2592 my $screendump = shift;
2593 my $genome = shift;
2594 my @result = ();
2595
2596 #Internal stuff
2597 #savepos must be declared with 'our' to see it within the regex.
2598 our $savepos;
2599 # Pre-generate main regex for even more speed:
2600 our ($mr, $regexes1);
2601 if(!$mr)
2602 {
2603 #Setup environment...
2604 $mr = $minrep - 2;
2605 $regexes1 = qr/((.{1,$maxmotiflen}?)(?>\2)\2{$mr,})/;
2606 }
2607
2608 #Use a regex to spot candidate repeats
2609 #This will favour short motifs over long ones
2610 while ($sequence =~ /$regexes1/g)
2611 {
2612 my $motif = $2;
2613 my $motiflen = length($2);
2614
2615 my $footprint = length($1);
2616 $savepos = pos($sequence);
2617 my $mpos = $savepos - $footprint;
2618 next unless ($thresholds[$motiflen - 1] =~ /\d+/);
2619
2620 #We have a candidate but: consider aaaaataaaaataaaaataaaaataaaaat
2621 # where the footprint of aaaaa (5) is shorter than the longest motif (6).
2622 #
2623 # Design decision : if thresholds permit motif longer than shortest footprint
2624 # then bail out - see above - that sorts out the ambiguity and takes us to the next case.
2625 #
2626 # Also - consider aaaaataataataataatg
2627 # Do we want a(5) and taa(4) or aat(5) ?
2628 # Depends if a(5) is within the threshold.
2629 #
2630 # If threshold check fails re-scan from $pos with new $minmotiflen
2631 # and look for a better candidate starting within the original footprint.
2632
2633 while( $footprint < $thresholds[$motiflen - 1] * $motiflen )
2634 {
2635 print STDERR "Please wait: ", $whirley->(), "\r" if ($screendump == 1);
2636 $motif = undef;
2637 pos($sequence) = $mpos;
2638 my $minmotiflen = $motiflen + 1;
2639 last if $minmotiflen > $maxmotiflen;
2640
2641 # This was really slow for big sequences because it scans the whole of the rest
2642 # of the sequence. In fact we only need to look up to savepos
2643 # $sequence =~ /$regexes[$minmotiflen]/g;
2644 #
2645 #
2646 # last if !pos($sequence); # No match at all
2647 #
2648 # $mpos = pos($sequence) - length($1);
2649 # last if $mpos >= $savepos; # Match later in sequence - we'll come back to it.
2650
2651 #Alternative uses a hook into the regex engine:
2652 use re 'eval';
2653 $sequence =~ /( (.{$minmotiflen,$maxmotiflen}?)
2654 (?(?{$-[2]<$savepos})(?:(?>\2)\2{$mr,})|(.) )
2655 )/gx;
2656
2657 #Give up if we hit the end or go past $savepos and therefore matched one char
2658 last if $3;
2659 last unless pos($sequence);
2660
2661 $mpos = pos($sequence) - length($1);
2662 $motif = $2;
2663 $motiflen = length($2);
2664 $footprint = length($1);
2665 }
2666
2667 # So now we have 2 cases:
2668 # 1) $motif is set - add to array
2669 # 2) $motif is undef : reset pos to $savepos and continue
2670 if($motif)
2671 {
2672 if (&splitmotif($motif) == 1)
2673 {
2674 pos($sequence) = $savepos;
2675 next;
2676 }
2677 my $name = "$genome." . ($mpos+1) . ".$motif." . ($footprint/$motiflen);
2678 # the "if grep" is a crude hack to get around the fact that the pregenerated
2679 # regexps will find things you don't want if you have set, for example,
2680 # motif types of 1,2,4,5,6. This way they don't appear in the output.
2681 push (@result, $name) if (grep /$motiflen/, @motif_range);
2682 }
2683 else
2684 {
2685 pos($sequence) = $savepos;
2686 }
2687 } # end main loop
2688 return \@result;
2689 }
2690
2691 sub findmotifs_sputnik
2692 {
2693
2694 my $sequence = shift;
2695 my $whirley = shift;
2696 my $screendump = shift;
2697 my $genome = shift;
2698 my @result = ();
2699 $non_divisible = 1; #let everything know that the output will be sputnikstyle - no motif and no repeat number, just stop
2700 #files
2701 my $sputfile_in='sputnik.in';
2702 my $sputfile_out='sputnik.out';
2703
2704 #convert to fasta format
2705 my $sputnik_in = ">$genome\n";
2706 my $temp = $sequence;
2707 while (length($temp) >= 70)
2708 {
2709 my $substring = substr($temp, 0, 70, "");
2710 $sputnik_in .= "$substring\n";
2711 }
2712 $sputnik_in .= "$temp";
2713 my $length = length($sputnik_in);
2714 open (SPUTNIK, ">$cwd$sputfile_in") || warn "can't open file : $!<br>";#die "can't open file: $!";
2715 print SPUTNIK "$sputnik_in\n";
2716 close SPUTNIK;
2717 system("$msat_dir/sputnik $cwd$sputfile_in>$cwd$sputfile_out");
2718 open (SPUTNIK_OUT, "$cwd$sputfile_out") || die "can't open file: $!";
2719 my @sputnik_results = <SPUTNIK_OUT>;
2720 close SPUTNIK_OUT;
2721 return &sputnik_reader(\@sputnik_results, $genome);
2722 }
2723
2724 sub sputnik_reader
2725 {
2726 my @input = @{shift @_};
2727 my $genome = shift;
2728 my %values = (mononucleotide => 1, dinucleotide => 2, trinucleotide => 3, tetranucleotide => 4, pentanucleotide => 5, hexanucleotide => 6);
2729 my $i=0;
2730 my @results;
2731 my $within=0;
2732 my ($start,$footprint,$motif_length,$motif);
2733 while ($input[$i])
2734 {
2735 if ($input[$i] =~ m/>/)
2736 {
2737 $i++;
2738 next;
2739 }
2740 if ($within == 1)
2741 {
2742 my $motif = lc substr ($input[$i], 0, $motif_length);
2743 my $repeat_number = $footprint / $motif_length;
2744 $repeat_number =~ s/\..*//;
2745 if (grep /$motif_length/, @motif_range)
2746 {
2747 if ($repeat_number > $thresholds[$motif_length-1])
2748 {
2749 $motif = "z" x $motif_length ;
2750 push (@results, "$genome.$start.$motif.$footprint") ;
2751 }
2752 }
2753 $within=0;
2754 }
2755 if ($input[$i] =~ m/(.*nucleotide) (\d*) : \d* -- length (\d*) score \d*/)
2756 {
2757 $motif_length = $values{$1};
2758 $start = $2;
2759 $footprint = $3;
2760 $within=1;
2761 }
2762 $i++;
2763 }
2764 return \@results;
2765 }
2766
2767 sub findmotifs_dust
2768 {
2769 my $sequence = shift;
2770 my $whirley = shift;
2771 my $screendump = shift;
2772 my $genome = shift;
2773 my @result = ();
2774 $non_divisible = 1; #let everything know that the output will be sputnikstyle - no motif and no repeat number, just stop
2775 my $sputfile_out='dust.out';
2776
2777 my $temp=$cwd;
2778 $temp =~ s/(\/$)/\/Fasta\//;
2779 system("dust $fasta_file>$temp$sputfile_out");
2780 open (DUST, "$temp$sputfile_out") || die "can't open file: $!";
2781 my @sputnik_results = <DUST>;
2782 close DUST;
2783 shift @sputnik_results;
2784 my $full_seq = join ('', @sputnik_results);
2785 $full_seq =~ s/\s//g;
2786 my @dust_results;
2787 while ($full_seq =~ /(N{2,})/g)
2788 {
2789 my $footprint = length($1);
2790 my $savepos = pos($full_seq);
2791 my $motiflen=1;
2792 my $motif="N";
2793 my $mpos = pos($full_seq) - length($1);
2794 my $name = "$genome." . ($mpos+1) . ".$motif." . ($footprint/$motiflen);
2795 push (@dust_results, $name);
2796 }
2797 return \@dust_results;
2798 }
2799
2800 sub findmotifs_seg
2801 {
2802 my $sequence = shift;
2803 my $whirley = shift;
2804 my $screendump = shift;
2805 my $genome = shift;
2806 my @result = ();
2807 $non_divisible = 1; #let everything know that the output will be sputnikstyle - no motif and no repeat number, just stop
2808 my $segfile_out="$genome.seg.out";
2809
2810 my $temp=$cwd;
2811 $temp =~ s/(\/$)/\/Fasta\//;
2812 my $seg_prog = "/usr/local/bioinf/wublast/filter/seg";
2813 my $segwindow = $config->{'CONTACT.segwindow'};
2814 my $seglocut = $config->{'CONTACT.seglocut'};
2815 my $seghighcut = $config->{'CONTACT.seghighcut'};
2816 system("$seg_prog $fasta_file $segwindow $seglocut $seghighcut>$temp$segfile_out");
2817 open (SEG, "$temp$segfile_out") || die "can't open file: $!";
2818 my @file_lines = <SEG>;
2819 my $size1 = scalar @file_lines;
2820
2821 close SEG;
2822 shift @file_lines;
2823 my @seg_results;
2824 foreach my $line (@file_lines)
2825 {
2826 $line =~ m/\s*\w+\s*(\d*)-(\d*)[^\w]*$/;
2827 if ($1 && $2)
2828 {
2829 my $start=$1;
2830 my $end=$2;
2831 my $motiflen=1;
2832 my $footprint=$2-$1+1; #is that correct?
2833 my $motif="X";
2834 my $name="$genome.$start.$motif.$footprint";
2835 # print "$name\n";
2836 push (@seg_results, $name);
2837 }
2838 }
2839 my $size2 = scalar @seg_results;
2840 #print "size of file $size1, size of results $size2\n";
2841 return \@seg_results;
2842 }
2843
2844 sub findmotifs_multipass
2845 {
2846 # useful in that it finds overlapping msats - each pass through
2847 # the sequence pays no heed to the result of previous passes, so
2848 # results may differ from the previous two methods
2849 my $sequence = shift;
2850 my $whirley = shift;
2851 my $screendump = shift;
2852 my $genome = shift;
2853 my @result = ();
2854 foreach my $i (@motif_range)
2855 {
2856 my $pattern = "." x $i;
2857 my $thresh = ($thresholds[$i-1] - 1);
2858 while ($sequence =~ /(($pattern)\2{$thresh,})/g)
2859 {
2860 print STDERR "Please wait: ", $whirley->(), "\r" if ($screendump == 1);
2861 my $whole_match = $1;
2862 my $match = $2 if (defined $2);
2863 if (&splitmotif($match) == 1) { next; } # reject this - it's too small
2864 else
2865 {
2866 my $name = "$genome." . (1 + pos($sequence)-length($whole_match)) . ".$match." . (length($whole_match)/length($match));
2867 push (@result, $name);
2868 #push(@result,{motif=>$match,footprint=>length($whole_match),mpos=>(1 + pos($sequence)-length($whole_match))});
2869 }
2870 }
2871 }
2872 return \@result;
2873 }
2874
2875 # add arrays
2876 sub addarrays
2877 {
2878 my @res = (); my ($ar, $i);
2879 for $ar (@_) { for($i = 0; $i < @$ar; $i++) { $res[$i] += $ar->[$i] if $ar->[$i] } };
2880 return @res;
2881 }
2882
2883 # print output files
2884 sub fileheaders
2885 {
2886 print OUTPUT_FILE8 "##gff-version 3\n";
2887 # print OUTPUT_FILE7 "##gff-version 3\n";
2888 my $isformat = shift;
2889 my $isaaa = shift;
2890 # Genbank &c. file
2891 if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 0)
2892 {
2893 print OUTPUT_FILE3 "repeat".$dlm."sequence_identifier".$dlm."specific_taxon".$dlm."generic_taxon".$dlm."division".$dlm."strand".$dlm."alphabet".$dlm."circular".$dlm."submit_date".$dlm."binomial".$dlm."genus".$dlm."species".$dlm."strain_or_subspecies".$dlm."specific_host".$dlm."lab_host".$dlm."taxid".$dlm."notes".$dlm."common_name".$dlm."organism".$dlm."definition".$dlm."genome_length".$dlm."flank_length".$dlm."repeat_plus_flank".$dlm."rep_start".$dlm."rep_stop".$dlm."repeat".$dlm."motif".$dlm."motif_revcom".$dlm."footprint".$dlm."repeat_units".$dlm."dist_from_left".$dlm."pc_from_left".$dlm."dist_from_right".$dlm."pc_from_right".$dlm."repeat_type".$dlm."total_nt_coding".$dlm."percent_coding".$dlm."coding_repeat".$dlm."gene".$dlm."protein".$dlm."product".$dlm."reverse".$dlm."primers".$dlm."gc_content_genome".$dlm."gc_content_flank".$dlm."gc_content_repeat".$dlm."no_of_coding_regions\n";
2894 print OUTPUT_FILE5 "genome".$dlm."specific_taxon".$dlm."generic_taxon".$dlm."division".$dlm."strand".$dlm."alphabet".$dlm."circular".$dlm."submit_date".$dlm."binomial".$dlm."genus".$dlm."species".$dlm."strain_or_subspecies".$dlm."specific_host".$dlm."lab_host".$dlm."taxid".$dlm."notes".$dlm."common_name".$dlm."organism".$dlm."definition".$dlm."genome_length".$dlm."total_nt_coding".$dlm."percent_coding".$dlm."gc_content".$dlm."no_of_coding_regions".$dlm."a".$dlm."t".$dlm."g".$dlm."c".$dlm."askew".$dlm."cskew".$dlm."total_rep_length".$dlm."pc_repeats".$dlm."msats".$dlm."no_of_msats".$dlm."percent_coding_msats\n";
2895 }
2896 if (($isformat == 0 or $isformat == 2 or $isformat == 3) and $isaaa == 1)
2897 {
2898 print OUTPUT_FILE3 "repeat".$dlm."sequence_identifier".$dlm."specific_taxon".$dlm."generic_taxon".$dlm."division".$dlm."alphabet".$dlm."circular".$dlm."submit_date".$dlm."binomial".$dlm."genus".$dlm."species".$dlm."strain_or_subspecies".$dlm."specific_host".$dlm."lab_host".$dlm."taxid".$dlm."notes".$dlm."common_name".$dlm."sequence_length".$dlm."flank_length".$dlm."repeat_plus_flank".$dlm."rep_start".$dlm."rep_stop".$dlm."repeat".$dlm."motif".$dlm."footprint".$dlm."repeat_units".$dlm."dist_from_left".$dlm."pc_from_left".$dlm."dist_from_right".$dlm."pc_from_right".$dlm."repeat_type\n";
2899 print OUTPUT_FILE5 "genome".$dlm."specific_taxon".$dlm."generic_taxon".$dlm."division".$dlm."alphabet".$dlm."circular".$dlm."submit_date".$dlm."binomial".$dlm."genus".$dlm."species".$dlm."strain_or_subspecies".$dlm."specific_host".$dlm."lab_host".$dlm."taxid".$dlm."notes".$dlm."common_name".$dlm."sequence_length".$dlm."total_rep_length".$dlm."pc_repeats".$dlm."msats".$dlm."no_of_msats".$dlm."percent_coding_msats\n";
2900 }
2901 # fasta file
2902 if ($isformat == 1 and $isaaa == 0)
2903 {
2904 print OUTPUT_FILE3 "repeat".$dlm."sequence_identifier".$dlm."genome_length".$dlm."flank_length".$dlm."repeat_plus_flank".$dlm."rep_start".$dlm."rep_stop".$dlm."repeat".$dlm."motif".$dlm."motif_revcom".$dlm."footprint".$dlm."repeat_units".$dlm."dist_from_left".$dlm."pc_from_left".$dlm."dist_from_right".$dlm."pc_from_right".$dlm."repeat_type".$dlm."primers".$dlm."gc_content_genome".$dlm."gc_content_flank".$dlm."gc_content_repeat\n";
2905 print OUTPUT_FILE5 "genome".$dlm."genome_length".$dlm."gc_content".$dlm."A".$dlm."T".$dlm."G".$dlm."C".$dlm."askew".$dlm."cskew".$dlm."total_rep_length".$dlm."pc_repeats".$dlm."msats".$dlm."no_of_msats\n";
2906 }
2907 # fasta file (amino acid)
2908 if ($isformat == 1 and $isaaa == 1)
2909 {
2910 print OUTPUT_FILE3 "repeat".$dlm."sequence_identifier".$dlm."genome_length".$dlm."flank_length".$dlm."repeat_plus_flank".$dlm."rep_start".$dlm."rep_stop".$dlm."repeat".$dlm."motif".$dlm."footprint".$dlm."repeat_units".$dlm."dist_from_left".$dlm."pc_from_left".$dlm."dist_from_right".$dlm."pc_from_right".$dlm."repeat_type\n";
2911 print OUTPUT_FILE5 "genome".$dlm."genome_length".$dlm."total_rep_length".$dlm."pc_repeats".$dlm."msats".$dlm."no_of_msats\n";
2912 }
2913 }
2914
2915 # determine if msat can be
2916 # split into smaller motifs
2917 sub splitmotif # run only for engine 2?
2918 {
2919 my $motif = shift;
2920 my $length = length $motif;
2921 print "Motif: $motif\n" if ($debug == 1);
2922 for (my $i=1;$i<$length;$i++)
2923 {
2924 # split motif into array of items of
2925 # length $i
2926 my @stuff = ();
2927 my $pos = 0;
2928 while ($pos < $length)
2929 {
2930 push(@stuff,substr($motif,$pos,$i));
2931 $pos += $i;
2932 }
2933 # see if all items in array match...
2934 if (scalar @stuff > 1) # ...unless there's only one item, or none.
2935 {
2936 my @matching = grep { /^$stuff[0]$/ } @stuff;
2937 if (scalar @stuff == scalar @matching)
2938 {
2939 return 1;
2940 }
2941 }
2942 }
2943 return 0;
2944 }
2945
2946 #####################################
2947 # perform various size calculations #
2948 #####################################
2949
2950 # add msat + flanking regions together #
2951 sub addregion
2952 {
2953 my ($start,$end) = @_;
2954 if ($start > $end)
2955 {
2956 ($start,$end) = ($end,$start);
2957 }
2958 my $done = 0;
2959 my $k;
2960 my $tempcoding = 0;
2961
2962 for ($k=0;$k<@nreg;$k++)
2963 {
2964 # new region starts after old region ends
2965 next if ($nreg[$k][1] < $start);
2966
2967 # new region ends before old region starts
2968 if ($end < $nreg[$k][0])
2969 {
2970 splice(@nreg,$k,0,[$start,$end]);
2971 $done = 1;
2972 last;
2973 }
2974
2975 # regions overlap
2976 $nreg[$k][0] = $start < $nreg[$k][0] ? $start : $nreg[$k][0];
2977 $nreg[$k][1] = $end > $nreg[$k][1] ? $end : $nreg[$k][1];
2978 $done = 1;
2979
2980 # Combine two regions if the new section has brough about an overlap
2981 while ($k < $#nreg and $nreg[$k][1] > $nreg[$k+1][0])
2982 {
2983 $nreg[$k][1] = $nreg[$k+1][1];
2984 splice(@nreg,$k+1,1);
2985 }
2986 last;
2987 }
2988 unless ($done)
2989 {
2990 push @nreg,[$start,$end];
2991 }
2992 }
2993
2994 # determine if region is within the defined area
2995 sub isitin
2996 {
2997 my ($what) = @_;
2998 my $within = 0;
2999 my $k;
3000 for ($k=0; $k<@nreg; $k++)
3001 {
3002 if ($what >= $nreg[$k][0] and $what <= $nreg[$k][1])
3003 {
3004 return 1;
3005 }
3006 }
3007 return 0;
3008 }
3009
3010 # return length of coding regions
3011 sub howlong
3012 {
3013 my $length;
3014 for (my $k=0; $k<@nreg; $k++)
3015 {
3016 $length += ($nreg[$k][1] - $nreg[$k][0]);
3017 }
3018 return $length;
3019 }
3020
3021 ###############################
3022 # depchecks, config files &c. #
3023 ###############################
3024 # read config file
3025 sub getconfig
3026 {
3027 my $cwd = shift;
3028 my $arg = shift;
3029 my $config_file="msatfinder.rc";
3030 my %config;
3031 if (-R "$cwd/$config_file")
3032 {
3033 Config::Simple->import_from("$cwd/$config_file", \%config);
3034 print "Using config file in $cwd\n" unless ($0 =~ /viewer/ or $> == 48); # or 81...
3035 return \%config;
3036 }
3037 elsif ($arg =~ /-h/ or $arg =~ /--help/)
3038 {
3039 return \%config;
3040 }
3041 else
3042 {
3043 print "No configuration file was found. Please make sure you have a copy of msatfinder.rc in $cwd, and try again.\n";
3044 print "If msatfinder was installed as a Bio-Linux package, you'll find the file you need (msatfinder.rc) in /usr/local/bioinf/msatfinder/\n";
3045 print "Or, if you installed from an ebuild, it will be in /usr/share/doc/msatfinder-$version\n";
3046 print "Exiting.\n";
3047 exit;
3048 }
3049 }
3050
3051 # check dependencies
3052 sub depcheck
3053 {
3054 my $find = shift;
3055 my $screendump = shift;
3056 my @missing;
3057 foreach my $dep (@_)
3058 {
3059 if (-x $find->{$dep})
3060 {
3061 print "Dependency $dep found.\n" if ($screendump == 1);
3062 }
3063 else # at least try to find it even if the confit is wrong
3064 {
3065 my $search = `which $dep 2> /dev/null`;
3066 chomp($search);
3067 if ($search =~ /aliased to (\.+)/)
3068 {
3069 $find->{$dep} = $1;
3070 print "$dep found, but not where your config file says it should be. Continuing...\n" if ($screendump == 1);
3071 }
3072 elsif (-x $search)
3073 {
3074 $find->{$dep} = $search;
3075 print "$dep found, but not where your config file says it should be. Continuing...\n" if ($screendump == 1);
3076 }
3077 else
3078 {
3079 push(@missing, $dep);
3080 }
3081 }
3082 }
3083 if (@missing)
3084 {
3085 print "The following dependencies are not found: @missing\n";
3086 print "Please add the full path to each dependency to the configuration file and try again.\n";
3087 exit;
3088 }
3089 }
3090
3091
3092
3093 #################################################
3094 # IRD - the Interrupted Repeat Detector routine #
3095 #################################################
3096 sub ird
3097 {
3098 # create an msat object for each line of info
3099 # store objects in a hash, and also relate names
3100 # of msats to genomes for printing later.
3101 my @incoming = @_;
3102 my @allmsats = ();
3103 my @interrupts = ();
3104 my %msats = ();
3105 my %linked = ();
3106 my %seen = ();
3107 my %root = ();
3108 my %components = ();
3109
3110 # hashes that store the results lines
3111 my %revname = ();
3112 my %revdist = ();
3113
3114 # order msats within the genome,
3115 # both forward and reverse
3116 my @msats = sort { [split(/\./,$a)]->[1] <=> [split(/\./,$b)]->[1] } @incoming;
3117 my ($old,$oend);
3118 foreach my $msat (@msats)
3119 {
3120 my @parts = split(/\./, $msat);
3121 my $loc = $parts[1];
3122 my $fp = length($parts[2]) * $parts[3];
3123 my $end = $loc + $fp;
3124 my $dist = $loc - $oend;
3125 $revname{$msat} = $old;
3126 $revdist{$msat} = $dist unless ($old eq "");
3127 $old = $msat;
3128 $oend = $end;
3129 }
3130
3131 # determine which are interrupted msats
3132 my $om;
3133 foreach my $msat (@msats)
3134 {
3135 my @parts = split(/\./, $msat);
3136 my $loc = $parts[1];
3137 my $ml = length($parts[2]);
3138 my $fp = $ml * $parts[3];
3139 my $last = $revname{$msat};
3140 my $dist = $revdist{$msat};
3141 my $ol = length([split(/\./,$om)]->[2]);
3142 # fits criteria
3143 if ($dist <= $fp and $ml == $ol)
3144 {
3145 if (defined $root{$om})
3146 {
3147 $root{$msat} = $root{$om};
3148 push(@{$components{$root{$om}}}, $msat);
3149 push (@interrupts, $msat);
3150 }
3151 else
3152 {
3153 $root{$msat} = $om;
3154 $root{$om} = $om;
3155 push(@{$components{$om}}, $msat);
3156 push(@{$components{$om}}, $om);
3157 push (@interrupts, $msat);
3158 push (@interrupts, $om);
3159 }
3160 }
3161 $om = $msat;
3162 }
3163
3164 # create separate array of non-interrupted msats,
3165 # and interrupted ones
3166 my %intseen = ();
3167 my @noninterrupts = ();
3168 foreach my $item (@interrupts) { $intseen{$item} = 1; }
3169 foreach my $item (@msats)
3170 {
3171 unless ($intseen{$item})
3172 {
3173 push(@noninterrupts, $item);
3174 }
3175 }
3176
3177 # go through array of interrupted msats, and combine each
3178 # set into one single msat
3179 my @joined = ();
3180 foreach my $msat (@interrupts)
3181 {
3182 if (defined $components{$msat})
3183 {
3184 my @reorder = sort {[split(/\./,$a)]->[1] <=> [split(/\./,$b)]->[1] } @{$components{$msat}};
3185 my $start = "";
3186 my $stop = "";
3187 my @locations=();
3188 my @motifs=();
3189 my $genome;
3190 my $units;
3191 my $oldloc = 0;
3192 my $oldfp = 0;
3193 foreach my $comp (@reorder)
3194 {
3195 my @parts = split(/\./, $comp);
3196 my $loc = $parts[1];
3197 my $motif = $parts[2];
3198 my $unit = $parts[3];
3199 $genome = $parts[0];
3200 push (@locations, $loc);
3201 push (@motifs, $motif);
3202 $units += $unit;
3203 $units += int(($loc - ($oldloc + $oldfp))/length($motif)) if ($oldloc > 0);
3204 $oldloc = $loc;
3205 $oldfp = length($motif) * $unit;
3206 }
3207 # create the new msat name
3208 my $newname = $genome . "." . $locations[0] . "." . join("-", @motifs) . "." . $units;
3209 push (@joined, $newname);
3210 }
3211 }
3212 # return array of all msats in genome
3213 my @finalmsats = ();
3214 foreach my $msat (@noninterrupts) { push (@finalmsats, $msat); }
3215 foreach my $msat (@joined) { push (@finalmsats, $msat); }
3216 return \@finalmsats;
3217 }
3218
3219 ###############
3220 # help, help! #
3221 ###############
3222 sub HELP_MESSAGE
3223 {
3224 print $usage;
3225 exit;
3226 }
3227
3228 sub VERSION
3229 {
3230 print "Msatfinder version: $version\n";
3231 exit;
3232 }
3233
3234 ##################################################
3235 # a please wait thingy, thanks to: #
3236 # http://www.perlmonks.org/index.pl?node_id=4943 #
3237 ##################################################
3238 {
3239 package Whirley;
3240 sub new
3241 {
3242 my $type = shift;
3243 my $number = shift;
3244 my $class = ref $type || $type;
3245
3246 my $WHIRLEY_COUNT = -1;
3247 my @whirleyhash = (
3248 [qw/. .o .oO .oO(zzz) .oO(ZZZ) /],
3249 [qw(>--- ->-- -->- ---> ---* --- ---< --<- -<-- <--- *---)],
3250 [qw(- \ | / )],
3251 [qw(_o_ \o/ _O_ \O/)],
3252 [qw(/... ./.. ../. .../ ...# ... ...\ ..\. .\.. \... #...)],
3253 [ '(^_^ )','( ^_^)'],
3254 [ '(^_^ )','( ^_^)','( -_-)', '(-_- )' ],
3255 [map chr, qw/32 176 177 178 219 178 177 176/]
3256 );
3257 my @whirley = @{$whirleyhash[$number]};
3258
3259 my $self = sub
3260 {
3261 $WHIRLEY_COUNT = 0 if ++$WHIRLEY_COUNT == @whirley;
3262 return $whirley[$WHIRLEY_COUNT];
3263 };
3264 bless $self, $class;
3265 }
3266 }
3267 __END__