ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/msatfinder/msataligner
Revision: 1.2
Committed: Mon Jun 27 16:09:17 2005 UTC (10 years, 10 months ago) by knirirr
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +5 -3 lines
Log Message:
Small alterations to fasta file location lines.

Line File contents
1 #!/usr/bin/perl
2
3 #################################################################################
4 # SCRIPT NAME: msataligner #
5 # FUNCTION: Summarize blast reports, make alignments and trees #
6 # AUTHOR: Cared for by Milo Thurston (mith@ceh.ac.uk) #
7 # AUTHOR: Sections previously cared for by Dawn Field (dfield@ceh.ac.uk) #
8 # AUTHOR: Kerr Wall ported code to use SearchIO intead of Blast.pm #
9 #################################################################################
10
11 use strict;
12 use warnings;
13 use Bio::SeqIO;
14 use Bio::SearchIO;
15 use Bio::Tools::BPbl2seq;
16 use Bio::TreeIO;
17 use Config::Simple;
18 use Cwd;
19 use File::Copy;
20 use Getopt::Std;
21 use Msatminer;
22
23 # set up variables
24 my (%rem_name,%rem_genome,%rem_blast,%opts,%rem_msats,@repeats);
25 my ($report, $hit);
26
27 # usage message
28 my $usage = "
29 Usage:
30
31 -d delete existing files
32 -h list these options
33
34 Run without arguments to process files according
35 to settings in msatminer.rc. Or, provide the names
36 of the genome files you'd like to process as an argument.
37
38 ";
39
40 ################
41 # idendify cwd #
42 ################
43 my $cwd = getcwd;
44 my $install = $ENV{MSATMINER_HOME};
45 print "Current directory: $cwd\n";
46
47 ####################################
48 # import settings from config file #
49 ####################################
50 my $config = &Msatminer::getconfig($install,$cwd);
51
52 ############################
53 # various global variables #
54 ############################
55 my $align_dir = $config->{'COMMON.align_dir'};
56 my $debug = $config->{'COMMON.debug'};
57 my $bgcolourhit = $config->{'ALIGNER.bgcolourhit'};
58 my $bgcolournohit = $config->{'ALIGNER.bgcolournohit'};
59 my $bgcolourself = $config->{'ALIGNER.bgcolourself'};
60 my $bgcolourpartial = $config->{'ALIGNER.bgcolourpartial'};
61 my $bl2seq = $config->{'DEPENDENCIES.bl2seq'};
62 my $blastall = $config->{'DEPENDENCIES.blastall'};
63 my $blast_options = $config->{'DEPENDENCIES.blast_options'};
64 my $dnadist = $config->{'DEPENDENCIES.dnadist'};
65 my $e_value = $config->{'DEPENDENCIES.e_value'};
66 my $fasta_dir = $config->{'COMMON.fasta_dir'};
67 my $formatdb = $config->{'DEPENDENCIES.formatdb'};
68 my $flank_size = $config->{'COMMON.flank_size'};
69 my $genomeglob = $config->{'COMMON.genomeglob'};
70 my $headers = $config->{'ALIGNER.headers'};
71 my $hsp_size = $config->{'ALIGNER.hsp_size'};
72 my $key = $config->{'ALIGNER.key'};
73 my $msatglob = $config->{'COMMON.msatglob'};
74 my $mview = $config->{'DEPENDENCIES.mview'};
75 my $run_mview = $config->{'ALIGNER.run_mview'};
76 my $mview_options = $config->{'DEPENDENCIES.mview_options'};
77 my $neighbor = $config->{'DEPENDENCIES.neighbor'};
78 my $url = $config->{'COMMON.url'};
79 my $vsgenomes = $config->{'ALIGNER.vsgenomes'};
80 my $vsmsats = $config->{'ALIGNER.vsmsats'};
81 my $repeat_path = "$cwd/";
82 if (-d "$fasta_dir")
83 {
84 $repeat_path .= $fasta_dir;
85 }
86
87 # show a help message
88 getopts('hd',\%opts);
89 if ($opts{h})
90 {
91 print $usage;
92 exit;
93 }
94 # check for other options
95 if ($opts{d})
96 {
97 if (-d "$cwd/$align_dir")
98 {
99 print "Deleting previous files.";
100
101 unlink glob "$cwd/$align_dir*\.aln"; print ".";
102 unlink glob "$cwd/$align_dir*\.bln"; print ".";
103 unlink glob "$cwd/$align_dir*\.bln.html"; print ".";
104 unlink glob "$cwd/$align_dir*\.bl2s"; print ".";
105 unlink glob "$cwd/$align_dir*blastdb*"; print ".";
106 unlink glob "$cwd/$align_dir*\.clw"; print ".";
107 unlink glob "$cwd/$align_dir*_ct.html"; print ".";
108 unlink glob "$cwd/$align_dir*\.dnd"; print ".";
109 unlink glob "$cwd/$align_dir*\.hash"; print ".";
110 unlink glob "$cwd/$align_dir*\.f"; print ".";
111 unlink glob "$cwd/$align_dir*\.jal.html"; print ".";
112 unlink glob "$cwd/$align_dir*\.log"; print ".";
113 unlink glob "$cwd/$align_dir*\.nwk"; print ".";
114 unlink glob "$cwd/$align_dir*\.tfa"; print ".";
115 unlink glob "$cwd/$align_dir*\.pd"; print ".";
116 unlink glob "$cwd/$align_dir*\.phy"; print ".";
117 unlink "$cwd/$align_dir/outfile"; print ".";
118 unlink "$cwd/$align_dir/infile"; print ".";
119 unlink "$cwd/$align_dir/treefile"; print ".";
120
121 print "\n\nExiting.\n";
122 }
123 else
124 {
125 print "No files to delete.\n";
126 }
127 exit;
128 }
129
130 my %hitcounter = ();
131 my %hitsummary = ();
132
133 # determine whether required dependencies are present
134 my @deps = qw(dnadist neighbor bl2seq blastall formatdb);
135 my %find = ();
136 $find{mview} = $mview;
137 $find{dnadist} = $dnadist;
138 $find{neighbor} = $neighbor;
139 $find{bl2seq} = $bl2seq;
140 $find{blastall} = $blastall;
141 $find{formatdb} = $formatdb;
142
143 print "\nSearching for dependencies:\n";
144 if ($run_mview == 0)
145 {
146 print "N.B. mview is turned off - edit run_mview in the config file to activate it.\n";
147 }
148 else
149 {
150 push (@deps, "mview");
151 }
152 &Msatminer::depcheck(\%find,@deps);
153
154 # create required directory
155 unless (-e "$cwd/$align_dir") { mkdir "$cwd/$align_dir", 0755 or warn "Can't create $align_dir"; }
156
157 # check that the jar files are present
158 my $jarerror = "
159 You will need the following .jar files for this script to function:
160 ATVjapplet.jar jalview.jar swing.jar
161 Please make sure that they are all present in this
162 directory, and executable (chmod +x *jar).
163
164 You should be able to find them in $install/bin/, and
165
166 Exiting!
167 ";
168 my @jars = qw(ATVjapplet.jar jalview.jar swing.jar);
169 if ($vsgenomes == 1 or $vsmsats == 1)
170 {
171 if (-x "$cwd/$align_dir/ATVjapplet.jar" and -x "$cwd/$align_dir/jalview.jar" and -x "$cwd/$align_dir/swing.jar")
172 {
173 print "Required java files present and working.\n";
174 }
175 elsif (-e "$cwd/$align_dir/ATVjapplet.jar" and -e "$cwd/$align_dir/jalview.jar" and -e "$cwd/$align_dir/swing.jar")
176 {
177 print "Required java files present, but not executable. Attempting to fix...\n";
178 !`chmod +x @jars` or die "Can't make java files executable: $!\n $jarerror";
179 print "Success!\n";
180 }
181 elsif (-R "$install/bin/ATVjapplet.jar" and -R "$install/bin/jalview.jar" and -R "$install/bin/swing.jar")
182 {
183 print "Copying required java files to $cwd, and attempting to make executable...\n";
184 unless (copy ("$install/bin/ATVjapplet.jar", "$cwd/$align_dir") &&
185 copy ("$install/bin/jalview.jar", "$cwd/$align_dir") &&
186 copy ("$install/bin/swing.jar", "$cwd/$align_dir"))
187 {
188
189 print "Could not copy java files to $cwd/$align_dir: $!\n";
190 print $jarerror;
191 exit;
192 }
193 foreach (@jars)
194 {
195 !`chmod +x $align_dir$_` or die "Can't make java files executable: $!\n $jarerror";
196 }
197 print "Success!\n";
198
199 }
200 else
201 {
202 print "The required java files were not found anywhere...\n";
203 print $jarerror;
204 exit;
205 }
206 }
207
208 # check that the icons are present
209 unless (-e "icons")
210 {
211 if (-e "$install/etc/icons")
212 {
213 !system("cp -r $install/etc/icons $cwd/") or warn "Can't copy icons directory to $cwd : $!";
214 }
215 else
216 {
217 print "No icons directory was found. This isn't serious, but you may want to investigate.\n";
218 }
219 }
220
221 # check that the globs don't clash
222 if ($msatglob eq $genomeglob)
223 {
224 print <<EOF;
225 Apparently, your genome and microsatellite files have the
226 same suffixes. This will break things. Please change them,
227 and try again.
228
229 Exiting!
230 EOF
231 exit;
232 }
233
234
235 # dates, &c.
236 my $date = `date`;
237 chomp($date);
238
239 # enable njtree to work if being viewed in same directory
240 # and obtain a prefix for the blast database
241 my $treeurl;
242 my $append;
243 my $pwd = getcwd();
244 $append = `basename $pwd`; chomp($append);
245
246 unless ($url =~ /^http:/)
247 {
248 $treeurl = "file://$pwd/";
249 }
250 else
251 {
252 $treeurl = $url;
253 }
254
255 # other important variables
256 my @genomes;
257 if (@ARGV)
258 {
259 @genomes = @ARGV;
260 }
261 else
262 {
263 print "Globbing genomes with suffix $genomeglob from $pwd.\n";
264 @genomes = glob "*\.$genomeglob";
265 }
266 my $gendbname = $append . "_genome.blastdb";
267 my $msatdbname = $append . "_msat.blastdb";
268 my @newfiles = ();
269 my %savename = ();
270 my %genomenum = ();
271 my %format=(); # 1 = fasta, 0 = genbank
272 my $hashfile = lc($append) . ".hash";
273 my %genname=();
274
275 my $outputstring; # the html output
276 my $outputstring2; # the csv bl2seq summary output
277
278
279 # check that there are genomes present in this
280 # directory, just in case!
281 unless (@genomes)
282 {
283 print <<EOF;
284 You appear to have no genome files in this directory.
285 Please create symlinks (or copy) the complete genomes
286 to this directory and try again.
287
288 Exiting!
289
290 EOF
291 exit;
292 }
293
294
295
296 #################################################
297 # create shorter names for each msat #
298 # fasta file so that clustal does not break #
299 #################################################
300 my $genomecount = "00";
301 my $isfasta;
302
303 print "\nCommencing analysis. This may take some time...\n";
304
305 GENOME: foreach my $genome (@genomes)
306 {
307 $genomecount++;
308 # determine if genome is a fasta file,
309 # or genbank
310 my $check;
311 open (FCHECK, "<$genome") or warn "Can't check file type of $genome: $!";
312 $. = 0;
313 do { $check = <FCHECK> } until $. == 1 || eof;
314 if ($check =~ /^>/)
315 {
316 $isfasta = 1;
317 $format{$genome} = $isfasta;
318 $rem_blast{$genome} = $genome;
319 }
320 else
321 {
322 $isfasta = 0;
323 $format{$genome} = $isfasta;
324 my $fastafile = $genome;
325 $fastafile =~ s/\.$genomeglob/.tfa/g;
326 $fastafile = $align_dir . $fastafile;
327 if (-e $fastafile)
328 {
329 print "Genome $genome already converted to fasta format.\n";
330 $rem_blast{$genome} = $fastafile;
331 }
332 else
333 {
334 open (OUT, ">$align_dir$fastafile") or die "Can't open $fastafile: $!";
335 my $in = Bio::SeqIO->new(-file => "$genome",
336 -format => "genbank");
337 my $out = Bio::SeqIO->new(-fh => \*OUT,
338 -format => "Fasta");
339 while (my $seq = $in->next_seq())
340 {
341 $out->write_seq($seq);
342 }
343 $rem_blast{$genome} = $fastafile;
344 close OUT;
345 }
346 }
347 close FCHECK;
348
349 # get the species name from each genome,
350 # if it's a genbank file
351 if ($isfasta == 0)
352 {
353 my $name;
354 foreach my $genome (@genomes)
355 {
356 open (READ, "<$genome") or die "Can't open $genome: $!";
357 while (<READ>)
358 {
359 if ($_ =~ /SOURCE/)
360 {
361 $name = $_;
362 $name =~ s/SOURCE\s+//;
363 chomp($name);
364 $genname{$genome} = $name;
365 last;
366 }
367 }
368 }
369 }
370 else
371 {
372 $genname{$genome} = "NA";
373 }
374
375
376 # open the genome file and extract the unique identifier
377 # new object, depeding on format
378 my $filein;
379 if ($isfasta == 1) # fasta
380 {
381 $filein = Bio::SeqIO->new(-file => "$genome",
382 -format => 'Fasta');
383 }
384 else # genbank
385 {
386 $filein = Bio::SeqIO->new(-file => "$genome",
387 -format => 'genbank');
388 }
389
390 my $short_name;
391 while (my $seq = $filein->next_seq())
392 {
393 my $id = $seq->display_id();
394 $id =~ s/;//;
395 $id =~ s/\.$genomeglob//;
396
397 # create a short name to refer to
398 # this particular sequence
399 my @ids = split(/\|/, $id);
400 if ($id =~ /^gi/) { $short_name = $ids[4]; }
401 elsif ($id =~ /^gb/) { $short_name = $ids[2]; }
402 else { $short_name = $id; }
403 }
404
405 # create a hash linking short_name to genome
406 $rem_name{$short_name} = $genome;
407 $rem_genome{$genome} = $short_name;
408
409 # label the files for each genome as
410 # <genome_num>.<file_num>.f
411 $genomenum{$genome} = $genomecount;
412 my $filecount = "001";
413
414 ####################
415 # glob in msats... #
416 ####################
417 my @msatfiles = glob "$repeat_path$short_name*$msatglob";
418 # check that there are microsatellites present
419 # in this directory, just in case!
420 unless (@msatfiles)
421 {
422 print "No msats found for genome $genome.\n";
423 next GENOME;
424 }
425 foreach my $msatfile (@msatfiles) { push (@repeats, [split(/\//, $msatfile)]->[-1]); }
426
427 # number the msats found in each genome
428 foreach my $file (@msatfiles)
429 {
430 my $shortfile = [split(/\//, $file)]->[-1];
431
432 # create and save new filename
433 my $newfile = $genomecount . "." . $filecount . ".f";
434 $savename{$shortfile} = $newfile;
435 if (-s "$align_dir$newfile")
436 {
437 print "Filename $shortfile already shortened to $newfile\n";
438 }
439 else
440 {
441 print "Shortening $shortfile to $newfile\n";
442
443 # read contents of old file and copy to new
444 # in order to correct the file name
445 open (INFILE, "<$file") or die "Can't open $file: $!";
446 open (OUTFILE, ">>$align_dir$newfile") or die "Can't open $file: $!";
447 my @in = <INFILE>;
448 close INFILE;
449 foreach my $in (@in)
450 {
451 if ($in =~ /^>/)
452 {
453 chomp($in);
454 $in =~ s/$shortfile/$newfile/;
455 $in = $in . ", Orig. file: $file\n";
456 }
457 print OUTFILE $in;
458 }
459 close OUTFILE;
460 }
461
462 push(@newfiles, $newfile);
463 $filecount++;
464 }
465 }
466
467 print "\n-----------------------------------------------------------------\n";
468 # all msat files will be fasta format, so record
469 # this fact for the makedb function
470 foreach (@repeats)
471 {
472 $format{$_} = 1;
473 }
474 #################################################
475 # print out a hash showing which file is which. #
476 # This is saved only for future reference, #
477 #################################################
478 open (HASH, ">$align_dir$hashfile") or die "Can't open $hashfile: $!\n";
479 print HASH "orig_name,new_name\n";
480 foreach my $file (@repeats)
481 {
482 print HASH "$file,$savename{$file}\n";
483 }
484 close HASH;
485 !system("chmod 644 $align_dir$hashfile") or warn "Can't change permissions on $hashfile $!";
486
487 ##########################
488 # create blast databases #
489 ##########################
490 if ($vsmsats == 1)
491 {
492 &makedb($msatdbname,\@repeats);
493 }
494 if ($vsgenomes == 1)
495 {
496 &makedb($gendbname,\@genomes);
497 }
498
499 #############################################################
500 # this section does the following: #
501 # 1. bl2seq on each genome vs. each msat #
502 # 2. makes clustal/mview alignments for blasts, if selected #
503 # 3. combines 1. (and 2. if selected) in the output html #
504 #############################################################
505
506 print "\nGenomes: @genomes\n";
507 print "REPEATS: @repeats\n";
508 my $numreps = @repeats;
509 print "Total number of microsatellites: $numreps\n";
510 print "\nBeginning bl2seq comparison of all genomes and repeats.\n\n";
511
512 # get the first line of one of the repeat files
513 my $threshline;
514 open (THRESHCHECK, "<$repeat_path$repeats[0]") or warn "Can't read threshold details: $!";
515 $. = 0;
516 do { $threshline = <THRESHCHECK> } until $. == 1 || eof;
517 my @threshes = split(/,/, $threshline);
518 my $threshold;
519 foreach my $t (@threshes)
520 {
521 if ($t =~ /thresholds:/)
522 {
523 $threshold = $t;
524 $threshold =~ s/thresholds://;
525 }
526 }
527 close THRESHCHECK;
528
529 # open summary file
530 my $repmatrixfile = "repeat_matrix.html";
531 if (-s "$repmatrixfile")
532 {
533 rename "$repmatrixfile", "$repmatrixfile.bak" or die "Can't backup $repmatrixfile: $!";
534 }
535 open (OUT, ">>$repmatrixfile") or die "Can't open $repmatrixfile: $!";
536
537 # html headers
538 print OUT <<EOF;
539 <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
540 <html>
541 <head>
542 <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
543 <title>msataligner output</title>
544 </head>
545 <body bgcolor="#FFFFFF">
546 EOF
547
548 ##############
549 # page title #
550 ##############
551 print OUT "\n<center><font color=\"#009900\"><font size=\"+3\"><a href=\"http://www.genomics.ceh.ac.uk/msatminer/\">msataligner</a> output</font></font></center>\n";
552
553 print OUT <<EOF;
554 <hr>
555 <p>Directory: $pwd
556 <p>Date: $date
557 <p>Thresholds: $threshold
558 EOF
559 if ($key == 1)
560 {
561 print OUT <<EOF;
562 <hr>
563 <h4>Colour coding for bl2seq reports:</h4>
564 <ul>
565 <li><font color="$bgcolourself"><b>S</b></font>: hit against same genome.
566 <li><font color="$bgcolourpartial"><b>P</b></font>: &ldquo;partial&rdquo; hit - hsp less than half flank size plus footprint.
567 <li><font color="$bgcolourhit"><b>H</b></font>: A hit against a non-self genome.
568 <li><font color="$bgcolournohit"><b>N</b></font>: No hit.
569 </ul>
570 <hr>
571 <h4>Colour coding for blast reports:</h4>
572 <p>Genome blast reports (and alignments) are coloured <font color="$bgcolourhit">thus</font> if the hit is to all genomes, <font color="$bgcolourpartial">thus</font> if to less than the full number but more than one, and <font color="$bgcolourself">thus</font> if only one hit was observed. The number of hits will be shown, with an &ldquo;S&rdquo; for self and the addition of an &ldquo;(ALL)&rdquo; if all genomes are hit.
573 <p>Microsatellite blast reports are coded only as <font color="$bgcolourhit">hits</font> or <font color="$bgcolournohit">non-hits</font>, and the blast column shows total hits, hits against self (never > 1), hits to the flanking regions of msats in the same genome (&ldquo;close&rdquo;), and hits that don't fall into these categories.
574 <hr>
575 EOF
576 }
577
578
579
580 # main body of page
581 print OUT "<center>\n<p><table border=\"1\" cellpadding=\"1\" cellspacing=\"1\">\n<tr valign = center>\n";
582
583 # print header line of table
584 unless ($headers == 1)
585 {
586 print OUT "<td align=\"center\"><img src=\"icons/msat.png\" alt=\"microsatellite locus\"></td>\n";
587 foreach my $genome (@genomes)
588 {
589 print OUT "<td align=\"center\"><center><a href=\"$genome\">$genome</a><br>$genname{$genome}<br>($genomenum{$genome})</a></center></td>\n";
590 }
591 &printheaders(\*OUT);
592 print OUT "</tr>\n";
593
594 }
595
596 # create the second summary file
597 my $repcsvfile = "repeat_matrix.csv";
598 if (-s "$repcsvfile")
599 {
600 rename "$repcsvfile", "$repcsvfile.bak" or die "Can't backup $repcsvfile: $!";
601 }
602 open (OUT2, ">>$repcsvfile") or die "Can't open $repcsvfile: $!";
603 foreach my $genome (@genomes)
604 {
605 print OUT2 ",$genomenum{$genome}";
606 }
607 print OUT2 "\n";
608
609 #####################################################
610 # runs the "repeats against genomes" bl2seq jobs, #
611 # won't run a blast if it detects an an output file #
612 #####################################################
613 print "Writing output file.\n";
614 my $oldrepeat = "x";
615 foreach my $repeat (@repeats)
616 {
617 # if the repeat name does not have a prefix, then
618 my $gname = [split(/\./, $repeat)]->[0];
619
620 # print the headers again for each genome type,
621 # if $headers is set &c.
622 if ($headers == 1 and $gname ne $oldrepeat)
623 {
624 # print header line of table
625 print OUT "<td align=\"center\"><img src=\"icons/msat.png\" alt=\"microsatellite locus\"></td>\n";
626 foreach my $genome (@genomes)
627 {
628 print OUT "<td align=\"center\"><center><a href=\"$genome\">$genome</a><br>$genname{$genome}<br>($genomenum{$genome})</a></center></td>\n";
629 }
630 print OUT "<td align=\"center\">repeat</td>";
631 &printheaders(\*OUT);
632 }
633
634 # create the link back to the
635 # msat fasta file
636 my $click = $repeat;
637 $click =~ s/\.$msatglob//;
638 my $key = $savename{$repeat};
639 $key =~ s/\.f$//g;
640
641 # prepare summary file
642 print OUT "\n<tr valign = center>\n";
643 my $tempname = $rem_name{$gname};
644 $outputstring = "<td align=\"center\"><a href=\"$repeat_path$repeat\">$click,</a> $genname{$tempname} ($key) </td>";
645 $outputstring2 = "$key";
646
647 ##########################
648 # produce bl2seq reports #
649 ##########################
650 foreach my $genome (@genomes)
651 {
652 # output blast data to file
653 # this set of substitutions makes sure
654 # that the blast report is correctly named
655 my $firstname = $genome;
656 $firstname =~ s/\.$genomeglob/_vs_/;
657 my $bl2s_out = $firstname . $repeat;
658 $bl2s_out =~ s/\.$msatglob/.bl2s/;
659
660 # run bl2seq jobs
661 # set option to -T T for html, but will break blast parser
662 if (-s "$align_dir$bl2s_out")
663 {
664 print "Bl2seq output $bl2s_out already exists.\n";
665 }
666 else
667 {
668 open (HITS_FILT, ">$align_dir$bl2s_out") or die "Can't open $bl2s_out: $!";
669 print "Blasting genome $genome against repeat $repeat\n";
670 my @output_bl2seq = `$find{bl2seq} -e $e_value -T F -F F -p blastn -i $rem_blast{$genome} -j $repeat_path$repeat`;
671 print "RBG: $rem_blast{$genome}\n" if ($debug == 3);
672 foreach my $output (@output_bl2seq)
673 {
674 # for some reason, bl2seq generates reports with an
675 # empty query field. So, add the correct entry
676 # whilst printing the lines to file
677 if ($output =~ /^Query=/)
678 {
679 $output = "Query = $genome\n";
680 }
681 print HITS_FILT "$output";
682 }
683 }
684 close HITS_FILT;
685
686
687 ########################################
688 # determine whether there are any hits #
689 ########################################
690 my $conserved = 0;
691 my $report = Bio::Tools::BPbl2seq->new(-file => "$align_dir$bl2s_out",
692 -report_type => 'blastn',
693 -queryname => "$genome");
694
695 # length of the repeat is needed; get this from the full fasta filename
696 my @repnames = split(/\./, $repeat);
697 my $r1 = length($repnames[2]);
698 my $r2 = $repnames[3];
699 my $replen = $r1 * $r2;
700
701 if ($debug == 1)
702 {
703 print "R1: $r1\n";
704 print "R2: $r2\n";
705 print "RL: $replen\n";
706 }
707
708 # NB: this bioperl module falls over if there were no hits at
709 # all. Therefore, test first that there actually is a subject,
710 # and don't attempt to run the next_feature() method if this
711 # is the case.
712 my $bgcolour;
713 my $subject = $report->sbjctName;
714 if ($subject)
715 {
716 # determine name of subject
717 # and query, to check if hit
718 # is to own genome
719 $subject =~ /([^,]+), Name:/;
720 my $s = [split(/\./, $1)]->[0];
721 my $g = $rem_genome{$genome};
722
723 # print an S if the hit is to self,
724 if ($g =~ /$s/)
725 {
726 $conserved = "S";
727 $bgcolour = $bgcolourself;
728 }
729 else
730 {
731 # if not a hit to self, determine length
732 # of match, and e value
733 while (my $hsp = $report->next_feature)
734 {
735 if ($debug == 1)
736 {
737 # notes on what these methods do
738 print "score: " . $hsp->score . "\n"; # Score = 78.1 bits (36) - the 36
739 print "bits: " . $hsp->bits . "\n"; # Score = 78.1 bits (36) - the 78.1
740 print "percent: " . $hsp->percent . "\n"; # Identities = 42/44 (95%) - the percent
741 print "expect: " . $hsp->P . "\n"; # Expect = 4e-16
742 print "identity: " . $hsp->match . "\n"; # Identities = 42/44 - the first value
743 print "length: " . $hsp->length . "\n"; # Identities = 42/44 - the second value
744 }
745
746 # test on these values to determine the likelyhood of
747 # the highest scoring hit being an actual conserved msat
748 my $e = $hsp->P();
749 my $identities = $hsp->length();
750
751 # see if the values selected above are within
752 # appropriate parameters for a "hit"
753 if ($e > 0.0001 or $identities < ($hsp_size + $replen))
754 {
755 $conserved = "P";
756 unless (-s "$align_dir$bl2s_out"){ print "No hits for $repeat on $genome\n"; }
757 $bgcolour = $bgcolourpartial;
758 last;
759 }
760 else
761 {
762 $conserved = "H";
763 $bgcolour = $bgcolourhit;
764 last;
765 }
766 }
767 }
768 $outputstring2 = $outputstring2 . ",$conserved";
769 }
770 else
771 {
772 $conserved = "N";
773 $bgcolour = $bgcolournohit;
774 unless (-s "$align_dir$bl2s_out"){ print "No hits for $repeat on $genome\n"; }
775 $outputstring2 = $outputstring2 . ",";
776 }
777 $outputstring = $outputstring . "<td align=\"center\" bgcolor=\"$bgcolour\"><a href=\"$align_dir$bl2s_out\">$conserved</a></td>";
778 }
779
780 if ($headers == 1)
781 {
782 $outputstring = $outputstring . "<td align=\"center\"><a href=\"$repeat_path$repeat\">$click,</a> $genname{$tempname} ($key)</td>";
783 }
784
785 ###################################
786 # run blasts vs. genomes/msats if #
787 # so required #
788 ###################################
789 my $msatreport = "m.$savename{$repeat}";
790 my $genreport = "g.$savename{$repeat}";
791 $msatreport =~ s/\.f/\.bln/;
792 $genreport =~ s/\.f/\.bln/;
793
794 # blasts vs. genomes
795 if ($vsgenomes == 1)
796 {
797 if (-s "$align_dir$genreport")
798 {
799 print "Genome blast job for $repeat already complete.\n";
800 }
801 else
802 {
803 print "Blasting $repeat against genomes.\n";
804 !system("$find{blastall} -p blastn -d $align_dir$gendbname -i $align_dir$savename{$repeat} -o $align_dir$genreport -e $e_value $blast_options") or die "Can't run blast! $!";
805 }
806
807 # get mview alignment
808 &runmview("$align_dir$genreport") if ($run_mview == 1);
809
810 # get hits
811 my $blasthits = &gethits("$align_dir$genreport");
812
813 # How many hits?
814 my $numhits = @$blasthits;
815 $hitcounter{$genreport} = $numhits;
816 print "Num. hits: $numhits\n";
817
818 # run clustal
819 if ($numhits > 1)
820 {
821 &clustal($blasthits,"$align_dir$genreport","g")
822 }
823 $outputstring .= &linkfiles("g",$genreport);
824
825 }
826
827 # blasts vs msats
828 if ($vsmsats == 1)
829 {
830 if (-s "$align_dir$msatreport")
831 {
832 print "Microsatellite blast job for $repeat already complete.\n";
833 }
834 else
835 {
836 print "Blasting $repeat against microsatellites.\n";
837 system("$find{blastall} -p blastn -d $align_dir$msatdbname -i $align_dir$savename{$repeat} -o $align_dir$msatreport -e $e_value $blast_options");
838 }
839
840 # get mview alignment
841 &runmview("$align_dir$msatreport") if ($run_mview == 1);
842
843 # get hits
844 my $blasthits = &gethits("$align_dir$msatreport");
845
846 # print out hits against msats
847 # from other genomes
848 my @samehits=();
849 my $oneself = 0;
850 my $otherhit = 0;
851
852 foreach my $hit (@$blasthits)
853 {
854 # get the "genome" part
855 # of the name
856 my $nrep = [split (/\./, $repeat)]->[0];
857 my $nhit = [split (/\./, $hit)]->[0];
858
859 # save hits to own genome
860 if ($savename{$hit} eq $savename{$repeat})
861 {
862 # hit onself
863 $oneself++;
864 }
865 elsif ($nhit eq $nrep)
866 {
867 # hit elsewhere within same genome
868 push (@samehits, $hit);
869 }
870 else
871 {
872 # hit another genome
873 $otherhit++;
874 }
875 }
876
877 # set up the postition of the repeats
878 &Msatminer::zap;
879 my ($start,$end) = &getstartend($savename{$repeat});
880 &Msatminer::addregion(($start-$flank_size),($end+$flank_size));
881
882 # find msats from the same genome
883 # but within each others flanks
884 my %closematch=();
885 foreach my $samehit (@samehits)
886 {
887 my ($start,$end) = &getstartend($samehit);
888 $closematch{$samehit}{START} = $start;
889 $closematch{$samehit}{END} = $end;
890 }
891
892 # count total number of hits that are not due to
893 # msats within each other's flanks in the same genome
894 my $closehit = 0;
895 my $same_genome = 0;
896 foreach my $hit (keys %closematch)
897 {
898 my $check1 = &Msatminer::isitin(($closematch{$hit}{START}-$flank_size));
899 my $check2 = &Msatminer::isitin(($closematch{$hit}{END}+$flank_size));
900
901 if ($check1 == 1 or $check2 == 1) { $closehit++; }
902 else { $same_genome++; }
903 }
904
905 # create a final header for other sorts of hit
906 my $numhits = @$blasthits;
907 $hitcounter{$msatreport} = $numhits;
908
909 # here's the line that goes in the final table
910 $hitsummary{$savename{$repeat}} = join " / ", $numhits, $closehit, $same_genome, $otherhit;
911
912 # produce clustal alignments
913 if ($numhits > 1)
914 {
915 &clustal($blasthits,$msatreport,"m");
916 }
917 $outputstring .= &linkfiles("m",$msatreport);
918
919
920 print "\n-----------------------------------------------------------------\n";
921
922 }
923
924 print OUT $outputstring . "</tr>\n";
925 print OUT2 $outputstring2 . "\n";
926
927 $oldrepeat = $gname;
928 }
929
930 # finish up the table
931 print "Msataligner finished!\n";
932 print OUT "</table>\n</body>\n</html>\n";
933 close OUT;
934 close OUT2;
935
936
937 #############
938 #############
939 # FUNCTIONS #
940 #############
941 #############
942
943 ####################################
944 # print headers in the output file #
945 ####################################
946 sub printheaders
947 {
948 my $outfile = shift;
949 if ($vsgenomes == 1)
950 {
951 print $outfile <<EOF;
952 <td align=\"center\">blast (genomes)</td>
953 <td align=\"center\"><img src=\"icons/clustal.png\" alt=\"clustal\"></td>
954 <td align=\"center\"><img src=\"icons/mview.png\" alt=\"mview\"></td>
955 <td align=\"center\"><img src=\"icons/jalview.png\" alt=\"jalview\"></td>
956 <td align=\"center\"><img src=\"icons/njtree.png\" alt=\"njtree file\"></td>
957 <td align=\"center\"><img src=\"icons/njtree.png\" alt=\"njtree in ATV\"></td>
958 EOF
959
960 print $outfile "</tr>\n" if ($vsmsats == 0);
961 }
962 if ($vsmsats == 1)
963 {
964 print $outfile <<EOF;
965 <td align=\"center\">blast (msats):<br> total/close/<br>same_genome/<br>other_genome</td>
966 <td align=\"center\"><img src=\"icons/clustal.png\" alt=\"clustal\"></td>
967 <td align=\"center\"><img src=\"icons/mview.png\" alt=\"mview\"></td>
968 <td align=\"center\"><img src=\"icons/jalview.png\" alt=\"jalview\"></td>
969 <td align=\"center\"><img src=\"icons/njtree.png\" alt=\"njtree file\"></td>
970 <td align=\"center\"><img src=\"icons/njtree.png\" alt=\"njtree in ATV\"></td>
971 EOF
972 }
973 print $outfile "</tr>\n";
974 }
975
976 #######################################
977 # open repeat fasta files and get the #
978 # start and end positions of the msat #
979 #######################################
980 sub getstartend
981 {
982 my $file = shift;
983 my ($start,$end);
984
985 # open the fasta file for each hit, and get
986 # the start and the stop for each hit.
987 if ($file =~ /\.f$/)
988 {
989 open (CHECK, "<$align_dir$file") or die "Can't open $file for checking: $!";
990 }
991 else
992 {
993 #open (CHECK, "<$repeat_path$file") or die "Can't open $file for checking: $!";
994 open (CHECK, "<Fasta/$file.fasta") or die "Can't open $file for checking: $!";
995 }
996 $. = 0; my $check;
997 do { $check = <CHECK> } until $. == 1 || eof;
998 close CHECK;
999 my @splits = split(/,/, $check);
1000 foreach my $split (@splits)
1001 {
1002 if ($split =~ /start/)
1003 {
1004 $start = $split;
1005 $start =~ s/start://;
1006 $start =~ s/\s//g;
1007 }
1008 elsif ($split =~ /end/)
1009 {
1010 $end = $split;
1011 $end =~ s/end://;
1012 $end =~ s/\s//g;
1013 }
1014 }
1015 return ($start,$end);
1016 }
1017
1018 ############################################
1019 # create the blast database of all genomes #
1020 # if genomes are not fasta, make them so! #
1021 ############################################
1022 sub makedb
1023 {
1024 my $dbname= shift;
1025 my $files = shift;
1026 if (-s "$align_dir$dbname")
1027 {
1028 print "$dbname blast database already exists.\n";
1029 }
1030 else
1031 {
1032 system ("touch $align_dir$dbname");
1033 open (OUTDB, ">>$align_dir$dbname") or die "Can't write to $dbname: $!";
1034 foreach my $file (@$files)
1035 {
1036 my $name = $file;
1037 if ($format{$name} == 1)
1038 {
1039 open (INGEN, "<Fasta/$file") or die "Can't read $file: $!";
1040 while (<INGEN>) { print OUTDB $_; }
1041 close INGEN;
1042 }
1043 else
1044 {
1045 my $in = Bio::SeqIO->new(-file => "$file",
1046 -format => "genbank");
1047 my $out = Bio::SeqIO->new(-fh => \*OUTDB,
1048 -format => "Fasta");
1049
1050 while (my $seq = $in->next_seq())
1051 {
1052 $out->write_seq($seq);
1053 }
1054 }
1055 }
1056 close OUTDB;
1057 !system("$find{formatdb} -i $align_dir$dbname -o F -p F") or die "Can't run formatdb on $align_dir$dbname $!";
1058 }
1059 }
1060
1061 ##############################
1062 # run mview on blast results #
1063 ##############################
1064 sub runmview
1065 {
1066 my $report = shift;
1067 if (-s "$report.html")
1068 {
1069 print "Mview formatting of $report complete.\n";
1070 }
1071 else
1072 {
1073 system("$find{mview} $mview_options $report > $report.html");
1074 }
1075 }
1076
1077 ###############################################
1078 # perform clustal alignments on blast reports #
1079 ###############################################
1080 sub clustal
1081 {
1082 my $hits = shift;
1083 my $report = shift;
1084 my $identifier = shift;
1085 my $numhits = $hitcounter{$report};
1086 my $repeat = $report;
1087 $repeat =~ s/\.bln/\.f/;
1088
1089
1090 # fasta file containing all sequences against which
1091 # hits were scored. Used as input for clustalw
1092 # bout = "blast out".
1093 my $boutfile = $report;
1094
1095 # make fasta files from blast reports
1096 #$boutfile = "$identifier.$report";
1097 $boutfile = $report;
1098 $boutfile =~ s/\.bln/\.clw/;
1099 unless (-e "$align_dir$boutfile")
1100 {
1101 open (BOUT, ">$align_dir$boutfile") or die "Can't open $boutfile: $!";
1102 if ($identifier =~ /m/) # print the whole seq of the hits to $boutfile
1103 {
1104 foreach my $hit (@$hits)
1105 {
1106 my $file;
1107 $file = "$savename{$hit}";
1108 open (IN, "<$align_dir$file") or die "Can't read $file: $!";
1109 while (<IN>) { print BOUT $_; }
1110 close IN;
1111 }
1112 }
1113 elsif ($identifier =~ /g/) # print the matching parts to $boutfile
1114 {
1115 my $blast_report = new Bio::SearchIO(-format => 'blast',
1116 -file => "$align_dir$report" );
1117
1118 my $result = $blast_report->next_result;
1119 my $query_name = $result->query_name();
1120 while (my $hit = $result->next_hit())
1121 {
1122 my $hit_name = $hit->name();
1123 $hit_name =~ s/,//;
1124 my $expect = $hit->significance();
1125 my $first_value = substr( $expect, 0, 1 );
1126 if ( $first_value eq "e" )
1127 {
1128 $expect = "1" . $expect;
1129 }
1130
1131 if ($expect < $e_value)
1132 {
1133 # extract hit regions and save to file, in order to run clustal
1134 while( my $hsp = $hit->next_hsp )
1135 {
1136 my $start = $hsp->query->start();
1137 my $end = $hsp->query->end();
1138 my $string = $hsp->query_string();
1139 print BOUT ">" . $hit->name() . ".$start start: $start, end: $end\n";
1140 print BOUT "$string\n";
1141 }
1142 }
1143 }
1144 }
1145 close BOUT;
1146 }
1147 else
1148 {
1149 $boutfile = "supercalifragilisticexpialidocious";
1150 }
1151
1152
1153
1154 # run these fasta files through clustal
1155 if (-e "$align_dir$boutfile")
1156 {
1157 my $clustal_aln_file = $report;
1158 $clustal_aln_file =~ s/\.bln/\.aln/;
1159 my $clustal_phy_file = $report;
1160 $clustal_phy_file =~ s/\.bln/\.phy/;
1161
1162 if ($debug == 1)
1163 {
1164 print "CAF: $clustal_aln_file\n";
1165 print "CPF: $clustal_phy_file\n";
1166 }
1167
1168 #################################
1169 # clustal output, phylip output #
1170 #################################
1171 if (-e "$align_dir$clustal_phy_file" and -e "$align_dir$clustal_aln_file")
1172 {
1173 print "Clustal already run on $report\n";
1174 }
1175 else
1176 {
1177 # will put .aln on end of infile
1178 system ("clustalw -INFILE=$align_dir$boutfile -OUTFILE=$align_dir$clustal_aln_file");
1179 # will put .phy on end of infile
1180 system ("clustalw -INFILE=$align_dir$boutfile -OUTFILE=$align_dir$clustal_phy_file -OUTPUT=phylip -maxdiv=1");
1181
1182 if (-e "$align_dir$clustal_phy_file" and $numhits > 2)
1183 {
1184 # copy phylip formatted alignment to infile for dnadist
1185 copy("$align_dir$clustal_phy_file","infile");
1186
1187 # remove outfile or phylip will ask whether or not to overwrite it
1188 if (-e "outfile") { unlink "outfile"; }
1189
1190 # if an infile already exists
1191 open( DNADIST, "|$find{dnadist}" ) || die "Error opening dnadist";
1192 print DNADIST "y\n";
1193 close DNADIST;
1194
1195 my $dnadist_file = "$report";
1196 $dnadist_file =~ s/\.bln/\.pd/;
1197 copy("outfile","$align_dir$dnadist_file");
1198
1199 # make a new infile for neighbor
1200 move("outfile","infile");
1201
1202 # run "neighbor"
1203 open( NEIGHBOR, "|$find{neighbor}" ) || die "Error opening neighbor";
1204 print NEIGHBOR "y\n";
1205 close NEIGHBOR;
1206
1207 my $tree_file = $report;
1208 my $tree_file_atv = $report;
1209 $tree_file =~ s/\.bln/\.nwk/;
1210 $tree_file_atv=~ s/\.bln/\.dnd/;
1211
1212 my $call_tree = $report;
1213 $call_tree =~ s/\.bln/\_ct.html/;
1214
1215 move("outfile","$align_dir$tree_file") or warn "Can't create $tree_file\n";
1216
1217 # a separate file is needed to call the tree
1218 open (CT, ">$align_dir$call_tree") or die "Can't open $call_tree: $!";
1219 print CT <<EOF;
1220 <html>
1221 <head>
1222 <title>
1223 ATVJapplet test
1224 </title>
1225
1226 <body>
1227
1228 <h1>
1229 ATV view of $report
1230 </h1>
1231
1232 <p>
1233 <APPLET ARCHIVE = "./swingjar, ./ATVjapplet.jar" CODE = "forester.atv.ATVjapplet.class" WIDTH = 200 HEIGHT = 50>
1234 <PARAM NAME = url_of_tree_to_load VALUE = "$treeurl$align_dir$tree_file_atv" >
1235 </APPLET>
1236
1237 </form>
1238
1239 </p>
1240 </body>
1241 </html>
1242 EOF
1243 }
1244 }
1245 }
1246 }
1247
1248 ####################################
1249 # create links to associated files #
1250 ####################################
1251 sub linkfiles
1252 {
1253 my $identifier = shift;
1254 my $newlink = shift;
1255
1256 my $numhits = $hitcounter{$newlink};
1257 my $blast_file = $newlink;
1258
1259 $newlink =~ s/\.bln//;
1260 my $mview_file = "$blast_file" . ".html";
1261 my $clustal_file = "$newlink.aln";
1262 my $njtree_file = "$newlink.nwk";
1263 my $njtree_java = "$newlink" . "_ct.html";
1264 my $jalview_file = "$newlink.jal.html";
1265
1266 # genome blast reports reports
1267 my $blast_link;
1268 my $blasthitcolour;
1269 if ($identifier =~ /g/)
1270 {
1271 my $numgen = @genomes;
1272 if ($hitcounter{$blast_file} == $numgen)
1273 {
1274 $blasthitcolour = "$bgcolourhit";
1275 $blast_link = "<td align=\"center\" bgcolor=\"$blasthitcolour\"><a href=\"$align_dir$blast_file\">$hitcounter{$blast_file} (ALL)</a></td>";
1276 }
1277 elsif ($hitcounter{$blast_file} == 1)
1278 {
1279 $blasthitcolour = $bgcolourself;
1280 $blast_link = "<td align=\"center\" bgcolor=\"$blasthitcolour\"><a href=\"$align_dir$blast_file\">S</a></td>";
1281 }
1282 else
1283 {
1284 $blasthitcolour = $bgcolourpartial;
1285 $blast_link = "<td align=\"center\" bgcolor=\"$blasthitcolour\"><a href=\"$align_dir$blast_file\">$hitcounter{$blast_file}</a></td>";
1286 }
1287 }
1288
1289 # msat blast reports reports
1290 elsif ($identifier =~ /m/)
1291 {
1292 my $name = "$newlink.f";
1293 $name =~ s/^m\.//g;
1294 if (-e "$align_dir$blast_file")
1295 {
1296 $blasthitcolour = $bgcolourhit;
1297 $blast_link = "<td align=\"center\" bgcolor=\"$blasthitcolour\"><a href=\"$align_dir$blast_file\">$hitsummary{$name}</a></td>";
1298 }
1299 else
1300 {
1301 $blasthitcolour = $bgcolournohit;
1302 $blast_link = "<td align=\"center\" bgcolor=\"$blasthitcolour\"><a href=\"$align_dir$blast_file\">0</a></td>";
1303 }
1304 }
1305
1306 # clustal alignments
1307 my $clustal_link;
1308 my $jalview_link;
1309 if (-e "$align_dir$clustal_file")
1310 {
1311 $clustal_link = "<td align=\"center\" bgcolor=\"$bgcolourhit\"><a href=\"$align_dir$clustal_file\">$numhits</a></td>";
1312 $jalview_link = "<td align=\"center\" bgcolor=\"$bgcolourhit\"><a href=\"$align_dir$jalview_file\">$numhits</a></td>";
1313 if (-e "$align_dir$jalview_file" )
1314 {
1315 print "Jalview alignment ($jalview_file) already exists.\n";
1316 }
1317 else
1318 {
1319 open (JAL, ">$align_dir$jalview_file") or die "Can't open $jalview_file: $!";
1320 print JAL <<EOF;
1321 <html>
1322 <head>
1323 <title>Jalview: $jalview_file</title>
1324 </head>
1325 <body>
1326 <h3>Jalview: $jalview_file</title>
1327 <p><a href="../repeat_matrix.html">back</a>.
1328 <br>
1329 <APPLET archive="./jalview.jar" CODE="jalview.ButtonAlignApplet.class" WIDTH=80 HEIGHT=35>
1330 <PARAM NAME="input" VALUE="$treeurl$align_dir$clustal_file">
1331 <PARAM NAME="type" value="URL">
1332 <PARAM NAME="format" value="CLUSTAL">
1333 <PARAM NAME="mailServer" value="localhost">
1334 </APPLET>
1335 </body>
1336 <html>
1337 EOF
1338 close JAL;
1339 }
1340 }
1341 else
1342 {
1343 $clustal_link = "<td align=\"center\" bgcolor=\"$bgcolournohit\">0</td>";
1344 $jalview_link = "<td align=\"center\" bgcolor=\"$bgcolournohit\">0</td>";
1345 }
1346
1347 # mview blast summaries
1348 my $mview_link;
1349 if (-e "$align_dir$mview_file")
1350 {
1351 $mview_link = "<td align=\"center\" bgcolor=\"$blasthitcolour\"><a href=\"$align_dir$mview_file\">$numhits</a></td>";
1352 }
1353 else
1354 {
1355 $mview_link = "<td align=\"center\" bgcolor=\"$bgcolournohit\"><a href=\"$align_dir$mview_file\">0</a></td>";
1356 }
1357
1358 # njtree files
1359 my $njtree_link;
1360 my $njtree_atv_link;
1361 if (-s "$align_dir$njtree_file")
1362 {
1363 $njtree_link = "<td align=\"center\" bgcolor=\"$blasthitcolour\"><a href=\"$align_dir$njtree_file\">$numhits</a></td>";
1364 $njtree_atv_link = "<td align=\"center\" bgcolor=\"$blasthitcolour\"><a href=\"$treeurl$align_dir$njtree_java\">$numhits</a></td>";
1365 print "LINK: $njtree_atv_link\n" if ($debug == 1);
1366 }
1367 else
1368 {
1369 $njtree_link = "<td align=\"center\" bgcolor=\"$bgcolournohit\">0</td>";
1370 $njtree_atv_link = "<td align=\"center\" bgcolor=\"$bgcolournohit\">0</td>";
1371 }
1372
1373 # print the final output file
1374 return "$blast_link $clustal_link $mview_link $jalview_link $njtree_link $njtree_atv_link\n";
1375 }
1376
1377 ###########################################
1378 # count the number of hits to each genome #
1379 ###########################################
1380 sub gethits
1381 {
1382 my $report = shift;
1383 my @hits=();
1384
1385 my $blast_report = new Bio::SearchIO(-format => 'blast',
1386 -file => "$report" );
1387
1388 my $result = $blast_report->next_result;
1389
1390 my $query_name = $result->query_name();
1391
1392 while (my $hit = $result->next_hit())
1393 {
1394 my $hit_name = $hit->name();
1395 $hit_name =~ s/,//;
1396
1397 my $expect = $hit->significance();
1398 my $first_value = substr( $expect, 0, 1 );
1399 if ( $first_value eq "e" )
1400 {
1401 $expect = "1" . $expect;
1402 }
1403
1404 # keep only hits above a certain threshold
1405 if ( $expect < $e_value)
1406 {
1407 push (@hits, $hit_name);
1408 }
1409 }
1410 return \@hits;
1411 }
1412
1413 ###############
1414 # help, help! #
1415 ###############
1416 sub HELP_MESSAGE
1417 {
1418 print $usage;
1419 }
1420
1421
1422 __END__