ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/msatfinder/msatannotator
Revision: 1.1.1.1 (vendor branch)
Committed: Mon Mar 7 15:34:44 2005 UTC (11 years, 4 months ago) by knirirr
Branch: MAIN
CVS Tags: HEAD, HEAD
Changes since 1.1: +0 -0 lines
Log Message:
First import

Line File contents
1 #!/usr/bin/perl
2
3 #########################################################
4 # SCRIPT NAME: msatannotator #
5 # FUNCTION: compare the conservation of microsatellites #
6 # in two genomes only, and produce a summary #
7 #########################################################
8
9 use strict;
10 use warnings;
11 use Bio::Tools::BPbl2seq;
12 use Bio::SeqIO;
13 use Config::Simple;
14 use File::Copy;
15 use Getopt::Std;
16 use Term::ANSIColor;
17 use Term::ReadLine;
18 use Cwd;
19 use Msatminer;
20
21 # usage message
22 my $usage = "
23 Usage:
24
25 msatannotator [options]
26 msatannotator file1 file2
27
28 -d delete existing blast reports
29 -h list these options
30
31 You must choose two files to annotate.
32 ";
33
34 ################
35 # idendify cwd #
36 ################
37 my $cwd = getcwd;
38 my $install = $ENV{MSATMINER_HOME};
39 print "Current directory: $cwd\n";
40
41
42 ####################################
43 # import settings from config file #
44 ####################################
45 my $config = &Msatminer::getconfig($install,$cwd);
46
47 ############################
48 # various global variables #
49 ############################
50 my $genomeglob = $config->{'COMMON.genomeglob'};
51 my $msatglob = $config->{'COMMON.msatglob'};
52 my $bl2seq = $config->{'DEPENDENCIES.bl2seq'};
53 my $bl2seq_options = $config->{'DEPENDENCIES.bl2seq_options'};
54 my $blast_type = $config->{'DEPENDENCIES.blast_type'};
55 my $e_value = $config->{'DEPENDENCIES.e_value'};
56 my $fasta_dir = $config->{'COMMON.fasta_dir'};
57 my $prefix = $config->{'ANNOTATOR.prefix'};
58 my $dbname = $config->{'ANNOTATOR.dbname'};
59 my $np_degree = $config->{'ANNOTATOR.np_degree'};
60 my $p_degree = $config->{'ANNOTATOR.p_degree'};
61 my $nc_degree = $config->{'ANNOTATOR.nc_degree'};
62 my $f1_degree = $config->{'ANNOTATOR.f1_degree'};
63 my $f2_degree = $config->{'ANNOTATOR.f2_degree'};
64 my $expand = $config->{'ANNOTATOR.expand'};
65 my $smatch = $config->{'ANNOTATOR.smatch'};
66 my $multiplier = $config->{'ANNOTATOR.multiplier'};
67 my $save = $config->{'ANNOTATOR.save'};
68 my $repeat_path = "$cwd/";
69 if (-d "$fasta_dir")
70 {
71 $repeat_path .= $fasta_dir;
72 }
73
74 # create annotations directory
75 my $annodir = $config->{'COMMON.anno_dir'};
76 $annodir =~ s/\/$//;
77 unless (-d $annodir)
78 {
79 mkdir "$annodir", 0755 or warn "Can't create $annodir: $!";
80 }
81
82 # set the pager
83 my $pager = `which less`;
84 chomp($pager);
85
86 # check if all previous output files are to be deleted
87 my %opts;
88 getopts('hd',\%opts);
89 if ($opts{h})
90 {
91 print $usage;
92 exit;
93 }
94 if ($opts{d})
95 {
96 print "Deleting old blast reports.\n";
97 unlink glob "$annodir/*\.bl2seq";
98 unlink glob "$annodir/*\.comp";
99 unlink glob "$annodir/*\.tfa";
100 unlink glob "$annodir/nomatch/*\.bl2seq";
101 unlink glob "$annodir/nopoly/*\.bl2seq";
102 unlink glob "$annodir/poly/*\.bl2seq";
103 unlink glob "$annodir/poly/*\.anno";
104 unlink glob "$annodir/oneflank/*\.bl2seq";
105 unlink glob "$annodir/twoflank/*\.bl2seq";
106 my @dirs = qw(nomatch nopoly poly oneflank twoflank);
107 foreach (@dirs) { !system("rmdir $annodir/$_") or warn "Can't delete $_ $!"; }
108 exit;
109 }
110
111 # check that deps are present
112 my @deps = qw(bl2seq);
113 my %find = ();
114 $find{bl2seq} = $bl2seq;
115 &Msatminer::depcheck(\%find,@deps);
116
117 # some hashes - it's necessary to relate lots of file/msat names/numbers
118 # to each other, and this does the job.
119 my %notes=(); # store annotation for each file in @annotated
120 my %rem_name=();
121 my %rem_genome=();
122 my %msatnum=(); # number every msat found
123 my %msatnames=(); # reverse hash of msatnum
124 my %cons=(); # a guess at the matching repeat in the other genome
125 my %rem_len=(); # store genome length
126 my %rem_blast=(); # makes sure that fasta files only are passed to bl2seq
127 my %rem_other=(); # the other genome
128 my %reportmatch=(); # which blast report goes with this msat?
129 my %repeatmatch=(); # which msat goes with this blast report?
130 my %rem_msats=();
131
132 # some arrays
133 my @genomes;
134 if (@ARGV)
135 {
136 @genomes = @ARGV;
137 }
138 else
139 {
140 @genomes = glob "*\.$genomeglob";
141 }
142 my @annotated=(); # store names of annotated files
143
144 # check that a proper pairwise
145 # comparison is being made
146 unless (@genomes)
147 {
148 print "No genome files (suffix .$genomeglob) have been found or specified on the command line.\n";
149 print "Exiting!\n";
150 exit;
151 }
152 if (@genomes != 2)
153 {
154 print "You must choose two genomes to annotate. Please try again.\n";
155 print "Exiting!\n";
156 exit;
157 }
158
159 # make sure we know what the "other genome" is
160 my ($genome1,$genome2) = @genomes;
161 $rem_other{$genome1} = $genome2;
162 $rem_other{$genome2} = $genome1;
163
164 # set up the unique names for each genome,
165 # and also to fasta conversions
166 my @repeats;
167 my $count = 0;
168 foreach my $genome (@genomes)
169 {
170 # msatfinder is obliged to extract a unique
171 # identity for each sequence, and use this
172 # to name the fasta files. Sometimes, this
173 # identity differs from the name of the genome
174 # file! So, here's a workaround for this evil.
175 # get the first line of one of the repeat files
176 my ($isfasta,$check);
177 open (FCHECK, "<$genome") or warn "Can't check file type of $genome -- $!";
178 $. = 0;
179 do { $check = <FCHECK> } until $. == 1 || eof;
180 if ($check =~ /^>/)
181 {
182 $isfasta = 1;
183 }
184 else
185 {
186 $isfasta = 0;
187 }
188 close FCHECK;
189
190 # open the genome file and extract the unique identifier
191 # new object, convert format if required
192 print "Processing files...\n";
193 my $filein;
194 if ($isfasta == 1) # fasta
195 {
196 $filein = Bio::SeqIO->new(-file => "$genome",
197 -format => 'Fasta');
198 $rem_blast{$genome} = $genome;
199 }
200 else # genbank
201 {
202 my $fastafile = $genome;
203 $fastafile =~ s/\.$genomeglob/.tfa/g;
204 if (-e $fastafile)
205 {
206 print "Genbank file $genome already converted to fasta format...\n";
207 }
208 else
209 {
210 print "Converting genbank file $genome to fasta format...\n";
211 open (OUT, ">$fastafile") or die "Can't open $fastafile: $!";
212 my $in = Bio::SeqIO->new(-file => "$genome",
213 -format => "genbank");
214 my $out = Bio::SeqIO->new(-fh => \*OUT,
215 -format => "Fasta");
216 while (my $seq = $in->next_seq())
217 {
218 $out->write_seq($seq);
219 }
220 close OUT;
221 }
222 $rem_blast{$genome} = $fastafile;
223 $filein = Bio::SeqIO->new(-file => "$genome",
224 -format => 'genbank');
225
226 }
227
228 my $short_name;
229 while (my $seq = $filein->next_seq())
230 {
231 my $id = $seq->display_id();
232 $id =~ s/;//;
233 $id =~ s/\.$genomeglob//;
234
235 # create a short name to refer to
236 # this particular sequence
237 my @ids = split(/\|/, $id);
238 if ($id =~ /^gi/) { $short_name = $ids[4]; }
239 elsif ($id =~ /^gb/) { $short_name = $ids[2]; }
240 else { $short_name = $id; }
241 my $genome_length = $seq->length();
242 $rem_len{$short_name} = $genome_length;
243 }
244
245 # create a hash linking short_name to genome
246 $rem_name{$genome} = $short_name;
247 $rem_genome{$short_name} = $genome;
248
249 my @msats = glob "$repeat_path$rem_name{$genome}*fasta";
250 foreach my $msat (@msats)
251 {
252 my $name = [split(/\//, $msat)]->[-1];
253 push (@repeats, $name);
254 push (@{$rem_msats{$genome}}, $name);
255 }
256 }
257
258 ####################
259 # perform blasting #
260 ####################
261 foreach my $genome (@genomes)
262 {
263 # produce standard bl2seq reports
264 foreach my $repeat (@{$rem_msats{$rem_other{$genome}}})
265 {
266 my $outputfile;
267 my $firstname = $genome;
268 $firstname =~ s/\.$genomeglob$/./;
269 my $lastname = $repeat;
270 $outputfile = $firstname . $lastname;
271 $outputfile =~ s/\.$msatglob$/.bl2seq/;
272 $reportmatch{$repeat} = $outputfile;
273 $repeatmatch{$outputfile} = $repeat;
274
275 if (-e "$annodir/$outputfile" or
276 -e "$annodir/nomatch/$outputfile" or
277 -e "$annodir/nopoly/$outputfile" or
278 -e "$annodir/oneflank/$outputfile" or
279 -e "$annodir/twoflank/$outputfile" or
280 -e "$annodir/poly/$outputfile")
281 {
282 print "Genome $genome already blasted against microsatellite $repeat\n";
283 $count++;
284 }
285 else
286 {
287 open (OUT, ">$annodir/$outputfile") or die "Can't open output file $outputfile: $!\n";
288 print "Blasting genome $genome against repeat $repeat\n";
289 my @output = `$find{bl2seq} $bl2seq_options -e $e_value -p $blast_type -i $rem_blast{$genome} -j $repeat_path$repeat`;
290 foreach my $output (@output)
291 {
292 if ($output =~ /^Query=/)
293 {
294 $output = "Query = $genome ($rem_name{$genome})\n";
295 }
296 print OUT $output;
297 }
298 }
299 }
300 }
301
302 # how many files were seen
303 if ($count > 0)
304 {
305 print "$count bl2seq reports already created.\n";
306 }
307
308 ########################################
309 # perform some pre-annotation matching #
310 ########################################
311
312 # glob all the report files
313 my @oldfiles = glob "$annodir/nomatch/*\.bl2seq $annodir/nopoly/*\.bl2seq $annodir/poly/*\.bl2seq $annodir/oneflank/*\.bl2seq $annodir/twoflank/*\.bl2seq";
314 my @currentfiles = glob "$annodir/*\.bl2seq";
315 unless (@oldfiles or @currentfiles)
316 {
317 print <<EOF;
318 You appear to have no bl2seq reports available, sorted
319 or unsorted. Please check your setup and try again.
320
321 Exiting!
322
323 EOF
324 exit;
325 }
326
327
328 ###########################################
329 # number all the msats, by start position #
330 # within each genome, and sort them #
331 ###########################################
332 my $number = 1;
333 my @countmsats;
334 foreach (@repeats)
335 {
336 push (@countmsats, [split(/\./, $_)]);
337 }
338 my @sorted = sort { $a->[0] cmp $b->[0] || $a->[1] <=> $b->[1] } @countmsats;
339 foreach (@sorted)
340 {
341 $msatnum{join(".", @$_)} = $number;
342 $msatnames{$number} = join(".", @$_);
343 $number++;
344 }
345
346
347 # make sure that the directories exist
348 my @dirs=qw(nomatch nopoly poly oneflank twoflank);
349 foreach my $dir (@dirs)
350 {
351 unless (-e "$annodir/$dir") { mkdir "$annodir/$dir", 0755 or warn "Can't create $dir: $!"; }
352 }
353
354 # purge all the genomes with no subject or tiny matches.
355 my $nomatchcount = 0;
356 my $shortmatchcount = 0;
357 my $fullname;
358 FILE: foreach my $outfile (@currentfiles)
359 {
360 # if there is no subject, then the report is immediately rejected.
361 open (READ, "<$outfile") or die "Can't open $outfile: $!";
362 my @linecheck = <READ>;
363 if (-e $outfile)
364 {
365 unless (grep /Sbjct:/, @linecheck)
366 {
367 rename "$annodir/$outfile", "$annodir/nomatch/$outfile" or die "Can't move $outfile: $!";
368 $nomatchcount++;
369 next FILE;
370 }
371 }
372 close READ;
373
374 # get information about this msat, based
375 # on the bl2seq report filename
376 my ($s,$subj,$query,$r1,$r2,$replen);
377 my @repnames = split(/\./, $outfile);
378 $subj = $repnames[1];
379 $s = $repnames[2];
380 $r1 = length($repnames[3]);
381 $r2 = $repnames[4];
382 $replen = $r1 * $r2;
383 if ($outfile =~ /^([a-z]+\/?)/)
384 {
385 $query = [split (/\//, $repnames[0])]->[1];
386 }
387 else
388 {
389 $query = $repnames[0];
390 }
391
392 my $report = Bio::Tools::BPbl2seq->new(-file => "$outfile", -report_type => "$blast_type");
393 my $subject = [split (/,/, $report->sbjctName)]->[0];
394
395 if ($smatch == 1 and -e $outfile)
396 {
397 # if the first match is very small (i.e. not >2* msat footprint) then it is also rejected
398 while (my $hsp = $report->next_feature)
399 {
400 my $identities = $hsp->length();
401 unless ($identities > ($multiplier * $replen))
402 {
403 # reject this one
404 rename "$annodir/$outfile", "$annodir/nomatch/$outfile" or die "can't move $outfile: $!";
405 $shortmatchcount++;
406 }
407 next FILE;
408 }
409 }
410 }
411
412 ###############################################################################
413 # use another bl2seq report to determine which repeats are in conserved areas #
414 ###############################################################################
415 my (@self,@other);
416 foreach (@sorted)
417 {
418 if ($_->[0] eq $rem_name{$genome1})
419 {
420 push (@self, $_);
421 }
422 else
423 {
424 push (@other, $_);
425 }
426 }
427
428 ###########################################################
429 # check if msats are found in maching regions #
430 # create a new, easily parseable report for each msat, #
431 # and on-the-fly determine the location of matching msats #
432 ###########################################################
433 print "Matching msats. Please wait...\n";
434 $count = 0;
435 foreach my $self (@self,@other)
436 {
437 my $msatname1 = join(".", @$self);
438 $cons{$msatname1} = "";
439 my @lines = `$find{bl2seq} -F F -e $e_value -D 1 -p $blast_type -i $repeat_path$msatname1 -j $rem_blast{$rem_other{$rem_genome{$self->[0]}}}`;
440
441 if ($save == 1)
442 {
443 my $savename = $rem_blast{$rem_other{$rem_genome{$self->[0]}}} . ".$msatname1";
444 $savename =~ s/\.$msatglob/\.comp/g;
445 $savename =~ s/\.tfa//g;
446 if (-e "$annodir/$savename")
447 {
448 $count++;
449 }
450 else
451 {
452 open (OUT, ">$annodir$savename") or die "Can't open $savename: $!";
453 foreach my $l (@lines) { print OUT $l; }
454 close OUT;
455 }
456 }
457
458 # parse the report
459 foreach my $line (@lines)
460 {
461 chomp($line);
462 next if ($line =~ /^#/ or $line =~ /^$/);
463 my @values = split(/\s+/, $line);
464 my $match_len = $values[3];
465 my $s2 = $values[8];
466 my $e2 = $values[9];
467 my $replen = length($self->[2]) * $self->[3];
468
469 # look only in the array containing msats from the
470 # other genome
471 my $array;
472 if ($genome1 =~ /$rem_genome{$self->[0]}/)
473 {
474 $array = \@other;
475 }
476 else
477 {
478 $array = \@self;
479 }
480 foreach my $other (@$array)
481 {
482 if ($match_len > ($multiplier * $replen))
483 {
484
485 my $msatname2 = join(".", @$other);
486 if ($e2 < $s2)
487 {
488 if ($e2 <= $other->[1] and $other->[1] <= $s2)
489 {
490 $cons{$msatname1} .= "$msatnum{$msatname2} ";
491 }
492 }
493 else
494 {
495 if ($s2 <= $other->[1] and $other->[1] <= $e2)
496 {
497 $cons{$msatname1} .= "$msatnum{$msatname2} ";
498 }
499 }
500 }
501 }
502 }
503 }
504
505 # how many files were seen
506 if ($count > 0)
507 {
508 print "$count tabular bl2seq files already created.\n";
509 }
510
511 ############################
512 # begin annotation section #
513 ############################
514 my @oldrejs = glob "$annodir/nomatch/*\.bl2seq";
515 print "$nomatchcount non-matching reports found and refiled.\n" unless ($nomatchcount == 0);
516 print "$shortmatchcount reports with short matches found and refiled.\n\n" unless ($shortmatchcount == 0);
517 print "A total of " . ($nomatchcount + $shortmatchcount) . " reports have been rejected.\n";
518
519 # glob the list of outfiles that need annotating
520 my @outfiles = glob "$annodir/*\.bl2seq";
521
522 if (@outfiles)
523 {
524 print color("green"), "Press A <enter> to annotate these genomes, or <enter> only to skip annotation", color("reset"), ".\n";
525 print color("red"), "NOTE: ", color("reset"), "bl2seq reports remaining in this directory will not be summarised.\n";
526 my $choice = <STDIN>;
527 chomp $choice;
528 if ($choice =~ /^[Aa]/)
529 {
530 &annotate(@outfiles);
531 }
532 }
533
534
535 ###########################################
536 # start producing the final summary table #
537 ###########################################
538
539 # set up some more variables,
540 # related this time to blasting
541 my $identities;
542 my $gaps;
543 my $strand;
544 my $expect;
545 my $outfile;
546 my $annotation;
547
548 # for each line of data, link it to the particular genome
549 my @gen0 = ();
550 my @gen1 = ();
551
552
553 ##############################
554 # get information from files #
555 ##############################
556 foreach my $genome (@genomes)
557 {
558 &filereader("$annodir/poly/",$genome,$p_degree);
559 &filereader("$annodir/oneflank/",$genome,$f1_degree);
560 &filereader("$annodir/twoflank/",$genome,$f2_degree);
561 &filereader("$annodir/nopoly/",$genome,$np_degree);
562 &filereader("$annodir/nomatch/",$genome,$nc_degree);
563 }
564
565 #############################
566 # end - print out some data #
567 #############################
568 my $sumfile = "annotated.csv";
569
570 if (-e $sumfile)
571 {
572 move("$sumfile","$sumfile.bak");
573 }
574
575 open (SUM, ">$sumfile") or die "Can't open $sumfile: $!";
576
577 # the titles
578 if ($expand == 1)
579 {
580 print SUM "**($genomes[0])****($genomes[1])****bl2seq output*****manual annotation\n";
581 print SUM "number*blast_hits*start*repeat*units*footprint*start*repeat*units*footprint*expect*score*identity*gaps*strand*conservation\n";
582 }
583 elsif ($expand == 0)
584 {
585 print SUM "**($genomes[0])**($genomes[1])**bl2seq output*****manual annotation\n";
586 print SUM "number*blast_hits*start*repeat*start*repeat*expect*score*identity*gaps*strand*conservation\n";
587 }
588
589 # sort the output based numerically;
590 # works on first field by default
591 my @genA = sort { $a cmp $b } @gen0;
592 my @genB = sort { $a cmp $b } @gen1;
593
594
595 # the lines of data
596 for (@genA)
597 {
598 print SUM "$_\n";
599 }
600 for (@genB)
601 {
602 print SUM "$_\n";
603 }
604
605 close SUM;
606
607 ###########
608 # The End #
609 ###########
610 print "Msatannotator finished.\n";
611
612 ##########################
613 # read each blast report #
614 ##########################
615 sub filereader
616 {
617 my $label = shift;
618 my $genome = shift;
619 $genome =~ s/\.$genomeglob//g;
620 my $annotation = shift;
621
622 my @files = glob "$label$genome\.*\.bl2seq";
623
624 foreach my $file (sort { $a cmp $b } @files)
625 {
626 my ($expect,$gaps,$strand,$identities,$score) = qw(n/a n/a n/a n/a n/a);
627
628 my $annofile = $file;
629 $annofile =~ s/\.bl2seq/.anno/g;
630
631 # if it's manually annotated, get the annotation
632 # from the annotation file
633 if (-e $annofile)
634 {
635 open (READ, "<$annofile") or die "Can't open $annofile: $!";
636 my @in = <READ>;
637 $annotation = $in[0];
638 chomp($annotation);
639 close READ;
640 }
641
642 # get the necessary details in order to complete
643 # $return, below (e.g. species, start, repeat &c.)
644 my @info = split(/\./, $file);
645 my $repeat = $info[1];
646 my $start = $info[2];
647 my $motif = lc $info[3];
648 my $units = $info[4];
649 my $footprint;
650 my @bases_self = split(//, $motif);
651
652 my $msatname = join ".", $repeat, $start, $info[3], $units, "fasta";
653
654 if (defined $motif and defined $units)
655 {
656 $footprint = (length $motif) * $units;
657 }
658
659 ################################################################
660 # You may be wondering why I have resorted to such a crude #
661 # technique as this. The answer is that neither BpBl2seq #
662 # nor SearchIO do what is required, and it's quicker to #
663 # write this simple code than it is to (re-)write a bioperl #
664 # module. I'm not ruling that out for later attention, though. #
665 ################################################################
666
667 # read the blast results from the file
668 open (IN, "<$file") or die "Can't open $file: $!";
669 my @inlines = <IN>;
670 close IN;
671
672 # make sure that we read in the values for the top hit
673 my @lines = reverse @inlines;
674
675 # get the required information for the best hsp
676 if (grep /Sbjct:/, @lines)
677 {
678 foreach my $line (@lines)
679 {
680 chomp($line);
681 if ($line =~ /^\s+Score/)
682 {
683 my @scex = split(/,/, $line);
684 $score = $scex[0];
685 $score =~ s/Score = //;
686 $expect = $scex[1];
687 $expect =~ s/Expect = //;
688 }
689 elsif ($line =~ /^\s+Identities/ and $line =~ /Gaps/)
690 {
691 my @idga = split(/,/, $line);
692 $identities = $idga[0];
693 $identities =~ s/Identities = //;
694 $gaps = $idga[1];
695 $gaps =~ s/Gaps = //;
696 }
697 elsif ($line =~ /^\s+Identities/)
698 {
699 $identities = $line;
700 $identities =~ s/Identities = //;
701 $gaps = "0";
702 }
703 elsif ($line =~ /^\s+Strand/)
704 {
705 $strand = $line;
706 $strand =~ s/Strand = //;
707 $strand =~ s/\s*Plus\s*/+/g;
708 $strand =~ s/\s*Minus\s*/-/g;
709 }
710 }
711 }
712
713 #############################################################
714 # place the data in the correct array for each genome #
715 # the separate $data entries make sure that the columns are #
716 # in the correct order for the final CSV table #
717 #############################################################
718
719 if ($repeat =~ /$rem_name{$genomes[0]}/)
720 {
721 my $data;
722 if ($expand == 0)
723 {
724 $data = "$msatnum{$msatname}*$cons{$msatname}*$start*($motif)$units***$expect*$score*$identities*$gaps*$strand*$annotation*";
725 }
726 elsif ($expand == 1)
727 {
728 $data = "$msatnum{$msatname}*$cons{$msatname}*$start*$motif*$units*$footprint*****$expect*$score*$identities*$gaps*$strand*$annotation*";
729 }
730 push (@gen0, $data);
731 }
732 elsif ($repeat =~ /$rem_name{$genomes[1]}/)
733 {
734 my $data;
735 if ($expand == 0)
736 {
737 $data = "$msatnum{$msatname}*$cons{$msatname}***$start*($motif)$units*$expect*$score*$identities*$gaps*$strand*$annotation*";
738 }
739 elsif ($expand == 1)
740 {
741 $data = "$msatnum{$msatname}*$cons{$msatname}*****$start*$motif*$units*$footprint*$expect*$score*$identities*$gaps*$strand*$annotation*";
742 }
743 push (@gen1, $data);
744 }
745 }
746 }
747
748
749 #################################
750 # view each file with the pager #
751 #################################
752 sub annotate
753 {
754 my $count = 0;
755 my @files = @_;
756 my $total = @_;
757
758 # if no repeats have been annotated this time,
759 # glob in a list of the ones in subdirs
760 unless (@annotated)
761 {
762
763 my @nomatch = glob "$annodir/nomatch/*\.bl2seq";
764 my @nopoly = glob "$annodir/nopoly/*\.bl2seq";
765 my @oneflank = glob "$annodir/oneflank/*\.bl2seq";
766 my @twoflank = glob "$annodir/twoflank/*\.bl2seq";
767 my @poly = glob "$annodir/poly/*\.bl2seq";
768
769 foreach (@nomatch) { my $stuff = [split(/\//, $_)]->[-1]; $notes{$stuff} = $nc_degree; push (@annotated, $stuff);
770 }
771 foreach (@nopoly) { my $stuff = [split(/\//, $_)]->[-1]; $notes{$stuff} = $np_degree; push (@annotated, $stuff);
772 }
773 foreach (@oneflank) { my $stuff = [split(/\//, $_)]->[-1]; $notes{$stuff} = $f1_degree; push (@annotated, $stuff); }
774 foreach (@twoflank) { my $stuff = [split(/\//, $_)]->[-1]; $notes{$stuff} = $f2_degree; push (@annotated, $stuff); }
775
776 foreach (@poly)
777 {
778 print "Yee-haw! $_\n";
779 }
780 #die "Yew varmint!\n";
781 foreach my $file (@poly)
782 {
783 my $text;
784 my $annofile = $file;
785 $annofile =~ s/\.bl2seq$/.anno/;
786 if (-e "$annofile")
787 {
788 open (IN, "<$annofile") or die "Can't open $annofile: $!";
789 my @lines = <IN>;
790 $text = $lines[0];
791 chomp($text);
792 }
793 else
794 {
795 $text = "An annotation file should exist for this report, but is missing.";
796 }
797 my $name = [split(/\//, $file)]->[-1];
798 $notes{$name} = $text;
799 push (@annotated, $name);
800 close IN;
801 }
802 }
803
804 FILE: foreach my $outfile (@files)
805 {
806 # get the filename of the original report, to see which
807 # reports match this one
808 my @orig = split(/\./, $outfile);
809 my $orig_file = join(".", $orig[1], $orig[2], $orig[3], $orig[4], "fasta");
810
811 $count++;
812 system("clear");
813
814 my $left = $total - $count;
815 print "You have viewed $count file(s) from a total of $total.\n";
816 print "There are $left files left\n\n";
817 print "--------------------\n";
818
819 ################################################
820 # FIRST MENU: #
821 # display matching files and their annotations #
822 ################################################
823 my @tempnotes=();
824 $outfile =~ s/$annodir\///;
825 if ($cons{$orig_file} ne "")
826 {
827 my @conserved = split(/\s/, $cons{$orig_file});
828 my $match;
829 my %matchseen = ();
830 foreach my $hit (@conserved)
831 {
832 $match = $msatnames{$hit};
833 print "Msat ", color("cyan"), $orig_file, color("reset"), " hit ", color("red"), $match, color("reset"), ".\n" unless (defined $matchseen{$match});
834 if (defined $notes{$reportmatch{$match}})
835 {
836 print "This is already annotated as: ";
837 print color("green"), "$notes{$reportmatch{$match}}", color("reset"), "\n";
838 push (@tempnotes, $notes{$reportmatch{$match}});
839 }
840 elsif (defined $reportmatch{$match})
841 {
842 print "The corresponding report ($reportmatch{$match}) has not yet been annotated.\n";
843 }
844 $matchseen{$match} = 1;
845 }
846 }
847 else
848 {
849 print "There are no hits matching $repeatmatch{$outfile}. \nENTER to view the report, or X to exit.\n";
850 chomp(my $choices = <STDIN>);
851 exit if ($choices =~ /^[Xx]/);
852 &viewer("$annodir/$outfile");
853 next;
854 }
855
856 print "--------------------\n";
857
858 ###################################################################
859 # if some annotation was found, as user if they want to re-use it #
860 ###################################################################
861 my %option=();
862 if (@tempnotes)
863 {
864 my $old = "";
865 print "Please select an option, and press ENTER:\n\n";
866 my $choices = 1;
867 foreach my $temp (@tempnotes)
868 {
869 next if ($temp eq "$old");
870 print "$choices: Use annotation: ", color("green"), " $temp ", color("reset"), "\n";
871 $option{$choices} = $temp;
872 $choices++;
873 $old = $temp;
874 }
875 print $choices, ": Edit this annotation.\n";
876 $option{$choices} = "edit";
877 print $choices+1, ": View the bl2seq report.\n";
878 $option{$choices+1} = "view";
879
880 print "\n";
881 print "X: exit.\n";
882
883 # choices
884 my $loop = 1;
885 while ($loop == 1)
886 {
887 chomp(my $in = <STDIN>);
888 exit if ($in =~ /^[Xx]/);
889
890 # check
891 my $annotation;
892 if ($in >= 1 and $in <= $choices+1)
893 {
894 $annotation = $option{$in};
895 }
896 else
897 {
898 print "Please select one of the menu options.\n";
899 }
900
901 # refiling
902 if ($annotation =~ /$nc_degree/)
903 {
904 $notes{$outfile} = $nc_degree;
905 rename "$annodir/$outfile", "$annodir/nomatch/$outfile" or die "Can't move $outfile: $!";
906 push (@annotated, $outfile);
907 $loop = 0;
908 }
909 elsif ($annotation =~ /$np_degree/)
910 {
911 $notes{$outfile} = $np_degree;
912 rename "$annodir/$outfile", "$annodir/nopoly/$outfile" or die "Can't move $outfile: $!";
913 push (@annotated, $outfile);
914 $loop = 0;
915 }
916 elsif ($annotation =~ /$f1_degree/)
917 {
918 $notes{$outfile} = $f1_degree;
919 rename "$annodir/$outfile", "$annodir/oneflank/$outfile" or die "Can't move $outfile: $!";
920 push (@annotated, $outfile);
921 $loop = 0;
922 }
923 elsif ($annotation =~ /$f2_degree/)
924 {
925 $notes{$outfile} = $f2_degree;
926 rename "$annodir/$outfile", "$annodir/twoflank/$outfile" or die "Can't move $outfile: $!";
927 push (@annotated, $outfile);
928 $loop = 0;
929 }
930 elsif ($annotation =~ /view/)
931 {
932 &viewer("$annodir/$outfile",(join ",", @tempnotes));
933 $loop = 0;
934 }
935 elsif ($annotation =~ /edit/)
936 {
937 my $term = new Term::ReadLine 'first edit';
938 my $prompt = "Msat annotation: ";
939 foreach my $temp (@tempnotes) { $term->addhistory($temp); }
940 $term->ornaments('0');
941 my $line = $term->readline($prompt);
942 push (@annotated, $outfile);
943 $notes{$outfile} = $line;
944 rename "$annodir/$outfile", "$annodir/poly/$outfile" or die "Can't move $outfile: $!";
945 my $annofile = "$annodir/poly/$outfile";
946 $annofile =~ s/\.bl2seq/.anno/g;
947 open (ANNO, ">$annofile") or die "Can't open $annofile: $!";
948 print ANNO $line;
949 close ANNO;
950
951 $loop = 0;
952 }
953 else
954 {
955 $notes{$outfile} = $annotation;
956 rename "$annodir/$outfile", "$annodir/poly/$outfile" or die "Can't move $outfile: $!";
957 # create the annotation file
958 my $annofile = "$annodir/poly/$outfile";
959 $annofile =~ s/\.bl2seq/.anno/g;
960 print "AF: $annofile\n";
961 open (ANNO, ">$annofile") or die "Can't open $annofile: $!";
962 print ANNO $annotation;
963 close ANNO;
964 push (@annotated, $outfile);
965 $loop = 0;
966 }
967 }
968 }
969 else
970 {
971 print "ENTER to view the report, or X to exit.\n";
972 chomp(my $choices = <STDIN>);
973 exit if ($choices =~ /^[Xx]/);
974 &viewer("$annodir/$outfile");
975 next FILE;
976 }
977 }
978 }
979
980
981 # show the alignment
982 sub viewer
983 {
984 my $outfile = shift;
985 my $text = shift;
986 $text = "NONE" unless (defined $text);
987
988 # $outfile does not include the path, and the file may
989 # have to be sought in a subdirectory
990 unless (-e "$annodir/outfile")
991 {
992 foreach (@dirs)
993 {
994 if (-e "$annodir/$_/$outfile")
995 {
996 $outfile = "$annodir/$_/$outfile";
997 last;
998 }
999 }
1000 }
1001
1002 system("$pager $outfile");
1003 #########################################################
1004 # SECOND MENU: #
1005 # choose a category rather than use existing annotation #
1006 #########################################################
1007 print <<EOF;
1008 Please select one of the following options,
1009 then press ENTER.
1010
1011 Standard annotation:
1012 1: No match.
1013 2: One flank only conserved.
1014 3: Both flanks conserved.
1015 4: No msat polymorphism (everything conserved).
1016
1017 Custom annotation:
1018 5: Edit and use previous annotations.
1019 6: View top HSP again, and annotate.
1020
1021 X: Quit.
1022 EOF
1023
1024 my $loop = 1;
1025 while ($loop == 1)
1026 {
1027 $outfile =~ s/$annodir\///;
1028 chomp(my $choice = <STDIN>);
1029 if ($choice =~ /^1/)
1030 {
1031 rename "$annodir/$outfile", "$annodir/nomatch/$outfile" or die "Can't move $outfile: $!";
1032 push (@annotated, $outfile);
1033 $notes{$outfile} = $np_degree;
1034 $loop = 0;
1035 }
1036 elsif($choice =~ /^2/)
1037 {
1038 rename "$annodir/$outfile", "$annodir/oneflank/$outfile" or die "Can't move $outfile: $!";
1039 push (@annotated, $outfile);
1040 $notes{$outfile} = $f1_degree;
1041 $loop = 0;
1042 }
1043 elsif($choice =~ /^3/)
1044 {
1045 rename "$annodir/$outfile", "$annodir/twoflank/$outfile" or die "Can't move $outfile: $!";
1046 push (@annotated, $outfile);
1047 $notes{$outfile} = $f2_degree;
1048 $loop = 0;
1049 }
1050 elsif($choice =~ /^4/)
1051 {
1052 rename "$annodir/$outfile", "$annodir/nopoly/$outfile" or die "Can't move $outfile: $!";
1053 push (@annotated, $outfile);
1054 $notes{$outfile} = $np_degree;
1055 $loop = 0;
1056 }
1057 elsif($choice =~ /^5/)
1058 {
1059 unless ($text eq "NONE")
1060 {
1061 # allow users to edit the annotation
1062 my $term = new Term::ReadLine 'second edit';
1063 my $prompt = "Msat annotation: ";
1064 $term->addhistory($text);
1065 $term->ornaments('0');
1066 my $line = $term->readline($prompt);
1067 push (@annotated, $outfile);
1068 $notes{$outfile} = $line;
1069 rename "$annodir/$outfile", "$annodir/poly/$outfile" or die "Can't move $outfile: $!";
1070 my $annofile = "$annodir/poly/$outfile";
1071 $annofile =~ s/\.bl2seq/.anno/g;
1072 open (ANNO, ">$annofile") or die "Can't open $annofile: $!";
1073 print ANNO $line;
1074 close ANNO;
1075
1076 $loop = 0;
1077 }
1078 else
1079 {
1080 print "Annotation not available. Please try again.\n";
1081 }
1082 }
1083 elsif($choice =~ /^6/)
1084 {
1085 # add an annotation to each polymorphic repeat
1086 # file so that sum_bl2seq can grep it out
1087 # display the alignment on screen to help when reading
1088 system("clear");
1089 open (READ, "<$annodir/$outfile") or die "Can't read $outfile: $!";
1090 my @repnames = split(/\./, $outfile);
1091 $fullname = "repeat: ($repnames[3])$repnames[4]";
1092 my @lines = <READ>;
1093 close READ;
1094 my $scorecheck = 0;
1095 print "------------------------------\n";
1096 print "$fullname\n";
1097 foreach my $line (@lines)
1098 {
1099 $scorecheck++ if ($line =~ /Score/);
1100 unless ($scorecheck > 1)
1101 {
1102 print $line if ($line =~ /Query:/);
1103 print $line if ($line =~ /\|+/);
1104 print "$line\n" if ($line =~ /Sbjct:/);
1105 }
1106 }
1107
1108 print "------------------------------\n";
1109
1110 # get the user input
1111 print "Please enter a free-form descrption of this feature, and press ENTER, or X to exit.\n";
1112 my $description = <STDIN>;
1113 chomp($description);
1114 exit if ($description =~ /^[Xx]/);
1115
1116 # append it to the end of the original file
1117 my $annofile = "$annodir/poly/$outfile";
1118 $annofile =~ s/\.bl2seq/.anno/g;
1119 open (DESC, ">>$annofile") or die "Can't read $annofile: $!";
1120 print DESC "$description\n";
1121 close DESC;
1122
1123 # finally, move the file
1124 rename "$annodir/$outfile", "$annodir/poly/$outfile" or die "Can't move $outfile: $!";
1125 push (@annotated, $outfile);
1126 $notes{$outfile} = $description;
1127 $loop = 0
1128 }
1129
1130 elsif($choice =~ /^[Xx]/)
1131 {
1132 print "$nomatchcount non-matches detected on this run.\n" if ($nomatchcount > 0);
1133 print "\'bye! \n";
1134 exit 0;
1135 }
1136 else
1137 {
1138 print "Please choose one of the options shown above.\n";
1139 }
1140 }
1141 }
1142
1143 ###############
1144 # help, help! #
1145 ###############
1146 sub HELP_MESSAGE
1147 {
1148 print $usage;
1149 exit;
1150 }
1151
1152
1153 __END__