ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/yamap/get_orphans.pl
Revision: 1.1
Committed: Wed Dec 13 10:42:46 2006 UTC (9 years, 7 months ago) by gawi79
Branch: MAIN
CVS Tags: HEAD
Log Message:
A QuickMine script

Line File contents
1 #!/usr/bin/perl -w
2
3
4
5 # creates .overview .matrix, .rank, .score and .seq files
6
7 ##############################################
8 ##############################################
9 # SCRIPT NAME: get_orphans
10 # FUNCTION: Summarize blast reports
11 # AUTHOR: Cared for by Gareth Wilson (gawi@ceh.ac.uk)
12 # AUTHOR: Kerr Wall ported code to use SearchIO intead of Blast.pm
13 ##############################################
14
15
16
17 # This script takes 5 or 6 input arguments:
18 # scriptname
19 # YES|NO - if yes tag must match input file
20 # TAG to name the output files with
21 # and a file glob to grab input files
22
23 # ./get_orphans_table YES SELF SELF_blastp condor_output ext configfile
24
25
26 #no strict 'refs';
27 #use strict;
28 # broken below for using contruct $$var
29
30 use Bio::SearchIO;
31 use Config::Simple;
32
33
34
35
36 my $species = "";
37 my $hit_tag = "";
38
39 unless (@ARGV ==6) {
40 die "\n\nProper Command Line Usage:
41 get_orphans YES/NO TAG glob condor_output ext configfile\nPlease try again.\n\n\n";}
42
43 # shift the first command line arg passed through @ARGV
44 my $yes_or_no = shift;
45
46 unless ($yes_or_no eq "YES" || $yes_or_no eq "NO") {
47 die "\n\nProper Command Line Usage:
48 get_orphans YES|NO TAG glob\nPlease try again.";}
49
50 my $tag = shift;
51 my $glob = shift;
52 my $condor_output = shift;
53 my $ext = shift;
54 my $config_file = shift;
55
56 print "CONFIG FILE =*$config_file*\n";
57
58 # create a new object containing the variables in the cfg file
59 my $cfg = new Config::Simple($config_file);
60
61 # initialize the variables shared with the config file
62 my $path2output = $cfg->param('PATHS.path2output');
63 my $path2blast = $cfg->param('PATHS.path2blast');
64 my $path2public = $cfg->param('PATHS.path2public');
65
66 print "My $path2output\n";
67
68 # initialize the variables shared with the config file
69 my $sig_thresh = $cfg->param('PARAMS.sig_thresh');
70
71 print "***Significance threshold for detecting orphans: $sig_thresh\n";
72
73 # get the record separator from the config file
74 my $record_separator = $cfg->param('PARAMS.record_separator');
75 # convert since use of \t in config file results in literal \t being printed
76
77 if ($record_separator =~ "tab") {$record_separator = "\t"}
78
79 print "RECORD SEP $record_separator\n";
80 my $quick_tax = 1;
81 my @taxi = ();
82 open TAX, "$path2output/tax.list" or $quick_tax = 1;
83 open (HELP, ">>$path2output/help$tag.txt");
84 print HELP "quick_tax = $quick_tax\n";
85 my $taggle;
86 my @taggles;
87 if ($quick_tax == 0)
88 {
89 @taxi = <TAX>;
90 close TAX;
91 }
92 else
93 {
94 if ($tag =~/,/)
95 {
96 print HELP "Comma found\n";
97 @taggles = split/,/,$tag;
98 foreach $taggle (@taggles)
99 {
100 print "taggle = $taggle\n";
101 push @taxi, $taggle;
102 }
103 }
104 else
105 {
106 push @taxi, $tag;
107 print HELP "Single chromosome\n";
108 }
109 }
110
111 my $tax;
112 my @input_files = ();
113
114 # if YES, make input file match TAG
115 if ($yes_or_no eq "YES")
116 {
117 foreach $tax (@taxi)
118 {
119 chomp $tax;
120 chdir "$path2blast/$tag".".dir";
121 my @tmp = <*$glob>;
122 print HELP "tax = $tax\n";
123 print HELP "size tmp = $#tmp\n";
124 for (@tmp)
125 {
126 print HELP "$_, tax = $tax\n";
127 if ($_ =~ /$tax/ && $condor_output == 1)
128 {
129 push (@input_files, "$path2blast"."/"."$tag".".dir/"."$_");
130 print HELP "condor output = 1 size input = $#input_files\n";
131 }
132 elsif ($_ =~ /$tax/ && $condor_output == 0)
133 {
134 push (@input_files, "$path2blast"."/"."$_");
135
136 }
137 }
138 }
139 }
140 print HELP "size input = $#input_files\n";
141 # if NO, use all files that match glob
142 if ($yes_or_no eq "NO")
143 {
144 @input_files = <*$glob>;
145 }
146
147 my (%hit_name, @ranked);
148
149 # parse the abbreviations from the abbr.list file
150
151 my $line = "";
152 my @species = ();
153 if (-e "$path2output/abbr.list")
154 {
155 print "Found an abbr.list file.";
156 print "Will read \@species from your abbr.list file\n";
157 open (ABBR, "$path2output/abbr.list" )
158 or die "Can't open $path2output/abbr.list\n";
159 while ($line = <ABBR>)
160 {
161 if ($line =~ /#/) {next;}
162 chomp $line;
163 $line =~ s/^\s+//;
164 $line =~ s/\s+$//;
165 push (@species, $line);
166
167 }
168
169 close (ABBR);
170
171 }
172
173
174 # print to screen for debugging
175
176 foreach my $abbr (@species) {
177 print "Abbr: $abbr\n";
178 }
179
180
181 # Now get extra names to parse from abbr_extra.list file
182 $line = "";
183 @species = (); # we're adding to this now
184 if (-e "$path2output/abbr_extra.list")
185 {
186 print "Found an abbr.list file.";
187 print "Will read \@species from your abbr_extra.list file\n";
188 open (ABBR, "$path2output/abbr_extra.list" )
189 or die "Can't open $path2output/abbr_extra.list\n";
190 while ($line = <ABBR>)
191 {
192 if ($line =~ /#/) {next;}
193 chomp $line;
194 $line =~ s/^\s+//;
195 $line =~ s/\s+$//;
196 push (@species, $line);
197 }
198
199 close (ABBR);
200 }
201
202
203 # and print again for debugging purposes
204
205 foreach my $abbr (@species)
206 {
207 print "Full List Abbr: $abbr\n";
208 }
209
210
211 my $hitnumber;
212
213 my ($report, $hit);
214 my $orphan_count = 0;
215 my $error_count = 0;
216
217
218 # old files - not put to html to remind me which are old
219 my $orphans_file = $tag."_".$glob."_orphans";
220 my $summary_file = $tag."_".$glob."_orphans_summary";
221 my $errors_file = $tag."_".$glob."_errors";
222
223 # files that are most important
224 my $overview_file = $tag."_".$glob."_overview.html";
225 my $scores = $tag."_".$glob."_scores.html";
226 my $matrix = $tag."_".$glob."_matrix.html";
227 my $rank = $tag."_".$glob."_rank.html";
228 my $top_hit = $tag."_".$glob."_tophit.html";
229 my $tab_info = $tag."_tabinfo.txt";
230
231 open (OUT, ">$orphans_file") or die "can't open orphans file for writing";
232 open (ERROR, ">$errors_file") or die "can't open blast error file for writing";
233 open (SUMMARY, ">$summary_file") or die "can't open summary file for writing";
234 open (OVERVIEW, ">$path2output/$overview_file") or die "can't open overview file for writing";
235 open (SCORES, ">$path2output/$scores") or die "can't open scores file for writing";
236 open (TOPHIT, ">$path2output/$top_hit") or die "can't open top hit file for writing";
237 open (TABINFO, ">$path2output/$tab_info") or die "can't open tab info file for writing";
238 open (MATRIX, ">$path2output/$matrix") or die "can't open matrix file for writing";
239 open (RANK, ">$path2output/$rank") or die "can't open rank file for writing";
240 open (LOG, ">>logfile") or die "can't open logfile file for appending";
241
242
243 print HELP "starting to print to files\n";
244 print OUT "<PRE>Total orphans:\n";
245 print ERROR "<PRE>Total reports against SELF without hits\n";
246 print SUMMARY "<PRE>Summary of orphans\n";
247 print SCORES "<PRE>Scores\n";
248 print TOPHIT "<PRE>Top Hits\n";
249 print OVERVIEW "<PRE>Summary of hits by species (for loading into Excel)\n";
250 print OVERVIEW "Query$record_separator"."Self$record_separator";
251 print MATRIX "<PRE>Summary of top hit by species (for loading into Excel)\n";
252 print MATRIX " Query";
253 print RANK "<PRE>Top hits from each species in rank order (for loading into Excel)\n";
254
255 # write all the column headers to the output file
256
257 for (@species)
258 {
259 print OVERVIEW "$_$record_separator";
260 print MATRIX "$record_separator$_";
261 }
262
263 print OVERVIEW "Total Libs with Hits\n";
264
265
266 # do all the work here
267 foreach $report (@input_files)
268 {
269
270 print HELP "input = $#input_files , report = $report\n";
271 my $public_report = $report;
272 my $taggle;
273 my $tagorf;
274 if ($tag =~/,/)
275 {
276 print HELP "size taggles = $#taggles\n";
277 foreach $taggle (@taggles)
278 {
279 print HELP "taggle = $taggle\n";
280 my $orf = 'orf';
281 if ($report =~m/$taggle$orf/)
282 {
283 $tagorf = "$taggle"."orf";
284 }
285
286 }
287
288 }
289 else
290 {
291 $tagorf = "$tag"."orf";
292 }
293 $report =~m/($tagorf.{4})/;
294 my $brief_report = $1;
295 print HELP "brief report = $brief_report\ntag = $tag\nreport = $report\n";
296 $public_report =~s/$path2blast/$path2public/;
297 %hit_name = ();
298 @ranked = ();
299
300 # make a local link to file
301 my $reportlink = "<a href=\"$public_report\">$brief_report</a>";
302 print HELP "report link = $reportlink\n";
303
304 print "\nSummarizing file $report\n";
305 my $blast_report = new Bio::SearchIO( -format => 'blast', -file => $report);
306 my $result = $blast_report->next_result;
307 my $query_name = $result->query_name();
308 my $query_length = $result->query_length();
309
310 # since a genome report, we only want the original
311 # abbreviation
312 $query_name =~ /(.+)(orf)/;
313 $query_name = $1;
314
315 my @hits;
316 my $self = 0;
317 my $top_hit_count = 0;
318 while (my $hit = $result->next_hit())
319 {
320
321 my $hit_name = $hit->name();
322
323 my $desc = $hit->description();
324
325 # all the names of the files should be in the @species array
326 # after reading the abbr.list file. Bring the needed variables
327 # into existance by initializing them
328
329 # should take the place of the set array below
330 my $fasta_report;
331 my $string;
332 my $stringy;
333 my $length;
334 my $stringy_count = 0;
335 my $total_length = 0;
336 my $mean_length = 0;
337 my @mean;
338 my $short_number = 0;
339 my $long_number = 0;
340
341 foreach my $abbr (@species)
342 {
343 print "ABBR: $abbr\n";
344 $$abbr = "";
345 }
346
347 $hit_name =~ s/\.fasta,//;
348 my $expect = $hit->significance(),
349 my $score = $hit->raw_score();
350 my $description = $hit->description();
351 my $first_value = substr($expect,0,1);
352 if ($first_value eq "e")
353 {
354 $expect = "1" . $expect;
355 }
356 # keep only hits above a certain threshold
357 my $hsp_string;
358 my @hsps =();
359 while (my $hsp = $hit->next_hsp())
360 {
361 my $identity = $hsp->percent_identity;
362 $identity = sprintf("%.2f",$identity);
363 my $hsp_score = $hsp->score;
364 my $q_start = $hsp->start('query');
365 my $q_stop = $hsp->end('query');
366 my $s_start = $hsp->start('hit');
367 my $s_stop = $hsp->end('hit');
368 my $blast_type = $glob;
369 $blast_type =~s/SELF_//;
370 $hsp_string = "$brief_report\t$blast_type\t$hit_name\t$identity\t$hsp_score\t$q_start\t$q_stop\t$s_start\t$s_stop\n";
371 push @hsps, $hsp_string;
372 }
373 if ($expect < $sig_thresh)
374 {
375 $hit_name =~ /(.+)(orf)(.)/;
376 $hit_tag = $1;
377 print SCORES "$tag$record_separator$reportlink$record_separator$hit_name$record_separator$expect$record_separator$description\n";
378 if ($top_hit_count == 0)
379 {
380 print TOPHIT "$tag$record_separator$reportlink$record_separator$hit_name$record_separator$expect$record_separator$description\n";
381 }
382 if (!$hit_name{$hit_tag})
383 {
384 $hit_name{$hit_tag} = $hit_name;
385 push (@ranked, $hit_name);
386 }
387 print "HIT: $hit_name QUERY $query_name\n";
388 # if we hit self (match file name) don't keep
389 # but set $self equal to "true"
390 if ($hit_name =~ $query_name)
391 {
392 $self = 1;
393 }
394 push (@hits, $hit_name);
395 $top_hit_count++;
396 foreach my $hsp (@hsps)
397 {
398 print TABINFO "$hsp";
399 }
400 }
401
402 }
403 # SELF NOT INCLUDED IN @hits
404 $species = 0;
405 foreach my $hit (@hits) {
406 foreach $species (@species) {
407 # check for match on $species first, then on
408 # $species{$species}
409 #if ($hit =~ /$species/) {$$species++;}
410 if ($species =~/,/)
411 {
412 my @problems = split /,/,$species;
413 my $problem;
414 foreach $problem (@problems)
415 {
416 if ($hit =~ /$problem/ ) ## || $hit =~ /$species{$species}/)
417 {$$species++;}
418 }
419 }
420 else
421 {
422 if ($hit =~ /$species/ ) ## || $hit =~ /$species{$species}/)
423 {$$species++;}
424 }
425 }
426 }
427 my $hitnumber = @hits;
428 if ($hitnumber == 0 && $report =~ "SELF")
429 {
430 print ERROR "ERROR: $reportlink against SELF has no hits\n";
431
432 }
433 elsif ($hitnumber == 1 && $report =~ "SELF")
434 {
435 print OUT "ORPHAN: $reportlink against SELF has only SELF hit\n";
436 $orphan_count++;
437 }
438 elsif ($hitnumber == 0 && $report =~ "swiss")
439 {
440 print OUT "ORPHAN: $reportlink against swiss has no hits\n";
441 $orphan_count++;
442 }
443 elsif ($hitnumber == 1 && $report =~ "swiss")
444 {
445 print OUT "ORPHAN: $reportlink against swiss has 1 hit - could be self\n";
446 $orphan_count++;
447 }
448 else
449 {
450 }
451
452 print OVERVIEW "$reportlink$record_separator";
453 print MATRIX "\n$reportlink$record_separator";
454 print RANK "\n$reportlink$record_separator";
455 my $count_sp = 0;
456 print OVERVIEW "$self$record_separator";
457 foreach $species (@species)
458 {
459 if ($$species)
460 {
461 print OVERVIEW "$$species$record_separator";$count_sp++
462 }
463 else
464 {
465 print OVERVIEW "0$record_separator";
466 }
467 if ($hit_name{$species})
468 {
469 $hit_name{$species} =~ /(orf)(\d{4})/;
470 my $orf_num = $2;
471
472 print MATRIX "$orf_num$record_separator";
473 }
474 else
475 {
476 print MATRIX "0$record_separator";
477 }
478
479 # this is why strict won't work at present
480 $$species = "";
481 #print HELP "input after dollar dollar = $#input_files\n";
482 }
483 print OVERVIEW "$count_sp\n";
484 print RANK "@ranked";
485
486 }
487
488
489
490 # print out summary for each genome
491 print SUMMARY "Total orphans: $orphan_count\n";
492 print SUMMARY "Total errors (low, complexity proteins, possible orphans): $error_count\n";
493
494 print "Files created\n";
495 print LOG "$orphans_file$record_separator$summary_file$record_separator$errors_file$record_separator$overview_file";
496 close HELP;