ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/yamap/yamap_nogui_condor.pl
Revision: 1.1.1.1 (vendor branch)
Committed: Thu Sep 7 15:35:22 2006 UTC (9 years, 7 months ago) by knirirr
Branch: MAIN, cehox
CVS Tags: start, HEAD
Changes since 1.1: +0 -0 lines
Log Message:
Imported sources

Line File contents
1 #!/usr/bin/perl
2
3 ####################################################
4 # YAMAP: Yet Another Microbial Annotation Pipeline #
5 # 1. Run each of the programs below #
6 # 2. Convert the outputs to .tab #
7 ####################################################
8
9 use strict;
10 use threads;
11 use Bio::SeqIO;
12 use Config::Simple;
13 use Cwd;
14 use File::Basename;
15 use File::Copy;
16 use Getopt::Std;
17 use Data::Dumper;
18
19 #####################
20 # usage information #
21 #####################
22 my $usage="
23 Usage:
24
25 ./yamap.pl [-c <config_file>] [-p <path_file>] [-w] <input_files>
26
27 Options:
28
29 -c full path to user config file.
30 -p full path to path config file.
31 -w write out dbblast summary as xls file.
32 -h print this message.
33 ";
34
35 print "Yet Another Microbial Annotation Pipeline - running...\n";
36
37 ###################
38 # setup variables #
39 ###################
40 my $pwd = getcwd;
41 $pwd =~ s/\/raid2//; # for running on the CEH Ox. condor cluster
42 my $runfile;
43 my $installdir = "/usr/local/bioinf/yamap/yamap";
44 my $converter = "yamap_embl.pl";
45 my $excelwriter = "crunchtoxls.pl";
46 my $dbname = "self_blast";
47 my $pathfile;
48 my $nothing = 0;
49 my @blastfiles;
50 my $condor_submit = "/vdt/condor/bin/condor_submit";
51 my $condor_submit_dag = "/vdt/condor/bin/condor_submit_dag";
52 my $exit = 0;
53
54 # Condor assumptions:
55 # 1. Yamap runs in your home directory
56 # 2. Your home directory is NFS mounted over all machines in the pool
57 # 3. The user name is set here. I can't get it from /etc/passwd
58 my $user = "milo";
59
60
61 ###############
62 # get options #
63 ###############
64 my %opts=();
65 getopts('c:hp:w',\%opts);
66 # print help message
67 if (defined $opts{h})
68 {
69 print $usage;
70 exit;
71 }
72 # get user config file info
73 if (defined $opts{c})
74 {
75 $runfile = $opts{c};
76 }
77 else
78 {
79 print "Using default configuration...\n";
80 $runfile = "$installdir/etc/yamap_run.ini";
81 }
82 # get path config file info
83 if (defined $opts{p})
84 {
85 $pathfile = $opts{p};
86 }
87 else
88 {
89 print "Using default paths...\n";
90 $pathfile = "$installdir/etc/yamap_paths.ini";
91 }
92 # write out excel
93 my $write_excel = 0;
94 my $write_tophits = 0;
95 if (defined $opts{w})
96 {
97 $write_excel = 1;
98 }
99 if (defined $opts{t})
100 {
101 $write_tophits = 1;
102 }
103
104
105 #####################
106 # files to annotate #
107 #####################
108 my @infiles = @ARGV;
109 unless (@infiles)
110 {
111 print $usage;
112 exit;
113 }
114
115
116 # check all are fasta files
117 my @dudfiles;
118 foreach my $file (@infiles)
119 {
120 my $file = Bio::SeqIO->new(-file => "$file",
121 -format => "FASTA");
122 while (my $seq = $file->next_seq())
123 {
124 my $sequence_string = $seq->seq();
125 unless ($seq->validate_seq($sequence_string))
126 {
127 push(@dudfiles, $file);
128 }
129 }
130 }
131 if (@dudfiles)
132 {
133 print "Program terminated. The following input files are not valid FASTA files:\n";
134 foreach my $file (@dudfiles)
135 {
136 print "$file\n";
137 }
138 exit;
139 }
140
141
142 # read a config file containing the options for
143 # all the scripts to be run. This to be set up
144 # by GUI.
145 my $datestring = `date +%j%H%M%S`;
146 chomp($datestring);
147 my $config = new Config::Simple("$runfile") or die "Can't open run config file $runfile: $!";
148 my $common = $config->param(-block=>'COMMON');
149 my $outdir = $pwd . "/" . $common->{outdir};
150 $outdir =~ s/\/raid2//;
151 unless (-e $outdir)
152 {
153 mkdir("$outdir") or die "Can't create $outdir: $!";
154 }
155
156 # another config file containing all the hardcoded paths
157 my $pathconf = new Config::Simple($pathfile) or die "Can't open path config file: $!";
158 my $proc = $pathconf->param(-block=>'PROCESSING');
159 my $parse = $pathconf->param(-block=>'PARSING');
160
161
162 ######################################################
163 # parsers, executables, and the link between the two #
164 ######################################################
165 my @execs = qw(dbblast selfblast bl2seq einverted etandem palindrome trnascan msatfinder glimmer);
166 my %execpaths = ("bigblast" => $proc->{bigblast},
167 "selfblast" => $proc->{selfblast},
168 "mspcrunch" => $proc->{mspcrunch},
169 "formatdb" => $proc->{formatdb},
170 "einverted" => $proc->{einverted},
171 "etandem" => $proc->{etandem},
172 "palindrome" => $proc->{palindrome},
173 "trnascan" => $proc->{trnascan},
174 "msatfinder" => $proc->{msatfinder},
175 "glimmer" => $proc->{glimmer},
176 "longorfs" => $proc->{longorfs},
177 "extract" => $proc->{extract},
178 "build_icm" => $proc->{build_icm},
179 "formatdb" => $proc->{formatdb},
180 "rbs" => $proc->{rbs});
181 my %parsers = ("bigblast" => $parse->{bigblast},
182 "selfblast" => $parse->{selfblast},
183 "einverted" => $parse->{einverted},
184 "etandem" => $parse->{etandem},
185 "palindrome" => $parse->{palindrome},
186 "trnascan" => $parse->{trnascan},
187 "msatfinder" => $parse->{msatfinder},
188 "glimmer" => $parse->{glimmer},
189 "repfinder" => $parse->{repfinder},
190 "rbs" => $parse->{rbs});
191 my %realname = ("bigblast" => "big-blast.pl",
192 "selfblast" => "big-blast.pl",
193 "rbs" => "rbs_finder.pl",
194 "glimmer" => "glimmer2",
195 "longorfs" => "long-orfs",
196 "trnascan" => "tRNAscan-SE",
197 "build_icm" => "build-icm");
198
199 #################################################
200 # check that all the execpaths are in the $PATH #
201 #################################################
202 my @missing = ();
203 foreach my $bin (keys %execpaths)
204 {
205 # only run if this program is required
206 next if ($bin eq "selfblast");
207 next if ($common->{$bin} eq "0");
208 my $real;
209 if (defined $realname{$bin})
210 {
211 $real = $realname{$bin};
212 }
213 else
214 {
215 $real = $bin;
216 }
217 my $elsewhere = "$bin found, but not where your config file says it should be. Continuing...\n";
218 # check the various places it could be...
219 if (-e $execpaths{$bin})
220 {
221 next;
222 }
223 else
224 {
225 my $search = `which $real 2> /dev/null`;
226 chomp($search);
227 if ($search =~ /aliased to (\.+)/)
228 {
229 $execpaths{$bin} = $1;
230 print $elsewhere;
231 }
232 elsif (-x $search)
233 {
234 $execpaths{$bin} = $search;
235 print $elsewhere;
236 }
237 else
238 {
239 push(@missing, $real);
240 }
241 }
242 }
243 # warn the user if missing stuff
244 if (@missing)
245 {
246 print "The following programs were not found on this system: @missing\n";
247 print "Please add the full path to each dependency to the configuration file and try again.\n";
248 exit;
249 }
250
251 ##########################################
252 # create a self-blast database if needed #
253 ##########################################
254 my $formatdb = $execpaths{formatdb};
255 my $len = @infiles;
256 my @catfiles = ();
257 if ($common->{selfblast} == 1 and $len > 1)
258 {
259 print "Preparing self blast database...\n";
260 # clean up old d
261 if (-e "$outdir/$dbname")
262 {
263 unlink("$outdir/$dbname") or die "Can't remove old self-blast database: $!";
264 }
265 # cat only the other file to the db if there are two files
266 my @dbfiles;
267 if ($len == 2)
268 {
269 push (@catfiles, $infiles[1]);
270 }
271 else
272 {
273 foreach my $cf (@infiles) { push (@catfiles, $cf); }
274 }
275 # create new
276 if (system("cat @catfiles >> $outdir/$dbname") == 0)
277 {
278 if (system("cd $outdir; $formatdb -i $dbname -o F -p F") == 0)
279 {
280 print "$dbname formatting complete.\n";
281 }
282 else
283 {
284 warn "Could not run formatdb: $!";
285 }
286 }
287 else
288 {
289 print "Could not create $dbname: $!";
290 }
291 }
292 elsif ($common->{selfblast} == 1 and $len == 1)
293 {
294 print "Single sequence only - self-blast will not be run.\n";
295 }
296
297 ######################################
298 # run each of the annotation methods #
299 ######################################
300 my %artline=();
301 my ($out,@seqfiles);
302 foreach my $infile (@infiles)
303 {
304 # setup input file name and output directory name
305 $infile = [split(/\//,$infile)]->[-1];
306 my @tabfiles = ();
307
308 # oh dear. Some people insist upon putting in FASTA
309 # files with many sequences in them rather than the
310 # one-file-per-metagenomic-sequence that I was expecting.
311 # However, this can be dealt with using the power of SeqIO.
312 # First, we must determine if the file has multiple sequences.
313 my $multiseq = 0;
314 my $fastas = 0;
315 my @slurp = ();
316 open (CHECK, "<$infile") or die "Can't check $infile for No. of seqs. $!";
317 while (my $line = <CHECK>)
318 {
319 $fastas++ if ($line =~ /^>/);
320 if ($fastas > 1)
321 {
322 $multiseq = 1;
323 last;
324 }
325 }
326 close CHECK;
327
328 # use seqio to get an id
329 my $basefile = Bio::SeqIO->new(-file => "$infile",
330 -format => 'Fasta');
331
332
333 # get each seq in the file, and write out
334 while (my $seq = $basefile->next_seq())
335 {
336 my $id = $seq->display_id();
337 $id =~ s/[\.|\||:|,|;]/_/g;
338 if ($id eq "")
339 {
340 print "Invalid name in FASTA header. Please replace with something like \"seq1\", \"seq2\" etc.\nExiting!\n";
341 next;
342 }
343 push (@seqfiles, $id);
344
345 # this file should be written out to a temp. file
346 # for the various progs to run on it, and delete afterwards.
347 # use a symlink instead if the file is a single seq, to save
348 # on disk space
349 if ($multiseq == 1)
350 {
351 open (TFOUT, ">$id") or die "Can't open $id: $!";
352 print TFOUT "> $id\n";
353 print TFOUT $seq->seq();
354 close TFOUT;
355 }
356 else
357 {
358 system("ln -s $infile $id") == 0 or die "Can't symlink $id to $infile: $!";
359 }
360
361 $out = $outdir . "/" . $id . ".out";
362
363 unless (-e $out)
364 {
365 mkdir("$out") or die "Can't create $out: $!";
366 }
367 system("cp $pwd/$id $out/$id") == 0 or warn "Can't copy genome file: $!";
368
369 # run the outputs
370 # threading - go through each genome in turn, but detach a
371 # separate thread for each of the execs. Once all threads are
372 # detached, leave it all to condor and watch for when user's jobs
373 # have terminated using the monitor subroutine
374 foreach my $exec (@execs)
375 {
376 if ($common->{$exec} == 1)
377 {
378 $nothing = 1;
379 # create a thread for each of the annotation steps
380 # each thread to run its own condor job
381 my $pass = $id . "|" . $exec . "|" . $out;
382 my $subroutine;
383 # run different routines for condor and non-condor
384 $subroutine = "run_" . $exec . "_condor";
385 my $thr = threads->new("$subroutine","$pass");
386 $thr->detach();
387 }
388 }
389 }
390 }
391
392 # exit early if no options set
393 if ($nothing == 0)
394 {
395 print "No options were set, so no annotation programs were run.\n";
396 print "\nYAMAP finished!\n";
397 exit;
398 }
399
400 # wait until all the condor jobs have finished
401 my $thr = threads->new("monitor");
402 $thr->join();
403
404 # generating excel format output
405 if ($write_excel == 1 and $common->{dbblast} == 1)
406 {
407 # glob all the output files
408 foreach my $file (@seqfiles)
409 {
410 my $base = &basename($file);
411 my $dir = &dirname($file);
412 my @glob = glob "$dir/$outdir/$base.out/*dbblast.crunch";
413 foreach my $gl (@glob) { push (@blastfiles, $gl); }
414 }
415 if (@blastfiles)
416 {
417 print "Converting dbblast output to xls format...\n";
418 if (system("$installdir/bin/$excelwriter -a @blastfiles") == 0)
419 {
420 print "Excel file generated.\n";
421 }
422 else
423 {
424 warn "Could not generate excel file: $!";
425 }
426 }
427 }
428 # generating excel format top hits
429 if ($write_tophits == 1 and $common->{dbblast} == 1)
430 {
431 # glob all the output files
432 foreach my $file (@seqfiles)
433 {
434 my $base = &basename($file);
435 my $dir = &dirname($file);
436 my @glob = glob "$dir/$outdir/$base.out/*dbblast.crunch";
437 foreach my $gl (@glob) { push (@blastfiles, $gl); }
438 }
439 if (@blastfiles)
440 {
441 print "Converting top blast hits to xls format...\n";
442 if (system("$installdir/bin/$excelwriter -t @blastfiles") == 0)
443 {
444 print "Excel file generated.\n";
445 }
446 else
447 {
448 warn "Could not generate excel file: $!";
449 }
450 }
451 }
452
453 # convert output files to EMBL format
454 if (@seqfiles)
455 {
456 print "Converting output to EMBL format...\n";
457 if (system("$installdir/bin/$converter @seqfiles") == 0)
458 {
459 print "File converter finished.\n";
460 }
461 else
462 {
463 warn "Could not run converter.";
464 }
465 }
466 else
467 {
468 print "No output - skipping EMBL conversion...\n";
469 }
470
471 #################################################
472 # print out a handy summary of what was created #
473 #################################################
474 my $printout = 0;
475 foreach my $infile (@seqfiles)
476 {
477 if (defined @{$artline{$infile}})
478 {
479 $printout = 1;
480 last;
481 }
482 }
483
484 # cleanup last file
485 if ($common->{msatfinder} == 1)
486 {
487 unlink("msatfinder.rc");
488 }
489
490 # cleanup all extra seq files
491 foreach (@seqfiles) { unlink $_; }
492
493
494 # Finished
495 print "\nYAMAP finished!\n";
496
497 ################
498 # monitor jobs #
499 ################
500 sub monitor
501 {
502 return if ($exit == 1);
503 # the purpose of this subroutine is to flip out
504 # and see if the user has any jobs queued, and
505 # to do this every 30 seconds. Return if no
506 # jobs left in the queue.
507 while ()
508 {
509 my $line = `condor_q -submitter $user -format "%s" Owner`;
510 return if ($line eq "" or $line =~ /Error/);
511 print "Waiting...\n";
512 sleep 30;
513 }
514 }
515
516 #############################
517 # write condor submit files #
518 #############################
519 sub writecondor
520 {
521 my $exec = shift;
522 my $dir = shift;
523 my $args = shift;
524 my $text = <<EOF;;
525 Executable = $exec
526 Universe = vanilla
527 Initialdir = $dir
528 Transfer_executable = True
529 when_to_transfer_output = ON_EXIT_OR_EVICT
530 Log = condor.log
531 error = condor.err
532 Requirements = (Machine == "darwin.nerc-oxford.ac.uk") || (Machine == "ivgws04.nerc-oxford.ac.uk") || (Machine == "ivgws05.nerc-oxford.ac.uk") || (Machine == "ivgws06.nerc-oxford.ac.uk") || (Machine == "ivgws07.nerc-oxford.ac.uk") || (Machine == "ivgws08.nerc-oxford.ac.uk") || (Machine == "ivgws09.nerc-oxford.ac.uk") || (Machine == "ivgws10.nerc-oxford.ac.uk") || (Machine == "ivgws12.nerc-oxford.ac.uk") || (Machine == "ivgws13.nerc-oxford.ac.uk") || (Machine == "ivgws15.nerc-oxford.ac.uk") || (Machine == "ivgws16.nerc-oxford.ac.uk") || (Machine == "ivgws17.nerc-oxford.ac.uk") || (Machine == "ivgws18.nerc-oxford.ac.uk") || (Machine == "ivgws19.nerc-oxford.ac.uk") || (Machine == "ivgws21.nerc-oxford.ac.uk") || (Machine == "ivgws22.nerc-oxford.ac.uk") || (Machine == "ivgws24.nerc-oxford.ac.uk")
533 Arguments = $args
534 queue
535 EOF
536 return $text;
537 }
538
539 #######################################
540 # subroutines for each of the methods #
541 #######################################
542 # GLIMMER
543 # dies due to running out of memory
544 sub run_glimmer_condor
545 {
546 # glimmer is slightly different - there are more steps and more
547 # executables. This is adapated from Chimdi's glimmer_auto script
548 # get the config details
549 my $incoming = shift;
550 my @parts = split(/\|/, $incoming);
551 my $infile = $parts[0];
552 my $exec = $parts[1];
553 my $out = $parts[2];
554 my $glimmer = $execpaths{$exec};
555 my $parse_glimmer = $parsers{$exec};
556 my $gconf = $config->param(-block=>'GLIMMER');
557 my $args_longorfs = $gconf->{arguments_longorfs};
558 my $args_glimmer2 = $gconf->{arguments_glimmer2};
559 my $longorfs = $execpaths{longorfs};
560 my $extract = $execpaths{extract};
561 my $build_icm = $execpaths{build_icm};
562 my $rbs = $execpaths{rbs};
563 my $parse_rbs = $parse->{rbs};
564 my $stdoutfile = "$out/$infile.glimmer.stdout";
565 my $continue = 1;
566
567 #######################################
568 # run glimmer components and then rbs #
569 #######################################
570 print "Submitting glimmer job to condor pool...\n";
571 # need job files for (long-orfs), processing, (extract, build-icm, glimmer2, rbsfinder). ()=one dag.
572 # run long-orfs
573 print "Submitting long-orfs to condor...\n";
574 my $bits = "/home/milo/bin/glimmer_bits.pl";
575 my $clean_longorfs = "/home/milo/bin/clean_longorfs.pl";
576
577 # long orfs
578 open (OUT, ">$out/job_long_orfs.condor") or warn "Can't write long-orfs submit file: $!";
579 print OUT &writecondor($bits,$out,"longorfs $longorfs $infile $out $infile.glimmer.orfs $stdoutfile $args_longorfs");
580 close OUT;
581
582 # clean_longorfs
583 open (OUT, ">$out/job_clean_longorfs.condor") or warn "Can't write clean_longorfs submit file: $!";
584 print OUT &writecondor($clean_longorfs,$out,"$out $infile");
585 close OUT;
586
587 # extract
588 open (OUT, ">$out/job_extract.condor") or warn "Can't write extract submit file: $!";
589 print OUT &writecondor($bits,$out,"extract $extract $infile.glimmer.orfs $out $infile.glimmer.seq $stdoutfile");
590 close OUT;
591
592 # build-icm
593 open (OUT, ">$out/job_build-icm.condor") or warn "Can't write build-icm submit file: $!";
594 print OUT &writecondor($bits,$out,"icm $build_icm $infile.glimmer.seq $out $infile.glimmer.icm $stdoutfile");
595 close OUT;
596
597 # glimmer2
598 open (OUT, ">$out/job_glimmer2.condor") or warn "Can't write glimmer2 submit file: $!";
599 print OUT &writecondor($bits,$out,"glimmer $glimmer $infile.glimmer.icm $out $infile.glimmer.output $stdoutfile $args_glimmer2");
600 close OUT;
601
602 # rbsfinder
603 open (OUT, ">$out/job_rbsfinder.condor") or warn "Can't write rbsfinder submit file: $!";
604 print OUT &writecondor($bits,$out,"rbs $rbs $infile.glimmer.output $out $infile.rbs.output $stdoutfile");
605 close OUT;
606
607 open (OUT, ">$out/glimmer_submit.dag") or warn "Can't write glimmer DAG file: $!";
608 print OUT <<EOF;
609 # Filename: glimmer_submit.dag
610
611 Job LO $out/job_long_orfs.condor
612 Job CL $out/job_clean_longorfs.condor
613 Job EXT $out/job_extract.condor
614 Job ICM $out/job_build-icm.condor
615 Job GLM $out/job_glimmer2.condor
616 Job RBS $out/job_rbsfinder.condor
617
618 Script POST GLM $parse_glimmer
619 Script POST RBS $parse_rbs
620
621 PARENT LO CHILD CL
622 PARENT CL CHILD EXT
623 PARENT EXT CHILD ICM
624 PARENT ICM CHILD GLM
625 PARENT GLM CHILD RBS
626 EOF
627 close OUT;
628
629 # submit to condor
630 system("cd $out; $condor_submit_dag glimmer_submit.dag") == 0 or warn "Can't submit condor job: $!";
631 }
632 # EINVERTED
633 # fully functional
634 sub run_einverted_condor
635 {
636 # get the config details
637 my $incoming = shift;
638 my @parts = split(/\|/, $incoming);
639 my $infile = $parts[0];
640 my $exec = $parts[1];
641 my $out = $parts[2];
642 my $einverted = $execpaths{$exec};
643 my $parse_einverted = $parsers{$exec};
644 my $econf = $config->param(-block=>'EINVERTED');
645 my $gap = $econf->{gap};
646 my $threshold = $econf->{threshold};
647 my $match = $econf->{match};
648 my $mismatch =$econf->{mismatch};
649 my $maxrepeat = $econf->{maxrepeat};
650
651 # write a submit file for einverted
652 print "Submitting einverted job on $infile to condor...\n";
653 open (OUT, ">$out/job_einverted.condor") or die "Can't write einverted submit file: $!";
654 print OUT &writecondor($einverted,$out,"-gap $gap -threshold $threshold -match $match -mismatch $mismatch -maxrepeat $maxrepeat -auto -outfile $out/$infile.einverted.out -sequence $out/$infile");
655 close OUT;
656 # write a submit file for einverted parse artemis
657 open (OUT, ">$out/job_einverted_parse_artemis.condor") or die "Can't write einverted_parse_artemis submit file: $!";
658 print OUT &writecondor($parse_einverted,$out,"$out/$infile.einverted.out $out/$infile.einverted.tab");
659 close OUT;
660
661 # write a dag to make sure that the processing script is run afterwards
662 open (OUT, ">$out/einverted_submit.dag") or die "Can't write einverted DAG file: $!";
663 print OUT <<EOF;
664 # Filename: einverted_submit.dag
665
666 Job EI $out/job_einverted.condor
667 Job PA $out/job_einverted_parse_artemis.condor
668
669 PARENT EI CHILD PA
670
671 EOF
672 close OUT;
673
674 unless (system("cd $out; $condor_submit_dag einverted_submit.dag") == 0 )
675 {
676 warn "Can't submit condor job: $!";
677 $exit = 1;
678 }
679 }
680 # PALINDROME
681 # dies for no apparent reason
682 sub run_palindrome_condor
683 {
684 # get the config details
685 my $incoming = shift;
686 my @parts = split(/\|/, $incoming);
687 my $infile = $parts[0];
688 my $exec = $parts[1];
689 my $out = $parts[2];
690 my $palindrome = $execpaths{$exec};
691 my $parse_palindrome = $parsers{$exec};
692 my $pconf = $config->param(-block=>'PALINDROME');
693 my $minpallen = $pconf->{minpallen};
694 my $maxpallen = $pconf->{maxpallen};
695 my $gaplimit = $pconf->{gaplimit};
696 my $nummismatches = $pconf->{nummismatches};
697 my $overlap = $pconf->{overlap};
698 my $outfile = "$out/$infile.palindrome.out";
699 my $tabfile = "$out/$infile.palindrome.tab";
700 my $stdoutfile = "$out/$infile.palindrome.stdout";
701
702 # write palindrome submit
703 print "Submitting palindrome job on $infile to condor...\n";
704 open (OUT, ">$out/job_palindrome.condor") or die "Can't write palindrome submit file: $!";
705 print OUT &writecondor($palindrome,$out,"-stdout -minpallen $minpallen -maxpallen $maxpallen -gaplimit $gaplimit -nummismatches $nummismatches -overlap $overlap -outfile $outfile $infile");
706 close OUT;
707
708 # write parse palindrome submit
709 open (OUT, ">$out/job_parse_palindrome.condor") or die "Can't write parse_palindrome submit file: $!";
710 print OUT &writecondor($parse_palindrome,$out,"$outfile $tabfile");
711 close OUT;
712
713 # write dag
714 open (OUT, ">$out/palindrome_submit.dag") or die "Can't write palindrome DAG file: $!";
715 print OUT <<EOF;
716 # Filename: palindrome_submit.dag
717
718 Job PA $out/job_palindrome.condor
719 Job PP $out/job_parse_palindrome.condor
720
721 PARENT PA CHILD PP
722
723 EOF
724 close OUT;
725
726 # submit dag
727 unless (system("cd $out; $condor_submit_dag palindrome_submit.dag") == 0 )
728 {
729 warn "Can't submit condor job: $!";
730 $exit = 1;
731 }
732 }
733 # MSATFINDER
734 # no output files produced
735 sub run_msatfinder_condor
736 {
737 my $incoming = shift;
738 my @parts = split(/\|/, $incoming);
739 my $infile = $parts[0];
740 my $exec = $parts[1];
741 my $out = $parts[2];
742
743 # write out msatfinder config file
744 my $msatfinder = $execpaths{$exec};
745 my $mconf = $config->param(-block=>'MSATFINDER');
746 my $flank_size = $mconf->{flank_size};
747 my $mrange = $mconf->{mrange};
748 my $engine = $mconf->{engine};
749 my $interrupts = $mconf->{interrupts};
750 my $interrupts = $mconf->{interrupts};
751 my $parse_msatfinder = $parsers{$exec};
752
753 # write configuration file
754 open (CONF, ">$out/msatfinder.rc") or warn "Can't open $out/msatfinder.rc: $!";
755 print CONF <<EOF;
756 [COMMON]
757 debug=0
758 flank_size=$flank_size
759 mine_dir="MINE/"
760 repeat_dir="Repeats/"
761 tab_dir="Msat_tabs/"
762 bigtab_dir="Flank_tabs/"
763 fasta_dir="Fasta/"
764 prime_dir="Primers/"
765 align_dir="Aligner/"
766 anno_dir="Annotations/"
767 count_dir="Counts/"
768 [DEPENDENCIES]
769 run_eprimer=0
770 eprimer_args="-primer"
771 eprimer="/usr/bin/eprimer3"
772 primer3core="/usr/bin/primer3_core"
773 [FINDER]
774 override=0
775 motif_threshold="$mrange"
776 artemis=1
777 mine=0
778 fastafile=0
779 sumswitch=0
780 screendump=0
781 EOF
782 close CONF;
783
784 # write msatfinder submit
785 print "Submitting msatfinder job on $infile to condor...\n";
786 open (OUT, ">$out/job_msatfinder.condor") or die "Can't write msatfinder submit file: $!";
787 print OUT &writecondor($msatfinder,$out,"-e $engine $infile");
788 close OUT;
789
790 # write msatfinder cleanup submit
791 open (OUT, ">$out/job_msatfinder_cleanup.condor") or die "Can't write msatfinder_cleanup submit file: $!";
792 print OUT &writecondor($parse_msatfinder,$out,"$infile $out");
793 close OUT;
794
795 # write a dag
796 open (OUT, ">$out/msatfinder_submit.dag") or die "Can't write msatfinder DAG file: $!";
797 print OUT <<EOF;
798 # Filename: einverted_submit.dag
799
800 Job MS $out/job_msatfinder.condor
801 Job CL $out/job_msatfinder_cleanup.condor
802
803 PARENT MS CHILD CL
804
805 EOF
806 close OUT;
807
808
809 # submit a dag
810 system("cd $out; $condor_submit_dag msatfinder_submit.dag") == 0 or warn "Can't submit condor job: $!";
811 }
812
813
814
815 __END__
816 # unconfigured
817 # BIGBLAST
818 sub run_bigblast
819 {
820 # get the config details
821 my $incoming = shift;
822 my @parts = split(/\|/, $incoming);
823 my $infile = $parts[0];
824 my $exec = $parts[1];
825 my $out = $parts[2];
826 my $conf = uc $exec;
827 my $bigblast = $execpaths{bigblast};
828 my $bconf = $config->param(-block=>"$conf");
829 my $program = $bconf->{program};
830 my $jobs = $bconf->{jobs};
831 my $other_opts = $bconf->{other_opts};
832
833 # set the correct database
834 my $database;
835 my $base = &basename($infile);
836 my $dir = &dirname($infile);
837 if ($exec eq "dbblast")
838 {
839 $database = $bconf->{database};
840 }
841 elsif ($exec eq "selfblast")
842 {
843 $database = "../self_blast";
844 }
845 else
846 {
847 warn "ERROR: Something odd has happened with the big-blast routine.\n";
848 }
849
850 # only blast once if using "bl2seq"
851 if ($len == 2 and $exec eq "selfblast")
852 {
853 return if (&basename($infile) eq $infiles[1]);
854 }
855
856 # exit if only one seq
857 if ($len == 1 and $exec eq "selfblast")
858 {
859 return;
860 }
861
862 # get the correct program
863 my %progs = ("blastn" => "-2",
864 "blastx" => "-x2",
865 "tblastx" => "-tx2");
866
867 # run blast
868 print "Running $exec on $infile...\n";
869 if ($dir eq ".") { $dir = $pwd; }
870 if (system("cd $dir/$out; $bigblast -j $jobs $progs{$program} $database $infile $other_opts > $infile.$exec.stdout 2>&1") == 0)
871 {
872 print "Blast complete!\n";
873 my @files = glob "$out/big_blast*";
874 foreach my $file (@files)
875 {
876 if ($file =~ /\.\d{8}/)
877 {
878 unlink("$file");
879 }
880 else
881 {
882 my $newname = &basename($infile) . ".$exec." . [split(/\./, $file)]->[-1];
883 &move("$file", "$out/$newname") or warn "Can't move $file to $newname: $!";
884 push (@outfiles, $newname);
885 }
886 }
887 }
888 else
889 {
890 warn "Could not run $bigblast on $infile: $!\n";
891 }
892 return \@outfiles;
893 }
894 # ETANDEM
895 sub run_etandem
896 {
897 # get the config details
898 my $incoming = shift;
899 my @parts = split(/\|/, $incoming);
900 my $infile = $parts[0];
901 my $exec = $parts[1];
902 my $out = $parts[2];
903 my $etandem = $execpaths{$exec};
904 my $parse_etandem = $parsers{$exec};
905 my @outfiles;
906 my $equicktandem = $etandem;
907 $equicktandem =~ s/etandem/equicktandem/;
908 my $name = [split(/\./,$infile)]->[0];
909 my $qtanfile = "$out/$infile.qtan.out";
910 my $tanfile = "$out/$infile.tan.out";
911
912 # equicktandem configs
913 my $eqconf = $config->param(-block=>'EQUICKTANDEM');
914 my $maxrepeat = $eqconf->{maxrepeat};
915 my $threshold = $eqconf->{threshold};
916
917 # etandem configs
918 my $econf = $config->param(-block=>'ETANDEM');
919 my $mismatch = $econf->{mismatch};
920 my $uniform = $econf->{uniform};
921
922 # run equicktandem, parse to get min. and max sizes and
923 # feed these into etandem
924 my ($max, $min, @sizes);
925 print "Running equicktandem...\n";
926 my $runquick = 0;
927 if (system("$equicktandem -threshold $threshold -maxrepeat $maxrepeat -outfile $qtanfile $infile > $out/$infile.equicktandem.stdout 2>&1") == 0)
928 {
929 # get min and max repeat sizes by parsing qtan
930 open (IN, "<$qtanfile") or warn "Can't open $qtanfile: $!";
931 my @lines = <IN>;
932 close IN;
933 foreach my $line (@lines)
934 {
935 next if ($line =~ /^#/ or $line =~ /Start/ or $line =~ /^\s+$/);
936 push (@sizes, [split(/\s+/,$line)]->[3]);
937 }
938 my @minmax = sort { $a <=> $b } @sizes;
939
940 # don't run if no repeats found
941 if (@minmax)
942 {
943 $max = $minmax[-1];
944 $min = $minmax[0];
945 $runquick = 1;
946 }
947 else
948 {
949 print "No repeats found; etandem will not be run.\n";
950 return;
951 }
952 }
953 else
954 {
955 warn "Could not run equicktandem - etandem will not be run.\n";
956 return;
957 }
958 unlink($qtanfile);
959
960 # run etandem on original file
961 if ($runquick == 1)
962 {
963 if (system("$etandem -minrepeat $min -maxrepeat $max -outfile $tanfile $infile > $out/$infile.etandem.stdout 2>&1") == 0)
964 {
965 #parse etandem
966 system("$parse_etandem $tanfile $out/$infile.etandem.tab");
967 push(@outfiles, "$infile.etandem.tab");
968 print "Etandem complete!\n";
969 }
970 else
971 {
972 warn "Could not run etandem!\n";
973 }
974 }
975 return \@outfiles;
976 }
977
978
979 # TRNASCAN
980 sub run_trnascan
981 {
982 # get the config details
983 my $incoming = shift;
984 my @parts = split(/\|/, $incoming);
985 my $infile = $parts[0];
986 my $exec = $parts[1];
987 my $out = $parts[2];
988 my $trnascan = $execpaths{$exec};
989 my $parse_trna = $parsers{$exec};
990 my $tconf = $config->param(-block=>'TRNASCAN');
991 my $searchmode = $tconf->{searchmode};
992 my $covariance = $tconf->{covariance};
993 my $showall = $tconf->{showall};
994 my $other_opts = $tconf->{options};
995
996 # outfiles
997 my $outfile = "$out/$infile.trnascan.out";
998 my $stdoutfile = "$out/$infile.trnascan.stdout";
999 my $tabfile = "$out/$infile.trnascan.tab";
1000 my @outfiles;
1001
1002 # convert config file details into command line
1003 my %taxon = ("Bacterial" => "-B",
1004 "Archeal" => "-A",
1005 "Organellar" => "-O",
1006 "Eukaryotic" => "-E",
1007 "General" => "-G");
1008 my %covar = ("Y" => "-C", "N" => undef);
1009 my %show = ("Y" => "-H", "N" => undef);
1010
1011 # command line
1012 my @options;
1013
1014 # load command line
1015 push (@options, $taxon{$searchmode}) if (defined $taxon{$searchmode});
1016 push (@options, $covar{$covariance}) if (defined $covar{$covariance});
1017 push (@options, $show{$showall}) if (defined $show{$showall});
1018 push (@options, $other_opts) if (defined $other_opts);
1019
1020 # run the code
1021 my $continue = 1;
1022 print "Running tRNAscan-SE with options \"@options\"...\n";
1023 unless (system("$trnascan @options $infile > $outfile 2> $stdoutfile") == 0)
1024 {
1025 warn "Could not run tRNAscan-SE: $!";
1026 $continue = 0;
1027 }
1028 return unless ($continue == 1);
1029 unless (-s $outfile)
1030 {
1031 print "No tRNAs found!\n";
1032 $continue = 0;
1033 }
1034 return unless ($continue == 1);
1035 if (system("$parse_trna $outfile $tabfile 2>> $stdoutfile") == 0)
1036 {
1037 print "tRNAscan complete!\n";
1038 push (@outfiles,$tabfile);
1039 }
1040 else
1041 {
1042 warn "Could not parse tRNAscan-SE output: $!";
1043 }
1044 unlink("$outfile");
1045 return \@outfiles;
1046 }
1047 # SELFBLAST
1048 sub run_selfblast
1049 {
1050 my $incoming = shift;
1051 return &run_bigblast($incoming);
1052 }
1053 # DBBLAST
1054 sub run_dbblast
1055 {
1056 my $incoming = shift;
1057 return &run_bigblast($incoming);
1058 }
1059 # PFAM
1060 sub run_pfam
1061 {
1062 unless ($common->{glimmer} == 1)
1063 {
1064 print "Pfam cannot be run without glimmer's output. Skipping...\n";
1065 return;
1066 }
1067 my $incoming = shift;
1068 my @parts = split(/\|/, $incoming);
1069 my $infile = $parts[0];
1070 my $exec = $parts[1];
1071 my $out = $parts[2];
1072
1073 my $conf = uc $exec;
1074 my $pconf = $config->param(-block=>"$conf");
1075 my $db = $pconf->{database};
1076 my $fast = $pconf->{fast};
1077 my $overlap = $pconf->{overlap};
1078 my $other_opts = $pconf->{other_opts};
1079
1080 my $pfam = $execpaths{$exec};
1081 my $parse_pfam = $parsers{$exec};
1082 my $orffile = "$infile.glimmer.orfs";
1083 my @outfiles;
1084
1085 # exit if the orffile is not there
1086 unless (-s "$out/$orffile")
1087 {
1088 print "No orfs found - cannot run Pfam. Skipping...\n";
1089 return;
1090 }
1091
1092 # the "overlap" and "fast" options are
1093 # on or off, and should be set by buttons
1094 # on the GUI. I'll leave the rest for
1095 # "other_opts"
1096 my $fastyn = "";
1097 if ($fast == 1)
1098 {
1099 $fastyn = "--fast";
1100 }
1101 my $overlapyn = "";
1102 if ($overlap == 1)
1103 {
1104 $overlapyn = "--overlap";
1105 }
1106
1107 # if the database does not exist, then there's no point carrying on
1108 my @pfams = glob "$db/Pfam-*";
1109 unless (@pfams)
1110 {
1111 print "No Pfam database found, skipping scan of $infile...\n";
1112 return;
1113 }
1114
1115 # get orfs
1116 open (ORFS, "<$out/$orffile") or warn "Can't read orfs from $orffile: $!";
1117 my @olines = <ORFS>;
1118 close ORFS;
1119
1120 # put each orf into an array
1121 my (%orfs,@transfiles);
1122 foreach my $line (@olines)
1123 {
1124 my ($name,$start,$stop) = split(/\s+/, $line);
1125 $orfs{$start} = $stop;
1126 }
1127
1128 # open seq
1129 my $seqobj = Bio::SeqIO->new(-file => "$infile", -format=>"FASTA");
1130 while (my $seq = $seqobj->next_seq())
1131 {
1132 foreach my $start (keys %orfs)
1133 {
1134 # generate an AA translation for each orf
1135 my $stop = $orfs{$start};
1136 my ($transfile,$outfile,$tabfile,$stdoutfile,$subseq,$protseq);
1137 if ($start < $stop)
1138 {
1139 $transfile = "$infile.$start-$stop.trans";
1140 $subseq = $seq->trunc($start,$stop);
1141 $protseq = $subseq->translate();
1142 }
1143 elsif ($start > $stop)
1144 {
1145 $transfile = "$infile.$start.trans";
1146 $subseq = $seq->trunc($stop,$start);
1147 $protseq = $subseq->translate();
1148 }
1149 else
1150 {
1151 print "Something has gone horribly wrong.\n";
1152 next;
1153 }
1154
1155 # print it out to the translation
1156 open (OUT, ">$out/$transfile") or exit "Can't write AA translation: $!";
1157 print OUT "> $transfile\n";
1158 print OUT $protseq->seq();
1159 push (@transfiles, $transfile);
1160 }
1161 }
1162 close OUT;
1163
1164 # run pfam on the local database, using the translated file
1165 foreach my $tf (@transfiles)
1166 {
1167 my $outfile = $tf;
1168 my $tabfile = $tf;
1169 my $stdoutfile = $tf;
1170 $outfile =~ s/trans/pfam/;
1171 $tabfile =~ s/trans/pfam.tab/;
1172 $stdoutfile =~ s/trans/pfam.stdout/;
1173 print "Running Pfam scan on $tf...\n";
1174
1175 if (system("cd $out; $pfam -d $db -o $outfile $fastyn $overlapyn $other_opts $tf > $stdoutfile 2>&1") == 0)
1176 {
1177 # it worked
1178 print "Pfam scan complete.\n";
1179 # empty output file if no hits found
1180 if (-z "$out/$outfile")
1181 {
1182 print "No Pfam hits found.\n";
1183 return;
1184 }
1185 }
1186 else
1187 {
1188 print "Pfam scan failed! $!";
1189 return;
1190 }
1191
1192 # parse the pfam output
1193 eval {
1194 if (system("cd $out; $parse_pfam $outfile $tabfile") == 0)
1195 {
1196 print "Parsing Pfam output...\n";
1197 push(@outfiles, $tabfile);
1198 }
1199 else
1200 {
1201 print "Could not parse Pfam! $!";
1202 return;
1203 } };
1204
1205 # finish
1206 unlink "$out/$tf" if (-e "$out/$tf");
1207 unlink "$out/$outfile" if (-e "$out/$outfile");
1208 return \@outfiles;
1209 }
1210 }