ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gicl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 14170 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use Cwd 'getcwd';
5 use FindBin;
6 use lib $FindBin::Bin;
7 use Mailer;
8
9 my $usage = qq/ Usage:
10 gicl <fasta_db> [-d <refDb>] [-g condor|sge|smp] [-c {<num_CPUs>]
11 [-m <user>] [-l <min_overlap>] [-v <max_overhang>] [-R <refprefix>]
12 [-p <pid>] [-n slicesize] [-M] [-L <local_wrk_dir>] [-S] [-K] [-X]
13 [-I] [-C] [-W <pairwise_script.psx>] [-D]
14 Options:
15 -c : use the specified number of CPUs on local machine
16 (or on a grid if -g option was given); default 1
17 Clustering phase options:
18 -d do not perform all-vs-all search, but search <fasta_db> against
19 <refDb> instead; unless -R options was given, exit after the
20 pairwise hits are generated and sorted
21 -n number of sequences in a clustering search slice (default 1000)
22 -p minimum percent identity for overlaps <PID> (default 94)
23 -l miminum overlap length (default 40)
24 -v maximum length of unmatched overhangs (default 30)
25 -R instead of full assembly, perform "containment clustering"
26 with nrcl followed by assembly-on-Reference using mblaor;
27 <refprefix> is the "full length" sequence name prefix
28 as used in the input files;
29 -L special grid worker mode - assumed local copies exist for
30 the given files in the provided <local_wrk_dir>, which will be
31 used to prefix <fasta_db> and <refDb> unless they are given
32 with absolute paths
33 -I do not rebuild database indices
34 -K skip the pairwise searches, directly run the assembly program
35 on previously obtained overlaps
36 -M ignore lower-case masking in <fasta_db> sequences
37 -S merge the pairwise search results without sorting them (enforces -X
38 so no assembly will be performed)
39 -W use custom script <pairwise_script.psx> for the distributed
40 pairwise searches instead of the default: gicl_mgbl.psx
41 -X do not perform assembly, only run the pairwise searches
42 and sort\/merge the results
43 -Z only run the distributed pairwise searches and exit
44 (without even merging the resulting pairwise overlaps)
45 /;
46
47 my @start_params=@ARGV;
48
49 my $usemail=0; #set to 1 after initialization
50 my $no_error=1;
51 my $exit_msg='';
52 my $cur_step;
53 die $usage."\n" if (@ARGV<1 || $ARGV[0] =~ /^\-/);
54
55 my $wrkdir=$ENV{'PWD'};
56
57 &addPath($FindBin::Bin);
58 &addPath($FindBin::Bin.'/bin') if -d $FindBin::Bin.'/bin';
59
60 my $dbfile=shift(@ARGV);
61
62 umask 0002;
63
64 getopts('DJMISKZXSW:R:L:r:d:c:m:n:p:l:v:g:') || die "$usage\nError at getopts!\n";
65 my $debug=$Getopt::Std::opt_D;
66 my $localdirbase=$Getopt::Std::opt_L;
67 $localdirbase=~s/\/+$//;
68 my $gridx=lc($Getopt::Std::opt_g); #use gridx instead; must be 'smp', 'sge' or 'condor'
69 my $dbfname=&getFName($dbfile);
70 my $dbfullpath=&getFullPath($dbfile);
71 die "Error: cannot find input file '$dbfile'\n" unless -s $dbfile;
72 #unless ($localdirbase) {
73 # die "Error: cannot find input file '$dbfile'\n" unless -s $dbfullpath;
74 # }
75 my $asm_paramfile=$Getopt::Std::opt_P;
76 if ($asm_paramfile) {
77 $asm_paramfile=&getFullPath($asm_paramfile);
78 die "Error: cannot find quality file '$asm_paramfile'\n" unless -s $asm_paramfile;
79 }
80 my $cpus = $Getopt::Std::opt_c || 1;
81 my $psxcmd = $gridx ? "gridx -g $gridx -p $cpus": "psx -p $cpus";
82 $psxcmd.=" -L $localdirbase" if $localdirbase && $gridx;
83 my $mailuser = $Getopt::Std::opt_m;
84 my $nosort = $Getopt::Std::opt_S ? '-M' : '';
85 my $maxovh=$Getopt::Std::opt_v || 30;
86 #$maxovh+=10 if $clustprog eq 'lclust';
87 my $minovl=$Getopt::Std::opt_l || 40;
88 my $pid=$Getopt::Std::opt_p || 94;
89 $pid=94 if $pid<20;
90 $minovl=40 if $minovl==0;
91 my $no_zmerge=$Getopt::Std::opt_Z;
92 my $onlysearch=$Getopt::Std::opt_X;
93 my $startdate=getDate();
94 my $nomasking=$Getopt::Std::opt_M ? 'M':'';
95 my $useDb=$Getopt::Std::opt_d;
96 my $useDbname;
97 if ($useDb) {
98 $useDbname=&getFName($useDb);
99 $useDb=&getFullPath($useDb);
100 unless ($localdirbase) {
101 die "Error: cannot locate the specified db: $useDb" unless -s $useDb;
102 }
103 }
104
105 my $psx_clust=$Getopt::Std::opt_W || 'gicl_mgbl.psx';
106 my $refprefix = $Getopt::Std::opt_R || $Getopt::Std::opt_r;
107 die "$psx_clust not available in PATH!\n"
108 unless ($psx_clust=&checkCmd($psx_clust));
109
110
111 my $r;
112 #=- logging system initialization:
113 my $log_file="gicl_$dbfname.log";
114 my $err_log="err_gicl_$dbfname.log";
115 unlink($log_file, $err_log);
116 open(OLDSTDERR, ">&STDERR");
117
118 open(STDOUT, ">$log_file") || &MErrExit("Failed to redirect STDOUT to $log_file");
119 open(STDERR, ">$err_log") || &MErrExit("Failed to redirect STDERR to $err_log");
120 &set_step('Initialization');
121
122 &flog("$FindBin::Script running options: \n".$FindBin::Script.' '.join(" ", @start_params));
123 &flog(" Standard log file: $log_file");
124 &flog(" Error log file: $err_log");
125 &flog(" Using $cpus CPUs for clustering and assembly");
126 #&flog(" Path is : $ENV{'PATH'} ");
127 $no_error=0;
128 $usemail=1 && $mailuser;
129
130 my $clusterfile=$dbfname.($useDb ? '_'.$useDbname :'_cl').'.ace';
131
132 my ($cmd, $psxparam);
133 $Getopt::Std::opt_I=1 if $Getopt::Std::opt_K || $localdirbase;
134
135
136 my $dbfileidx=$dbfullpath.'.cidx';
137
138 #
139 # rebuild indices (unless specifically told not to do so)
140 unless ($Getopt::Std::opt_I) {
141 my $toindex = $useDb ? $useDb : $dbfile ;
142 &flog("-= Rebuilding $toindex indices =-");
143 $cmd="formatdb -p F -o F -i $toindex";
144 # just in case, rebuild the cdbfasta index
145 system($cmd) &&
146 &MErrExit("Error at '$cmd'");
147 # -- also make sure the cdb index for the main file is created
148 if ($useDb) {
149 system("cdbfasta $useDb") &&
150 &MErrExit("Error at cdbfasta $useDb");
151 }
152 system("cdbfasta $dbfile") &&
153 &MErrExit("Error at cdbfasta $dbfile");
154 }
155
156 #========= compress & merge sort the cluster results
157 my $hitsort = $dbfname.'_'.($useDb ? $useDbname.'_tabhits' : 'cl_tabhits');
158 my $rbasename = $dbfname.'_'.($useDb ? $useDbname : 'self');
159 my $acefile = $rbasename;
160
161 if ($Getopt::Std::opt_J) {
162 goto ZMERGE;
163 }
164
165 PAIRWISE:
166 #=- start clustering the $blastDB file
167 &set_step('pairwise');
168 unlink($clusterfile);
169 if ($Getopt::Std::opt_K) {
170 &flog("-- Skipping pairwise searches.\n");
171 goto ASSEMBLE;
172 }
173 my $numseqs = $Getopt::Std::opt_n || 1000;
174 system('/bin/rm -rf cluster_[1-9]*') unless $Getopt::Std::opt_J;
175 # ---psx user parameter format: <db>:<minpid>:<maxovh>:<minovl>:<flags>
176 # where <flags> are one or more letters of:
177 # D=no self-clustering, M = no masking, G = gap info
178 my $dbflag=$useDb?'D':'';
179 my $paramdb=$useDb? $useDb : $dbfullpath;
180 my $gapinfo='G'; #always save gap info
181 $psxparam=join(':', ($paramdb,$pid,$maxovh,$minovl,$nomasking.$gapinfo.$dbflag));
182 $cmd=$psxcmd." -n $numseqs -i $dbfile -d cluster -C '$psxparam' -c '$psx_clust'";
183 $cmd.=' -D' if $debug && $gridx;
184 &flog(" Launching distributed clustering: \n $cmd");
185 system($cmd)
186 && &MErrExit("Error at '$cmd'\n");
187 &end_step();
188
189 if ($no_zmerge) {
190 &flog("Exit requested after pairwise searches.");
191 # &flog($exit_msg);
192 # generate the clustering singleton list here?
193 goto THEEND;
194 }
195
196 ZMERGE:
197 &set_step('mgmerge');
198 my @dirs = ((<cluster_?>),(<cluster_??>));
199 unlink('masked.lst');
200
201 foreach my $dir (@dirs) {
202 next unless -d $dir;
203 system("cat $dir/masked.lst >> masked.lst");
204 zmergeDirHits($dir);
205 }
206 $cmd="mgmerge -b $nosort -o $hitsort -s 1600 zdir_*.bz2";
207 system($cmd)
208 && &MErrExit("Error at final sort:\n $cmd");
209 system('/bin/rm -rf zdir_*.bz2') unless $debug;
210 goto THEEND if $nosort;
211 if ($onlysearch || ($useDb && !$refprefix)) {
212 &flog("Exit requested after pairwise searches and sorting") if $onlysearch;
213 # &flog($exit_msg);
214 # generate the clustering singleton list here?
215 goto THEEND;
216 }
217
218 ASSEMBLE:
219 &set_step('assemble');
220 if ($refprefix) {
221 $acefile.='.aor.ace';
222 my $lytfile= $rbasename.'.nrcl.lyt';
223 $cmd="bzip2 -cd ${hitsort}_*.bz2 | nrcl OVHANG=$maxovh -o $rbasename.nrcls -y $lytfile";
224 $refprefix=~s/\|$//;
225 $cmd.=" -p '$refprefix'";
226 system($cmd)
227 && &MErrExit("Error at command: $cmd");
228 #now assemble using aor
229 $cmd = "mblaor $lytfile -d $dbfileidx -c $maxovh -p '$refprefix' -o $acefile";
230 system($cmd)
231 && &MErrExit("Error at assembly command: $cmd");
232 }
233 else { # straight-forward, all vs all assembly
234 $acefile.='.ace';
235 $cmd="bzip2 -cd ${hitsort}_*.bz2 | mblasm -d $dbfileidx -c $maxovh -o $acefile";
236 &flog("Running assembly command: $cmd\n");
237 system($cmd)
238 && &MErrExit("Error at assembly command: $cmd");
239 }
240 &end_step();
241
242 SINGLETSONLY:
243 &set_step('singletons');
244
245 unless ($Getopt::Std::opt_a) {
246 #build the singleton list if the full straight clustering/assembly pipeline was run
247 $cmd='grep "^AF " '.$acefile.'| cut -d " " -f2 | sort -u > seqnames_in_asms';
248 system($cmd) && &MErrExit("Error running:\n$cmd\n");
249 $cmd="cdbyank -l $dbfileidx | sort > seqnames_all";
250 system($cmd) && &MErrExit("Error running:\n$cmd\n");
251 my $sglist = $rbasename.'.singletons';
252 $cmd="comm -23 seqnames_all seqnames_in_asms > $sglist";
253 system($cmd) && &MErrExit("Error running:\n$cmd");
254 unlink('seqnames_all', 'seqnames_in_asms');
255 print STDERR "Singletons are listed in file: $sglist\n";
256 }
257
258 &end_step();
259
260 THEEND:
261 $no_error=1;
262 &flog("*** gicl [$rbasename] finished ***");
263
264
265 #=====================================================================
266 #========================= SUBROUTINES ==========================
267 #=====================================================================
268
269 END { #to be executed on exit
270 if ($cur_step && $err_log) {
271 my $step_log=`cat $err_log`;
272 $step_log =~ s/\s+$//;
273 $exit_msg.="\nThis is the content of the error log file (ended at $cur_step):\n$step_log"
274 if $step_log;
275 my $host=$ENV{'HOST'} || $ENV{'HOSTNAME'};
276 my $msg = $no_error ? qq/$FindBin::Script ($dbfile) finished on machine $host
277 in $wrkdir, without a detectable error.
278 / :
279 qq/$FindBin::Script ($dbfile) encountered an error at step $cur_step
280 Working directory was $wrkdir.
281 /;
282 unless ($no_error) { #an abnormal termination
283 &flog("\nProcess terminated with an error, at step '$cur_step'!");
284 &send_mail({to=>$mailuser, subj=>"$FindBin::Script ($dbfile) error at $cur_step!",
285 body=>$msg.$exit_msg}) if $usemail;
286 &flog($msg);
287 }
288 else {
289 #&flog("*** Done ***") if ($cur_step && lc($cur_step) ne 'Initialization');
290 &send_mail({to=>$mailuser, subj=>"$FindBin::Script ($dbfile) finished.",
291 body=>$msg.$exit_msg}) if $usemail;
292 }
293 print OLDSTDERR $msg;
294 }
295 }
296
297 #== checkCmd -- checks for executable, in the PATH if no full path given
298 sub checkCmd {
299 my $cmd=$_[0];
300 if ($cmd =~ m/^\//) {
301 return (-x $cmd) ? $cmd : '';
302 }
303 my @paths=split(/:/, $ENV{'PATH'});
304 foreach my $p (@paths) {
305 return $p.'/'.$cmd if -x $p.'/'.$cmd;
306 }
307 return '';
308 }
309
310 sub flog {
311 print STDOUT join("\n",@_),"\n";
312 print STDERR join("\n",@_),"\n";
313 }
314
315 sub MErrExit {
316 #print STDERR $_[0]."\n";
317 $exit_msg.=$_[0].$_[1];
318 &flog($exit_msg);
319 exit(1) unless defined($_[1]);
320 die $_[1];
321 }
322
323
324 sub set_step {
325 $cur_step=$_[0];
326 &flog(">>> --- $cur_step [$dbfile] started at ".&getDate());
327 }
328
329 sub end_step {
330 &flog("<<< --- $cur_step [$dbfile] finished at ".getDate());
331 }
332
333 #a basic date function :
334 sub getDate {
335 my $date=localtime();
336 #get rid of the day so Sybase will accept it
337 (my $wday,$date)=split(/\s+/,$date,2);
338 return $date;
339 }
340
341 sub getFullPath {
342 if ($localdirbase) {
343 return ($_[0] =~ m/^\//) ? $_[0] : $localdirbase.'/'.$_[0];
344 }
345 else {
346 return ($_[0] =~ m/^\//) ? $_[0] : $ENV{'PWD'}.'/'.$_[0];
347 }
348 }
349
350 sub getFName {
351 if ($_[0] =~ m/.+[\/\\](.*?)$/) {
352 return $1;
353 }
354 else {
355 return $_[0];
356 }
357 }
358
359 # sub addPath {
360 # my $path=$ENV{'PATH'};
361 # foreach my $p (@_) {
362 # next if ($p eq $path || m/\Q:$p$/
363 # || m/:\Q$p:/ || m/^\Q$p:/);
364 # $path=$p.':'.$path;
365 # }
366 # $ENV{'PATH'}=$path;
367 # }
368
369 sub addPath {
370 my $path=$ENV{'PATH'};
371 my @allp=split(/\:/,$path);
372 my %h;
373 @h{@allp}=();
374 foreach my $p (@_) {
375 next if exists($h{$p});
376 $path=$p.':'.$path;
377 }
378 $ENV{'PATH'}=$path;
379 }
380
381 sub zmergeDirHits {
382 my ($dir)=@_;
383 my $maxfopen=16; #max files to open once -- must never fail due to hitting the system limit
384 my $run=0;
385 my $startDir=getcwd(); # from Cwd module
386 chdir($dir) || die "Cannot change to $dir directory!\n";
387 my $numFiles=0;
388 while (1) {
389 opendir(FDIR, '.') || die "Cannot open directory $dir\n";
390 my @allfiles=readdir(FDIR); #possibly large array w/ all the files from that directory
391 close(FDIR);
392 @allfiles=grep(/\.bz2$/, @allfiles);
393 $numFiles=@allfiles;
394 last if ($numFiles<=$maxfopen);
395 my @files;
396 foreach my $f (@allfiles) {
397 next unless ($f=~m/\.tab\.bz2$/ || $f=~m/zMrg_p\S+\.bz2$/);
398 push(@files, $f);
399 if (@files==$maxfopen) {
400 my $sortcmd="mgmerge -b $nosort -o zMrg_p$run -s 1200 ".join(' ',@files);
401 $run++;
402 &runCmd($sortcmd);
403 if ($debug) {
404 foreach my $rf (@files) {
405 my $ren=$rf;
406 $ren=~s/\.bz2$/\.bz/;
407 rename($rf, $ren);
408 }
409 }
410 else {
411 unlink(@files);
412 }
413 @files=();
414 }
415 }
416 if (@files>1) {
417 my $sortcmd="mgmerge -b $nosort -o zMrg_p$run -s 1200 ".join(' ',@files);
418 $run++;
419 &runCmd($sortcmd);
420 if ($debug) {
421 foreach my $rf (@files) {
422 my $ren=$rf;
423 $ren=~s/bz2$/bz/;
424 rename($rf, $ren);
425 }
426 }
427 else {
428 unlink(@files);
429 }
430 }
431 };
432 chdir($startDir);
433 if ($numFiles) { #some directories may be empty!
434 my $sortcmd="mgmerge -b $nosort -o zdir_$dir -s 1700 $dir/*.bz2";
435 runCmd($sortcmd);
436 }
437 system("/bin/rm -rf $dir") unless $debug;
438 }
439
440 sub runCmd {
441 my $cmd=shift(@_);
442 print STDERR "Running:\n$cmd\n" if $debug;
443 my $errout=`( $cmd ) 2>&1`;
444 my $errcode=$?;
445 print STDERR $errout;
446 if ($errcode) {
447 print STDERR "Error status detected (code=$errcode) at command $cmd\n";
448 exit(1); #exit, but do not report error
449 #- so we can continue to assemble the rest of the clusters!
450 }
451 if ($errout=~/aborted|error|fail|segmentation|violation|cannot|no such file/i) {
452 print STDERR "Error message detected after running command:\n$cmd\n";
453 exit(2);
454 }
455 }

Properties

Name Value
svn:executable *