ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/seqclean
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 11824 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;
5 use lib "$FindBin::Bin"; #just add the current script directory to the lib directory
6 use Mailer;
7
8 my $usage = q/
9 seqclean <seqfile> [-v <vecdbs>] [-s <screendbs>] [-r <reportfile>]
10 [-o <outfasta>] [-n slicesize] [-c <num_CPUs>] [-g <gridengine>]
11 [-l <minlen>] [-N] [-A] [-L] [-x <min_pid>] [-y <min_vechitlen>]
12 [-m <e-mail>]
13
14 Parameters
15
16 <seqfile>: sequence file to be analyzed (multi-FASTA)
17
18 -c use the specified number of CPUs
19 -g use grid <gridengine> ('condor', 'sge' or 'smp')
20 (default: smp, i.e. local execution)
21 -n number of sequences taken at once in each
22 search slice (default 2000)
23 -v comma delimited list of sequence files
24 to use for end-trimming of <seqfile> sequences
25 (usually vector sequences)
26 -l during cleaning, consider invalid the sequences sorter
27 than <minlen> (default 100)
28 -s comma delimited list of sequence files to use for
29 screening <seqfile> sequences for contamination
30 (mito\/ribo or different species contamination)
31 -r write the cleaning report into file <reportfile>
32 (default: <seqfile>.cln)
33 -o output the "cleaned" sequences to file <outfasta>
34 (default: <seqfile>.clean)
35 -x minimum percent identity for an alignment with
36 a contaminant (default 96)
37 -y minimum length of a terminal vector hit to be considered
38 (>11, default 11)
39 -N disable trimming of ends rich in Ns (undetermined bases)
40 -M disable trashing of low quality sequences
41 -A disable trimming of polyA\/T tails
42 -L disable low-complexity screening (dust)
43 -I do not rebuild the cdb index file
44 -m send e-mail notifications to <e-mail>
45 /;
46
47
48 my @start_params=@ARGV;
49 my $usemail=0; #set to 1 after initialization
50 my $no_error=1;
51 my $err_log;
52 my $exit_msg='';
53 die $usage."\n" if (@ARGV<1 || $ARGV[0] =~ /^\-/);
54
55 my $wrkdir=$ENV{'PWD'};
56
57 $ENV{'PATH'}=$FindBin::Bin.':'.$FindBin::Bin.'/bin:'.$ENV{'PATH'};
58
59 my $dbfile=shift(@ARGV);
60
61 umask 0002;
62
63 getopts('MNALIl:n:v:s:g:o:r:x:y:c:m:z:')
64 || die "$usage\nError getting options!\n";
65
66 $dbfile=~s{^\./}{};
67
68 die "Error: input file '$dbfile' not found\n" unless -e $dbfile;
69 my $dbfullpath=($dbfile=~m/^\//) ? $dbfile : ($wrkdir.'/'.$dbfile);
70 my ($dbname)=( $dbfullpath =~ m/([^\/]+)$/ );
71 my $cpus = $Getopt::Std::opt_c || 1;
72
73 my $mailuser = $Getopt::Std::opt_m;
74
75 my $outfile=$Getopt::Std::opt_o || $dbname.'.clean';
76 undef $outfile if ($outfile eq '.' || lc($outfile) eq 'none');
77 my $cln_report=$Getopt::Std::opt_r || $dbname.'.cln';
78
79 #=- logging system initialization:
80 my $log_file="seqcl_$dbname.log";
81 $err_log="err_seqcl_$dbname.log";
82 unlink($log_file, $err_log);
83 open(OLDSTDERR, ">&STDERR");
84
85 open(STDOUT, ">$log_file") || &MErrExit("Failed to redirect STDOUT to $log_file");
86 open(STDERR, ">$err_log") || &MErrExit("Failed to redirect STDERR to $err_log");
87
88 my @vecdbs=split(/[\,:\|]/, $Getopt::Std::opt_v);
89 foreach my $vdb (@vecdbs) {
90 my $noflt;
91 {local $/='^'; $noflt=(chomp($vdb)?'^':''); }
92 die "$usage Error: vector file '$vdb' not found.\n" unless -e $vdb;
93 $vdb=~s{^\./}{};
94 $vdb=$wrkdir.'/'.$vdb unless $vdb=~m{^/};
95 $vdb.=$noflt;
96 }
97 &flog("* Using trimming files:\n ",join("\n ",@vecdbs)) if @vecdbs;
98
99 my @screendbs=split(/[\,:\|]/, $Getopt::Std::opt_s);
100 foreach my $sdb (@screendbs) {
101 die "$usage Error: contaminant file '$sdb' not found.\n" unless -e $sdb;
102 $sdb=~s{^\./}{};
103 $sdb=$wrkdir.'/'.$sdb unless $sdb=~m{^/};
104 }
105 &flog("* Using screening files:\n ",join("\n ",@screendbs)) if @screendbs;
106
107 my $startdate=getDate();
108 my $psx_clean=$Getopt::Std::opt_z || 'seqclean.psx';
109 my $r=`which $psx_clean`;chomp($r);
110 die "Error: cannot find the slice cleaning script '$psx_clean' in the shell's path!\n"
111 unless $r;
112
113 $psx_clean=$r;
114
115 &flog("$FindBin::Script running options: \n".$FindBin::Script.' '.join(" ", @start_params));
116 &flog(" Standard log file: $log_file");
117 &flog(" Error log file: $err_log");
118 &flog(" Using $cpus CPUs for cleaning");
119
120 $no_error=0;
121 $usemail=1 if $mailuser;
122
123 my $numseqs = $Getopt::Std::opt_n || 1000;
124 my $minlen = $Getopt::Std::opt_l || 100;
125
126 my ($cmd, $psxparam);
127 #
128 goto SKIPINDEX if ($Getopt::Std::opt_I || !$outfile);
129 &flog("-= Rebuilding $dbname cdb index =-");
130 system("cdbfasta $dbfile -o $dbname.cidx") &&
131 &MErrExit("Error at cdbfasta $dbfile -o $dbname.cidx");
132
133 SKIPINDEX:
134 #================ Actual work starts here ==============
135 system("/bin/rm -rf cleaning_[1-9]*"); #remove previous cleaning directories!
136 my $flags=(($Getopt::Std::opt_A)?'':'A').(($Getopt::Std::opt_N)?'':'N').
137 (($Getopt::Std::opt_L)?'':'L').(($Getopt::Std::opt_M)?'':'M');
138
139 $flags.='S'.$minlen;
140 my $minvechit=$Getopt::Std::opt_y || 11 ;
141 my $minpid= $Getopt::Std::opt_x || '0' ;
142 $psxparam=join(':', ($dbfullpath,$flags,join(',', @vecdbs),
143 join(',',@screendbs), $minvechit, $minpid));
144 $psxparam =~ s/ /\~\+/g;
145
146 my $gridengine= $Getopt::Std::opt_g || 'smp';
147 #my $usepvm = (-s $cpus) ? 1:0;
148
149 my $psxcmd;
150 #if ($gridengine) {
151 $psxcmd="gridx -g $gridengine -p $cpus";
152 # }
153 # else {
154 # if ($usepvm) {
155 # $psxcmd="pvmsx -L -m $cpus ";
156 # }
157 # else {
158 # die $usage."Invalid number of CPUs" unless ($cpus > 0 && $cpus <= 50);
159 # $psxcmd="gridx -g smp -p $cpus ";
160 # }
161 # }
162
163 $cmd=$psxcmd." -n $numseqs -i $dbfile -d cleaning -C '$psxparam' -c '$psx_clean'";
164 &flog(" Launching actual cleaning process:\n $cmd");
165 system($cmd)
166 && &MErrExit("Error at '$cmd'\n");
167
168 #==== merge the cleaning reports for all slices:
169
170
171 &flog("Collecting cleaning reports\n");
172 my $sortfile='outparts_cln.sort';
173 &sortParts('cleaning',$dbname.'.cln', $sortfile);
174 #collect all parts listed in $sortfile
175 # into the file $mskfile
176 &catParts($sortfile, $cln_report);
177
178 open(CLNREPORT, $cln_report) || &MErrExit("Error opening $cln_report");
179
180 if ($outfile) { #the only way to disable the output fasta
181 $cmd="| cdbyank $dbname.cidx -d $dbfile -R -o $outfile";
182 open(TOFETCH, $cmd) || &MErrExit("Error opening pipe to cdbyank ($cmd)");
183 }
184 my ($total, $trashed, $trimmed);
185 my %trash;
186 while(<CLNREPORT>) {
187 next if m/^\s*#/;
188 chomp;
189 my @t=split(/\t/);
190 &MErrExit("Invalid input line in $cln_report:\n$_") unless @t>4;
191 foreach (@t) { s/^\s+//; s/\s+$//; }
192 $total++;
193 if ($t[5]) {
194 $trash{$t[5]}++;
195 $trashed++;
196 }
197 else { #not trashed, fetch it
198 print TOFETCH join(' ',@t[0,2,3])."\n" if $outfile;
199 $trimmed++ if ($t[2]>1 || $t[3]<$t[4]);
200 }
201 }
202 close(CLNREPORT);
203 close(TOFETCH) if ($outfile ne '.');
204 &flog('*'x50);
205 &flog("Sequences analyzed: ".sprintf('%9d',$total));
206 &flog('-' x 35);
207 &flog(" valid: ".
208 sprintf('%9d (%d trimmed)',$total-$trashed,$trimmed));
209 &flog(" trashed: ".sprintf('%9d',$trashed));
210 &flog('*'x50);
211 if ($trashed>0) {
212 my ($k,$v);
213 &flog("----= Trashing summary =------") ;
214 while (($k, $v)=each(%trash)) {
215 &flog(sprintf(" %24s:%9d", 'by '."'$k'",$v));
216 }
217 &flog('-' x 30);
218 }
219 &flog("Output file containing only valid and trimmed sequences: $outfile")
220 if ($outfile);
221 &flog("For trimming and trashing details see cleaning report : $cln_report");
222 &flog('-' x 50);
223 $no_error=1;
224
225 #===================== SUBROUTINES ===================
226
227 END { #to be executed on exit
228 if ($err_log) {
229 my $step_log=`cat $err_log cleaning_*/err_log`;
230 $step_log =~ s/\s+$//;
231 $exit_msg.="\nThis is the content of the error log file:\n$step_log"
232 if $step_log;
233 my $host=$ENV{'HOST'} || $ENV{'HOSTNAME'};
234 my $msg = $no_error ? "$FindBin::Script ($dbfile) finished on machine $host\n".
235 " in $wrkdir, without a detectable error.\n" :
236 "$FindBin::Script ($dbfile) encountered an error.\n".
237 "Working directory was $wrkdir";
238 unless ($no_error) { #an abnormal termination
239 &flog("\nProcess terminated with an error!");
240 &send_mail({to=>$mailuser, subj=>"$FindBin::Script ($dbfile) error!",
241 body=>$msg.$exit_msg}) if $usemail;
242 &flog($msg);
243 }
244 else {
245 &send_mail({to=>$mailuser, subj=>"$FindBin::Script ($dbfile) finished.",
246 body=>$msg.$exit_msg}) if $usemail;
247 }
248 print OLDSTDERR $msg;
249 }
250 }
251
252 sub flog {
253 print STDOUT join("\n",@_),"\n";
254 print STDERR join("\n",@_),"\n";
255 print OLDSTDERR join("\n",@_),"\n" if $err_log;
256 }
257
258 sub MErrExit {
259 #print STDERR $_[0]."\n";
260 $exit_msg.=$_[0].$_[1];
261 &flog($exit_msg);
262 $no_error=0;
263 exit(1) unless defined($_[1]);
264 die $_[1];
265 }
266
267 #a basic date function :
268 sub getDate {
269 my $date=localtime();
270 #get rid of the day so Sybase will accept it
271 (my $wday,$date)=split(/\s+/,$date,2);
272 return $date;
273 }
274
275
276 #--------------------------------------------
277 # sortParts(dirbase, filesuffix, sortfile)
278 #--------------------------------------------
279 # sort all the file names by slice# and write
280 # their names in the file <sortfile>
281 # sorted accordingly
282 # the sliceno is surrounded by tab
283 #--------------------------------------------
284 sub sortParts {
285 my ($dirbase, $suffix, $sortfile)=@_;
286 local *SORTFILE;
287 open(SORTFILE, '>'.$sortfile) ||
288 die "Error creating slice-gathering file '$sortfile'\n";
289 my @dirs = <${dirbase}_[0-9]*>;
290 die "Error: no ${dirbase}_[0-9]* directories found!\n"
291 unless @dirs>0;
292 my $numfiles;
293 my @dbg;
294 foreach my $d (@dirs) {
295 next unless -d $d && ($d=~m/\Q$dirbase\E_\d+$/);
296 opendir(DIRH, $d) || die "Error at opendir $d\n";
297 while (my $fentry=readdir(DIRH)) {
298 push(@dbg, $fentry);
299 next unless $fentry=~s/^(\d+)(_\Q$suffix\E)$/\t$1\t$2/;
300 my $toprint='./'.$d.'/'.$fentry;
301 print(SORTFILE $toprint."\n") ||
302 die ("Error at printing '$toprint' to $sortfile!\n");
303 $numfiles++;
304 }
305 closedir(DIRH);
306 }
307 close(SORTFILE);
308 #die "Error: no files found matching /^(\\d+)(_\\Q$suffix\\E)\$/\n".
309 # "files:\n".join("\n",@dbg)."\n"
310 # unless $numfiles>0;
311 my $cmd="sort -k2,2 -n -o $sortfile $sortfile";
312 system($cmd) && die("Error at: $cmd\n");
313 }
314 #--------------------------------------------
315 #catParts(sortfile, outfile [, fnameproc, flineproc] )
316 #---------------------------
317 # Cautious concatenation of the parts in sortfile
318 # into the outfile
319 # -if fnameproc is given, the result of fnameproc($fname)
320 # is opened as a source file
321 # -if flineproc is given, the result of it flineproc($fline)
322 # if written to outfile
323 #--------------------------------------------
324 sub catParts {
325 my ($sortfile, $outfile, $fnameproc, $flineproc)=@_;
326 local *FHANDLE;
327 local *SORTFILE;
328 local *SFILE;
329
330 #my ($outlock, $fhout) = &createExclusive($outfile)
331 # || die "catParts Error at exclusive creating of file $outfile\n";
332 open(FHANDLE, '>'.$outfile) || die "Error creating $outfile\n";
333 open(SORTFILE, $sortfile) ||
334 die("Error opening $sortfile\n");
335 close(FHANDLE) unless ($flineproc);
336 my @acc; #accumulates file names for the faster cat operation
337 while (my $fname=<SORTFILE>) {
338 $fname=~tr/\t//d;
339 chomp($fname);
340 $fname=&$fnameproc($fname) if $fnameproc;
341 if ($flineproc) {
342 open(SFILE, $fname) || die "Error opening slice file '$fname'\n";
343 local $_;
344 while (<SFILE>) {
345 if (my $wline=&$flineproc($_)) {
346 print(FHANDLE $wline) ||
347 die "Error printing $wline to $outfile\n";
348 }
349 }
350 close(SFILE);
351 } #flineproc line filter provided
352 else { # simple cat
353 die "Error -- file '$fname' cannot be located!\n" unless -f $fname;
354 push(@acc, $fname);
355 if (@acc>20) { #20 files at a time
356 my $cmd="cat '".join("' '",@acc)."' >> $outfile";
357 system($cmd) && die("Error at: $cmd\n");
358 @acc=();
359 }
360 }
361 } #while filename
362 close(SORTFILE);
363 if ($flineproc) {
364 close(FHANDLE)
365 }
366 elsif (@acc>0) {
367 my $cmd="cat '".join("' '",@acc)."' >> $outfile";
368 system($cmd) && die("Error at: $cmd\n");
369 @acc=();
370 }
371 }

Properties

Name Value
svn:executable *