ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/seqclean.psx
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 11856 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use FindBin;
4 umask 0002;
5
6 chdir($ENV{PVMSX_WD}) if ($ENV{PVMSX_WD});
7
8 my ($file, $numpass, $sliceno, $last, $skipped, $total, $userparams)=@ARGV;
9
10 #$ENV{'PATH'}=$FindBin::Bin.':'.$ENV{'PATH'};
11 $userparams =~ s/\~\+/ /g;
12 my ($dbfile, $flags, $vec_db, $screen_db, $minvlen, $minpid)=split(/:/, $userparams);
13 my ($dbname)=($dbfile =~ m/([^\/]+)$/);
14 $dbname=$dbfile unless $dbname;
15
16 #minimum theoretical length of a terminal vector/linker residue
17 $minvlen=11 unless $minvlen;
18
19 my $cmd;
20 my $wrkfile = $file.'.wrk';
21 my $repfile=sprintf('%06d_%s.cln', $sliceno, $dbname);
22
23 #my $outfile= sprintf('%06d_%s.outfasta', $sliceno, $dbname) if ($flags =~ m/O/);
24 my $log_file='log_std';
25 my $err_file='err_log';
26 my %seqs; # this is the hash with all sequence data
27 my @seqlist; #this is the list with all the sequences in this file
28 my ($tooshort) = ($flags =~ m/S(\d+)/);
29 my $lowqualcheck = ($flags =~ m/M/);
30 my $minseqlen = $tooshort || 50;
31 open(STDERR, '>>'.$err_file);
32 open(STDOUT, '>>'.$log_file);
33 &writeCLRfile($wrkfile, 1); #create sequence hash and work file (length filtered)
34 my $polyA = ($flags =~ m/A/);
35 my $endN = ($flags =~ m/N/);
36 my $dust = ($flags =~ m/L/);
37 #run trimpoly unless disabled
38 if ($polyA || $endN) {
39 my $trimcmd = ($polyA)? 'trimpoly -R -s10 -r8 -ex -l20 ':'trimpoly -R ';
40 if ($endN) {
41 $trimcmd.=' -n1 -m15 ';
42 $trimcmd.=' -N ' unless $polyA;
43 }
44 open (TRIMPOLY, $trimcmd." < $wrkfile |") || die "Error opening '$trimcmd pipe\n";
45 while (<TRIMPOLY>) {
46 chomp;
47 my ($seqname, $percN, $sh5, $sh3, $ilen)=split(/\t/);
48 my $d=$seqs{$seqname} || die "Error: seq '$seqname' not found in hash!\n";
49 $$d[1]+=$sh5;
50 $$d[2]-=$sh3;
51 $$d[4]=$percN;
52 $$d[5]='low_qual' if ($lowqualcheck && $percN>3.00 && $endN); #trash
53 $$d[5]='shortq' if ($tooshort && $$d[2]-$$d[1]+1<$tooshort);
54 $$d[6].="trimpoly[+$sh5, -$sh3]; " if $sh5 || $sh3;
55 }
56 close(TRIMPOLY);
57 writeCLRfile($wrkfile);
58 } #trimpoly part
59
60 #== trash low complexity (unless disabled)
61 if ($dust) {
62 $cmd="mdust $wrkfile -c |";
63 open (INCOMING, $cmd) || &MErrExit("Error running $cmd !\n");
64 my $lastseq;
65 my $lastlen;
66 my $trashcount=0;
67 while (<INCOMING>) {
68 my ($seq_name, $len, $c5,$c3)=split(/\s+/);
69 my $d=$seqs{$seq_name} || die "Error at dust: $seq_name not in hash!\n";
70 my $rlen = ($seq_name eq $lastseq) ? $lastlen-($c3-$c5) : $len-($c3-$c5); #remaining non-lc
71 if ($rlen<40) {
72 $$d[5]='dust' unless ($$d[5]);
73 $$d[6].='low complexity; ';
74 $trashcount++;
75 }
76 $lastseq=$seq_name;
77 $lastlen=$rlen;
78 }
79 close(INCOMING);
80 writeCLRfile($wrkfile) if $trashcount;
81 }
82
83 #=== the vector search; assumes the vector database is already
84 # formatted with formatdb
85
86 my $minperc = ($minpid>0) ? $minpid : 93;
87
88 foreach my $vdb (split(/,/,$vec_db)) {
89 my $changed;
90 my $noflt;
91 { local $/='^'; $noflt=chomp($vdb); }
92 die "Error locating contaminant database $vdb\n" unless -e $vdb;
93 my ($fvdb)=($vdb =~ m/([^\/]+)$/);
94 $fvdb=$vdb unless $fvdb;
95 my $tabout=$wrkfile.'.vtab';
96 $cmd="blastall -p blastn -d $vdb -i $wrkfile ";
97 $cmd.=$noflt?' -F F ':' -F "m D" ';
98 my $blout=$wrkfile.'.blout';
99 $cmd.=' -q -4 -G 3 -E 3 -v 10000 -b 10000 -e 700 -Y 1.75e10 -m 8 > '.$blout;
100 sysrun($cmd);
101 $cmd= 'grep -v "^#" '.$blout.' | sort -k1,1 -k12,12 -nr '." > $tabout";
102 sysrun($cmd);
103
104 if (-s $tabout) {
105 local $_;
106 open (TABOUT, $tabout) || die "Error opening '$tabout' at $vdb search\n";
107 while (<TABOUT>) {
108 next if m/^\s*#/;
109 my @t=split(/\t/);
110 die "Error: invalid line format in $tabout:\n$_\n" unless @t>=12;
111 my $d=$seqs{$t[0]} ||
112 die "Error: query '$t[0]' from $tabout not found in sequence hash!\n";
113 next if $$d[5];
114 #distance of hit from either end:
115 my ($d5, $d3)=($t[6]-1, ($$d[8]-$$d[7]+1)-$t[7]);
116 #hit should start within 30% distance of either (original) end
117 my $maxdist=0.30*$$d[3];
118 my $hitlen=$t[7]-$t[6];
119 my $mindist=$d5>$d3?$d3:$d5;
120 my $minhitlen=$minvlen+$mindist/2>35 ? 35 : $minvlen+$mindist/2;
121 #validate the hit: the further it is, the longer it should be
122
123 if ($t[2]>=$minperc && $mindist<$maxdist && $hitlen>=$minhitlen) {
124 my $adjusted;
125 if ($mindist == $d5) {
126 #trim 5' end
127 if ($$d[7]+$t[7]>$$d[1]) {
128 $$d[1]=$$d[7]+$t[7]; #set the new end5 after the 3' end of the vector hit
129 $adjusted=1;
130 }
131 }
132 else {
133 #trim 3' end
134 if ($$d[7]+$t[6]-2<$$d[2]) {
135 $$d[2]=$$d[7]+$t[6]-2; #set the new end3 before the 5' end of the vector hit
136 $adjusted=1;
137 }
138 }
139 if ($adjusted) {
140 $changed=1;
141 if ($$d[2]-$$d[1]<$minseqlen) {
142 $$d[5]=$fvdb; #trash
143 $$d[1]=$$d[2] if $$d[1]>$$d[2];
144 }
145 $$d[6].=" $t[1]".':<'.($$d[7]+$t[6]-1).'-'.($$d[7]+$t[7]-1).'>;' if $adjusted;
146 }
147 }
148 }
149 close(TABOUT);
150 }
151 unlink($tabout);unlink($blout);
152 writeCLRfile($wrkfile) if $changed;
153 } # vector search loop
154
155 # -- screening searches: large contaminant hit expected; that is,
156 # at least 60% of the query should get a hit of at least 96% identity
157 $minperc = ($minpid>10) ? $minpid : 96;
158 foreach my $sdb (split(/,/,$screen_db)) {
159 die "Error locating contaminant database $sdb\n" unless -e $sdb;
160 my ($fsdb)=($sdb =~ m/([^\/]+)$/);
161 $fsdb=$sdb unless $fsdb;
162 my $tabout=$wrkfile.'.stab';
163 my $changed;
164 # -- 2.2.11-13 megablast bug: it IGNORES -JF flag!
165 # at least blastall still works as it should..
166 # so let's just use mgblast instead here:
167 $cmd="mgblast -d $sdb -i $wrkfile -p$minperc".
168 ' -W18 -JF -F "m D" -X30 -D3 | '.
169 ' grep -v "^#" | sort -k1,1 -k12,12 -nr '.
170 " > $tabout";
171 # $minscov=60 unless $minscov;
172 sysrun($cmd);
173 if (-s $tabout) {
174 local $_;
175 open (TABOUT, $tabout) || die "Error opening '$tabout' at $sdb search\n";
176 while (<TABOUT>) {
177 next if m/^\s*#/;
178 my @t=split(/\t/);
179 die "Error: invalid line format in $tabout:\n$_\n" unless @t>=12;
180 my $d=$seqs{$t[0]} ||
181 die "Error: query '$t[0]' from $tabout not found in sequence hash!\n";
182 next if $$d[5];
183 #distance of hit from either [original] end:
184 my ($d5, $d3)=($t[6]-1, ($$d[8]-$$d[7]+1)-$t[7]);
185 my $hitlen=$t[7]-$t[6];
186 #my $minhitlen= (0.75 * ($$d[8]-$$d[7]+1) < $minseqlen) ? $minseqlen: 0.75 * ($$d[8]-$$d[7]+1);
187 my $minhitlen = 60; #at least 60 base pairs overlap!
188
189 my $mindist=$d5>$d3?$d3:$d5;
190 if ($hitlen>=$minhitlen) {
191 my $adjusted;
192 if ($mindist == $d5) {
193 #trim 5' end
194 if ($$d[7]+$t[7]>$$d[1]) {
195 $$d[1]=$$d[7]+$t[7]; #set the new end5 after the 3' end of the vector hit
196 $adjusted=1;
197 }
198 }
199 else {
200 #trim 3' end
201 if ($$d[7]+$t[6]-2<$$d[2]) {
202 $$d[2]=$$d[7]+$t[6]-2; #set the new end3 before the 5' end of the vector hit
203 $adjusted=1;
204 }
205 }
206 if ($adjusted) {
207 $changed=1;
208 if ($$d[2]-$$d[1]<$minseqlen) {
209 $$d[5]=$fsdb ; #trash
210 $$d[1]=$$d[2] if $$d[1]>$$d[2];
211 }
212 $$d[6].=" $t[1]".':<'.($$d[7]+$t[6]-1).'-'.($$d[7]+$t[7]-1).'>;' if $adjusted;
213 }
214 }
215 } #while
216 close(TABOUT);
217 }
218 unlink($tabout);
219 writeCLRfile($wrkfile) if $changed;
220 } # larger contaminant screening
221
222 # run trimpoly mostly for computing the percentage of undetermined bases
223 my $trimcmd = ($polyA)? 'trimpoly -R -s10 -r8 -ex -l20 ':'trimpoly -R ';
224 $trimcmd.= ($endN ? ' -n1 -m15 ' : ' -n90 -m1 ');
225 $trimcmd.=' -N ' unless $polyA;
226 open (TRIMPOLY, $trimcmd." < $wrkfile |") || die "Error opening final '$trimcmd' pipe\n";
227 while (<TRIMPOLY>) {
228 chomp;
229 my ($seqname, $percN, $sh5, $sh3, $ilen)=split(/\t/);
230 my $d=$seqs{$seqname} || die "Error: seq '$seqname' not found in hash!\n";
231 $$d[1]+=$sh5;
232 $$d[2]-=$sh3;
233 $$d[4]= $percN;
234 $$d[5]='low_qual' if ($lowqualcheck && $percN>3.00 && $endN); #trash
235 $$d[5]='shortq' if ($tooshort && $$d[2]-$$d[1]+1<$tooshort);
236 $$d[6].="trimpoly[+$sh5, -$sh3]; " if $sh5 || $sh3;
237 }
238 close(TRIMPOLY);
239
240 #------- write cleaning report file for this slice
241 open(CLNFILE, '>'.$repfile) || die "Error creating file '$repfile'\n";
242 foreach my $seqname (@seqlist) {
243 my $d=$seqs{$seqname} || die "Error writing cleaning report: sequence $seqname not found!\n";
244 printf(CLNFILE "%-16s\t%5.2f\t%5s\t%5s\t%5s\t%12s\t%s\n",$seqname,
245 $$d[4],$$d[1],$$d[2],$$d[3],$$d[5], $$d[6]);
246 }
247 close(CLNFILE);
248
249 #------ cleaning up
250 $file.='*';
251 unlink(<${file}>);
252 print "<< finished $sliceno\n";
253
254 #==================
255 sub sysrun {
256 local $_ = `($_[0]) 2>&1`;
257 my $code=$?;
258 print STDERR $_;
259 my @cores = <core.*>;
260 if ($code && @cores>0) {
261 print STDERR "ERROR: command \n$cmd\n exited with code $code\n";
262 exit(1);
263 }
264 if ($code || m/error|failure|fatal|segmentation|core/i) {
265 if (($_[0] =~ m/^\s*blastall/i)) {
266 my @olines=split(/\n/);
267 @olines=grep(m/error/i && !m/couldn't uncache/i && !m/No valid letters to be indexed/i,
268 @olines);
269 die "blastall unexpected error at\n$_[0]\n" if (@olines>0);
270 #my @e = (m/(error)/igs);
271 #my @r = (m/(couldn't uncache)/igs);
272 #if (scalar(@e) == scalar(@r)) {
273 # print STDERR "Please ignore (code $code) 'uncache' errors above.\n";
274 # return;
275 # }
276 #print STDERR "Warning: error code $code with not just uncaching errors at\n$_[0]\n";
277 }
278 else {
279 die "Error code: $code, message:\n$_) at command:\n$_[0]\n";
280 }
281 }
282
283 }
284 #----------------------------------------------------------------------------
285 #printing fasta sequence (60 chars per line), uppercase
286 # $seq is just a reference to an actual nucleotide string
287 # if e5,e3 are specified and e3>e5 only CLEAR range will be returned
288 #----------------------------------------------------------------------------
289 sub print_fasta {
290 my ($seq_name, $seq, $e5, $e3) = @_;
291 my $pos=$e5-1;
292 print '>'.$seq_name."\n" if $seq_name;
293 if ($e5 && $e3 && $e3>$e5) { #assumes e3 and e5 make sense
294 while ($e3-$pos>=60) {
295 print uc(substr($$seq,$pos,60))."\n";
296 $pos+=60;
297 }
298 print uc(substr($$seq,$pos,$e3-$pos))."\n"
299 if ($e3-$pos>0);
300 }
301 else { #no valid clear range provided, print whole sequence
302 my $len=length($$seq);
303 my $pos=0;
304 while ($pos<$len) {
305 print uc(substr($$seq,$pos,60))."\n";
306 $pos+=60;
307 }
308 }
309 }
310
311 sub writeCLRfile {
312 my ($outfile, $create)=@_;
313 local *SLICEFILE;
314 local *OUTFILE;
315 open(SLICEFILE, $file) || die "Error opening slice file '$file'\n";
316 local $/="\n>";
317 open (OUTFILE, ">$outfile") || die "Error creating work file '$outfile'\n";
318 my $oldsel=select(OUTFILE); # outfile, only containing clear range, non-trashed sequences
319 while (<SLICEFILE>) {
320 s/^>//;
321 chomp;
322 my ($defline, $seq)=(m/^(.+?)\n(.+)/s);
323 my ($seqname, $seqdescr) = ($defline =~ m/^(\S+)\s*(.*)/);
324 $seq =~ s/\s+//g;
325 my $trash;
326 my ($e5, $e3);
327 if ($create) { #descr, end5, end3, oldlength, percent N, trash code, cleaning comments, f5, f3
328 #already trash sequences shorter than 100 with code 'i'
329 my $seqlen=length($seq);
330 $trash='short' if ($tooshort && $seqlen<$tooshort);
331 # 0 1 2 3 4 5 6 7 8
332 $seqs{$seqname}=[$seqdescr, 1, $seqlen, $seqlen, 0 , $trash, '', 1, $seqlen];
333 push(@seqlist, $seqname);
334 }
335 else { #extract clear range
336 my $d=$seqs{$seqname} || die "Error: seq '$seqname' not found in hash!\n";
337 ($e5,$e3, $trash)=($$d[1], $$d[2], $$d[5]);
338 ($$d[7], $$d[8])=($e5, $e3); #coordinates of the sequence written in this working file
339 }
340 &print_fasta($seqname,\$seq, $e5, $e3) unless $trash;
341 }
342 select($oldsel);
343 close OUTFILE;
344 close SLICEFILE;
345 }

Properties

Name Value
svn:executable *