ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gicl_ncbitab.psx
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 5619 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use FindBin;
4
5 umask 0002;
6 #the line below is needed if pvmsx is used
7 # also, the error condition is set only by the presence of $file
8 chdir($ENV{PVMSX_WD}) if ($ENV{PVMSX_WD});
9
10
11 $ENV{'PATH'}=$FindBin::Bin.':'.$ENV{'PATH'};
12
13 #so for pvmsx to consider the task was successful, $file must be deleted!
14 #==============
15 # 1 is the name of the fasta sequence input file
16 # 2 is the # of sequences in ${1} should = 1 for this script
17 # 3 is the slice no. being processed by sx
18 # 4 is 0 if not the last file, 1 if the last file
19 # 5 is the # of sequences skipped initially
20 # 6 is the # of sequences to be processed (-1 = ALL)
21 # 7 user parameter
22 # 1 2 3 4 5 6
23 my ($file, $numpass, $slice_num, $last, $skipped, $total, $userparams)=@ARGV;
24 #user parameters:
25 # 1) database (full path)
26 # 2) PID cutoff (using 94 if not specified)
27 # 3) maximum overhang
28 # 4) minimum overlap
29 # 5) extra flags (optional)
30 # flags: M = no masking, D= separate database (not all-vs-all)
31 # G = show gaps
32 my ($searchdb,$minpid,$maxovh,$minovl, $xflags)=split(/:/,$userparams);
33 my $mgblast_res=$file.'.tab';
34 my $nomasking = $xflags=~/M/;
35 my $gapinfo = $xflags=~/G/;
36 #my $otherDb = $xflags=~/D/;
37 my $otherDb = 1; #enforce this here
38
39 $minpid=94 unless $minpid>50;
40 $minovl=40 unless $minovl;
41 my $log_file='log_std';
42 my $err_file='err_log';
43 open(STDERR, '>>'.$err_file);
44 open(STDOUT, '>>'.$log_file);
45
46 my $toskip=($file =~ m/_\@(\d+)_v\d+\.\d+/) ? $1 : $skipped+$numpass*($slice_num-1);
47
48 #= mgblast weakness - when some query seqs are entirely masked
49 #= or what's left is just low complexity, it will abort the search
50 #= so let's filter the input in advance
51 my %seqlens;
52 unless ($nomasking) {
53 my $renfile=$file.'_cpy';
54 rename($file, $renfile) || die "Cannot rename '$file' to '$renfile'!\n";
55 local $/="\n>";
56 open (INFILE, "mdust -m L < $renfile |") || die "Cannot open renamed file '$renfile'\n";
57 open (OUTFILE, '>'.$file) || die "Cannot create flt file '$file'\n";
58 open (MASKED, '>>masked.lst') || die "Cannot open masked.lst for append\n";
59 open (TOOSHORT, '>>tooshort.lst') || die "Cannot open tooshort.lst for append\n";
60
61 my $counter;
62 while(<INFILE>) {
63 s/^>//;
64 chomp;
65 $counter++;
66 my ($defline, $seq)=(m/^(.+?)\n(.+)/s);
67 my $s=$seq;
68 $s =~ tr/\n \t//d;
69 my ($seqname)=($defline=~m/^(\S+)/);
70 my $seqlen=length($s);
71 $seqlens{$seqname}=$seqlen;
72 if ($seqlen<$minovl) {
73 print TOOSHORT $defline."\n";
74 next;
75 }
76 my @uc=($s =~ m/([A-M,O-W,Y,Z]+)/g);
77 my $valid;
78 foreach my $l (@uc) {
79 if (length($l)>12) { # a valid stretch of at least 12nt would suffice
80 $valid=1;
81 last;
82 }
83 }
84 if ($valid) {
85 print OUTFILE '>'.$defline."\n".$seq."\n";
86 }
87 else {
88 print MASKED $defline."\n";
89 }
90 } #while
91 close(INFILE);close(OUTFILE);close(MASKED);close(TOOSHORT);
92 die "Error: no sequence in this slice!\n" unless $counter;
93 unlink($renfile);
94 }
95 else { # nomasking was requested, but we still need to get the sequence lengths
96 open (INFILE, $file) || die "Cannot open input file '$file'\n";
97 local $/="\n>";
98 while(<INFILE>) {
99 s/^>//;
100 chomp;
101 my ($defline, $seq)=(m/^(.+?)\n(.+)/s);
102 $seq =~ tr/\n \t//d;
103 my ($seqname)=($defline=~m/^(\S+)/);
104 $seqlens{$seqname}=length($seq);
105 }
106 close(INFILE);
107 }
108 # -- for now, we use discontiguous megablast in dual mode
109 my $cmd="megablast -i $file -d $searchdb -p $minpid -m8 -N2 -W11 -t18 -X16 ".
110 "-JF -v9000000 -b9000000 ";
111 #$cmd.=$gapinfo ? ' -D5 ':' -D4 ';
112 $cmd.= ($nomasking) ? ' -FF -UF ' : ' -UT -F "m D" ';
113 #unless ($otherDb) {
114 # $cmd.=' -KT ';
115 # $cmd.= " -k $toskip " if $toskip>0;
116 # }
117 my $slno=sprintf("slice:%09d",$slice_num);
118 my $cmdpipe="($cmd | sed 's/^/STDOUT#:/')".' 2>&1 |';
119 print STDERR ">>$slno: $cmdpipe\n";
120 my $errmsg;
121
122 my %tlens; #store lengths of target sequences
123
124 open(OUTFILE, '>'.$mgblast_res) || die ("Error creating result file $mgblast_res\n");
125 open(SPIPE, $cmdpipe) || die ("Error opening pipe $cmd\n");
126 while (<SPIPE>) {
127 if (s/^STDOUT#://) { # stdout
128 next if m/^# /;
129 chomp;
130 my ($qname, $tname, $pid, $alen, $mism, $gaps, $qstart, $qend,
131 $tstart, $tend, $eval, $bitscore)=split(/\t/);
132 next unless $alen>=$minovl;
133 my $qlen=$seqlens{$qname};
134 die("Error: didn't get length for qry $qname\n") unless $qlen>10;
135 my $tlen=$tlens{$tname};
136 unless ($tlen) {
137 $tlen=getDbSeqLen($tname);
138 $tlens{$tname}=$tlen;
139 }
140 my $strand='+';
141 ($strand, $tstart, $tend, $qstart, $qend) =
142 ('-', $tend, $tstart, $qend, $qstart) if ($tstart>$tend);
143 #my ($minq, $maxq)=($qstart>$qend)?($qend,$qstart):($qstart,$qend);
144 #next unless $ovh>$maxovh ? # let's not worry about overhangs for now, use later filtering for that if needed..
145 print OUTFILE join("\t", $qname, $qlen, $qstart, $qend,
146 $tname, $tlen, $tstart, $tend, $pid, int($bitscore), $eval, $strand)."\n";
147 }
148 else {
149 $errmsg.=$_;
150 }
151
152 }
153
154 close(SPIPE);
155 close(OUTFILE);
156
157 print STDERR "<<$slno: done.\n";
158
159 if ($? || ($errmsg=~/ERROR/i) || ($errmsg=~/Segmentation/i)) {
160 print STDERR "!Error at:\n$cmd\n";
161 print STDERR "$errmsg\n";
162 exit(1);
163 }
164 $cmd="sort -k11,11g -k10,10nr -k9,9nr $mgblast_res | bzip2 -cf > $mgblast_res.bz2";
165 my $r=system($cmd);
166 if ($r && ($r>>8 != 2) ) {
167 die "Error sorting/compressing the results file '$mgblast_res'\n$cmd\n";
168 }
169
170 unlink($file, $mgblast_res);
171 exit 0;
172
173 sub getDbSeqLen {
174 my ($tname)=@_;
175 my $r=`cdbyank -a '$tname' $searchdb.cidx`;
176 chomp($r);
177 die ("Error: cannot cdbyank $tname from $searchdb.cidx!\n") unless $r;
178 $r=~s/^[^\n\r]+//s;
179 $r=~tr/\n\r //d;
180 return length($r);
181 }

Properties

Name Value
svn:executable *