ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/gicl_ncbitab.psx
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (9 years, 1 month ago) by gpertea
File size: 5619 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use FindBin;
4    
5     umask 0002;
6     #the line below is needed if pvmsx is used
7     # also, the error condition is set only by the presence of $file
8     chdir($ENV{PVMSX_WD}) if ($ENV{PVMSX_WD});
9    
10    
11     $ENV{'PATH'}=$FindBin::Bin.':'.$ENV{'PATH'};
12    
13     #so for pvmsx to consider the task was successful, $file must be deleted!
14     #==============
15     # 1 is the name of the fasta sequence input file
16     # 2 is the # of sequences in ${1} should = 1 for this script
17     # 3 is the slice no. being processed by sx
18     # 4 is 0 if not the last file, 1 if the last file
19     # 5 is the # of sequences skipped initially
20     # 6 is the # of sequences to be processed (-1 = ALL)
21     # 7 user parameter
22     # 1 2 3 4 5 6
23     my ($file, $numpass, $slice_num, $last, $skipped, $total, $userparams)=@ARGV;
24     #user parameters:
25     # 1) database (full path)
26     # 2) PID cutoff (using 94 if not specified)
27     # 3) maximum overhang
28     # 4) minimum overlap
29     # 5) extra flags (optional)
30     # flags: M = no masking, D= separate database (not all-vs-all)
31     # G = show gaps
32     my ($searchdb,$minpid,$maxovh,$minovl, $xflags)=split(/:/,$userparams);
33     my $mgblast_res=$file.'.tab';
34     my $nomasking = $xflags=~/M/;
35     my $gapinfo = $xflags=~/G/;
36     #my $otherDb = $xflags=~/D/;
37     my $otherDb = 1; #enforce this here
38    
39     $minpid=94 unless $minpid>50;
40     $minovl=40 unless $minovl;
41     my $log_file='log_std';
42     my $err_file='err_log';
43     open(STDERR, '>>'.$err_file);
44     open(STDOUT, '>>'.$log_file);
45    
46     my $toskip=($file =~ m/_\@(\d+)_v\d+\.\d+/) ? $1 : $skipped+$numpass*($slice_num-1);
47    
48     #= mgblast weakness - when some query seqs are entirely masked
49     #= or what's left is just low complexity, it will abort the search
50     #= so let's filter the input in advance
51     my %seqlens;
52     unless ($nomasking) {
53     my $renfile=$file.'_cpy';
54     rename($file, $renfile) || die "Cannot rename '$file' to '$renfile'!\n";
55     local $/="\n>";
56     open (INFILE, "mdust -m L < $renfile |") || die "Cannot open renamed file '$renfile'\n";
57     open (OUTFILE, '>'.$file) || die "Cannot create flt file '$file'\n";
58     open (MASKED, '>>masked.lst') || die "Cannot open masked.lst for append\n";
59     open (TOOSHORT, '>>tooshort.lst') || die "Cannot open tooshort.lst for append\n";
60    
61     my $counter;
62     while(<INFILE>) {
63     s/^>//;
64     chomp;
65     $counter++;
66     my ($defline, $seq)=(m/^(.+?)\n(.+)/s);
67     my $s=$seq;
68     $s =~ tr/\n \t//d;
69     my ($seqname)=($defline=~m/^(\S+)/);
70     my $seqlen=length($s);
71     $seqlens{$seqname}=$seqlen;
72     if ($seqlen<$minovl) {
73     print TOOSHORT $defline."\n";
74     next;
75     }
76     my @uc=($s =~ m/([A-M,O-W,Y,Z]+)/g);
77     my $valid;
78     foreach my $l (@uc) {
79     if (length($l)>12) { # a valid stretch of at least 12nt would suffice
80     $valid=1;
81     last;
82     }
83     }
84     if ($valid) {
85     print OUTFILE '>'.$defline."\n".$seq."\n";
86     }
87     else {
88     print MASKED $defline."\n";
89     }
90     } #while
91     close(INFILE);close(OUTFILE);close(MASKED);close(TOOSHORT);
92     die "Error: no sequence in this slice!\n" unless $counter;
93     unlink($renfile);
94     }
95     else { # nomasking was requested, but we still need to get the sequence lengths
96     open (INFILE, $file) || die "Cannot open input file '$file'\n";
97     local $/="\n>";
98     while(<INFILE>) {
99     s/^>//;
100     chomp;
101     my ($defline, $seq)=(m/^(.+?)\n(.+)/s);
102     $seq =~ tr/\n \t//d;
103     my ($seqname)=($defline=~m/^(\S+)/);
104     $seqlens{$seqname}=length($seq);
105     }
106     close(INFILE);
107     }
108     # -- for now, we use discontiguous megablast in dual mode
109     my $cmd="megablast -i $file -d $searchdb -p $minpid -m8 -N2 -W11 -t18 -X16 ".
110     "-JF -v9000000 -b9000000 ";
111     #$cmd.=$gapinfo ? ' -D5 ':' -D4 ';
112     $cmd.= ($nomasking) ? ' -FF -UF ' : ' -UT -F "m D" ';
113     #unless ($otherDb) {
114     # $cmd.=' -KT ';
115     # $cmd.= " -k $toskip " if $toskip>0;
116     # }
117     my $slno=sprintf("slice:%09d",$slice_num);
118     my $cmdpipe="($cmd | sed 's/^/STDOUT#:/')".' 2>&1 |';
119     print STDERR ">>$slno: $cmdpipe\n";
120     my $errmsg;
121    
122     my %tlens; #store lengths of target sequences
123    
124     open(OUTFILE, '>'.$mgblast_res) || die ("Error creating result file $mgblast_res\n");
125     open(SPIPE, $cmdpipe) || die ("Error opening pipe $cmd\n");
126     while (<SPIPE>) {
127     if (s/^STDOUT#://) { # stdout
128     next if m/^# /;
129     chomp;
130     my ($qname, $tname, $pid, $alen, $mism, $gaps, $qstart, $qend,
131     $tstart, $tend, $eval, $bitscore)=split(/\t/);
132     next unless $alen>=$minovl;
133     my $qlen=$seqlens{$qname};
134     die("Error: didn't get length for qry $qname\n") unless $qlen>10;
135     my $tlen=$tlens{$tname};
136     unless ($tlen) {
137     $tlen=getDbSeqLen($tname);
138     $tlens{$tname}=$tlen;
139     }
140     my $strand='+';
141     ($strand, $tstart, $tend, $qstart, $qend) =
142     ('-', $tend, $tstart, $qend, $qstart) if ($tstart>$tend);
143     #my ($minq, $maxq)=($qstart>$qend)?($qend,$qstart):($qstart,$qend);
144     #next unless $ovh>$maxovh ? # let's not worry about overhangs for now, use later filtering for that if needed..
145     print OUTFILE join("\t", $qname, $qlen, $qstart, $qend,
146     $tname, $tlen, $tstart, $tend, $pid, int($bitscore), $eval, $strand)."\n";
147     }
148     else {
149     $errmsg.=$_;
150     }
151    
152     }
153    
154     close(SPIPE);
155     close(OUTFILE);
156    
157     print STDERR "<<$slno: done.\n";
158    
159     if ($? || ($errmsg=~/ERROR/i) || ($errmsg=~/Segmentation/i)) {
160     print STDERR "!Error at:\n$cmd\n";
161     print STDERR "$errmsg\n";
162     exit(1);
163     }
164     $cmd="sort -k11,11g -k10,10nr -k9,9nr $mgblast_res | bzip2 -cf > $mgblast_res.bz2";
165     my $r=system($cmd);
166     if ($r && ($r>>8 != 2) ) {
167     die "Error sorting/compressing the results file '$mgblast_res'\n$cmd\n";
168     }
169    
170     unlink($file, $mgblast_res);
171     exit 0;
172    
173     sub getDbSeqLen {
174     my ($tname)=@_;
175     my $r=`cdbyank -a '$tname' $searchdb.cidx`;
176     chomp($r);
177     die ("Error: cannot cdbyank $tname from $searchdb.cidx!\n") unless $r;
178     $r=~s/^[^\n\r]+//s;
179     $r=~tr/\n\r //d;
180     return length($r);
181     }

Properties

Name Value
svn:executable *