ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/btab2cov.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 5922 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     #use FindBin;use lib $FindBin::Bin;
5    
6     my $usage = q/Usage:
7     btab2cov.pl [-T] [-p <min_ps>] [-c <min_cov>] [-n <max_hits>] <input_btab>
8    
9     WARNING: it assumes that the input btatb is sorted by <qry_name>!
10    
11     It will consider all overlaps of at least <min_ps> percent similarity
12     (default: 70) to assess the coverage of the query sequences
13    
14     Outputs coverage information like this, for any query sequence with
15     an overall percentual coverage of at least <min_cov>% (default 50):
16    
17     <qry_seq> <strand> <qry_cov%> <top5hits>
18    
19     ..where:
20     <qry_cov%> is the percentual coverage of <qry_seq> length;
21     <top5hits> is a comma delimited list of max. 5 subj. sequences with the best
22     overall alignment scores
23    
24     /;
25     #umask 0002;
26     getopts('Tn:p:c:') || die($usage."\n");
27     my $mincov=$Getopt::Std::opt_c || 50;
28     my $minpsim=$Getopt::Std::opt_p || 70;
29     my $tophits=$Getopt::Std::opt_n;
30     my $mgtab=$Getopt::Std::opt_T;
31     my @qhp; #HSPs on positive strand: list of [ql, qr, sname]
32     my @qhm; #HSPs on negative strand:list of [ql, qr, sname]
33     my @hitsp; # list of [score, sname] for positive strand;
34     my @hitsm; # list of [score, sname] for negative strand;
35     my %topm; # names of top n hits on plus strand
36     my %topp; # names of top n hits on minus strand
37     my ($curquery, $curlen);
38     if ($mgtab) { #mgblast TAB format
39     while (<>) {
40     next if m/^\s*#/;
41     chomp;
42     # 0 1 2 3 4 5 6 7 8 9
43     my ($qname, $qlen, $q_5, $q_3, $sname, $slen, $s_5, $s_3, $pid, $score,
44     # 10 11 12 13
45     $e_val, $strand, $gapdataq, $gapdatas)=split(/\t/);
46     if ($curquery ne $qname) {
47     flushQ() if $curquery;
48     $curquery=$qname;
49     $curlen=$qlen;
50     @qhp=();
51     @qhm=();
52     @hitsp=();
53     @hitsm=();
54     }
55     next unless $pid>=$minpsim;
56     if ($strand eq '-') {
57     #minus strand:
58     push(@hitsm, [$score, $sname]);
59     ($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3;
60     push(@qhm, [$q_5, $q_3, $sname]);
61     }
62     else {
63     #plus strand:
64     push(@hitsp, [$score, $sname]);
65     ($q_5, $q_3)=($q_3,$q_5) if $q_5>$q_3;
66     push(@qhp, [$q_5, $q_3, $sname]);
67     }
68     } #while readline
69     } #mgblast TAB format
70    
71     else { # BTAB format
72    
73     while (<>) {
74     next if m/^\s*#/;
75     chomp;
76     # 0 1 2 3 4 5 6 7 8 9
77     my ($qname, $date, $qlen, $method, $db, $sname, $q_5, $q_3, $s_5, $s_3,
78     # 10 11 12 13 14 15 16 17 18
79     $pid, $psim, $score, $fofsb, $fofse, $sdescr, $frame, $strand, $slen,
80     # 19 20 21
81     $e_val, $scov, $hsps)=split(/\t/);
82     if ($curquery ne $qname) {
83     flushQ() if $curquery;
84     $curquery=$qname;
85     $curlen=$qlen;
86     @qhp=();
87     @qhm=();
88     @hitsp=();
89     @hitsm=();
90     }
91     my @hsp=split(/\~/,$hsps);
92     if ($strand eq '-' || lc($strand) eq 'minus') {
93     #minus strand:
94     push(@hitsm, [$score, $sname]);
95     foreach my $h (@hsp) {
96     my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\|([\d\.]+)/);
97     next unless $p>=$minpsim;
98     die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1;
99     ($q5, $q3)=($q3,$q5) if $q5>$q3;
100     push(@qhm, [$q5, $q3, $sname]);
101     }
102     }
103     else {
104     #plus strand:
105     push(@hitsp, [$score, $sname]);
106     foreach my $h (@hsp) {
107     my ($q5,$q3, $h5, $h3, $p)=($h=~/(\d+)\-(\d+)\:(\d+)\-(\d+)\|([\d\.]+)/);
108     next unless $p>=$minpsim;
109     die("Error parsing segment data $h for btab line:\n$_\n") unless $p>1;
110     #($q5, $q3)=($q3,$q5) if $q5>$q3;
111     push(@qhp, [$q5, $q3, $sname]);
112     }
113     }
114     }
115     } #BTAB case
116    
117     flushQ() if $curquery;
118    
119     sub flushQ {
120     #compute coverage per strand
121     my ($cp, $cm)=(0,0);
122     my $rhits; #points to either @hitsp or @hitsm
123     undef %topp;
124     undef %topm;
125     if (@hitsp) {
126     @hitsp = sort { $main::b->[0]<=>$main::a->[0] } @hitsp; #sort hits by score
127     if ($tophits) {
128     foreach my $hd (@hitsp) {
129     $topp{$hd->[1]}=1;
130     last if keys(%topp)>=$tophits;
131     }
132     }
133     }
134     if (@hitsm) {
135     @hitsm = sort { $main::b->[0]<=>$main::a->[0] } @hitsm;
136     if ($tophits) {
137     foreach my $hd (@hitsm) {
138     $topm{$hd->[1]}=1;
139     last if keys(%topm)>=$tophits;
140     }
141     }
142     }
143     if (@qhp) {
144     if ($tophits) { @qhp = grep { exists($topp{$_->[2]}) } @qhp };
145     #compute coverage on positive strand
146     $cp=getCov($curlen, \@qhp);
147     }
148     if (@qhm) {
149     if ($tophits) { @qhm = grep { exists($topm{$_->[2]}) } @qhm };
150     #compute coverage on negative strand
151     $cm=getCov($curlen, \@qhm);
152     }
153     if ($cm>=$mincov) {
154     my @h;
155     foreach my $p (@hitsm) { if ($tophits) {
156     push(@h, $$p[1]) if exists($topm{$$p[1]});
157     } else { push(@h, $$p[1]); }
158     last if @h>=5;
159     }
160     print join("\t",$curquery, '-', $cm, join(',',@h))."\n";
161     }
162     if ($cp>=$mincov) {
163     my @h;
164     foreach my $p (@hitsp) {
165     if ($tophits) {
166     push(@h, $$p[1]) if exists($topp{$$p[1]});
167     }
168     else {
169     push(@h, $$p[1]);
170     }
171     last if @h>=5;
172     }
173     print join("\t",$curquery, '+', $cp, join(',',@h))."\n";
174     }
175     }
176    
177    
178     sub getCov {
179     my ($len, $r)=@_;
180     my @a=sort { $main::a->[0]<=>$main::b->[0] } @$r;
181     my $wasovl=1;
182     my @m = map { [$_->[0], $_->[1]] } @a;
183     WASOVL: while ($wasovl) {
184     $wasovl=0;
185     for (my $i=0;$i<@m-1;$i++) {
186     for (my $j=$i+1;$j<@m;$j++) {
187     next if $i==$j;
188     my ($l1, $r1)=($m[$i]->[0], $m[$i]->[1]);
189     my ($l2, $r2)=($m[$j]->[0], $m[$j]->[1]);
190     #we know that l2>=l1 for sure
191     if ($l2<=$r1) {
192     #intersection detected;
193     $wasovl=1;
194     $m[$i]->[1] = ($r1>$r2)?$r1:$r2; #max right coord
195     splice(@m, $j);
196     next WASOVL;
197     } #intersection
198     } #j
199     } #i
200     }#while wasovl
201     my $cov=0;
202     map { $cov+=$_->[1]-$_->[0]+1 } @m;
203     return sprintf('%d',($cov*100.00)/$len);
204     }

Properties

Name Value
svn:executable *