ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/fltexonerate
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 4815 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4    
5     my $usage = q/Usage:
6     exonerate ... --showalignment no --showvulgar no -n 11 --showtargetgff yes\
7     --ryo '%ti\t%qi\tmap_end\t%r\t%ql|%qab-%qae\t%ps\t%g\t%qd\n' \
8     | fltexonerate [-p <minpid>] [-c <mincov%>]
9    
10     This is meant to convert exonerate's custom duplex output (gff + ryo)
11     to a specific gff3 output which includes query coverage (Cov=..) and
12     percent identity (PID=) attributes
13     /;
14     getopts('p:c:') || die($usage."\n");
15    
16     my ($mincov, $minpid)=($Getopt::Std::opt_c, $Getopt::Std::opt_p);
17     $mincov=~tr/%//d;
18     $mincov=1 unless $mincov;
19     $minpid=20 unless $minpid;
20    
21     my ($tgt, $qry, $gstart, $gend, $gscore, $gstrand, $qreg, $gpid, $grank, $gdef, $qProt, $qldiv, $qcov, $qlen);
22     my @exons;
23     my @simdata;
24     my @cds;
25     my @qxgaps;
26     my @aln; #align segs on genomic
27     while (<>) {
28     next if m/^#/ || m/^\*\*/;
29     chomp;
30     my @t=split(/\t/);
31     my $fea=lc($t[2]);
32     if ($fea eq 'gene') {
33     flushgff() if $qry;
34     ($qProt, $qldiv)=($t[1]=~m/protein2/)? (1,3):(0,1);
35     ($qry)=($t[8]=~m/sequence\s+(\S+)/);
36     $tgt=$t[0];
37     ($gstart, $gend, $gscore, $gstrand)=@t[3..6];
38     next;
39     }
40     if ($fea eq 'exon') {
41     push(@exons, [$t[3], $t[4]]);
42     }
43     #elsif ($fea eq 'splice3') {
44     # }
45     #elsif ($fea eq 'splice5') {
46     # }
47     elsif ($fea eq 'cds') {
48     push(@cds, [$t[3], $t[4]]);
49     }
50     elsif ($fea eq 'similarity') {
51     #print STDERR $t[8]."\n";
52     #intervals on query can be parsed here -- get the coverage
53     #-- all ungapped alignment blocks are given here
54     my @ad=($t[8]=~m/; Align (\d+ \d+ \d+)/g); # get all blocks
55     @simdata=map { [(split(/ /))] } @ad ;
56     $qcov=0;
57     my $cs=1;
58     if ($gstrand eq '-') {
59     $cs=-1;
60     map { $_=[$_->[0]-1 ,$_->[1], $_->[2]-2, $_->[2]] } @simdata;
61     }
62     else {
63     map { $_=[$_->[0] ,$_->[1], $_->[2], $_->[2]] } @simdata;
64     }
65     #print STDERR "===> qry intervals:";
66     my $i=0;
67     foreach my $a (@simdata) {
68     my ($s0, $q0, $sal, $qal)=@$a;
69     $a=[$s0, $s0+$cs*$sal-1, $q0*$qldiv, $q0*$qldiv+$qal-1];
70     $qcov+=$a->[3]-$a->[2]+1;
71     $a->[2]=int($a->[2]/$qldiv);
72     $a->[3]=int($a->[3]/$qldiv);
73     #print STDERR " $$a[0]-$$a[1]|$$a[2]-$$a[3] ";
74     #push(@aln, [$s0,$s0+$sal+1,$a->[2], $a->[3]]);
75     if ($i>0) {
76     if ($a->[2]-$simdata[$i-1]->[3]>4) {
77     my @gx=($simdata[$i-1]->[1], $a->[0]);
78     ($gx[0],$gx[1]) = ($gx[1],$gx[0]) if $gx[0]>$gx[1];
79     push(@qxgaps, [@gx]) if $gx[1]-$gx[0]>20;
80     #print STDERR "\nqxgap: ".$gx[0]." - ". $gx[1]."\n";
81     }
82     }
83     $i++;
84     }
85     $qcov/=$qldiv;
86     #print STDERR " ===================== \n";
87     #foreach my $a (@ad) {
88     # my ($s0, $q0, $al)=split(/ /,$a);
89     # $qcov+=$al/$qldiv;
90     # }
91     }
92     elsif ($fea eq 'map_end') {
93     $grank=$t[3];
94     $gpid=$t[5];
95     $gdef='';
96     if ($t[7]) {
97     ($gdef, undef)=split(/[\x01\;]/, $t[7],2);
98     }
99     my $qstrand=$t[6]; #query strand.. always '+' for protein query
100     ($qlen, my $qstart, my $qend)=($t[4]=~m/^(\d+)\|(\d+)\-(\d+)/);
101     if ($qlen) {
102     $qstart++;
103     ($qstart, $qend)=($qend, $qstart) if $qstart>$qend;
104     #$gcov=sprintf('%.1f',(100.00*($qend-$qstart+1))/$qlen );
105     $qreg="$qstart-$qend|$qlen";
106     }
107     }
108     }
109    
110     flushgff() if $qry;
111    
112     sub flushgff {
113     #-- flush buffer
114     my $parent=$qry.'.m'.$grank;
115    
116     my $gffattr='ID='.$parent;
117     $gffattr.=';Score='.$gscore if $gscore;
118     $qcov = ($qlen && $qcov) ? sprintf('%.1f',(100.00*$qcov)/$qlen) : 0;
119     if ($qcov) {
120     goto CLEANUP if $qcov<$mincov;
121     $gffattr.=';Cov='.$qcov if $qcov;
122     }
123     $gffattr.=';qreg='.$qreg if $qreg;
124     if ($gpid) {
125     goto CLEANUP if $gpid<$minpid;
126     $gffattr.=';PID='.$gpid;
127     }
128     else { $gpid='.'; }
129     @exons= sort { $main::a->[0] <=> $main::b->[0] } @exons;
130     @cds= sort { $main::a->[0] <=> $main::b->[0] } @cds;
131     #@simdata = sort { $main::a->[0] <=> $main::b->[0] } @simdata;
132     if (@qxgaps>0) {
133     @qxgaps = sort { $main::a->[0] <=> $main::b->[0] } @qxgaps;
134     my @qxg= map { $_->[0].'-'.$_->[1] } @qxgaps;
135     $gffattr.=';QXGap='.join(',',@qxg);
136     }
137     $gffattr.=';descr="'.$gdef.'"' if $gdef;
138    
139     print join("\t", $tgt, 'exonerate', 'mRNA', $gstart, $gend, $gpid,
140     $gstrand, '.', $gffattr)."\n";
141     my $i=0;
142     foreach my $ed (@exons) {
143     #unless ($simdata[$i]) {
144     # die("Error: undefined simdata($i) for parent=$parent, tgt=$tgt\n");
145     # }
146     #my ($g1, $g2, $q1, $q2)=@{$simdata[$i]};
147     my $attrs='Parent='.$parent;
148     #$attrs.=";galn=$g1-$g2;qreg=$q1-$q2";
149     print join("\t", $tgt, 'exonerate', 'exon', $$ed[0], $$ed[1], '.',
150     $gstrand, '.', $attrs)."\n";
151     $i++;
152     }
153    
154     foreach my $cd (@cds) {
155     print join("\t", $tgt, 'exonerate', 'CDS', $$cd[0], $$cd[1], '.',
156     $gstrand, '.', 'Parent='.$parent)."\n";
157     }
158     CLEANUP:
159     #-- reset buffer
160     ($tgt, $qry, $gstart, $gend, $gscore, $qcov, $gpid, $qlen, $qreg)=('', '', 0,0,0,0,0,0,0);
161     @exons=();
162     @simdata=();
163     @qxgaps=();
164     @cds=();
165     }

Properties

Name Value
svn:executable *