ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/fltexonerate
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 4 months ago) by gpertea
File size: 4815 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4
5 my $usage = q/Usage:
6 exonerate ... --showalignment no --showvulgar no -n 11 --showtargetgff yes\
7 --ryo '%ti\t%qi\tmap_end\t%r\t%ql|%qab-%qae\t%ps\t%g\t%qd\n' \
8 | fltexonerate [-p <minpid>] [-c <mincov%>]
9
10 This is meant to convert exonerate's custom duplex output (gff + ryo)
11 to a specific gff3 output which includes query coverage (Cov=..) and
12 percent identity (PID=) attributes
13 /;
14 getopts('p:c:') || die($usage."\n");
15
16 my ($mincov, $minpid)=($Getopt::Std::opt_c, $Getopt::Std::opt_p);
17 $mincov=~tr/%//d;
18 $mincov=1 unless $mincov;
19 $minpid=20 unless $minpid;
20
21 my ($tgt, $qry, $gstart, $gend, $gscore, $gstrand, $qreg, $gpid, $grank, $gdef, $qProt, $qldiv, $qcov, $qlen);
22 my @exons;
23 my @simdata;
24 my @cds;
25 my @qxgaps;
26 my @aln; #align segs on genomic
27 while (<>) {
28 next if m/^#/ || m/^\*\*/;
29 chomp;
30 my @t=split(/\t/);
31 my $fea=lc($t[2]);
32 if ($fea eq 'gene') {
33 flushgff() if $qry;
34 ($qProt, $qldiv)=($t[1]=~m/protein2/)? (1,3):(0,1);
35 ($qry)=($t[8]=~m/sequence\s+(\S+)/);
36 $tgt=$t[0];
37 ($gstart, $gend, $gscore, $gstrand)=@t[3..6];
38 next;
39 }
40 if ($fea eq 'exon') {
41 push(@exons, [$t[3], $t[4]]);
42 }
43 #elsif ($fea eq 'splice3') {
44 # }
45 #elsif ($fea eq 'splice5') {
46 # }
47 elsif ($fea eq 'cds') {
48 push(@cds, [$t[3], $t[4]]);
49 }
50 elsif ($fea eq 'similarity') {
51 #print STDERR $t[8]."\n";
52 #intervals on query can be parsed here -- get the coverage
53 #-- all ungapped alignment blocks are given here
54 my @ad=($t[8]=~m/; Align (\d+ \d+ \d+)/g); # get all blocks
55 @simdata=map { [(split(/ /))] } @ad ;
56 $qcov=0;
57 my $cs=1;
58 if ($gstrand eq '-') {
59 $cs=-1;
60 map { $_=[$_->[0]-1 ,$_->[1], $_->[2]-2, $_->[2]] } @simdata;
61 }
62 else {
63 map { $_=[$_->[0] ,$_->[1], $_->[2], $_->[2]] } @simdata;
64 }
65 #print STDERR "===> qry intervals:";
66 my $i=0;
67 foreach my $a (@simdata) {
68 my ($s0, $q0, $sal, $qal)=@$a;
69 $a=[$s0, $s0+$cs*$sal-1, $q0*$qldiv, $q0*$qldiv+$qal-1];
70 $qcov+=$a->[3]-$a->[2]+1;
71 $a->[2]=int($a->[2]/$qldiv);
72 $a->[3]=int($a->[3]/$qldiv);
73 #print STDERR " $$a[0]-$$a[1]|$$a[2]-$$a[3] ";
74 #push(@aln, [$s0,$s0+$sal+1,$a->[2], $a->[3]]);
75 if ($i>0) {
76 if ($a->[2]-$simdata[$i-1]->[3]>4) {
77 my @gx=($simdata[$i-1]->[1], $a->[0]);
78 ($gx[0],$gx[1]) = ($gx[1],$gx[0]) if $gx[0]>$gx[1];
79 push(@qxgaps, [@gx]) if $gx[1]-$gx[0]>20;
80 #print STDERR "\nqxgap: ".$gx[0]." - ". $gx[1]."\n";
81 }
82 }
83 $i++;
84 }
85 $qcov/=$qldiv;
86 #print STDERR " ===================== \n";
87 #foreach my $a (@ad) {
88 # my ($s0, $q0, $al)=split(/ /,$a);
89 # $qcov+=$al/$qldiv;
90 # }
91 }
92 elsif ($fea eq 'map_end') {
93 $grank=$t[3];
94 $gpid=$t[5];
95 $gdef='';
96 if ($t[7]) {
97 ($gdef, undef)=split(/[\x01\;]/, $t[7],2);
98 }
99 my $qstrand=$t[6]; #query strand.. always '+' for protein query
100 ($qlen, my $qstart, my $qend)=($t[4]=~m/^(\d+)\|(\d+)\-(\d+)/);
101 if ($qlen) {
102 $qstart++;
103 ($qstart, $qend)=($qend, $qstart) if $qstart>$qend;
104 #$gcov=sprintf('%.1f',(100.00*($qend-$qstart+1))/$qlen );
105 $qreg="$qstart-$qend|$qlen";
106 }
107 }
108 }
109
110 flushgff() if $qry;
111
112 sub flushgff {
113 #-- flush buffer
114 my $parent=$qry.'.m'.$grank;
115
116 my $gffattr='ID='.$parent;
117 $gffattr.=';Score='.$gscore if $gscore;
118 $qcov = ($qlen && $qcov) ? sprintf('%.1f',(100.00*$qcov)/$qlen) : 0;
119 if ($qcov) {
120 goto CLEANUP if $qcov<$mincov;
121 $gffattr.=';Cov='.$qcov if $qcov;
122 }
123 $gffattr.=';qreg='.$qreg if $qreg;
124 if ($gpid) {
125 goto CLEANUP if $gpid<$minpid;
126 $gffattr.=';PID='.$gpid;
127 }
128 else { $gpid='.'; }
129 @exons= sort { $main::a->[0] <=> $main::b->[0] } @exons;
130 @cds= sort { $main::a->[0] <=> $main::b->[0] } @cds;
131 #@simdata = sort { $main::a->[0] <=> $main::b->[0] } @simdata;
132 if (@qxgaps>0) {
133 @qxgaps = sort { $main::a->[0] <=> $main::b->[0] } @qxgaps;
134 my @qxg= map { $_->[0].'-'.$_->[1] } @qxgaps;
135 $gffattr.=';QXGap='.join(',',@qxg);
136 }
137 $gffattr.=';descr="'.$gdef.'"' if $gdef;
138
139 print join("\t", $tgt, 'exonerate', 'mRNA', $gstart, $gend, $gpid,
140 $gstrand, '.', $gffattr)."\n";
141 my $i=0;
142 foreach my $ed (@exons) {
143 #unless ($simdata[$i]) {
144 # die("Error: undefined simdata($i) for parent=$parent, tgt=$tgt\n");
145 # }
146 #my ($g1, $g2, $q1, $q2)=@{$simdata[$i]};
147 my $attrs='Parent='.$parent;
148 #$attrs.=";galn=$g1-$g2;qreg=$q1-$q2";
149 print join("\t", $tgt, 'exonerate', 'exon', $$ed[0], $$ed[1], '.',
150 $gstrand, '.', $attrs)."\n";
151 $i++;
152 }
153
154 foreach my $cd (@cds) {
155 print join("\t", $tgt, 'exonerate', 'CDS', $$cd[0], $$cd[1], '.',
156 $gstrand, '.', 'Parent='.$parent)."\n";
157 }
158 CLEANUP:
159 #-- reset buffer
160 ($tgt, $qry, $gstart, $gend, $gscore, $qcov, $gpid, $qlen, $qreg)=('', '', 0,0,0,0,0,0,0);
161 @exons=();
162 @simdata=();
163 @qxgaps=();
164 @cds=();
165 }

Properties

Name Value
svn:executable *