1 |
#!/usr/bin/perl |
2 |
use strict; |
3 |
use Getopt::Std; |
4 |
|
5 |
my $usage = q/Usage: |
6 |
exonerate ... --showalignment no --showvulgar no -n 11 --showtargetgff yes\ |
7 |
--ryo '%ti\t%qi\tmap_end\t%r\t%ql|%qab-%qae\t%ps\t%g\t%qd\n' \ |
8 |
| fltexonerate [-p <minpid>] [-c <mincov%>] |
9 |
|
10 |
This is meant to convert exonerate's custom duplex output (gff + ryo) |
11 |
to a specific gff3 output which includes query coverage (Cov=..) and |
12 |
percent identity (PID=) attributes |
13 |
/; |
14 |
getopts('p:c:') || die($usage."\n"); |
15 |
|
16 |
my ($mincov, $minpid)=($Getopt::Std::opt_c, $Getopt::Std::opt_p); |
17 |
$mincov=~tr/%//d; |
18 |
$mincov=1 unless $mincov; |
19 |
$minpid=20 unless $minpid; |
20 |
|
21 |
my ($tgt, $qry, $gstart, $gend, $gscore, $gstrand, $qreg, $gpid, $grank, $gdef, $qProt, $qldiv, $qcov, $qlen); |
22 |
my @exons; |
23 |
my @simdata; |
24 |
my @cds; |
25 |
my @qxgaps; |
26 |
my @aln; #align segs on genomic |
27 |
while (<>) { |
28 |
next if m/^#/ || m/^\*\*/; |
29 |
chomp; |
30 |
my @t=split(/\t/); |
31 |
my $fea=lc($t[2]); |
32 |
if ($fea eq 'gene') { |
33 |
flushgff() if $qry; |
34 |
($qProt, $qldiv)=($t[1]=~m/protein2/)? (1,3):(0,1); |
35 |
($qry)=($t[8]=~m/sequence\s+(\S+)/); |
36 |
$tgt=$t[0]; |
37 |
($gstart, $gend, $gscore, $gstrand)=@t[3..6]; |
38 |
next; |
39 |
} |
40 |
if ($fea eq 'exon') { |
41 |
push(@exons, [$t[3], $t[4]]); |
42 |
} |
43 |
#elsif ($fea eq 'splice3') { |
44 |
# } |
45 |
#elsif ($fea eq 'splice5') { |
46 |
# } |
47 |
elsif ($fea eq 'cds') { |
48 |
push(@cds, [$t[3], $t[4]]); |
49 |
} |
50 |
elsif ($fea eq 'similarity') { |
51 |
#print STDERR $t[8]."\n"; |
52 |
#intervals on query can be parsed here -- get the coverage |
53 |
#-- all ungapped alignment blocks are given here |
54 |
my @ad=($t[8]=~m/; Align (\d+ \d+ \d+)/g); # get all blocks |
55 |
@simdata=map { [(split(/ /))] } @ad ; |
56 |
$qcov=0; |
57 |
my $cs=1; |
58 |
if ($gstrand eq '-') { |
59 |
$cs=-1; |
60 |
map { $_=[$_->[0]-1 ,$_->[1], $_->[2]-2, $_->[2]] } @simdata; |
61 |
} |
62 |
else { |
63 |
map { $_=[$_->[0] ,$_->[1], $_->[2], $_->[2]] } @simdata; |
64 |
} |
65 |
#print STDERR "===> qry intervals:"; |
66 |
my $i=0; |
67 |
foreach my $a (@simdata) { |
68 |
my ($s0, $q0, $sal, $qal)=@$a; |
69 |
$a=[$s0, $s0+$cs*$sal-1, $q0*$qldiv, $q0*$qldiv+$qal-1]; |
70 |
$qcov+=$a->[3]-$a->[2]+1; |
71 |
$a->[2]=int($a->[2]/$qldiv); |
72 |
$a->[3]=int($a->[3]/$qldiv); |
73 |
#print STDERR " $$a[0]-$$a[1]|$$a[2]-$$a[3] "; |
74 |
#push(@aln, [$s0,$s0+$sal+1,$a->[2], $a->[3]]); |
75 |
if ($i>0) { |
76 |
if ($a->[2]-$simdata[$i-1]->[3]>4) { |
77 |
my @gx=($simdata[$i-1]->[1], $a->[0]); |
78 |
($gx[0],$gx[1]) = ($gx[1],$gx[0]) if $gx[0]>$gx[1]; |
79 |
push(@qxgaps, [@gx]) if $gx[1]-$gx[0]>20; |
80 |
#print STDERR "\nqxgap: ".$gx[0]." - ". $gx[1]."\n"; |
81 |
} |
82 |
} |
83 |
$i++; |
84 |
} |
85 |
$qcov/=$qldiv; |
86 |
#print STDERR " ===================== \n"; |
87 |
#foreach my $a (@ad) { |
88 |
# my ($s0, $q0, $al)=split(/ /,$a); |
89 |
# $qcov+=$al/$qldiv; |
90 |
# } |
91 |
} |
92 |
elsif ($fea eq 'map_end') { |
93 |
$grank=$t[3]; |
94 |
$gpid=$t[5]; |
95 |
$gdef=''; |
96 |
if ($t[7]) { |
97 |
($gdef, undef)=split(/[\x01\;]/, $t[7],2); |
98 |
} |
99 |
my $qstrand=$t[6]; #query strand.. always '+' for protein query |
100 |
($qlen, my $qstart, my $qend)=($t[4]=~m/^(\d+)\|(\d+)\-(\d+)/); |
101 |
if ($qlen) { |
102 |
$qstart++; |
103 |
($qstart, $qend)=($qend, $qstart) if $qstart>$qend; |
104 |
#$gcov=sprintf('%.1f',(100.00*($qend-$qstart+1))/$qlen ); |
105 |
$qreg="$qstart-$qend|$qlen"; |
106 |
} |
107 |
} |
108 |
} |
109 |
|
110 |
flushgff() if $qry; |
111 |
|
112 |
sub flushgff { |
113 |
#-- flush buffer |
114 |
my $parent=$qry.'.m'.$grank; |
115 |
|
116 |
my $gffattr='ID='.$parent; |
117 |
$gffattr.=';Score='.$gscore if $gscore; |
118 |
$qcov = ($qlen && $qcov) ? sprintf('%.1f',(100.00*$qcov)/$qlen) : 0; |
119 |
if ($qcov) { |
120 |
goto CLEANUP if $qcov<$mincov; |
121 |
$gffattr.=';Cov='.$qcov if $qcov; |
122 |
} |
123 |
$gffattr.=';qreg='.$qreg if $qreg; |
124 |
if ($gpid) { |
125 |
goto CLEANUP if $gpid<$minpid; |
126 |
$gffattr.=';PID='.$gpid; |
127 |
} |
128 |
else { $gpid='.'; } |
129 |
@exons= sort { $main::a->[0] <=> $main::b->[0] } @exons; |
130 |
@cds= sort { $main::a->[0] <=> $main::b->[0] } @cds; |
131 |
#@simdata = sort { $main::a->[0] <=> $main::b->[0] } @simdata; |
132 |
if (@qxgaps>0) { |
133 |
@qxgaps = sort { $main::a->[0] <=> $main::b->[0] } @qxgaps; |
134 |
my @qxg= map { $_->[0].'-'.$_->[1] } @qxgaps; |
135 |
$gffattr.=';QXGap='.join(',',@qxg); |
136 |
} |
137 |
$gffattr.=';descr="'.$gdef.'"' if $gdef; |
138 |
|
139 |
print join("\t", $tgt, 'exonerate', 'mRNA', $gstart, $gend, $gpid, |
140 |
$gstrand, '.', $gffattr)."\n"; |
141 |
my $i=0; |
142 |
foreach my $ed (@exons) { |
143 |
#unless ($simdata[$i]) { |
144 |
# die("Error: undefined simdata($i) for parent=$parent, tgt=$tgt\n"); |
145 |
# } |
146 |
#my ($g1, $g2, $q1, $q2)=@{$simdata[$i]}; |
147 |
my $attrs='Parent='.$parent; |
148 |
#$attrs.=";galn=$g1-$g2;qreg=$q1-$q2"; |
149 |
print join("\t", $tgt, 'exonerate', 'exon', $$ed[0], $$ed[1], '.', |
150 |
$gstrand, '.', $attrs)."\n"; |
151 |
$i++; |
152 |
} |
153 |
|
154 |
foreach my $cd (@cds) { |
155 |
print join("\t", $tgt, 'exonerate', 'CDS', $$cd[0], $$cd[1], '.', |
156 |
$gstrand, '.', 'Parent='.$parent)."\n"; |
157 |
} |
158 |
CLEANUP: |
159 |
#-- reset buffer |
160 |
($tgt, $qry, $gstart, $gend, $gscore, $qcov, $gpid, $qlen, $qreg)=('', '', 0,0,0,0,0,0,0); |
161 |
@exons=(); |
162 |
@simdata=(); |
163 |
@qxgaps=(); |
164 |
@cds=(); |
165 |
} |