ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/sim4gff
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 3753 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4
5 my $usage = q/Usage:
6 sim4cc qfile dbfile | sim4gff [-p <minpid>] [-C] [-c <mincov>] [-f {exon|CDS}]
7
8 This is meant to work only on the default (A=0) sim4cc output,
9 preferrably on the fly (pipe sim4cc into this script).
10 Expects the query fasta file to be present so the
11 definition line can be extracted.
12
13 Option -C enforces the assumption that the query file is a CDS\/mRNA which is already
14 properly oriented (forward strand, and this is used to determine the feature strand
15 on the genomic sequence (necessary for one-exon features).
16 /;
17 umask 0002;
18 getopts('Cf:c:p:') || die($usage."\n");
19 my ($mincov, $minpid, $outfeat, $coding)=($Getopt::Std::opt_c, $Getopt::Std::opt_p,
20 $Getopt::Std::opt_f, $Getopt::Std::opt_C);
21
22 $mincov=1 unless $mincov;
23 $minpid=20 unless $minpid;
24 my $numaln=1; # suffix for each alignment
25
26 my ($qfile, $qname, $qlen, $sfile, $sname, $slen, $strand);
27 my @ex; # [start, end, pid, tstart, tend]
28 my $complement; # target aligned as rev complement
29 $outfeat='exon' unless $outfeat;
30 while (<>) {
31 if (m/^seq1 = (\S+)\,\s+(\d+)\s+bp/) {
32 my ($file, $ql)=($1,$2);
33 writeprev() if @ex>0;
34 ($qfile, $qlen)=($file, $ql);
35 $_=<>;
36 ($sname, $slen) = (m/seq2 = \S+ \(([^\)]+)\)\, (\d+) bp/);
37 die ("Error parsing seq2 info!\n") unless $sname && $slen>5;
38 next;
39 }
40 chomp;
41 next if length($_)<3;
42 if (m/^\(complement\)/) {
43 $complement=1;
44 $strand='-' if $coding;
45 next;
46 }
47 if (m/^(\d+)\-(\d+)\s+\((\d+)\-(\d+)\)\s+(\d+)\%\s*([\<\-\>]*)/) {
48 my ($t1,$t2, $s1, $s2, $pid, $arrow)=($1,$2,$3,$4,$5,$6);
49 if ($arrow && !$strand) {
50 $strand=($arrow eq '<-') ? '-' : '+';
51 }
52 push(@ex, [$s1, $s2, $pid, $t1, $t2]);
53 }
54 }
55
56 writeprev() if @ex>0;
57
58 #----------------------------
59 sub writeprev {
60 #try to open the qfile to get the defline
61 my ($qdesc, $gname);
62 unless($strand) {
63 #deciding the "coding" strand for single-exon mappings is tricky,
64 #for RefSeq the default should be:
65 $strand = $complement ? '-' : '+';
66 }
67 my ($cds_start, $cds_end);
68 if (open(QFILE, $qfile)) {
69 my $ql=<QFILE>;
70 ($qname, $qdesc)=($ql=~m/^>(\S+)\s*(.*)/);
71 die ("Cannot parse qry name from first line of $qfile :\n$ql\n") unless $qname;
72 #see if we can pull the CDS to run the alignment again
73 #if ($qdesc=~m/CDS[:=](\d+)\-(\d+)/) {
74 # ($cds_start, $cds_end)=($1,$2);
75 # }
76 if ($qdesc=~/gene[:=](\S+)/) {
77 $gname=$1;
78 }
79 close(QFILE);
80 }
81 # compute overall pid & coverage
82 my ($qcov, $qmatch)=0;
83 my ($smin, $smax)=($slen,1);
84 foreach my $x (@ex) {
85 my ($s1, $s2, $pid, $t1, $t2)=@$x;
86 $qcov += $t2-$t1+1;
87 $qmatch += (($t2-$t1+1)*$pid)/100;
88 $smin=$s1 if $s1<$smin;
89 $smax=$s2 if $s2>$smax;
90 }
91 my $qscore=int($qmatch);
92 my $qpid=sprintf('%d',($qmatch*100)/$qcov); # % matches
93 $qcov=sprintf('%d', ($qcov*100)/$qlen); # % query coverage
94 my $qid=$qname.'.sim'.$numaln;
95 $numaln++;
96 if ($qpid>=$minpid && $qcov>=$mincov) {
97 #--- write gff data:
98 #my $tsense=($complement && ($strand ne '-')) ? '-' : '+';
99 my $tsense=$complement ? '-' : '+';
100 $gname=$qname unless $gname;
101 my $gffattr="ID=$qid;Name=$gname;Cov=$qcov;PID=$qpid";
102 $gffattr.=";Score=$qscore" if $qscore;
103 $gffattr.=";Info=\"$qdesc\"" if $qdesc;
104 print join("\t", $sname, 'sim4cc', 'mRNA', $smin, $smax, $qpid, $strand, '.',
105 $gffattr)."\n";
106 foreach my $x (@ex) {
107 my ($s1, $s2, $pid, $t1, $t2)=@$x;
108 print join("\t", $sname, 'sim4cc', $outfeat, $s1, $s2, $pid, $strand, '.',
109 "Parent=$qid;Target=$qname $t1 $t2 $tsense")."\n";
110
111 }
112 }
113 # -- reset vars
114 $qlen=0;
115 ($qname, $qfile, $sname, $slen, $strand, $complement)=
116 (undef, undef, undef, undef, undef, 0);
117 @ex=();
118 $complement=0;
119 }

Properties

Name Value
svn:executable *