ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/psl2gff
Revision: 110
Committed: Wed Oct 12 17:17:00 2011 UTC (7 years, 11 months ago) by gpertea
File size: 7904 byte(s)
Log Message:
minor script updates

Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 #use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 psl2gff [-p <minpid>] [-c <minqrycoverage>] [-P] <psl_file>
8 Outputs a gff3-formatted version of the Blat's mapping data
9 (generating 'mRNA' and 'exon' tags only)
10 Use -P if the psl comes from protein-to-genome mappings.
11 /;
12 umask 0002;
13 die "$usage\n" if $ARGV[0]=~m/^\-+h/;
14 #----------------------
15 # 0 matches int unsigned , # Number of bases that match that aren't repeats
16 # 1 misMatches int unsigned , # Number of bases that don't match
17 # 2 repMatches int unsigned , # Number of bases that match but are part of repeats
18 # 3 nCount int unsigned , # Number of 'N' bases
19 # 4 qNumInsert int unsigned , # Number of inserts in query
20 # 5 qBaseInsert int unsigned , # Number of bases inserted in query
21 # 6 tNumInsert int unsigned , # Number of inserts in target
22 # 7 tBaseInsert int unsigned , # Number of bases inserted in target
23 # 8 strand char(2) , # + or - for query strand, optionally followed by + or - for target strand
24
25 # 9 qName varchar(255) , # Query sequence name
26
27 # 10 qSize int unsigned , # Query sequence size
28 # 11 qStart int unsigned , # Alignment start position in query
29 # 12 qEnd int unsigned , # Alignment end position in query
30 # 13 tName varchar(255) , # Target sequence name
31 # 14 tSize int unsigned , # Target sequence size
32 # 15 tStart int unsigned , # Alignment start position in target
33 # 16 tEnd int unsigned , # Alignment end position in target
34 # 17 blockCount int unsigned , # Number of blocks in alignment
35 # 18 blockSizes longblob , # Size of each block in a comma separated list
36 # 19 qStarts longblob , # Start of each block in query in a comma separated list
37 # 20 tStarts longblob , # Start of each block in target in a comma separated list
38
39 #Currently the program does not distinguish between matches and repMatches. repMatches is always zero.
40 # how coordinates are handled on the negative strand: In the qStart/qEnd fields the coordinates are
41 # where it matches from the point of view of the forward strand (even when the match is on the reverse strand).
42 # However on the qStarts[] list, the coordinates are reversed.
43
44 getopts('Pp:c:') || die($usage."\n");
45 my $protalign=$Getopt::Std::opt_P;
46 my $minqcov=$Getopt::Std::opt_c;
47 my $minpid=$Getopt::Std::opt_p;
48
49 #==============================================
50
51 my %ctgs; # target_seqname => [$numreads, $ctglen, $min, $max, @readlst]
52 # each elem of rdlst is like this:
53 # [rdname, orientation, len, start_coord, clipL, clipR, seg_gaps,... ]
54 #open (FPSL, $fpsl) || die("Error opening $fpsl file\n");
55 my ($qseq, $tseq, $prevqname, $prevtname);
56 my %qnums; # qryname => num_alignments
57 #$tseq="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
58 my $lines=0;
59 while (<>) {
60 next unless m/^\d+/;
61 $lines++;
62 #next if $lines>2;
63 chomp;
64 my @t=split(/\t/);
65 my $qalnlen=$t[12]-$t[11]+1;
66 my $pid=sprintf('%d', ($t[0]*100.00)/$qalnlen);
67 next if $minpid && $pid<$minpid;
68 #--
69 my ($tstart, $tend)=@t[15,16]; #should be complemented based on $t[8] and $t[14] ?
70 #--
71 my ($strand, $qryname, $qrylen, $ctgname, $ctglen)=@t[8, 9,10, 13,14];
72 my $blcount=$t[17];
73 my @blsizes=split(/\,/,$t[18]);
74 die "Error: block count doesn't match block size list!\n" unless $blcount==@blsizes;
75 my @qrystarts=split(/\,/,$t[19]);
76 die "Error: block count doesn't match block qstart list!\n" unless $blcount==@qrystarts;
77 my @tgtstarts=split(/\,/,$t[20]);
78 die "Error: block count doesn't match block tstart list!\n" unless $blcount==@tgtstarts;
79 $ctgs{$ctgname}->[0]++; #add this read to the "alignment" for this ctg
80 $ctgs{$ctgname}->[1]=$t[14];
81 my ($pmin, $pmax)=($ctgs{$ctgname}->[2], $ctgs{$ctgname}->[3]);
82 $pmin=$t[14] unless $pmin;
83 $pmax=1 unless $pmax;
84 $pmin=$tstart if ($tstart<$pmin);
85 $pmax=$tend if ($tend>$pmax);
86 $ctgs{$ctgname}->[2]=$pmin;
87 $ctgs{$ctgname}->[3]=$pmax;
88 #walk on tgt blocks and merge any blocks closer than 2bases apart
89 my ($mblcount, @mcoords ); # coords: list of [tstart, tend, qstart, qend]
90 my $multiplier = $protalign ? 3 : 1;
91
92 my ($mtstart, $mtend, $mqstart, $mqend) =
93 ($tgtstarts[0]+1, $tgtstarts[0]+$blsizes[0]*$multiplier, $qrystarts[0]+1, $qrystarts[0]+$blsizes[0]);
94 my $qryblsum=0;
95 for (my $i=0;$i<$blcount-1;$i++) {
96 $qryblsum+=$blsizes[$i+1];
97 if ($tgtstarts[$i+1]-$tgtstarts[$i]-$blsizes[$i]*$multiplier <= 2*$multiplier &&
98 $qrystarts[$i+1]-$qrystarts[$i]-$blsizes[$i] <= 2) {
99 #merge with next block
100 #print STDERR "merge occured ($qryname, blocks $i + next!\n";
101 $mtend=$tgtstarts[$i+1]+$blsizes[$i+1]*$multiplier;
102 $mqend=$qrystarts[$i+1]+$blsizes[$i+1]; #print STDERR ".... mqend=$mqend ($qrystarts[$i+1])\n";
103 }
104 else { #no merge, keep the last found block and start a new block
105 push(@mcoords, [$mtstart, $mtend, $mqstart, $mqend]);
106 ($mtstart, $mtend, $mqstart, $mqend) =
107 ($tgtstarts[$i+1]+1, $tgtstarts[$i+1]+$blsizes[$i+1]*$multiplier, $qrystarts[$i+1]+1, $qrystarts[$i+1]+$blsizes[$i+1]);
108 }
109 }
110 next if $minqcov && ($qryblsum*100.00)/$qrylen <= $minqcov;
111 push(@mcoords, [$mtstart, $mtend, $mqstart, $mqend]);
112
113 if ($strand=~/\-/) {
114 $strand=$protalign?'-':'+';
115 #my @revqst;
116 #for (my $i=0;$i<$blcount;$i++) {
117 # push(@revqst, $qrylen-($qrystarts[$i]+$blsizes[$i]));
118 # }
119 #@qrystarts = @revqst;
120 if ($protalign) { #recalculate tgtstart coordinates
121 for (my $i=0;$i<@mcoords; $i++) {
122 ($mcoords[$i]->[0], $mcoords[$i]->[1])=
123 ($ctglen-($mcoords[$i]->[0]-1), $ctglen-($mcoords[$i]->[1]-1));
124 }
125 }
126 else { #recalculate qrystart coordinates
127 for (my $i=0;$i<@mcoords; $i++) {
128 $mcoords[$i]->[2]=$qrylen-($mcoords[$i]->[2]-1);
129 $mcoords[$i]->[3]=$qrylen-($mcoords[$i]->[3]-1);
130 }
131 }
132 }
133 else {
134 $strand='+';
135 }
136
137
138 my $qnum= ++$qnums{$qryname};
139 my $alnid="$qryname.aln$qnum";
140 if ($protalign && $strand eq '-') {
141 #swap the order of segments
142 @mcoords=reverse(@mcoords);
143 foreach my $d (@mcoords) {
144 ($$d[0], $$d[1])=($$d[1], $$d[0]);
145 ($$d[2], $$d[3])=($$d[3], $$d[2]);
146 }
147 }
148 print join("\t",$ctgname, 'psl', 'mRNA', $mcoords[0]->[0], $mcoords[-1]->[1], $pid, $strand,
149 '.', "ID=$alnid;Name=$alnid;qlen=$qrylen;coords=$mcoords[0]->[2]-$mcoords[-1]->[3]")."\n";
150 my $en;
151 foreach my $d (@mcoords) {
152 my ($t1, $t2, $q1, $q2)=@$d;
153 $en++;
154 print join("\t",$ctgname, 'psl', 'exon', $t1, $t2, $pid, $strand,
155 '.', "ID=$alnid.exon$en;Parent=$alnid;coords=$q1-$q2")."\n";
156 }
157 print "###\n";
158
159 # if ($qryname ne $prevqname) {
160 # $qseq=&getSeq($fqry, $qryname);
161 # $prevqname=$qryname;
162 # }
163 # if ($ctgname ne $prevtname) {
164 # $tseq=&getSeq($ftgt, $ctgname);
165 # $prevtname=$ctgname;
166 # }
167 #------ parse and align the blocks
168 #print ">$qryname vs $ctgname [$strand] $blcount aln blocks ($pid\% identity):\n";
169 #for (my $i=0;$i<$blcount;$i++) {
170 # #-- modify values for reverse $strand cases
171 # &showBlockAlign(\$qseq, \$tseq, $strand, $qrystarts[$i], $tgtstarts[$i], $blsizes[$i]);
172 # }
173
174 } #while input lines
175
176
177 #foreach my $ctg (keys(%ctgs)) {
178 # my $d=$ctgs{$ctg};
179 # print STDERR "Contig $ctg has $$d[0] reads, $$d[1] len (covered from $$d[2] to $$d[3])\n";
180 # }
181
182 #=============================================
183
184 sub revcompl { #gets a DNA sequence, returns the reverse complement
185 my $rseq=reverse($_[0]);
186 $rseq =~ tr/AaCcTtGgUuMmRrWwSsYyKkVvHhDdBb/TtGgAaCcAaKkYyWwSsRrMmBbDdHhVv/;
187 return $rseq;
188 }
189
190 sub showBlockAlign {
191 my ($pseq1, $pseq2, $strand, $bstart1, $bstart2, $blen) = @_;
192 my $qseq = ($strand=~/\-/) ? revcompl(substr($$pseq1, $bstart1, $blen)) : substr($$pseq1, $bstart1, $blen);
193 printf("qry %12s: %s\ntgt %12s: %s\n-=-\n",$bstart1+1,$qseq,
194 $bstart2+1,substr($$pseq2, $bstart2, $blen));
195 }

Properties

Name Value
svn:executable *