ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/fltgff4gv
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 5026 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4
5 my $usage = q/Usage:
6 fltgff4gv [-t <outtrackname>] [-o <outgff>] [-i <max_intron>] [-b <out_bookmarks>]
7 [-I] <gmap_or_pmap_gff3..>
8
9 Use the -I option to preserve the full original IDs found in
10 the input gff stream.
11
12 /;
13 my %gdup; #prevents the duplicate mappings in the input..
14 umask 0002;
15 getopts('i:It:o:b:') || die($usage."\n");
16 my $track=$Getopt::Std::opt_t;
17 my $fullid=$Getopt::Std::opt_I;
18 my $bfile=$Getopt::Std::opt_b;
19 my $maxintron=$Getopt::Std::opt_i || 320000;
20 if ($ARGV[0] && (-s $ARGV[0] < 60)) {
21 exit 0;
22 }
23 if ($bfile) {
24 open(BFILE, '>'.$bfile) || die ("Error creating bookmark file $bfile!\n");
25 }
26
27 my $outfile=$Getopt::Std::opt_o;
28 if ($outfile) {
29 open(OUTFILE, '>'.$outfile) || die ("Error creating file $outfile!\n");
30 select(OUTFILE);
31 }
32
33 my $skiprec=0;
34 my $parentID='';
35 my @mbuf; #buffer mRNA lines so we can filter out some
36 my @mbex; #buffered mRNA exons
37 while (<>) {
38 next if m/^\s*#/;
39 chomp;
40 my ($chr, $tr, $ftype, $fstart, $fend, $fscore,
41 $fstrand, $frame, $ndata)=split(/\t/);
42 $tr=$track if $track;
43 if ($ftype eq 'mRNA') {
44 flushmbuf();
45 my ($info)=($ndata=~m/Info="([^"]+)/i);
46 unless ($info) {
47 ($info)=($ndata=~m/TopHit="([^"]+)/i);
48 unless ($info) {
49 ($info)=($ndata=~m/Descr="([^"]+)/i);
50 }
51 if ($info) {
52 my @nri=split(/\x01/, $info);
53 $info=$nri[0] if (@nri>1);
54 }
55 }
56 my ($cov)=($ndata=~m/Cov=([\d\.]+)/i);
57 my ($pi)=($ndata=~m/Identity=([\d\.]+)/i);
58 my ($id)=($ndata=~m/ID=([^;]+)/);
59 my ($gffname)=($ndata=~m/Name=([^;]+)/);
60 my ($gid, $xid, $name);
61 if (m/GeneId=([\w\.\-]+)/) {
62 $gid=$1;
63 }
64 elsif ($tr=~/jigsaw/ && $id ne $gffname) {
65 $gid=$gffname;
66 }
67 if ($fullid) {
68 $xid=$id;
69 $name=$id;
70 #$info=~s/^\w+\|(\w+)/$1/;
71 $info=~s/gid\://;
72 $info=~s/CDS:\d+\-\d+\s*//;
73 }
74 else {
75 if ($id=~m/^UP/) {
76 if ($id=~m/\|gid\|([\w\-]+)/) {
77 $gid=$1;
78 }
79 else { ($gid)=($id=~m/^UP\w*\|(\w+)/) }
80 $info=~s/^UniRef\w+\s*//;
81 }
82 else {
83 if ($info=~s/gid:(\S+)\s*//) {
84 $gid=$1;
85 }
86 $info=~s/CDS:\d+\-\d+\s*//;
87 }
88 my ($suffixnum)=($id=~m/\.([pbmrnag]*\d+)$/);
89 my ($num)=($suffixnum=~m/(\d+)$/);
90 my ($suffix)=($suffixnum=~m/^([^\d]+)/);
91 #$suffix='m' unless $suffix;
92 $suffix='m';
93 $num=1 unless $num;
94 ($xid)=($id=~m/^\w+\|([\w]+)/);
95 if ($xid) {
96 $xid=$tr.'|'.$xid.'.'.$suffix.$num;
97 }
98 else { #jigsaw, or anything simple
99 #$xid=$tr.'|'.$id;
100 $xid=$id;
101 }
102 $name=$gid || $xid;
103 }
104
105 #skip duplicates: same ID, track and same start-end coords
106 $skiprec = ($gdup{"$xid.$tr.$fstart.$fend"}++);
107 # $gdup{"$xid.$fstart.$fend"}++;
108 if ($skiprec) {
109 #print "skipping record $xid.$tr.$fstart.$fend\n";
110 $parentID='';
111 next;
112 }
113 my $xinfo;
114 $xinfo='cov:'.$cov.'%' if $cov;
115 $xinfo.=', ident:'.$pi.'%' if $pi;
116 $xinfo=~s/^\, //;
117 $info.=' ('.$xinfo.')' if $xinfo;
118 if ($tr =~ m/jigsaw/i) {
119 $xid.='|'.$gid if $gid;
120 $ndata='ID='.$xid.';Name='.$name.';info="'.$info.'"';
121 }
122 else {
123 my $s=(index(uc($info), uc($name))>=0)? $info : $name.' '.$info;
124 $ndata='ID='.$xid.';Name='.$name.';info="'.$s.'"';
125 }
126 $parentID=$xid;
127 if ($bfile) {
128 my $bid=$xid;
129 $bid.="|$gid" if $gid && $xid!~m/\Q|$gid/;
130 print BFILE join("\t",$bid,$fstart)."\n";
131 }
132 } #mRNA line processing until here
133 elsif ($ftype eq 'exon' || $ftype eq 'CDS') {
134 next if $skiprec;
135 next unless $parentID;
136 # my ($pid)=($ndata=~m/Parent=([^;]+)/);
137 # my $pxid;
138 #if ($fullid) {
139 # $pxid=$pid;
140 # }
141 # else {
142 # ($pxid)=($pid=~m/^\w+\|([\w]+)/);
143 # if ($pxid) {
144 # my ($pnum)=($pid=~m/\.mrna(\d+)$/);
145 # $pxid.='.m'.$pnum;
146 # }
147 # else { $pxid=$pid; }
148 # $ndata='Parent='.$pxid;
149 # }
150 $ndata='Parent='.$parentID;
151 push(@mbex, ($fend<$fstart) ? [ $fend, $fstart ] : [$fstart, $fend] );
152 }
153 else { #next; keep any unrecognized features/markers (e.g. seqgap)
154 flushmbuf();
155 print join("\t",$chr, $tr, $ftype, $fstart, $fend, $fscore,
156 $fstrand, $frame, $ndata)."\n";
157 next;
158 }
159 # mRNA, exon/CDS go through here:
160 push(@mbuf, join("\t",$chr, $tr, $ftype, $fstart, $fend, $fscore,
161 $fstrand, $frame, $ndata));
162 }
163
164 flushmbuf();
165
166 close(BFILE) if $bfile;
167 if ($outfile) {
168 select STDOUT;
169 close(OUTFILE);
170 }
171
172 sub flushmbuf {
173 return unless @mbuf>0 && @mbex>0;
174 if ($mbuf[0]=~m/\tjigsaw/) {
175 goto NO_INTRONCHECK;
176 }
177 @mbex=sort { $main::a->[0]<=>$main::b->[0] } @mbex;
178 my $maxi=0;
179 if (scalar(@mbex)!=scalar(@mbuf)-1) {
180 die("Error: fltgff4gv: invalid mbuf vs mbex counts for $mbuf[0]!\n");
181 }
182 for (my $i=1;$i<@mbex;$i++) {
183 my $isize=$mbex[$i]->[0]-$mbex[$i-1]->[1]-1;
184 $maxi=$isize if $maxi<$isize;
185 last if ($maxi>$maxintron);
186 }
187 NO_INTRONCHECK:
188 print join("\n",@mbuf)."\n" unless ($maxi>$maxintron);
189 @mbuf=();
190 @mbex=();
191 }

Properties

Name Value
svn:executable *