ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/utrpaste.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 3 months ago) by gpertea
File size: 6291 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 utrpaste.pl [-a 'newID'] [-o <output.gff3>] <jigsaw_out.jgff> <exon_mappings.gff3>
8
9 Augment the gene models found in <jigsaw_out.jgff> with
10 exon mappings found in <mappings.gff3> (only initial\/terminal exons,
11 if they overlap the CDS)
12
13 /;
14 umask 0002;
15 getopts('a:o:') || die($usage."\n");
16 my $outfile=$Getopt::Std::opt_o;
17 my $newacc=$Getopt::Std::opt_a;
18 my ($jgff,$mapgff)=@ARGV;
19 foreach (@ARGV) {
20 die ("Error locating file $_\n") unless -f $_;
21 }
22 open(MGFF, $mapgff) || die ("Error opening mappings file $mapgff\n");
23 # 0 1 2 3 4
24 my @mapdata; #list of [ ID, strand, start, end, [ @exons] ]
25 # where each exon element is [exon_start, exon_end]
26 my $mi=-1; #current index in @mapdata
27 my $curid;
28 while (<MGFF>) {
29 my ($chr, $v, $f, $fstart, $fend, $fscore, $strand, $frame, $info)=split(/\t/);
30 next unless ($f eq 'exon'); #! ignore anything else but exons
31 ($fstart, $fend)=($fend, $fstart) if ($fstart>$fend);
32 my ($id)=($info=~m/Parent=([^;]+)/);
33 die("Error getting mapping ID at:\n$_\n") unless $id;
34 if ($id ne $curid) {
35 #start a new record;
36 push(@mapdata, [ $id, $strand, 0,0, []]);
37 $mi=$#mapdata;
38 $curid=$id;
39 }
40 push( @{$mapdata[$mi]->[4]} , [$fstart, $fend] );
41 }
42 close(MGFF);
43
44 foreach my $md (@mapdata) {
45 my @exons=sort { $main::a->[0] <=> $main::b->[0] } @{$md->[4]};
46 $md->[2]=$exons[0]->[0];
47 $md->[3]=$exons[-1]->[1];
48 $md->[4]=[@exons];
49 }
50
51 # sort mappings by start coordinate
52 @mapdata=sort { $main::a->[2] <=> $main::b->[2] } @mapdata;
53
54 if ($outfile) {
55 open(FOUT, '>'.$outfile) || die("Error creating file $outfile\n");
56 select(FOUT);
57 }
58 #now parse and augment the jgff
59 open(JGFF, $jgff) || die ("Error opening mappings file $jgff\n");
60 $curid='';
61 my @cds=0; #current CDS chain: list of [$start, $end, $frame]
62 my @curdata;
63 while (<JGFF>) {
64 next if length($_)<4 || m/^# /;
65 my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $phase, $jdata)=split(/\t/);
66 next unless (($f eq 'CDS') || ($track=~m/jigsaw/ && $f=~m/\-exon$/));
67 my ($id)=($jdata=~m/^([^;]+); gene_score=/);
68 unless ($id) {
69 ($id)=($jdata=~m/Parent=([^;]+)/);
70 unless ($id) { #try GTF too
71 ($id)=($jdata=~m/transcript_id "([^"]+)/);
72 }
73 die("Error getting model ID at:\n$_\n") unless $id;
74 }
75 if ($id ne $curid) {
76 extendCDS(@curdata, \@cds) if $curid; #checks @mapdata
77 @cds=();
78 $curid=$id;
79 my $wid = $newacc || $curid;
80 my ($jnum)=($curid=~m/\.(\d+)$/);
81 $jnum=1 unless $jnum;
82 @curdata=($wid, $jnum, $chr, $track, $strand);
83 }
84 push(@cds, [$fstart, $fend, $phase]);
85 }
86 # last record:
87 extendCDS(@curdata, \@cds) if @cds>0; #checks @mapdata
88 close(JGFF);
89
90 if ($outfile) { close(FOUT); }
91
92 #================ SUBROUTINES ============
93
94 sub extendCDS {
95 my ($id, $jnum, $chr, $track, $strand, $cdsl)=@_;
96 #find mapdata overlapping first and last exons of this model
97 my @cds = sort { $main::a->[0] <=> $main::b->[0] } @$cdsl;
98 my $cstart=$cds[0]->[0];
99 my $cend=$cds[-1]->[1];
100 my $mrnaid=$id.'.j'.$jnum;
101
102 my $found=0;
103 my @exons;
104
105 foreach my $m (@mapdata) {
106 my ($mid, $mstrand, $mstart, $mend, $mexons)=@$m;
107 #mapping should overlap on the same strand
108 #print STDERR "===> comparing $mstart-$mend ($mstrand) w $cstart-$cend ($strand)\n";
109 next if ($mstrand ne $strand || $mstart>$cend || $cstart>$mend);
110 #mapping ends should cover a larger interval than CDS
111 next if (($mstart>$cstart) || ($mend<$cend));
112 # at least one of the ends should stretch farther
113 next unless (($mstart<$cstart) || ($mend>$cend));
114 # -- found it:
115 $found=1;
116 # augment here, and print the mRNA line too, then exons
117 # left end :
118 my @lexons;
119 my $lovl;
120 for (my $i=0;$i<@$mexons;$i++) {
121 my ($estart, $eend)=@{$$mexons[$i]};
122 #print STDERR "left end cmp: $estart-$eend vs CDS $cstart-$cds[0]->[1]\n";
123 if ($estart<=$cstart && $eend>$cstart) {
124 # if they really overlapped properly, $estart is <= cds start
125 #print STDERR " YES, adding exon $estart-$cds[0]->[1]^^!\n";
126 push(@lexons, [$estart, $cds[0]->[1]]);
127 $lovl=1;
128 last;
129 }
130 push(@lexons,[$estart, $eend]);
131 }
132 #right end:
133 my @rexons;
134 my $rovl;
135 for (my $i=@$mexons-1;$i>=0;$i--) {
136 my ($estart, $eend)=@{$$mexons[$i]};
137 #print STDERR "right end cmp: $estart-$eend vs CDS $cds[-1]->[0]-$cend\n";
138 if ($eend>=$cend && $estart<$cend) {
139 # if they really overlapped properly, $eend is >= cds end
140 #print STDERR " YES, adding exon $cds[-1]->[0]-$eend!\n";
141 unshift(@rexons, [$cds[-1]->[0], $eend]);
142 $rovl=1;
143 last;
144 }
145 unshift(@rexons,[$estart, $eend]);
146 }
147
148 if ($rovl && $lovl) {
149 #special single exon case:
150 if ( @cds==1 ) {
151 #we have to merge rexon and lexons
152 $lexons[-1]->[1]=$rexons[0]->[1];
153 shift(@rexons); #remove the first rexon
154 push(@exons, @lexons, @rexons);
155 }
156 else { #normal multi-exon case
157 push(@exons, @lexons);
158 foreach my $cd (@cds) {
159 push(@exons, [$$cd[0], $$cd[1]])
160 if ($cd->[0]>$lexons[-1]->[1] && $cd->[1]<$rexons[0]->[0])
161 }
162 #for (my $i=1;$i<@cds;$i++) {
163 # push(@exons, [$cds[$i]->[0], $cds[$i]->[1]]);
164 # }
165 push(@exons, @rexons);
166 }
167 # print mRNA and exons
168 print join("\t", $chr, $track, 'mRNA', $exons[0]->[0], $exons[-1]->[1], '.', $strand, '.',
169 'ID='.$mrnaid)."\n";
170 foreach my $e (@exons) {
171 print join("\t", $chr, $track, 'exon', $$e[0], $$e[1], '.', $strand, '.',
172 'Parent='.$mrnaid)."\n";
173 }
174 last;
175 }
176 else { $found = 0; }
177 }
178 if (!$found) {
179 #print mRNA line and the old CDS as is
180 print join("\t", $chr, $track, 'mRNA', $cstart, $cend, '.', $strand, '.',
181 'ID='.$mrnaid)."\n";
182 }
183 #print CDS segments:
184 foreach my $c (@cds) {
185 print join("\t", $chr, $track, 'CDS', $$c[0], $$c[1], '.', $strand, $$c[2],
186 'Parent='.$mrnaid)."\n";
187
188 }
189 }

Properties

Name Value
svn:executable *