ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/bam2gff_rawcount.pl
Revision: 160
Committed: Fri Feb 3 15:31:57 2012 UTC (7 years, 7 months ago) by gpertea
File size: 7419 byte(s)
Log Message:
added fq_extract script

Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 bam2gff_rawcount.pl [-o <outfile] <bamfile> <transcripts.gff>
8
9 Reports the raw number of read mappings overlapping each transcript
10 found in the given <gff> file (exon-wise)
11
12 BAM file MUST have been indexed prior to running this script.
13 (using 'samtools index <bamfile>')
14
15 /;
16 umask 0002;
17 getopts('o:') || die($usage."\n");
18 my $outfile=$Getopt::Std::opt_o;
19 my $bamfile=shift(@ARGV) || die "$usage\n";
20 my $f_gtf=shift(@ARGV) || die "$usage\n";
21 if ($outfile) {
22 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
23 select(OUTF);
24 }
25
26
27 my %gffrecs;
28 # --------------- 0 1 2 3 4 5 6 7
29 # recID => [ chr, strand, fstart, fend, [@exons], [@cds], geneID, \@attrs ]
30
31 my $gffh;
32 open($gffh, $f_gtf) || die("Error: failed to open $f_gtf. $!\n");
33 print STDERR "Loading GFF data from $f_gtf..\n";
34 my @tlist; #list of transcript_IDs, in order they are encountered
35
36 loadGff($gffh, \%gffrecs, \@tlist);
37 # we don't need to sort by location, we'll go through every transcript anyway
38 #my @tsorted = sort sortByLoc @tlist;
39
40
41 foreach my $tid (@tlist) {
42 #for every transcript, print the number of read mappings which overlap at least one exon
43 #samtools view will be called for every transcript, so this can be slow
44 my $td=$gffrecs{$tid};
45 my $ovlcount=bamOvlCount($bamfile, $tid, $td);
46 print join("\t",$tid, $$td[1], $$td[0].':'.$$td[2].'-'.$$td[3], $$td[6], $ovlcount)."\n";
47 }
48
49
50 #-------------
51 sub loadGff {
52 my ($fh, $recs, $reclist)=@_;
53 # --------------------- 0 1 2 3 4 5 6 7
54 # recID => [ chr, strand, fstart, fend, [@exons], [@cds], geneID, \@attrs ]
55 # reclist = just a list of IDs in order they are discovered
56 while (<$fh>) {
57 next if m/^\s*#/;
58 chomp;
59 my ($chr, $track, $f, $fstart, $fend, $fscore, $strand, $frame, $attrs)=split(/\t/);
60 next unless $fstart>1 && $attrs;
61 next if $f eq 'gene' || $f =~ m/locus/; # Warning: skipping any 'gene' or 'locus' features, unconditionally
62 my ($tid, $geneid, @lattr);
63 my %hattr;
64 #if ($attrs=~m/nearest_ref[= ]+"?([^;"]+)/) {
65 # $refid=$1;
66 # }
67 #if ($attrs=~m/class_code[\.= ]+"?([\w=])/) {
68 # $classcode=$1;
69 # }
70 @lattr=(split(/;\s*/, $attrs));
71 my $gff3_ID;
72 my $gff3_Parent;
73 ($fstart, $fend)=($fend, $fstart) if $fend<$fstart;
74
75 ($gff3_ID)=($attrs=~m/\bID=([^;]+)/);
76 ($gff3_Parent)=($attrs=~m/\bParent=([^;]+)/);
77
78 if ($gff3_ID || $gff3_Parent) {
79 # GFF3 format
80 @lattr = map { [split(/\s*=\s*/, $_, 2)] } @lattr;
81 %hattr = map { lc($_->[0]) => $_->[1] } @lattr;
82
83 $gff3_ID =~ tr/"//d;
84 $gff3_Parent =~ tr/"//d;
85 $gff3_Parent='' if ($f eq 'mRNA');
86 $geneid=$hattr{'gene_id'} || $hattr{'geneid'} || $hattr{'gene_name'};
87 $geneid='' if $geneid eq $gff3_ID;
88 if ($gff3_ID && !$gff3_Parent) { #top level feature
89 $tid=$gff3_ID;
90 die("Error: duplicate feature $gff3_ID on $chr\n") # if (exists($$recs{"$chr|$gff3_ID"}));
91 if (exists($$recs{$gff3_ID}));
92 #my $recID=$gff3_ID;
93 #$geneid=$refid if $refid;
94 $$recs{$tid} = [$chr, $strand, $fstart, $fend, [], [], $geneid, [@lattr]];
95 push(@$reclist, $tid);
96 next;
97 } # parent/top-level feature
98 $tid=$gff3_Parent;
99 } #GFF3
100 else { #GTF format
101 if ($track=~/^jigsaw/ && $attrs=~m/^\d+$/) {
102 $tid=$chr.'.jsm.'.$attrs;
103 }
104 else {
105 @lattr = map { [split(/\s+/,$_,2)] } @lattr;
106 %hattr = map { lc($_->[0]) => $_->[1] } @lattr;
107 $tid = $hattr{'transcript_id'} || die("Error: invalid GTF (transcript_id missing: $_)\n");
108 $tid =~ tr/"//d; #"
109 }
110 if ( $geneid = $hattr{'gene_id'} ) {
111 $geneid=~tr/"//d; #"
112 }
113 # gtf with parent 'transcript' feature
114 if ($f eq 'transcript') {
115 die("Error: duplicate feature $tid on $chr\n") # if (exists($$recs{"$chr|$gff3_ID"}));
116 if (exists($$recs{$tid}));
117 $$recs{$tid} = [$chr, $strand, $fstart, $fend, [], [], $geneid, [@lattr]];
118 push(@$reclist, $tid);
119 next;
120 }
121 } #GTF
122 #only 'exon' and 'CDS' sub-features are recognized
123 next unless $f eq 'exon' || $f eq 'CDS';
124 # -----------exon/CDS line here:
125 my $ld = $$recs{$tid};
126 my $sidx=($f eq 'CDS') ? 5 : 4;
127 if ($ld) { #update existing entry
128 my ($lstart, $lend)=($$ld[2], $$ld[3]);
129 $$ld[2] = $fstart if $fstart < $lstart;
130 $$ld[3] = $fend if $fend > $lend;
131 }
132 else { # first time seeing this transcript
133 $$recs{$tid} = [$chr, $strand, $fstart, $fend, [], [], $geneid, [@lattr] ];
134 push(@$reclist, $tid);
135 }
136 push(@{$$ld[$sidx]}, [ $fstart, $fend, $fscore, $frame ]);
137 } #while <$fh>
138 } # loadGff
139
140 sub sortByLoc {
141 my $da=$gffrecs{$a};
142 my $db=$gffrecs{$b};
143 if ($$da[0] eq $$db[0]) {
144 return ($$da[2]==$$db[2]) ? $$da[3] <=> $$db[3] : $$da[2] <=> $$db[2] ;
145 }
146 else { return $$da[0] cmp $$db[0] ; }
147 }
148
149 sub bamOvlCount { # get the number of reads overlapping a transcript
150 my ($fbam, $tid, $tdata)=@_;
151 my ($chr, $cstrand, $cstart, $cend, $exons, $cds, $geneid, $attrs)=@$tdata;
152 my $range=$chr.':'.$cstart.'-'.$cend;
153 my $segdata=$exons;
154 $segdata=$cds unless @$segdata>0;
155 my @exons;
156 if (@$segdata>0) {
157 @exons= sort { $a->[0]<=>$b->[0] } @$segdata;
158 }
159 else {
160 @exons=([$cstart, $cend]);
161 }
162
163 open(SAMPIPE, "samtools view $fbam $range |") || die ("Error opening samtools pipe ($!)\n");
164 my $numovl=0; #number of reads overlapping at least one exon of the current transcript
165 while(<SAMPIPE>) {
166 my $samline=$_;
167 chomp;
168 my ($qname, $flags, $gseq, $pos, $mapq, $cigar, $rnext, $pnext, $tlen, $seq, $qual, @extra)=
169 split(/\t/);
170 # if ($strand) {
171 # my $mstrand= (($flags & 0x10)==0) ? '+' : '-';
172 # next if $mstrand ne $strand;
173 # }
174 #now extract the CIGAR segments
175 my @cigdata=($cigar=~m/(\d+[A-Z,=])/g);
176 my ($mstart, $mend);
177 my $curpos=$pos;
178 $mstart=$pos;
179 foreach my $cd (@cigdata) {
180 my $code=chop($cd);
181 #now $cd has the length of the CIGAR operation
182 if ($code eq 'N') { #gap found (possible intron)
183 #process previous interval
184 if ($mend && checkOverlap($mstart, $mend, \@exons)) {
185 $numovl++;
186 $mend=0;
187 last; #exon overlap found, no need to check the other exons
188 }
189
190 $curpos+=$cd;
191 $mstart=$curpos; #start of the next exon
192 $mend=0;
193 next;
194 }
195 if ($code eq 'M' || $code eq 'D') {
196 #only advance genomic position for match and delete operations
197 $curpos+=$cd;
198 $mend=$curpos-1; #advance the end of the exon
199 }
200 }
201 #check the last interval
202 if ($mend && checkOverlap($mstart, $mend, \@exons)) {
203 #$hasOvl=1;
204 $numovl++;
205 }
206 } # while <SAMPIPE>
207 close(SAMPIPE);
208 return $numovl;
209 }
210
211 sub checkOverlap { #check if segment $a-$b overlaps any of the exons in $rx
212 my ($a, $b, $rx)=@_;
213 return 0 if ($a>$$rx[-1]->[1] || $b<$$rx[0]->[0]); # not overlapping the transcript region at all
214 foreach my $x (@$rx) {
215 #check for exon overlap
216 return 1 if ($a<=$$x[1] && $b>=$$x[0]);
217 return 0 if $b<$$x[0]; #@$rx is sorted, so we can give up
218 }
219 }

Properties

Name Value
svn:executable *