ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/mgbl_miRNA_cov.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 5113 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 mgbl_miRNA_cov.pl [-P] [-v <max_mms>] [-g <maxgaps>] [-f <valid_overlaps>] \
8 [-o <output_cov_data>] <mgblast_output.tab>
9
10 Filters the overlaps resulted from a mgblast search against a database of
11 known miRNAs and estimates expression level (for mature miRNAs).
12
13 Options:
14 -P precursor miRNAs were used for the mgblast search
15 -g discard overlaps with more than <maxgaps> gaps (default 2)
16 -v discard overlaps with more than <max_mms> mismatched bases
17 at the 3' end of the read (default 4 at the 3' end of the read);
18 -U unique mappings of reads to miRNAs: assuming the hits are sorted
19 by score, only use the first hit for a read and discard the rest
20 -R append column with all mapped reads for each miRNA
21 -B consider aligments on both strands (normally only Plus strand
22 alignments are kept)
23 Other implied restrictions:
24 * only 1 base mismatch is allowed at the 5' end of the read;
25 * in case of mature miRNAs, an overlap should involve at least 90% of the
26 read length and at least 70% of the mature miRNA length
27 * in case of precursor miRNAs, an overlap should involve at least 90% of
28 the read length
29 /;
30 umask 0002;
31 getopts('BRUPf:g:v:o:') || die($usage."\n");
32 my $outfile=$Getopt::Std::opt_o;
33 my $uniqmap=$Getopt::Std::opt_U;
34 my $rshow=$Getopt::Std::opt_R;
35 my $bothstrands=$Getopt::Std::opt_B;
36
37 my $fflt=$Getopt::Std::opt_f;
38 my $maxgaps=$Getopt::Std::opt_g || 2;
39 my $maxovh=$Getopt::Std::opt_v || 4;
40 my $premiRNA=$Getopt::Std::opt_P;
41 my $minmcov=70;
42 my $minrcov=90;
43 #my $totalreads=$Getopt::Std::opt_n;
44 #die($usage."Error: total number of reads in the sample is required! (-n parameter)\n")
45 # unless $totalreads>1;
46
47 if ($outfile) {
48 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
49 select(OUTF);
50 }
51 if ($fflt) {
52 open(FLT, '>'.$fflt) || die("Error creating output file $fflt\n");
53 }
54
55 my %mdata; # mirna => [$cov, $read1, $read2,...]
56 my %rd; # reads assigned uniquely to miRNAs - first come first served,
57 # assuming first hit is the best one!
58 while (<>) {
59 my $mgline=$_;
60 chomp;
61 my ($read, $rlen, $rstart, $rend, $mirna, $mlen, $mstart, $mend,
62 $pid, $bitscore, $pvalue, $strand, $rgaps, $mgaps)=split(/\t/);
63
64 next unless $read && $mirna;
65 next if $uniqmap && exists($rd{$read}); #read already assigned to a mature mRNA
66 next if ($strand eq '-' && !$bothstrands);
67 my ($read_mult)=($read=~m/_x(\d+)$/); # read multiplicity
68 print STDERR ">checking $mgline";
69 print STDERR "read_multiplier=$read_mult, pid=$pid\n";
70 next if ($read_mult>=3 && $pid<100);
71 ($rstart, $rend)=($rend, $rstart) if $rend<$rstart;
72 next if ($rend-$rstart+1)*100<$minrcov*$rlen;
73 unless ($premiRNA) {
74 next if ($mend-$mstart+1)*100<$minmcov*$mlen;
75 }
76 my ($rovh3, $rovh5)=($strand eq '-') ? ($rstart-1, $rlen-$rend) : ($rlen-$rend, $rstart-1);
77 my ($movh3, $movh5)=($mlen-$mend, $mstart-1);
78 my $mm3=($rovh3<$movh3) ? $rovh3 : $movh3;
79 my $mm5=($rovh5<$movh5) ? $rovh5 : $movh5;
80 my $maxmm=$maxovh;
81 $maxmm-- if $pid<100;
82 next if $mm3>$maxmm || $mm5>1;
83 my $numgaps=0;
84 if ($rgaps) {
85 my @rg=split(/\,/, $rgaps);
86 foreach my $g (@rg) {
87 $numgaps+= ($g=~m/\+(\d+)$/) ? $1 : 1;
88 }
89 next if $numgaps>2;
90 }
91 if ($mgaps) {
92 my @rg=split(/\,/, $mgaps);
93 foreach my $g (@rg) {
94 $numgaps+= ($g=~m/\+(\d+)$/) ? $1 : 1;
95 }
96 next if $numgaps>2;
97 }
98 next if ($read_mult>=3 && $numgaps>0);
99 # -- here we are, valid overlap, keep it:
100 print FLT $mgline if $fflt;
101 $rd{$read}=$mirna;
102 my ($rmul)=($read=~m/_x(\d+)$/); #read multiplicity
103 my $md=$mdata{$mirna};
104 unless ($md) {
105 my @mc=(0) x $mlen;
106 @mc[($mstart-1)..($mend-1)]=($rmul) x ($mend-$mstart+1);
107 # ------------- 0 1 2 3 4 5
108 $mdata{$mirna}=[$rmul, $mlen, $mstart, $mend, [@mc], $read];
109 next;
110 }
111 # add to existing miRNA
112 $md->[0]+=$rmul;
113 $$md[2]=$mstart if ($$md[2]>$mstart);
114 $$md[3]=$mend if ($$md[3]<$mend);
115 my $mc=$$md[4];
116 for (my $i=$mstart-1;$i<$mend;$i++) {
117 $$mc[$i]+=$rmul;
118 }
119 push(@$md, $read);
120 } #while input lines
121 # report coverage per known miRNA
122 my $k=keys(%mdata);
123 my ($maxmirna, $maxcov);
124 while (my ($mirna, $md)=each(%mdata)) {
125 my @r=@$md;
126 my $mcov=shift(@r);
127 my $mlen=shift(@r);
128 my $mcovstart=shift(@r);
129 my $mcovend=shift(@r);
130 my $covdata=shift(@r); #coverage per base
131 # find maximum coverage for this miRNA
132 my $maxc=$$covdata[0];
133 foreach my $c (@$covdata) { $maxc=$c if $c>$maxc; }
134 my $mlencov = ($mcovstart==1 && $mcovend==$mlen)? 'full' : sprintf('%.2f',($mcovend-$mcovstart)/$mlen);
135 ($maxcov, $maxmirna)=($mcov, $mirna) if $maxcov<$mcov;
136 #print join("\t",$mirna, sprintf('%.4f',$mcov/$totalreads), $mcov.'x',
137 my $rlist=$rshow ? "\t".join(',',@r) : '';
138 print join("\t",$mirna, $mcov, $maxc, $mlencov)."$rlist\n";
139 }
140 print STDERR "Done. $k miRNAs covered.\n".
141 "Maximum coverage was found for $maxmirna at ${maxcov}x.\n";
142 #- script ending here:
143 close(FLT) if $fflt;
144 if ($outfile) {
145 select(STDOUT);
146 close(OUTF);
147 }
148
149 #======== Subroutines ============#

Properties

Name Value
svn:executable *