ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/mgbl_miRNA_cov.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 5113 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4     use FindBin;use lib $FindBin::Bin;
5    
6     my $usage = q/Usage:
7     mgbl_miRNA_cov.pl [-P] [-v <max_mms>] [-g <maxgaps>] [-f <valid_overlaps>] \
8     [-o <output_cov_data>] <mgblast_output.tab>
9    
10     Filters the overlaps resulted from a mgblast search against a database of
11     known miRNAs and estimates expression level (for mature miRNAs).
12    
13     Options:
14     -P precursor miRNAs were used for the mgblast search
15     -g discard overlaps with more than <maxgaps> gaps (default 2)
16     -v discard overlaps with more than <max_mms> mismatched bases
17     at the 3' end of the read (default 4 at the 3' end of the read);
18     -U unique mappings of reads to miRNAs: assuming the hits are sorted
19     by score, only use the first hit for a read and discard the rest
20     -R append column with all mapped reads for each miRNA
21     -B consider aligments on both strands (normally only Plus strand
22     alignments are kept)
23     Other implied restrictions:
24     * only 1 base mismatch is allowed at the 5' end of the read;
25     * in case of mature miRNAs, an overlap should involve at least 90% of the
26     read length and at least 70% of the mature miRNA length
27     * in case of precursor miRNAs, an overlap should involve at least 90% of
28     the read length
29     /;
30     umask 0002;
31     getopts('BRUPf:g:v:o:') || die($usage."\n");
32     my $outfile=$Getopt::Std::opt_o;
33     my $uniqmap=$Getopt::Std::opt_U;
34     my $rshow=$Getopt::Std::opt_R;
35     my $bothstrands=$Getopt::Std::opt_B;
36    
37     my $fflt=$Getopt::Std::opt_f;
38     my $maxgaps=$Getopt::Std::opt_g || 2;
39     my $maxovh=$Getopt::Std::opt_v || 4;
40     my $premiRNA=$Getopt::Std::opt_P;
41     my $minmcov=70;
42     my $minrcov=90;
43     #my $totalreads=$Getopt::Std::opt_n;
44     #die($usage."Error: total number of reads in the sample is required! (-n parameter)\n")
45     # unless $totalreads>1;
46    
47     if ($outfile) {
48     open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
49     select(OUTF);
50     }
51     if ($fflt) {
52     open(FLT, '>'.$fflt) || die("Error creating output file $fflt\n");
53     }
54    
55     my %mdata; # mirna => [$cov, $read1, $read2,...]
56     my %rd; # reads assigned uniquely to miRNAs - first come first served,
57     # assuming first hit is the best one!
58     while (<>) {
59     my $mgline=$_;
60     chomp;
61     my ($read, $rlen, $rstart, $rend, $mirna, $mlen, $mstart, $mend,
62     $pid, $bitscore, $pvalue, $strand, $rgaps, $mgaps)=split(/\t/);
63    
64     next unless $read && $mirna;
65     next if $uniqmap && exists($rd{$read}); #read already assigned to a mature mRNA
66     next if ($strand eq '-' && !$bothstrands);
67     my ($read_mult)=($read=~m/_x(\d+)$/); # read multiplicity
68     print STDERR ">checking $mgline";
69     print STDERR "read_multiplier=$read_mult, pid=$pid\n";
70     next if ($read_mult>=3 && $pid<100);
71     ($rstart, $rend)=($rend, $rstart) if $rend<$rstart;
72     next if ($rend-$rstart+1)*100<$minrcov*$rlen;
73     unless ($premiRNA) {
74     next if ($mend-$mstart+1)*100<$minmcov*$mlen;
75     }
76     my ($rovh3, $rovh5)=($strand eq '-') ? ($rstart-1, $rlen-$rend) : ($rlen-$rend, $rstart-1);
77     my ($movh3, $movh5)=($mlen-$mend, $mstart-1);
78     my $mm3=($rovh3<$movh3) ? $rovh3 : $movh3;
79     my $mm5=($rovh5<$movh5) ? $rovh5 : $movh5;
80     my $maxmm=$maxovh;
81     $maxmm-- if $pid<100;
82     next if $mm3>$maxmm || $mm5>1;
83     my $numgaps=0;
84     if ($rgaps) {
85     my @rg=split(/\,/, $rgaps);
86     foreach my $g (@rg) {
87     $numgaps+= ($g=~m/\+(\d+)$/) ? $1 : 1;
88     }
89     next if $numgaps>2;
90     }
91     if ($mgaps) {
92     my @rg=split(/\,/, $mgaps);
93     foreach my $g (@rg) {
94     $numgaps+= ($g=~m/\+(\d+)$/) ? $1 : 1;
95     }
96     next if $numgaps>2;
97     }
98     next if ($read_mult>=3 && $numgaps>0);
99     # -- here we are, valid overlap, keep it:
100     print FLT $mgline if $fflt;
101     $rd{$read}=$mirna;
102     my ($rmul)=($read=~m/_x(\d+)$/); #read multiplicity
103     my $md=$mdata{$mirna};
104     unless ($md) {
105     my @mc=(0) x $mlen;
106     @mc[($mstart-1)..($mend-1)]=($rmul) x ($mend-$mstart+1);
107     # ------------- 0 1 2 3 4 5
108     $mdata{$mirna}=[$rmul, $mlen, $mstart, $mend, [@mc], $read];
109     next;
110     }
111     # add to existing miRNA
112     $md->[0]+=$rmul;
113     $$md[2]=$mstart if ($$md[2]>$mstart);
114     $$md[3]=$mend if ($$md[3]<$mend);
115     my $mc=$$md[4];
116     for (my $i=$mstart-1;$i<$mend;$i++) {
117     $$mc[$i]+=$rmul;
118     }
119     push(@$md, $read);
120     } #while input lines
121     # report coverage per known miRNA
122     my $k=keys(%mdata);
123     my ($maxmirna, $maxcov);
124     while (my ($mirna, $md)=each(%mdata)) {
125     my @r=@$md;
126     my $mcov=shift(@r);
127     my $mlen=shift(@r);
128     my $mcovstart=shift(@r);
129     my $mcovend=shift(@r);
130     my $covdata=shift(@r); #coverage per base
131     # find maximum coverage for this miRNA
132     my $maxc=$$covdata[0];
133     foreach my $c (@$covdata) { $maxc=$c if $c>$maxc; }
134     my $mlencov = ($mcovstart==1 && $mcovend==$mlen)? 'full' : sprintf('%.2f',($mcovend-$mcovstart)/$mlen);
135     ($maxcov, $maxmirna)=($mcov, $mirna) if $maxcov<$mcov;
136     #print join("\t",$mirna, sprintf('%.4f',$mcov/$totalreads), $mcov.'x',
137     my $rlist=$rshow ? "\t".join(',',@r) : '';
138     print join("\t",$mirna, $mcov, $maxc, $mlencov)."$rlist\n";
139     }
140     print STDERR "Done. $k miRNAs covered.\n".
141     "Maximum coverage was found for $maxmirna at ${maxcov}x.\n";
142     #- script ending here:
143     close(FLT) if $fflt;
144     if ($outfile) {
145     select(STDOUT);
146     close(OUTF);
147     }
148    
149     #======== Subroutines ============#

Properties

Name Value
svn:executable *