ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/mirfind_score.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 24985 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5 use IO::Handle qw( );
6 use RNA;
7
8 my $usage = q/Usage:
9 mirfind_score.pl [-c <cutoff>] [-D] <regions.fa> <regions.xmap>
10 Options
11 -c set the score cut-off (default 1)
12 -A print all entries that qualify structurally, even with bad score
13 -R use Randfold
14 -D consider Drosha processing (number of base pairings in the lower stems)
15 /;
16 umask 0002;
17 getopts('VRDAc:o:') || die($usage."\n");
18 my $outfile=$Getopt::Std::opt_o;
19 my $showall=$Getopt::Std::opt_A;
20 my $verbose=$Getopt::Std::opt_V;
21 my ($use_randfold, $use_drosha)=($Getopt::Std::opt_R, $Getopt::Std::opt_D);
22 my $score_min=$Getopt::Std::opt_c || 1;
23 die("$usage\n") unless @ARGV==2;
24 my ($frna,$fxmaps)=@ARGV;
25 die("$usage\n") unless -f $frna && -f $fxmaps;
26 if ($outfile) {
27 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
28 select(OUTF);
29 }
30 # -- setup RNA folding parameters:
31 $RNA::dangles=0;
32 #parameters
33 my $nucleus_lng=7;
34 my $score_star=3.9;
35 my $score_star_not=-1.3;
36 my $score_nucleus=3;
37 my $score_nucleus_not=-0.6;
38 my $score_randfold=1.6;
39 my $score_randfold_not=-2.2;
40 my $score_intercept=0.3;
41 my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0);
42 my $e=2.718281828;
43
44
45
46 # -- 0 1 2 3 4 5
47 my @xmaps; # list of [readID_xN, read_freq, read_alnstart, read_alnend, mismatches, intronInfo]
48 my %qmaps; # $read_name=>[readID_xN, read_freq, read_alnstart, read_alnend, mismatches, intronInfo]
49 # 0 1 2 3 4 5
50 #TODO: while parsing this read data, collapse reads that are included in other reads (summing up the frequencies)
51 my $reg_id; #current region id
52 # 0 1 2 3
53 my @rnafold; # (region_descr, region_seq, region_struct, region_mfe)
54 my %hash_bp; # base pairing data for current region
55 my $qmaxfreq; # read with maximum coverage in current region
56 my @qmaxfreqdata; # list of (readID_xN, read_freq, read_alnstart, read_alnend, mismatches, intronInfo) for the read with maximum coverage in current region
57 my @prevmir; # -- to avoid reporting duplicates
58 # [ 0=mature_loc, 1=hairpin_loc, 2=final_score, 3=mfe_score, 4= report line ]
59 #------
60 open(INRNA, $frna) || die ("Error opening file $frna\n");
61 open(INMAPS, $fxmaps) || die ("Error opening file $fxmaps\n");
62 my $prevchr;
63 while (($reg_id=RNAfold(*INRNA, \@rnafold))) {
64 #print STDERR "DBG: processing region: $reg_id\n" if $verbose;
65 my $x_id=readXmaps(*INMAPS, \@xmaps, \%qmaps);
66 die("Error: couldn't match RNAfold id ($reg_id) with mappings id ($x_id)\n") unless $x_id eq $reg_id;
67 my $chr=$x_id;
68 $chr=~s/_\d+$//;
69 if ($chr ne $prevchr) {
70 print STDERR "processing regions on $chr..\n";
71 $prevchr=$chr;
72 }
73 processRegion(\@rnafold, \@xmaps);
74 }
75 print $prevmir[4] if $prevmir[4];
76 close(INMAPS);
77 close(INRNA);
78 print STDERR "Done.\n";
79 # --
80 if ($outfile) {
81 select(STDOUT);
82 close(OUTF);
83 }
84 #************ Subroutines **************
85
86 sub processRegion {
87 my ($rfold, $rmaps)=@_;
88 my ($g_id, $g_descr, $g_seq, $g_struct, $g_mfe)=@$rfold;
89 my $g_seqlen=length($g_seq);
90 #print STDERR ">greg: $g_id mfe=$g_mfe\nseq: $g_seq\nstr: $g_struct\n";
91 # --> fill_structure()
92 #reads the dot bracket structure into the 'bp' hash where each key and value are basepaired
93 %hash_bp=();
94 my $lng=length($g_struct);
95 #local stack for keeping track of basepairings
96 my @bps;
97 for(my $i=1;$i<=$lng;$i++){
98 #my $pstruct=excise_str($g_struct,$i,$i);
99 my $ps=substr($g_struct,$i-1,1);
100 if ($ps eq "(") {
101 push(@bps,$i);
102 next;
103 }
104 if ($ps eq ")") {
105 my $i_prev=pop(@bps);
106 $hash_bp{$i_prev}=$i;
107 $hash_bp{$i}=$i_prev;
108 }
109 }
110 # --> fill_pri()
111 #fills basic specifics on the precursor into the 'comp' hash
112 # we already have the values in $g_id, $g_descr, $g_seq, $g_struct, $g_mfe and $g_seqlen
113
114 # --> fill_mature()
115 my $mature_query=$qmaxfreq;
116 #my($mature_beg,$mature_end)=get_qloc($mature_query);
117 my ($mature_beg, $mature_end)=($qmaxfreqdata[2], $qmaxfreqdata[3]);
118 my $mature_seq=excise_str($g_seq, $mature_beg, $mature_end);
119 my $mature_struct=excise_str($g_struct,$mature_beg,$mature_end);
120 my $mature_arm=arm_mature($mature_struct); # 1, 2 or 0
121 # $hash_comp{"mature_query"}=$mature_query;
122 # $hash_comp{"mature_beg"}=$mature_beg;
123 # $hash_comp{"mature_end"}=$mature_end;
124 # $hash_comp{"mature_strand"}=$mature_strand;
125 # $hash_comp{"mature_struct"}=$mature_struct;
126 # $hash_comp{"mature_seq"}=$mature_seq;
127 # $hash_comp{"mature_arm"}=$mature_arm;
128
129 # --> fill_star{
130 #fills specifics on the expected star strand into 'comp' hash ('component' hash)
131 #if the mature sequence is not plausible, don't look for the star arm
132 my ($star_arm, $star_beg, $star_end, $star_seq, $star_struct);
133
134 return unless ($mature_arm>0); # EXIT point if no mature arm determined
135 ($star_beg,$star_end)=find_star($mature_beg, $mature_end);
136 ($star_arm, $star_struct)=arm_star($star_beg, $star_end, $g_seqlen, $mature_arm, $mature_beg, $mature_end, $g_struct);
137 return unless $star_struct; #EXIT point if no star sequence
138
139 #excise expected star sequence and structure
140 $star_seq=excise_str($g_seq,$star_beg,$star_end);
141 # $hash_comp{"star_beg"}=$star_beg;
142 # $hash_comp{"star_end"}=$star_end;
143 # $hash_comp{"star_seq"}=$star_seq;
144 # $hash_comp{"star_struct"}=$star_struct;
145 # $hash_comp{"star_arm"}=$star_arm;
146
147
148 return if ($mature_arm+$star_arm!=3); #bail out if no valid mature and star combo
149
150 # --> fill_loop{
151 #fills specifics on the loop sequence into the 'comp' hash
152 #unless both mature and star sequences are plausible, do not look for the loop
153 my ($loop_beg, $loop_end, $loop_seq, $loop_struct);
154 #defining the begin and end positions of the loop from the mature and star positions
155 #excision depends on whether the mature or star sequence is 5' of the loop ('first')
156 ($loop_beg, $loop_end)=($mature_arm==1) ? ($mature_end+1, $star_beg-1) : ($star_end+1, $mature_beg-1);
157 #unless the positions are plausible, do not fill hash
158 return unless ($loop_beg<$loop_end && $loop_end<=$g_seqlen && $loop_beg>0);
159
160 $loop_seq=excise_str($g_seq,$loop_beg,$loop_end);
161 $loop_struct=excise_str($g_struct,$loop_beg,$loop_end);
162 # --> fill_lower_flanks()
163 #fills specifics on the lower flanks and unpaired strands into the 'comp' hash
164 my ($flank_first_end,$flank_second_beg)=($mature_arm==1) ? ($mature_beg-1, $star_end+1) :
165 ($star_beg-1, $mature_end+1);
166 return unless ($flank_first_end<=$flank_second_beg && $flank_first_end>0 && $flank_second_beg<=$g_seqlen);
167 my $flank_first_seq=excise_str($g_seq, 1, $flank_first_end);
168 my $flank_second_seq=excise_str($g_seq, $flank_second_beg, $g_seqlen);
169 my $flank_first_struct=excise_str($g_struct, 1, $flank_first_end);
170 my $flank_second_struct=excise_str($g_struct, $flank_second_beg, $g_seqlen);
171 # -- fill stems Drosha
172 my ($stem_first, $stem_second, $stem_bp_first, $stem_bp_second, $stem_bp);
173 if ($use_drosha) {
174 $stem_first=substr($flank_first_struct, -10);
175 $stem_second=substr($flank_second_struct,0,10);
176 $stem_bp_first=($stem_first=~tr/(/(/);
177 $stem_bp_second=($stem_second=~tr/)/)/);
178 $stem_bp=($stem_bp_first<$stem_bp_second)?$stem_bp_first : $stem_bp_second;
179 }
180 # -- final checks:
181 my ($pre_struct, $pre_seq, $mstar_struct) = ($mature_arm==1) ?
182 ($mature_struct.$loop_struct.$star_struct, $mature_seq.$loop_seq.$star_seq, $mature_struct.$star_struct) :
183 ($star_struct.$loop_struct.$mature_struct, $star_seq.$loop_seq.$mature_seq, $star_struct.$mature_struct);
184 #simple pattern matching checks for bifurcations
185 my $unloop=($loop_struct=~tr/././); #how many unpaired bases in loop
186 #print STDERR ">>>>> $g_id : loop_struct_len=".length($loop_struct)." unpaired bases: $unloop | $loop_struct\n";
187 unless (($unloop<<1)>length($loop_struct)) { #too many
188 print STDERR "[rejected]\t$g_id\t$g_descr($g_seqlen)\ttoo many pairings found in loop structure\n" if $verbose;
189 }
190 unless ($mstar_struct=~m/^[\.\(]+[\.\)]+$/) {
191 print STDERR "[rejected]\t$g_id\t$g_descr($g_seqlen)\tbifurcation detected in precursor\n" if $verbose;
192 return;
193 }
194 #minimum 14 base pairings required in duplex
195 my $duplex_bps=($mature_struct=~tr/()/()/);
196 unless ($duplex_bps>=14) {
197 print STDERR "[rejected]\t$g_id\t$g_descr($g_seqlen)\ttoo few base pairings in duplex\n" if $verbose;
198 return;
199 }
200 #no more than 6 nt difference between mature and star length
201 if (abs(length($mature_seq)-length($star_seq))>6) {
202 print STDERR "[rejected]\t$g_id\t$g_descr($g_seqlen)\ttoo big difference between mature and star lengths\n" if $verbose;
203 return;
204 }
205 # ---> pass_filtering_signature()
206 #number of reads that map in consistence with Dicer processing
207 my $consistent=0;
208 #number of reads that map inconsistent with Dicer processing
209 my $inconsistent=0;
210 # number of potential star reads map in good consistence with Drosha/Dicer processing
211 # (3' overhangs relative to mature product)
212 my $star_perfect=0;
213 # number of potential star reads that do not map in good consistence with 3' overhang
214 my $star_fuzzy=0;
215 #sort queries (deep sequences) by their position on the hairpin
216 my @xmapos = sort { $a->[2]<=>$b->[2] } @xmaps;
217 foreach my $qd (@xmapos) {
218 my ($qid, $qfreq, $qbeg, $qend,$qmism, $qinfo)=@$qd;
219 #test which Dicer product (if any) the read corresponds to
220 #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end
221 my $fuzz_beg=2;
222 #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end
223 my $fuzz_end=5;
224 my $product=0; # 1= mature, 2=star, 3=loop
225 if (contained($qbeg, $qend, $mature_beg-$fuzz_beg, $mature_end+$fuzz_end)) {
226 $product=1;
227 }
228 elsif (contained($qbeg, $qend, $star_beg-$fuzz_beg, $star_end+$fuzz_end)) {
229 $product=2;
230 if (abs($qbeg-$star_beg)<2) { $star_perfect+=$qfreq; }
231 else { $star_fuzzy+=$qfreq; }
232 }
233 elsif (contained($qbeg, $qend, $loop_beg-$fuzz_beg, $loop_end+$fuzz_end)) {
234 $product=3;
235 }
236 #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable
237 if ($product) { $consistent+=$qfreq; }
238 #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable
239 else { $inconsistent+=$qfreq; }
240 }
241 # if the majority of potential star sequences map in good accordance with 3' overhang
242 # score for the presence of star evidence
243 my $star_read=1 if ($star_perfect>$star_fuzzy);
244 #total number of reads mapping to the hairpin
245 my $freq=$consistent+$inconsistent;
246 #$hash_comp{"freq"}=$freq;
247 unless($freq>1) {
248 print STDERR "[rejected]\t$g_id\t$g_descr($g_seqlen)\tread frequency too low\n" if $verbose;
249 return;
250 }
251 #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded
252 my $cfraction=$inconsistent/($inconsistent+$consistent);
253 unless($cfraction<=0.1) {
254 print STDERR "[rejected]\t$g_id\t$g_descr($g_seqlen)\tpoor mapping frequency: inconsistent:$inconsistent, consistent:$consistent\n"
255 if $verbose;
256 return;
257 }
258 # --> pass_threshold_score()
259 #minimum free energy of the potential precursor
260 my $score_mfe=score_mfe($g_mfe);
261 #count of reads that map in accordance with Dicer processing
262 my $score_freq=score_freq($freq);
263 #basic score
264 my $score=$score_mfe+$score_freq;
265 $score+= $star_read ? $score_star : $score_star_not; #penalty if there are no "perfect" star reads
266 #score lower stems for potential for Drosha recognition
267 if($use_drosha){
268 $score+=$scores_stem[$stem_bp];
269 }
270 $score+=$score_intercept; # ?? why
271 if ($use_randfold) {
272 # only calculate randfold value if it can make the difference between the potential precursor
273 # being accepted or discarded
274 if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min) {
275 #randfold value<0.05
276 if(test_randfold($g_seq)) { $score+=$score_randfold; }
277 #randfold value>0.05
278 else{ $score+=$score_randfold_not; }
279 }
280 } #use randfold test
281 #round off values to one decimal
282 my $round_mfe=sprintf('%.1f', $score_mfe);
283 my $round_freq=sprintf('%.1f',$score_freq);
284 my $round=sprintf('%.1f', $score);
285 #print scores
286 my $passed=($score>=$score_min) ? '[passed]':'';
287 # print STDERR "score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round\t$passed\n";
288 #return 1 if the potential precursor is accepted, return 0 if discarded
289 #--- print all that made it up to here
290 my $g_chr=$g_id;
291 $g_chr=~s/_\d+$//;
292 my ($g_strand, $g_range)=split(/\|/,$g_descr);
293 my ($g_beg, $g_end)=split(/[\-\._]+/,$g_range);
294 my ($mature_gbeg, $mature_gend)=coords2genome($g_strand, $g_beg, $g_end, $mature_beg, $mature_end);
295 my ($star_gbeg, $star_gend)=coords2genome($g_strand, $g_beg, $g_end, $star_beg, $star_end);
296 #TODO: perhaps we can extend $flank_first_end and $flank_second_beg based on actual read coverage and RNA structure?
297 my ($hairpin_gbeg, $hairpin_gend)=coords2genome($g_strand, $g_beg, $g_end, $flank_first_end, $flank_second_beg);
298 my $m_loc_id=$g_chr.$g_strand.$mature_gbeg.'-'.$mature_gend;
299 my $h_loc_id=$g_chr.$g_strand.$hairpin_gbeg.'-'.$hairpin_gend;
300
301 my $pline=join("\t", $g_id.'|'.$g_descr, $g_chr, $g_strand, $hairpin_gbeg, $hairpin_gend, $mature_query, 'x'.$qmaxfreqdata[1],
302 $mature_seq, 'm|'.$mature_gbeg.'-'.$mature_gend, '*|'.$star_gbeg.'-'.$star_gend, $round_mfe,
303 $round_freq, $round, $passed, $qmaxfreqdata[5])."\n";
304 #print $pline;
305 if ($passed) {
306 if ($prevmir[0] eq $m_loc_id) { # && $prevmir[1] eq $h_loc_id) {
307 # same miRNA as before
308 if ($prevmir[2]>$round || ($prevmir[2]==$round && $prevmir[3]>$round_mfe)) {
309 #prev miRNA was the same but had a better score, print that
310 print $prevmir[4];
311 }
312 else { #prev didn't have a better score, print this
313 print $pline;
314 }
315 @prevmir=();
316 }
317 else { #different miRNA than before
318 print $prevmir[4] if $prevmir[4];
319 @prevmir=($m_loc_id, $h_loc_id, $round, $round_mfe, $pline);
320 #print $pline;
321 }
322 } # good prediction
323 else { # optionally, report also the potentially valid structures that failed the scoring test
324 print $prevmir[4] if $prevmir[4];
325 print $pline if $showall;
326 @prevmir=();
327 }
328 STDOUT->flush();
329 }
330
331 sub coords2genome { #convert local coordinates to genome coordinates
332 my ($strand, $g1, $g2, $l1, $l2)=@_;
333 return ($strand eq '-') ? ($g2-$l2+1, $g2-$l1+1) : ($g1+$l1-1, $g1+$l2-1);
334 }
335
336 sub contained{
337 #is c1-c2 interval contained within a1-a2 ?
338 my($c1,$c2,$a1,$a2)=@_;
339 #testbeginend($beg1,$end1,$beg2,$end2);
340 return ($c1>=$a1 and $c2<=$a2);
341 }
342
343 sub arm_mature {
344 #tests whether the mature sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
345 my ($struct)=@_;
346 #-- there should be no bifurcations and minimum one base pairing at that respective end
347 #my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,$strand);
348 my $r1=index($struct,'(');
349 my $r2=index($struct,')');
350 if ($r2<0 && $r1>=0) { #struct=~m/^[\(\.]+$/ && $struct=~m/\(/) {
351 return 1;
352 }
353 if ($r1<0 && $r2>=0) { # ($struct=~m/^[\)\.]+$/ && $struct=~m/\)/){
354 return 2;
355 }
356 return 0;
357 }
358
359 sub find_star {
360 #uses the 'bp' hash to find the expected star begin and end positions from the mature positions
361 my ($mature_beg, $mature_end)=@_;
362 $mature_end-=2; #the -2 is for the overhang
363 my $mature_lng=$mature_end-$mature_beg+1;
364 #in some cases, the last nucleotide of the mature sequence does not form a base pair,
365 #and therefore does not basepair with the first nucleotide of the star sequence.
366 #In this case, the algorithm searches for the last nucleotide of the mature sequence
367 #to form a base pair. The offset is the number of nucleotides searched through.
368 my $offset_star_beg=0;
369 my $offset_beg=0;
370 #the offset should not be longer than the length of the mature sequence, then it
371 #means that the mature sequence does not form any base pairs
372 while(!$offset_star_beg and $offset_beg<$mature_lng){
373 if($hash_bp{$mature_end-$offset_beg}){
374 $offset_star_beg=$hash_bp{$mature_end-$offset_beg};
375 }else{
376 $offset_beg++;
377 }
378 }
379 #when defining the beginning of the star sequence, compensate for the offset
380 my $star_beg=$offset_star_beg-$offset_beg;
381 #same as above
382 my $offset_star_end=0;
383 my $offset_end=0;
384 while (!$offset_star_end and $offset_end<$mature_lng){
385 if($hash_bp{$mature_beg+$offset_end}){
386 $offset_star_end=$hash_bp{$mature_beg+$offset_end};
387 }else{
388 $offset_end++;
389 }
390 }
391 #the +2 is for the overhang
392 my $star_end=$offset_star_end+$offset_end+2;
393 return ($star_beg, $star_end);
394 }
395
396 sub arm_star{
397 #tests whether the star sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
398 my ($beg, $end, $pri_end, $mature_arm, $mature_beg, $mature_end, $g_struct)=@_;
399 #unless the begin and end positions are plausible, test negative
400 return (0,'') unless ($beg>0 and $end>=$beg and $end<=$pri_end);
401 #must be no overlap between the mature and the star sequence
402 return (0,'') if ($mature_arm==1 && $mature_end>=$beg); #check ?
403 return (0,'') if ($mature_arm==2 && $mature_beg<=$end);
404 #there should be no bifurcations and minimum one base pairing
405
406 my $struct=excise_str($g_struct,$beg,$end);
407 my $r1=index($struct,'(');
408 my $r2=index($struct,')');
409 if ($r2<0 && $r1>=0) { #($struct=~/^(\(|\.)+$/ and $struct=~/\(/)
410 return (1,$struct);
411 }
412 if ($r1<0 && $r2>=0) { # ($struct=~/^(\)|\.)+$/ and $struct=~/\)/)
413 return (2, $struct);
414 }
415 return (0, $struct);
416 }
417
418
419
420 sub get_qloc {
421 my $qd=$qmaps{$_[0]} || die ("Error: couldn't retrieve read $_[0] info (on $reg_id)\n");
422 return ($$qd[2], $$qd[3]);
423 }
424
425
426 sub excise_str {
427 #excise sub structure
428 my($struct,$beg,$end)=@_;
429 my $lng=length($struct);
430 #begin can be equal to end if only one nucleotide is excised
431 die("Error: invalid begin..end coordinates given to excise_str ($reg_id\:$beg..$end) lng=$lng\n(struct: $struct)\n")
432 if ($beg>$end || $beg>$lng || $end>$lng);
433 #rarely, permuted combinations of signature and structure cause out of bound excision errors.
434 #this happens once appr. every two thousand combinations
435 #return 0 unless ($beg<=length($struct));
436
437 #if excising relative to minus strand, positions are reversed
438 #if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);}
439 return substr($struct,$beg-1,$end-$beg+1);
440 }
441
442
443 sub RNAfold {
444 my ($fh, $rfd)=@_;
445 # 0 1 2 3 4
446 # @$rfd=(region_id, region_descr, region_seq, region_struct, region_mfe)
447
448 @$rfd=();
449 my $rgid;
450 $_=<$fh>; #header line
451 return '' unless $_;
452 my $rgdescr;
453 ($rgid, $rgdescr)=(m/^>(\S+)\s+(\S+)/);
454 push(@$rfd, $rgid, $rgdescr);
455 my $seq=<$fh>; # sequence line
456 return '' unless $seq; #terminated prematurely?
457 $seq=~tr/\n\r \t//d;
458 my ($structure, $mfe)=RNA::fold($seq);
459 if (length($structure)!=length($seq)) {
460 die("Error: RNA::fold returned a structure with different length!\n".
461 "seq: $seq\nstr: $structure\n");
462 }
463 push(@$rfd, $seq, $structure, $mfe);
464 return $rgid;
465 }
466
467
468 sub readXmaps {
469 my ($fh, $rxmaps, $hxmaps)=@_;
470 @$rxmaps=(); #clear the array
471 %$hxmaps=();
472 $qmaxfreq='';
473 @qmaxfreqdata=();
474 #my $freqmax=0;
475 my $rgid;
476 my @rmaps;
477 while (<$fh>) {
478 if (m/^>(\S+)/) {
479 if ($rgid) {
480 seek($fh, -length($_), 1);
481 last; # --> jump to processing
482 }
483 $rgid=$1;
484 next;
485 }
486 chomp;
487 my @t=split(/\t/);
488 die("Error: invalid xmaps line format ($_)!\n")
489 unless ($t[1]>0 && $t[2]>$t[1]);
490 my $qname=shift(@t);
491 my ($qfreq)=($qname=~m/_x(\d+)$/);
492 #insert read freq and store the whole thing
493 my $rdata=[$qname, $qfreq, @t];
494 # 0 1 2 3 4 5
495 # [readID_xN, read_freq, read_alnstart, read_alnend, mismatches, intronInfo]
496 #push(@$rxmaps, $rdata);
497 push(@rmaps, $rdata);
498 #$hxmaps->{$qname}=$rdata;
499 }
500 die("Error: no mappings found for region $rgid !\n") unless @rmaps>0;
501 #collapse near-duplicate mappings (and sum up the frequencies)
502 # -- sort by start position
503 @rmaps = sort { ($a->[2]==$b->[2])? $a->[3]<=>$b->[3] : $a->[2] <=> $b->[2] } @rmaps;
504
505 my $i=0;
506 while ($i+1 < @rmaps) {
507 my $pdel=0;
508 for (my $j=$i+1;$j<@rmaps;$j++) {
509 my $pd=$rmaps[$i];
510 my $cd=$rmaps[$j];
511 my ($fr, $idel, $ikeep) = ($$cd[1]>$$pd[1])? ($$cd[1]/$$pd[1], $i, $j) : ($$pd[1]/$$cd[1], $j, $i); #freq ratio
512 my $plen=$$pd[3]-$$pd[2]+1;
513 my $clen=$$cd[3]-$$cd[2]+1;
514 my $pmm=$rmaps[$i]->[4];
515 my $cmm=$rmaps[$j]->[4];
516 my $dbg=0;
517 #if ($rgid eq 'chrX_5' && !$cmm && !$pmm) {
518 # print STDERR "checking merge between $$pd[0] ($$pd[2]-$$pd[3]) and $$cd[0] ($$cd[2]-$$cd[3])\n";
519 # $dbg=1;
520 # }
521 $pmm=~s/\d+\://g;
522 $cmm=~s/\d+\://g;
523 last if ($$cd[2]-$$pd[2]>16);
524 $fr=200 if ($$cd[1]<20 && $$pd[1]<20);
525 if ( ($$cd[2]-$$pd[2]<3 && abs($$cd[3]-$$pd[3])<3) && abs($plen-$clen)<5 && $pmm eq $cmm && $fr<3.6) { #collapse very close, similar mappings
526 #if ($dbg) {
527 # print STDERR "..merging $rmaps[$idel]->[0] ($rmaps[$idel]->[1]\:$rmaps[$idel]->[2]-$rmaps[$idel]->[3]) into ".
528 # "$rmaps[$ikeep]->[0] ($rmaps[$ikeep]->[1]\:$rmaps[$ikeep]->[2]-$rmaps[$ikeep]->[3])\n";
529 # if $rmaps[$ikeep]->[0] eq 'E2s5T00116060_x5438';
530 # }
531 $rmaps[$ikeep]->[1] += $rmaps[$idel]->[1];
532 $rmaps[$ikeep]->[2] = $pd->[2];
533 $rmaps[$ikeep]->[3] = ($$cd[3]>$$pd[3])? $$cd[3] : $$pd[3];
534 $pdel=1 if $idel==$i;
535 splice(@rmaps, $idel, 1);
536 last if $pdel;
537 }
538 } #for each $j (read mapping after $i)
539 $i++ unless $pdel;
540 } #for each read mapping $i
541 #also populate the hash
542 my $freqmax=0;
543 foreach my $rd (@rmaps) {
544 push (@$rxmaps, $rd);
545 $hxmaps->{$$rd[0]}=$rd;
546 if ($$rd[1]>$freqmax) {
547 $qmaxfreq=$$rd[0];
548 $freqmax=$$rd[1];
549 }
550 }
551 #print STDERR " DBG: $rgid : qmaxfreq=$qmaxfreq\n" if $verbose;
552 @qmaxfreqdata=@{$hxmaps->{$qmaxfreq}};
553 return $rgid;
554 }
555
556
557 sub test_randfold {
558 my ($rseq)=@_;
559 #print sequence to temporary file, test randfold value, return 1 or 0
560 #print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"});
561 my $p_value=`echo ">\\n$rseq" | randfold -s - 200 | cut -f3`;
562 chomp($p_value);
563 if($p_value<=0.05){return 1;}
564 return 0;
565 }
566
567 sub score_mfe{
568 # scores the minimum free energy in kCal/mol of the potential precursor
569 # Assumes Gumbel distribution as described in methods section of miRDeep manuscript
570 my $mfe=shift;
571 #numerical value, minimum 1
572 my $mfe_adj = (-$mfe>1) ? -$mfe : 1 ;
573 #parameters of known precursors and background hairpins, scale and location
574 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
575 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
576 my $odds=$prob_test/$prob_background;
577 my $log_odds=log($odds);
578 return $log_odds;
579 }
580
581 sub prob_gumbel_discretized{
582 # discretized Gumbel distribution, probabilities within windows of 1 kCal/mol
583 # uses the subroutine that calculates the cdf to find the probabilities
584 my ($var,$scale,$location)=@_;
585 my $bound_lower=$var-0.5;
586 my $bound_upper=$var+0.5;
587 my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location);
588 my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location);
589 my $prob=$cdf_upper-$cdf_lower;
590 return $prob;
591 }
592
593 sub cdf_gumbel{
594 # calculates the cumulative distribution function of the Gumbel distribution
595 my ($var,$scale,$location)=@_;
596 my $cdf=$e**(-($e**(-($var-$location)/$scale)));
597 return $cdf;
598 }
599
600 sub score_freq{
601 # scores the count of reads that map to the potential precursor
602 # Assumes geometric distribution as described in methods section of manuscript
603 my $freq=shift;
604 #parameters of known precursors and background hairpins
605 my $parameter_test=0.999;
606 my $parameter_control=0.6;
607 #log_odds calculated directly to avoid underflow
608 my $intercept=log((1-$parameter_test)/(1-$parameter_control));
609 my $slope=log($parameter_test/$parameter_control);
610 my $log_odds=$slope*$freq+$intercept;
611 #if no strong evidence for 3' overhangs, limit the score contribution to 0
612 #unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);}
613 return $log_odds;
614 }

Properties

Name Value
svn:executable *