ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/rnaseq_flt2fq.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 2 months ago) by gpertea
File size: 2853 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 23 #!/usr/bin/perl
2     use strict;
3     use Getopt::Std;
4    
5     my $usage = q/Usage:
6     rnaseq_flt2fq.pl [-Q] [-D] [-o <out.fq>] [-p <name_prefix>] <flt_file>
7    
8     Use -D to apply a minimal dust filter and not output any reads
9     that are more than 50% low-complexity
10    
11     /;
12     umask 0002;
13     getopts('QDo:p:') || die($usage."\n");
14     my $prefix=$Getopt::Std::opt_p || 'MIR0';
15     my $dust=$Getopt::Std::opt_D;
16     my $isfq=$Getopt::Std::opt_Q;
17     my $outfile=$Getopt::Std::opt_o;
18     if ($outfile) {
19     open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
20     select(OUTF);
21     }
22     # --
23    
24     my $minlen=16;
25     my $rc=0; #counter
26     my $totalreads=0;
27     my $discarded=0;
28     my %h; # $h{read}=[count, 'quals'];
29     # also computes qualities by averaging the values
30     while (<>) {
31     chomp;
32     next unless length($_)>0;
33     my ($seq, $qv);
34     if ($isfq) {
35     die("formatting Error! (\@ missing)\n") unless m/^\@/;
36     $seq=<>;
37     chomp($seq);
38     $_=<>;
39     die("formatting Error! (+ missing)\n") unless m/^\+/;
40     $qv=<>;
41     chomp($qv);
42     }
43     else { #simple 2-column format: sequence, quality values
44     ($seq, $qv)=split;
45     }
46     $totalreads++;
47     die("Error: quality value length doesn't match sequence length!\n")
48     unless (length($seq)==length($qv));
49     my $plen=length($seq);
50     if ($seq=~s/^n+//i) {
51     $qv=substr($qv,$plen-length($seq));
52     }
53     if (length($seq)<16) {
54     $discarded++;
55     next;
56     }
57     $rc++;
58     storeRead(uc($seq), $qv);
59     #printfq($seq, $qv);
60     }
61    
62    
63     # sprintf('MIRD1%08d',$rc),
64     #print join("\n",'@'.$id, uc($seq), '+',$quals)."\n";
65    
66     print STDERR "$discarded reads discarded out of initial $totalreads reads\n";
67     my $newcount=keys(%h);
68     print STDERR "After collapsing duplicates, the new total is: $newcount reads\n";
69     my $counter=0;
70     my $dustremoved=0;
71     while (my ($seq, $sd)=each(%h)) {
72     my $id=sprintf($prefix.'%08d',$counter+1);
73     if ($dust) {
74     my $mseq=$seq;
75     $mseq =~ s/((\w+?)\2{5,})/'N' x length $1/oeg;
76     my $masked=($mseq=~tr/N//); #bases masked
77     if ($masked>(length($seq)>>1)) { #>50% masked
78     $dustremoved++;
79     next;
80     }
81     }
82     print join("\n", '@'.$id.'_x'.$sd->[0], $seq, '+',$sd->[1])."\n";
83     $counter++;
84     }
85     my $dustmsg='';
86     $dustmsg=" (dust filter discarded $dustremoved reads)" if ($dustremoved>0);
87     print STDERR "$counter fastq records written.$dustmsg\n";
88    
89     # --
90     if ($outfile) {
91     select(STDOUT);
92     close(OUTF);
93     }
94    
95     #============ Subroutines:
96    
97     sub storeRead {
98     my ($seq, $qv)=@_;
99     my @q=unpack('C*', $qv); #get ascii values
100     my $slen=length($qv);
101     my $sd=$h{$seq};
102     if ($sd) { # same read found before
103     $sd->[0]++;
104     my @oldq=unpack('C*',$sd->[1]); #old qualities
105     my $i=0;
106     my $quals='';
107     foreach my $q0 (@oldq) {
108     $quals.= chr( int(($q[$i]-31+$q0)/2) ); #average qualities
109     $i++;
110     }
111     $sd->[1]=$quals; #update qualities
112     }
113     else {
114     my $quals='';
115     foreach my $c (@q) {
116     $quals .= chr($c-31);
117     }
118     $h{$seq}=[1, $quals];
119     }
120     }

Properties

Name Value
svn:executable *