ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/rnaseq_flt2fq.pl
Revision: 24
Committed: Tue Jul 26 21:46:39 2011 UTC (8 years, 1 month ago) by gpertea
File size: 2853 byte(s)
Log Message:
Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4
5 my $usage = q/Usage:
6 rnaseq_flt2fq.pl [-Q] [-D] [-o <out.fq>] [-p <name_prefix>] <flt_file>
7
8 Use -D to apply a minimal dust filter and not output any reads
9 that are more than 50% low-complexity
10
11 /;
12 umask 0002;
13 getopts('QDo:p:') || die($usage."\n");
14 my $prefix=$Getopt::Std::opt_p || 'MIR0';
15 my $dust=$Getopt::Std::opt_D;
16 my $isfq=$Getopt::Std::opt_Q;
17 my $outfile=$Getopt::Std::opt_o;
18 if ($outfile) {
19 open(OUTF, '>'.$outfile) || die("Error creating output file $outfile\n");
20 select(OUTF);
21 }
22 # --
23
24 my $minlen=16;
25 my $rc=0; #counter
26 my $totalreads=0;
27 my $discarded=0;
28 my %h; # $h{read}=[count, 'quals'];
29 # also computes qualities by averaging the values
30 while (<>) {
31 chomp;
32 next unless length($_)>0;
33 my ($seq, $qv);
34 if ($isfq) {
35 die("formatting Error! (\@ missing)\n") unless m/^\@/;
36 $seq=<>;
37 chomp($seq);
38 $_=<>;
39 die("formatting Error! (+ missing)\n") unless m/^\+/;
40 $qv=<>;
41 chomp($qv);
42 }
43 else { #simple 2-column format: sequence, quality values
44 ($seq, $qv)=split;
45 }
46 $totalreads++;
47 die("Error: quality value length doesn't match sequence length!\n")
48 unless (length($seq)==length($qv));
49 my $plen=length($seq);
50 if ($seq=~s/^n+//i) {
51 $qv=substr($qv,$plen-length($seq));
52 }
53 if (length($seq)<16) {
54 $discarded++;
55 next;
56 }
57 $rc++;
58 storeRead(uc($seq), $qv);
59 #printfq($seq, $qv);
60 }
61
62
63 # sprintf('MIRD1%08d',$rc),
64 #print join("\n",'@'.$id, uc($seq), '+',$quals)."\n";
65
66 print STDERR "$discarded reads discarded out of initial $totalreads reads\n";
67 my $newcount=keys(%h);
68 print STDERR "After collapsing duplicates, the new total is: $newcount reads\n";
69 my $counter=0;
70 my $dustremoved=0;
71 while (my ($seq, $sd)=each(%h)) {
72 my $id=sprintf($prefix.'%08d',$counter+1);
73 if ($dust) {
74 my $mseq=$seq;
75 $mseq =~ s/((\w+?)\2{5,})/'N' x length $1/oeg;
76 my $masked=($mseq=~tr/N//); #bases masked
77 if ($masked>(length($seq)>>1)) { #>50% masked
78 $dustremoved++;
79 next;
80 }
81 }
82 print join("\n", '@'.$id.'_x'.$sd->[0], $seq, '+',$sd->[1])."\n";
83 $counter++;
84 }
85 my $dustmsg='';
86 $dustmsg=" (dust filter discarded $dustremoved reads)" if ($dustremoved>0);
87 print STDERR "$counter fastq records written.$dustmsg\n";
88
89 # --
90 if ($outfile) {
91 select(STDOUT);
92 close(OUTF);
93 }
94
95 #============ Subroutines:
96
97 sub storeRead {
98 my ($seq, $qv)=@_;
99 my @q=unpack('C*', $qv); #get ascii values
100 my $slen=length($qv);
101 my $sd=$h{$seq};
102 if ($sd) { # same read found before
103 $sd->[0]++;
104 my @oldq=unpack('C*',$sd->[1]); #old qualities
105 my $i=0;
106 my $quals='';
107 foreach my $q0 (@oldq) {
108 $quals.= chr( int(($q[$i]-31+$q0)/2) ); #average qualities
109 $i++;
110 }
111 $sd->[1]=$quals; #update qualities
112 }
113 else {
114 my $quals='';
115 foreach my $c (@q) {
116 $quals .= chr($c-31);
117 }
118 $h{$seq}=[1, $quals];
119 }
120 }

Properties

Name Value
svn:executable *