1 |
gpertea |
23 |
#!/usr/bin/perl |
2 |
|
|
#simple dust filter for short reads: |
3 |
|
|
# if more than 50% of the sequence is masked, don't print the read |
4 |
|
|
use strict; |
5 |
|
|
while (<>) { |
6 |
|
|
if (m/^@(\S+)/) { |
7 |
|
|
my $seqname=$1; |
8 |
|
|
my $seq=<>; |
9 |
|
|
$seq=~tr/\n\r//d; |
10 |
|
|
my $qh=<>; |
11 |
|
|
#chomp($qh); |
12 |
|
|
$qh=~tr/\n\r//d; |
13 |
|
|
die("Error: 3rd line of $seqname must start with '+' !\n") |
14 |
|
|
unless ($qh eq '+') || ($qh eq '+'.$seqname); |
15 |
|
|
my $quals=<>; |
16 |
|
|
my $mseq=$seq; |
17 |
|
|
$mseq =~ s/((\w+?)\2{5,})/'N' x length $1/oeg; |
18 |
|
|
my $masked=($mseq=~tr/N//); |
19 |
|
|
if ($masked<(length($seq)>>1)) { |
20 |
|
|
print "\@$seqname\n$seq\n+\n$quals"; |
21 |
|
|
} |
22 |
|
|
#else { |
23 |
|
|
# print STDERR "Discarding: $seq\n"; |
24 |
|
|
# } |
25 |
|
|
} |
26 |
|
|
} |