ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/scripts/fq_extract.pl
Revision: 160
Committed: Fri Feb 3 15:31:57 2012 UTC (7 years, 7 months ago) by gpertea
File size: 2800 byte(s)
Log Message:
added fq_extract script

Line File contents
1 #!/usr/bin/perl
2 use strict;
3 use Getopt::Std;
4 use FindBin;use lib $FindBin::Bin;
5
6 my $usage = q/Usage:
7 fq_extract.pl [-o <outprefix> ] insert_names.lst reads_1.fq[.bz2] [ reads_2.fq[.bz2] ]
8 Given a list of read names (actually insert names), extracts the
9 [paired] reads from input fastq files.
10 /;
11 umask 0002;
12 getopts('o:') || die($usage."\n");
13 my $outprefix=$Getopt::Std::opt_o || 'extracted';
14 my $flst=shift(@ARGV);
15 die($usage."\n") unless $flst eq '-' || -f $flst;
16 die($usage."\n") unless @ARGV>0 && @ARGV<3;
17
18 my %h; #hash holding the insert names
19
20 open(LST, $flst) || die("Error opening file $flst\n");
21 while (<LST>) {
22 chomp;
23 my @l=split;
24 next unless $l[0];
25 $l[0]=~s/\/[12]$//;
26 $h{$l[0]}=1;
27 }
28 close(LST) unless $flst eq '-';
29 my $n=keys(%h);
30 my @f=@ARGV;
31 die($usage."\nError: list file ($flst) has no entries !\n") if $n==0;
32 my $paired=(@f==2);
33 if ($paired) {
34 print STDERR "$n read pairs to be extracted.\n";
35 }
36 else{
37 print STDERR "$n reads to be extracted.\n";
38 }
39
40
41 my @fh=(undef, undef);
42 #determine compression, if any
43 my @fz=('','');
44 my @fno=($outprefix.'_1.fq', $outprefix.'_2.fq');
45 $fno[0]=$outprefix.'.fq' unless $paired;
46 my @foh=(undef, undef);
47 for (my $i=0;$i<@f;$i++) {
48 last unless $f[$i];
49 if ($f[$i]=~m/\.bzi?p?2$/) {
50 $fz[$i]='bzip2';
51 }
52 elsif ($f[$i]=~m/.g?zi?p?$/) {
53 $fz[$i]='gzip';
54 }
55 if ($fz[$i]) {
56 open($fh[$i], $fz[$i]." -cd '".$f[$i]."'|") ||
57 die("Error creating decompression pipe: $fz[0] -cd '$f[$i]' !\n");
58 }
59 else {
60 open($fh[$i], $f[$i]) || die("Error opening file $f[$i] ! \n");
61 }
62 open($foh[$i], '>'.$fno[$i]) || die("Error opening file $fno[$i] !\n");
63 }
64
65
66 while (1) {
67 my ($rname, $rseq, $rquals)=getFastq($fh[0]);
68 last unless $rname;
69 my $iname=$rname;
70 $iname=~s/\/[12]$//;
71 my ($mname, $mseq, $mquals);
72 if ($f[1]) {
73 ($mname, $mseq, $mquals)=getFastq($fh[1]);
74 my $check=$mname;
75 $check=~s/\/[12]$//;
76 die("Error: cannot find mate for $rname (found $mname instead)!\n")
77 unless $check eq $iname;
78 }
79 next unless exists($h{$iname});
80 print { $foh[0] }'@'.$rname."\n$rseq\n+\n$rquals\n";
81 if ($mname) {
82 print { $foh[1] } '@'.$mname."\n$mseq\n+\n$mquals\n";
83 }
84 }
85
86
87 for (my $i=0;$i<@f;$i++) {
88 last unless $f[$i];
89 close($foh[$i]);
90 close($fh[$i]);
91 }
92
93 #************ Subroutines **************
94
95 sub getFastq {
96 my $fh=$_[0]; # file handle
97 #parses next FASTQ record
98 #returns ($readname, $readseq, $readquals)
99 my ($rname, $rseq, $rquals);
100 while (<$fh>) {
101 ($rname)=(m/^@(\S+)/);
102 last if $rname;
103 }
104 if ($_) {
105 while (<$fh>) {
106 last if m/^\+/;
107 chomp;
108 $rseq.=$_;
109 }
110 if ($_) {
111 while (<$fh>) {
112 chomp;
113 $rquals.=$_;
114 last if (length($rseq)<=length($rquals));
115 }
116 }
117 }
118 return ($rname, $rseq, $rquals);
119 }

Properties

Name Value
svn:executable *