ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/prep_reads.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 10957 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * polyA_reads.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 9/2/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 * Derived from maq "catfilter", by Heng Li at Sanger
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <stdio.h>
15 #include <cassert>
16 #include <vector>
17 #include <cstring>
18 #include <cstdlib>
19
20 #include "common.h"
21 #include "reads.h"
22 #include "tokenize.h"
23 #include "qual.h"
24
25 //bool fastq_db = true;
26
27 using namespace std;
28
29 void format_qual_string(string& qual_str)
30 {
31 for (size_t i = 0; i < qual_str.size(); ++i)
32 {
33 qual_str[i] = charToPhred33(qual_str[i],
34 solexa_quals,
35 phred64_quals);
36 }
37 }
38
39 bool readmap_loaded=false;
40 vector<bool> readmap; //for filter_multihits
41
42 void load_readmap() {
43 readmap_loaded=false;
44 if (flt_reads.empty()) return;
45 //FZPipe(filter_multihits, false);
46 FILE* rfile=fopen(flt_reads.c_str(),"r");
47 if (rfile==NULL)
48 err_die("Error: cannot read file %s\n", flt_reads.c_str());
49 FLineReader fr(rfile);
50 skip_lines(fr); //should not be needed, bowtie doesn't add extra header lines for this
51 Read read;
52 while (next_fastx_read(fr, read, reads_format)) {
53 uint32_t rnum=(uint32_t)atol(read.name.c_str());
54 if (rnum>=(uint32_t) readmap.size())
55 readmap.resize(rnum+1, false);
56 readmap[rnum] = true;
57 }
58 readmap_loaded=true;
59 fclose(rfile);
60 }
61
62 bool check_readmap(uint32_t read_id) {
63 if (read_id>=readmap.size()) return false;
64 else return readmap[read_id];
65 }
66 void flt_reads_and_hits(vector<FZPipe>& reads_files) {
67 if (!readmap_loaded)
68 err_die("Error: filtering reads not enabled, aborting.");
69 if (aux_outfile.empty())
70 err_die("Error: auxiliary output file not provided.");
71 if (index_outfile.empty())
72 err_die("Error: index output file not provided.");
73
74 // -- filter mappings:
75 string fext=getFext(flt_mappings);
76 if (fext=="bam") {
77 //TODO
78 //WRITEME: use samtools C API to write a quick bam filter
79 err_die("Error: cannot handle BAM files yet![WRITEME]");
80 return;
81 }
82 bool is_sam=false;
83 string s(flt_mappings);
84 for(size_t i=0; i!=s.length(); ++i)
85 s[i] = std::tolower(s[i]);
86 if (fext=="sam" || s.rfind(".sam.")+fext.length()+5==s.length()) is_sam=true;
87 string unzcmd=getUnpackCmd(flt_mappings);
88 FZPipe hitsfile(flt_mappings, unzcmd);
89 FLineReader fh(hitsfile);
90 FZPipe outfile;
91 outfile.openWrite(aux_outfile.c_str(), zpacker);
92 if (outfile.file==NULL) err_die("Error: cannot create file %s", aux_outfile.c_str());
93 const char* line;
94 while ((line=fh.nextLine())) {
95 if (is_sam && line[0]=='@') {
96 fprintf(outfile.file, "%s\n", line);
97 continue;
98 }
99 char* tab=strchr((char*)line, '\t');
100 if (tab==NULL)
101 err_die("Error: cannot find tab character in %s mappings line:\n%s",
102 flt_mappings.c_str(),line);
103 *tab=0;
104 uint32_t rid = (uint32_t) atol(line);
105 if (rid==0)
106 err_die("Error: invalid read ID (%s) parsed from mapping file %s",
107 line, flt_mappings.c_str());
108 *tab='\t';
109 if (check_readmap(rid)) {
110 fprintf(outfile.file, "%s\n", line);
111 }
112 }
113 outfile.close();
114 hitsfile.close();
115 // -- now filter reads
116 for (size_t fi = 0; fi < reads_files.size(); ++fi) {
117 //only one file expected here, this is not the initial prep_reads
118 Read read;
119 FLineReader fr(reads_files[fi]);
120 //skip_lines(fr);
121 while (next_fastx_read(fr, read)) {
122 uint32_t rnum=(uint32_t)atol(read.name.c_str());
123 if (check_readmap(rnum))
124 printf("@%s\n%s\n+%s\n%s\n",
125 read.name.c_str(),
126 read.seq.c_str(),
127 read.alt_name.c_str(),
128 read.qual.c_str());
129 }
130 }
131 }
132
133
134 void process_reads(vector<FZPipe>& reads_files, vector<FZPipe>& quals_files)
135 {
136 //TODO: add the option to write the garbage reads into separate file(s)
137 int num_reads_chucked = 0;
138 int multimap_chucked = 0;
139 int min_read_len = 20000000;
140 int max_read_len = 0;
141 uint32_t next_id = 0;
142 uint64_t file_offset = 0;
143 FILE* fw=NULL;
144 if (!aux_outfile.empty()) {
145 fw=fopen(aux_outfile.c_str(), "w");
146 if (fw==NULL)
147 err_die("Error: cannot create file %s\n",aux_outfile.c_str());
148 }
149
150 FILE* findex = NULL;
151 if (!index_outfile.empty()) {
152 findex = fopen(index_outfile.c_str(), "w");
153 if (findex == NULL)
154 err_die("Error: cannot create file %s\n", index_outfile.c_str());
155 }
156
157 for (size_t fi = 0; fi < reads_files.size(); ++fi)
158 {
159 Read read;
160 FLineReader fr(reads_files[fi]);
161 skip_lines(fr);
162 FZPipe fq;
163 if (quals)
164 fq = quals_files[fi];
165
166 FLineReader frq(fq);
167 skip_lines(frq);
168
169 while (!fr.isEof()) {
170 //read.clear();
171 // Get the next read from the file
172 if (!next_fastx_read(fr, read, reads_format, ((quals) ? &frq : NULL)))
173 break;
174
175 ++next_id;
176 //IMPORTANT: to keep paired reads in sync, this must be
177 //incremented BEFORE any reads are chucked !
178
179 if (read.seq.length()<12) {
180 ++num_reads_chucked;
181 continue;
182 }
183 if ((int)read.seq.length()<min_read_len)
184 min_read_len=read.seq.length();
185 if ((int)read.seq.length()>max_read_len)
186 max_read_len=read.seq.length();
187
188 // daehwan - check this later, it's due to bowtie
189 if (color && read.seq[1] == '4') {
190 ++num_reads_chucked;
191 continue;
192 }
193
194 if (readmap_loaded && check_readmap(next_id)) {
195 ++num_reads_chucked;
196 ++multimap_chucked;
197 continue;
198 }
199 format_qual_string(read.qual);
200 std::transform(read.seq.begin(), read.seq.end(), read.seq.begin(), ::toupper);
201 char counts[256];
202 memset(counts, 0, sizeof(counts));
203 // Count up the bad characters
204 for (unsigned int i = 0; i != read.seq.length(); ++i)
205 {
206 char c = (char)toupper(read.seq[i]);
207 counts[(size_t)c]++;
208 }
209
210 double percent_A = (double)(counts[(size_t)'A']) / read.seq.length();
211 double percent_C = (double)(counts[(size_t)'C']) / read.seq.length();
212 double percent_G = (double)(counts[(size_t)'G']) / read.seq.length();
213 double percent_T = (double)(counts[(size_t)'T']) / read.seq.length();
214 double percent_N = (double)(counts[(size_t)'N']) / read.seq.length();
215 double percent_4 = (double)(counts[(size_t)'4']) / read.seq.length();
216
217 // Chuck the read if there are at least 5 'N's or if it's mostly
218 // (>90%) 'N's and 'A's
219
220 if (percent_A > 0.9 ||
221 percent_C > 0.9 ||
222 percent_G > 0.9 ||
223 percent_T > 0.9 ||
224 percent_N >= 0.1 ||
225 percent_4 >= 0.1)
226 {
227 ++num_reads_chucked;
228 }
229 else
230 {
231 // daehwan - we should not use buf in printf function
232 // because it may contain some control characters such as "\" from quality values.
233 // Here, buf is only used for calculating the file offset
234 char buf[2048] = {0};
235 if (reads_format == FASTQ or (reads_format == FASTA && quals))
236 {
237 sprintf(buf, "@%u\n%s\n+%s\n%s\n",
238 next_id,
239 read.seq.c_str(),
240 read.name.c_str(),
241 read.qual.c_str());
242
243 printf("@%u\n%s\n+%s\n%s\n",
244 next_id,
245 read.seq.c_str(),
246 read.name.c_str(),
247 read.qual.c_str());
248 }
249 else if (reads_format == FASTA)
250 {
251 string qual;
252 if (color)
253 qual = string(read.seq.length()-1, 'I').c_str();
254 else
255 qual = string(read.seq.length(), 'I').c_str();
256
257 sprintf(buf, "@%u\n%s\n+%s\n%s\n",
258 next_id,
259 read.seq.c_str(),
260 read.name.c_str(),
261 qual.c_str());
262
263 printf("@%u\n%s\n+%s\n%s\n",
264 next_id,
265 read.seq.c_str(),
266 read.name.c_str(),
267 qual.c_str());
268 }
269 else
270 {
271 assert(0);
272 }
273
274 if (findex != NULL)
275 {
276 if ((next_id - num_reads_chucked) % 1000 == 0)
277 fprintf(findex, "%d\t%lu\n", next_id, file_offset);
278 }
279
280 file_offset += strlen(buf);
281 }
282 } //while !fr.isEof()
283 fr.close();
284 frq.close();
285 } //for each input file
286 fprintf(stderr, "%u out of %u reads have been filtered out\n",
287 num_reads_chucked, next_id);
288 if (readmap_loaded)
289 fprintf(stderr, "\t(%u filtered out due to %s)\n",
290 multimap_chucked, flt_reads.c_str());
291 if (fw!=NULL) {
292 fprintf(fw, "min_read_len=%d\n",min_read_len - (color ? 1 : 0));
293 fprintf(fw, "max_read_len=%d\n",max_read_len - (color ? 1 : 0));
294 fprintf(fw, "reads_in =%d\n",next_id);
295 fprintf(fw, "reads_out=%d\n",next_id-num_reads_chucked);
296 fclose(fw);
297 }
298
299 if (findex != NULL)
300 fclose(findex);
301 }
302
303 void print_usage()
304 {
305 fprintf(stderr, "Usage:\n prep_reads [--filter-multi <multi.fq>] <reads1.fa/fq,...,readsN.fa/fq>\n");
306 //alternate usage (--ctv_to_num) : doesn't filter out any reads,
307 //but simply convert read names to numeric ids
308 }
309
310
311 int main(int argc, char *argv[])
312 {
313 fprintf(stderr, "prep_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
314 fprintf(stderr, "---------------------------\n");
315
316 int parse_ret = parse_options(argc, argv, print_usage);
317 if (parse_ret)
318 return parse_ret;
319
320 if(optind >= argc)
321 {
322 print_usage();
323 return 1;
324 }
325
326 string reads_file_list = argv[optind++];
327
328 vector<string> reads_file_names;
329 vector<FZPipe> reads_files;
330 tokenize(reads_file_list, ",",reads_file_names);
331 for (size_t i = 0; i < reads_file_names.size(); ++i)
332 {
333 string fname=reads_file_names[i];
334 string pipecmd=guess_packer(fname, true);
335 if (!pipecmd.empty()) pipecmd.append(" -cd");
336 FZPipe seg_file(fname,pipecmd);
337 if (seg_file.file == NULL) {
338 fprintf(stderr, "Error: cannot open reads file %s for reading\n",
339 reads_file_names[i].c_str());
340 exit(1);
341 }
342 reads_files.push_back(seg_file);
343 }
344
345 vector<string> quals_file_names;
346 vector<FZPipe> quals_files;
347 if (quals)
348 {
349 string quals_file_list = argv[optind++];
350 tokenize(quals_file_list, ",", quals_file_names);
351 for (size_t i = 0; i < quals_file_names.size(); ++i)
352 {
353 string fname(quals_file_names[i]);
354 string pipecmd=guess_packer(fname, true);
355 FZPipe seg_file(fname, pipecmd);
356 if (seg_file.file == NULL)
357 {
358 fprintf(stderr, "Error: cannot open reads file %s for reading\n",
359 quals_file_names[i].c_str());
360 exit(1);
361 }
362 quals_files.push_back(seg_file);
363 }
364 }
365 load_readmap();
366 // Only print to standard out the good reads
367 //TODO: a better, more generic read filtering protocol
368 if (flt_mappings.empty())
369 process_reads(reads_files, quals_files);
370 else //special use case: filter previous bowtie results
371 flt_reads_and_hits(reads_files);
372
373 return 0;
374 }