ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/reads.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 17804 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 /*
2 * reads.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 9/2/48.
6 * Copyright 2448 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14 #include <iostream>
15 #include <cassert>
16 #include <string>
17 #include <algorithm>
18 #include <cstring>
19 #include <cstdlib>
20
21 #include <seqan/find.h>
22 #include <seqan/file.h>
23 #include <seqan/modifier.h>
24
25 #include "reads.h"
26 #include "bwt_map.h"
27 #include "tokenize.h"
28
29 using namespace std;
30
31 char* FLineReader::nextLine() {
32 if(!file) return NULL;
33 if (pushed) { pushed=false; return buf; }
34 //reads a char at a time until \n and/or \r are encountered
35 len=0;
36 int c=0;
37 while ((c=getc(file))!=EOF) {
38 if (len>=allocated-1) {
39 allocated+=512;
40 buf=(char*)realloc(buf,allocated);
41 }
42 if (c=='\n' || c=='\r') {
43 buf[len]='\0';
44 if (c=='\r') { //DOS file: double-char line terminator, skip the second one
45 if ((c=getc(file))!='\n')
46 ungetc(c,file);
47 }
48 lcount++;
49 return buf;
50 }
51 buf[len]=(char)c;
52 len++;
53 }
54 if (c==EOF) {
55 isEOF=true;
56 if (len==0) return NULL;
57 }
58 buf[len]='\0';
59 lcount++;
60 return buf;
61 }
62
63 void skip_lines(FLineReader& fr)
64 {
65 if (fr.fhandle() == NULL) return;
66 char* buf = NULL;
67 while ((buf = fr.nextLine()) != NULL) {
68 if (buf[0] == '\0') continue;
69 if (buf[0] == '>' || buf[0] == '@')
70 {
71 fr.pushBack();
72 break;
73 }
74 }
75 }
76
77 bool next_fasta_record(FLineReader& fr,
78 string& defline,
79 string& seq,
80 ReadFormat reads_format)
81
82 {
83 seq.clear();
84 defline.clear();
85 char* buf=NULL;
86 while ((buf=fr.nextLine())!=NULL) {
87 if (buf[0]==0) continue; //skip empty lines
88 if ((reads_format == FASTA && buf[0] == '>') || (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
89 if (seq.length()>0) { //current record ending
90 fr.pushBack();
91 return true;
92 }
93 defline=buf+1;
94 string::size_type space_pos = defline.find_first_of(" \t");
95 if (space_pos != string::npos) {
96 defline.resize(space_pos);
97 }
98 continue;
99 } //defline
100 // sequence line
101 seq.append(buf);
102 } //line reading loop
103
104 replace(seq.begin(), seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
105 return !(seq.empty());
106 }
107
108 bool next_fastq_record(FLineReader& fr,
109 const string& seq,
110 string& alt_name,
111 string& qual,
112 ReadFormat reads_format)
113 {
114 alt_name.clear();
115 qual.clear();
116 char* fline=fr.nextLine();
117 if (fline==NULL) return false;
118 while (fline[0]==0) { //skip empty lines
119 fline=fr.nextLine();
120 if (fline==NULL) return false;
121 }
122 //must be on '+' line here
123 if (fline==NULL || (reads_format == FASTQ && fline[0] != '+') ||
124 (reads_format == FASTA && quals && fline[0] != '>')) {
125 err_exit("Error: '+' not found for fastq record %s\n",fline);
126 return false;
127 }
128 alt_name=fline+1;
129 string::size_type space_pos = alt_name.find_first_of(" \t");
130 if (space_pos != string::npos) alt_name.resize(space_pos);
131 //read qv line(s) now:
132 while ((fline=fr.nextLine())!=NULL) {
133 if (integer_quals)
134 {
135 vector<string> integer_qual_values;
136 tokenize(string(fline), " ", integer_qual_values);
137
138 string temp_qual;
139 for (size_t i = 0; i < integer_qual_values.size(); ++i)
140 {
141 int qual_value = atoi(integer_qual_values[i].c_str());
142 if (qual_value < 0) qual_value = 0;
143 temp_qual.push_back((char)(qual_value + 33));
144 }
145
146 qual.append(temp_qual);
147 }
148 else
149 qual.append(fline);
150 if (qual.length()>=seq.length()-1) break;
151 }
152 // final check
153 if (color) {
154 if (seq.length()==qual.length()) {
155 //discard first qv
156 qual=qual.substr(1);
157 }
158 if (seq.length()!=qual.length()+1) {
159 err_exit("Error: length of quality string does not match seq length (%d) for color read %s!\n",
160 seq.length(), alt_name.c_str());
161 }
162 }
163 else {
164 if (seq.length()!=qual.length()) {
165 err_exit("Error: qual string length (%d) differs from seq length (%d) for read %s!\n",
166 qual.length(), seq.length(), alt_name.c_str());
167 //return false;
168 }
169 }
170 //
171 return !(qual.empty());
172 }
173
174 bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format,
175 FLineReader* frq) {
176 if (fr.pushed_read)
177 {
178 read = fr.last_read;
179 fr.pushed_read = false;
180 return true;
181 }
182
183 read.clear();
184 char* buf=NULL;
185 while ((buf=fr.nextLine())!=NULL) {
186 if (buf[0]==0) continue; //skip empty lines
187 if ((reads_format == FASTA && buf[0] == '>') ||
188 (reads_format == FASTQ && (buf[0] == '+' || buf[0] == '@'))) { //next record
189 if (read.seq.length()>0) { //current record ending
190 fr.pushBack();
191 break;
192 }
193 read.name=buf+1;
194 string::size_type space_pos = read.name.find_first_of(" \t");
195 if (space_pos != string::npos) {
196 read.name.resize(space_pos);
197 }
198 continue;
199 } //defline
200 // sequence line
201 read.seq.append(buf);
202 } //line reading loop
203
204 replace(read.seq.begin(), read.seq.end(), '.', color ? '4' : 'N'); //shouldn't really be needed for FASTA files
205 if (reads_format != FASTQ && frq==NULL)
206 return (!read.seq.empty());
207 if (frq==NULL) frq=&fr; //FASTQ
208 //FASTQ or quals in a separate file -- now read quality values
209 buf=frq->nextLine();
210 if (buf==NULL) return false;
211 while (buf[0]==0) { //skip empty lines
212 buf=frq->nextLine();
213 if (buf==NULL) return false;
214 }
215 //must be on '+' line here
216 if (buf==NULL || (reads_format == FASTQ && buf[0] != '+') ||
217 (reads_format == FASTA && buf[0] != '>')) {
218 err_exit("Error: beginning of quality values record not found! (%s)\n",buf);
219 return false;
220 }
221 read.alt_name=buf+1;
222 string::size_type space_pos = read.alt_name.find_first_of(" \t");
223 if (space_pos != string::npos) read.alt_name.resize(space_pos);
224 //read qv line(s) now:
225 while ((buf=frq->nextLine())!=NULL) {
226 if (integer_quals)
227 {
228 vector<string> integer_qual_values;
229 tokenize(string(buf), " ", integer_qual_values);
230 string temp_qual;
231 for (size_t i = 0; i < integer_qual_values.size(); ++i)
232 {
233 int qual_value = atoi(integer_qual_values[i].c_str());
234 if (qual_value < 0) qual_value = 0;
235 temp_qual.push_back((char)(qual_value + 33));
236 }
237 read.qual.append(temp_qual);
238 }
239 else {
240 read.qual.append(buf);
241 }
242 if (read.qual.length()>=read.seq.length()-1)
243 break;
244 } //while qv lines
245
246 // final check
247 if (color) {
248 if (read.seq.length()==read.qual.length()) {
249 //discard first qv
250 read.qual=read.qual.substr(1);
251 }
252 if (read.seq.length()!=read.qual.length()+1) {
253 err_exit("Error: length of quality string does not match sequence length (%d) for color read %s!\n",
254 read.seq.length(), read.alt_name.c_str());
255 }
256 }
257 else {
258 if (read.seq.length()!=read.qual.length()) {
259 err_exit("Error: qual length (%d) differs from seq length (%d) for fastq record %s!\n",
260 read.qual.length(), read.seq.length(), read.alt_name.c_str());
261 return false;
262 }
263 }
264
265 fr.last_read = read;
266 return !(read.seq.empty());
267 }
268
269 // This could be faster.
270 void reverse_complement(string& seq)
271 {
272 //fprintf(stderr,"fwd: %s\n", seq.c_str());
273 for (string::size_type i = 0; i < seq.length(); ++i)
274 {
275 switch(seq[i])
276 {
277 case 'A' : seq[i] = 'T'; break;
278 case 'T' : seq[i] = 'A'; break;
279 case 'C' : seq[i] = 'G'; break;
280 case 'G' : seq[i] = 'C'; break;
281 default: seq[i] = 'N'; break;
282 }
283 }
284 reverse(seq.begin(), seq.end());
285 //fprintf(stderr, "rev: %s\n", seq.c_str());
286 }
287
288 string convert_color_to_bp(const string& color)
289 {
290 if (color.length() <= 0)
291 return "";
292
293 char base = color[0];
294 string bp;
295 for (string::size_type i = 1; i < color.length(); ++i)
296 {
297 char next = color[i];
298 switch(base)
299 {
300 // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N',
301 case 'A':
302 {
303 switch(next)
304 {
305 case '0': next = 'A'; break;
306 case '1': next = 'C'; break;
307 case '2': next = 'G'; break;
308 case '3': next = 'T'; break;
309 default: next = 'N'; break;
310 }
311 }
312 break;
313 case 'C':
314 {
315 // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N',
316 switch(next)
317 {
318 case '0': next = 'C'; break;
319 case '1': next = 'A'; break;
320 case '2': next = 'T'; break;
321 case '3': next = 'G'; break;
322 default: next = 'N'; break;
323 }
324 }
325 break;
326 case 'G':
327 {
328 // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N',
329 switch(next)
330 {
331 case '0': next = 'G'; break;
332 case '1': next = 'T'; break;
333 case '2': next = 'A'; break;
334 case '3': next = 'C'; break;
335 default: next = 'N'; break;
336 }
337 }
338 break;
339 case 'T':
340 {
341 // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N',
342 switch(next)
343 {
344 case '0': next = 'T'; break;
345 case '1': next = 'G'; break;
346 case '2': next = 'C'; break;
347 case '3': next = 'A'; break;
348 default: next = 'N'; break;
349 }
350 }
351 break;
352 default: next = 'N'; break;
353 }
354
355 bp.push_back(next);
356 base = next;
357 }
358
359 return bp;
360 }
361
362 // daehwan - reduce code redundancy!
363 seqan::String<char> convert_color_to_bp(char base, const seqan::String<char>& color)
364 {
365 if (seqan::length(color) <= 0)
366 return "";
367
368 string bp;
369 for (string::size_type i = 0; i < seqan::length(color); ++i)
370 {
371 char next = color[i];
372 switch(base)
373 {
374 // 'A0':'A', 'A1':'C', 'A2':'G', 'A3':'T', 'A4':'N', 'A.':'N',
375 case 'A':
376 {
377 switch(next)
378 {
379 case '0': next = 'A'; break;
380 case '1': next = 'C'; break;
381 case '2': next = 'G'; break;
382 case '3': next = 'T'; break;
383 default: next = 'N'; break;
384 }
385 }
386 break;
387 case 'C':
388 {
389 // 'C0':'C', 'C1':'A', 'C2':'T', 'C3':'G', 'C4':'N', 'C.':'N',
390 switch(next)
391 {
392 case '0': next = 'C'; break;
393 case '1': next = 'A'; break;
394 case '2': next = 'T'; break;
395 case '3': next = 'G'; break;
396 default: next = 'N'; break;
397 }
398 }
399 break;
400 case 'G':
401 {
402 // 'G0':'G', 'G1':'T', 'G2':'A', 'G3':'C', 'G4':'N', 'G.':'N',
403 switch(next)
404 {
405 case '0': next = 'G'; break;
406 case '1': next = 'T'; break;
407 case '2': next = 'A'; break;
408 case '3': next = 'C'; break;
409 default: next = 'N'; break;
410 }
411 }
412 break;
413 case 'T':
414 {
415 // 'T0':'T', 'T1':'G', 'T2':'C', 'T3':'A', 'T4':'N', 'T.':'N',
416 switch(next)
417 {
418 case '0': next = 'T'; break;
419 case '1': next = 'G'; break;
420 case '2': next = 'C'; break;
421 case '3': next = 'A'; break;
422 default: next = 'N'; break;
423 }
424 }
425 break;
426 default: next = 'N'; break;
427 }
428
429 bp.push_back(next);
430 base = next;
431 }
432
433 return bp;
434 }
435
436
437 #define check_color(b1, b2, c1, c2) ((b1 == c1 && b2 == c2) || (b1 == c2 && b2 == c1))
438
439 #define two_bps_to_color(b1, b2, c) \
440 if (((b1) == 'A' || (b1) == 'G' || (b1) == 'C' || (b1) == 'T') && (b1) == (b2)) \
441 c = '0'; \
442 else if (check_color((b1), (b2), 'A', 'C') || check_color((b1), (b2), 'G', 'T')) \
443 c = '1'; \
444 else if (check_color((b1), (b2), 'A', 'G') || check_color((b1), (b2), 'C', 'T')) \
445 c = '2'; \
446 else if (check_color((b1), (b2), 'A', 'T') || check_color((b1), (b2), 'C', 'G')) \
447 c = '3'; \
448 else \
449 c = '4';
450
451
452 string convert_bp_to_color(const string& bp, bool remove_primer)
453 {
454 if (bp.length() <= 1)
455 return "";
456
457 char base = toupper(bp[0]);
458 string color;
459 if (!remove_primer)
460 color.push_back(base);
461
462 for (string::size_type i = 1; i < bp.length(); ++i)
463 {
464 char next = toupper(bp[i]);
465
466 char c = '0';
467 two_bps_to_color(base, next, c);
468 color.push_back(c);
469
470 base = next;
471 }
472
473 return color;
474 }
475
476 // daehwan - check this -
477 seqan::String<char> convert_bp_to_color(const seqan::String<char>& bp, bool remove_primer)
478 {
479 if (seqan::length(bp) <= 1)
480 return "";
481
482 char base = toupper(bp[0]);
483 string color;
484 if (!remove_primer)
485 color.push_back(base);
486
487 for (string::size_type i = 1; i < seqan::length(bp); ++i)
488 {
489 char next = toupper(bp[i]);
490
491 char c = '0';
492 two_bps_to_color(base, next, c);
493 color.push_back(c);
494
495 base = next;
496 }
497
498 return color;
499 }
500
501 /*
502 */
503 void BWA_decode(const string& color, const string& qual, const string& ref, string& decode)
504 {
505 assert(color.length() == ref.length() - 1);
506
507 const size_t max_length = 256;
508 const unsigned int max_value = max_length * 0xff;
509 size_t length = color.length();
510 if (length < 1 || length + 1 > max_length)
511 {
512 return;
513 }
514
515 unsigned int f[max_length * 4];
516 char ptr[max_length * 4];
517
518 unsigned int q_prev = 0;
519 for (unsigned int i = 0; i < length + 1; ++i)
520 {
521 unsigned int q = (unsigned int) (qual.length() <= i ? 'I' : qual[i]) - 33;
522 for (unsigned int j = 0; j < 4; ++j)
523 {
524 size_t i_j = i * 4 + j;
525 if (i == 0)
526 {
527 f[i_j] = "ACGT"[j] == ref[i] ? 0 : q;
528 ptr[i_j] = 4;
529 continue;
530 }
531
532 f[i_j] = max_value;
533 char base = "ACGT"[j];
534 for (unsigned int k = 0; k < 4; ++k)
535 {
536 char base_prev = "ACGT"[k];
537 char ref_color;
538 two_bps_to_color(base_prev, base, ref_color);
539
540 char base_prev_prev = "ACGTN"[ptr[(i-1)*4 + k]];
541 char ref_color_prev;
542 two_bps_to_color(base_prev_prev, base_prev, ref_color_prev);
543
544 char color_curr = color[i-1];
545 char color_prev = i >= 2 ? color[i-2] : '4';
546
547 int q_hat = 0;
548 if (color_prev == ref_color_prev && color_prev != '4')
549 {
550 if (color_curr == ref_color)
551 q_hat = q + q_prev;
552 else
553 q_hat = q_prev - q;
554 }
555 else if (color_curr == ref_color)
556 {
557 q_hat = q - q_prev;
558 }
559
560 unsigned int f_k = f[(i-1) * 4 + k] +
561 (base == ref[i] ? 0 : q_hat) +
562 (color_curr == ref_color ? 0 : q);
563
564 if (f_k < f[i_j])
565 {
566 f[i_j] = f_k;
567 ptr[i_j] = k;
568 }
569 }
570 }
571
572 q_prev = q;
573 }
574
575 unsigned int min_index = 0;
576 unsigned int min_f = f[length * 4];
577 for (unsigned int i = 1; i < 4; ++i)
578 {
579 unsigned int temp_f = f[length * 4 + i];
580 if (temp_f < min_f)
581 {
582 min_f = temp_f;
583 min_index = i;
584 }
585 }
586
587 decode.resize(length + 1);
588 decode[length] = "ACGT"[min_index];
589 for (unsigned int i = length; i > 0; --i)
590 {
591 min_index = ptr[i * 4 + min_index];
592 decode[i-1] = "ACGT"[min_index];
593 }
594 }
595
596
597 bool ReadStream::next_read(Read& r, ReadFormat read_format) {
598 FLineReader fr(fstream.file);
599 while (read_pq.size()<100000 && !r_eof) {
600 //keep the queue topped off
601 Read rf;
602 if (!next_fastx_read(fr, rf, read_format)) {
603 r_eof=true;
604 break;
605 }
606 //Read read=Read(rf);
607 uint64_t id = (uint64_t)atol(rf.name.c_str());
608 read_pq.push(make_pair(id, rf));
609 }
610 if (read_pq.size()==0)
611 return false;
612 const pair<uint64_t, Read>& t = read_pq.top();
613 r=t.second; //copy strings
614 //free(t.second);
615 read_pq.pop();
616 return true;
617 }
618
619 // reads must ALWAYS requested in increasing order of their ID
620 bool ReadStream::getRead(uint64_t r_id,
621 Read& read,
622 ReadFormat read_format,
623 bool strip_slash,
624 FILE* um_out, //unmapped reads output
625 bool um_write_found,
626 uint64_t begin_id,
627 uint64_t end_id) {
628 if (!fstream.file)
629 err_die("Error: calling ReadStream::getRead() with no file handle!");
630 if (r_id<last_id)
631 err_die("Error: ReadStream::getRead() called with out-of-order id#!");
632 last_id=r_id;
633 bool found=false;
634 while (!found) {
635 read.clear();
636 // Get the next read from the file
637 if (!next_read(read, read_format))
638 break;
639
640 if (strip_slash) {
641 string::size_type slash = read.name.rfind("/");
642 if (slash != string::npos)
643 read.name.resize(slash);
644 }
645 uint64_t id = (uint64_t)atol(read.name.c_str());
646 if (id >= end_id)
647 return false;
648
649 if (id < begin_id)
650 continue;
651
652 if (id == r_id)
653 {
654 found=true;
655 }
656 else if (id > r_id)
657 {
658 read_pq.push(make_pair(id, read));
659 break;
660 }
661
662 if (um_out && (um_write_found || !found)) {
663 //write unmapped reads
664 fprintf(um_out, "@%s\n%s\n+\n%s\n", read.alt_name.c_str(),
665 read.seq.c_str(), read.qual.c_str());
666 }
667 } //while reads
668 return found;
669 }
670
671
672 bool get_read_from_stream(uint64_t insert_id,
673 FLineReader& fr,
674 ReadFormat reads_format,
675 bool strip_slash,
676 Read& read,
677 FILE* um_out,
678 bool um_write_found)
679 {
680 bool found=false;
681 while(!found && !fr.isEof())
682 {
683 read.clear();
684
685 // Get the next read from the file
686 if (!next_fastx_read(fr, read, reads_format))
687 break;
688 if (strip_slash)
689 {
690 string::size_type slash = read.name.rfind("/");
691 if (slash != string::npos)
692 read.name.resize(slash);
693 }
694 uint64_t read_id = (uint64_t)atoi(read.name.c_str());
695 if (read_id == insert_id)
696 {
697 found=true;
698 }
699 else if (read_id > insert_id)
700 {
701 fr.pushBack_read();
702 break;
703 }
704
705 if (um_out && (um_write_found || !found)) {
706 //write unmapped reads
707 fprintf(um_out, "@%s\n%s\n+\n%s\n", read.alt_name.c_str(),
708 read.seq.c_str(), read.qual.c_str());
709 }
710 //rt.get_id(read.name, ref_str);
711 } //while reads
712 return found;
713 }