1 |
gpertea |
29 |
#ifndef READS_H |
2 |
|
|
#define READS_H |
3 |
|
|
/* |
4 |
|
|
* reads.h |
5 |
|
|
* TopHat |
6 |
|
|
* |
7 |
|
|
* Created by Cole Trapnell on 9/2/08. |
8 |
|
|
* Copyright 2008 Cole Trapnell. All rights reserved. |
9 |
|
|
* |
10 |
|
|
*/ |
11 |
|
|
|
12 |
|
|
#include <string> |
13 |
|
|
#include <sstream> |
14 |
gpertea |
154 |
#include <queue> |
15 |
|
|
#include <limits> |
16 |
gpertea |
29 |
#include <seqan/sequence.h> |
17 |
|
|
#include "common.h" |
18 |
|
|
|
19 |
gpertea |
154 |
|
20 |
gpertea |
29 |
using std::string; |
21 |
|
|
|
22 |
gpertea |
135 |
static const int max_read_bp = 256; |
23 |
gpertea |
29 |
|
24 |
|
|
// Note: qualities are not currently used by TopHat |
25 |
|
|
struct Read |
26 |
|
|
{ |
27 |
|
|
Read() |
28 |
|
|
{ |
29 |
|
|
seq.reserve(max_read_bp); |
30 |
|
|
qual.reserve(max_read_bp); |
31 |
|
|
} |
32 |
|
|
|
33 |
|
|
string name; |
34 |
|
|
string seq; |
35 |
|
|
string alt_name; |
36 |
|
|
string qual; |
37 |
|
|
|
38 |
|
|
bool lengths_equal() { return seq.length() == qual.length(); } |
39 |
|
|
void clear() |
40 |
|
|
{ |
41 |
|
|
name.clear(); |
42 |
|
|
seq.clear(); |
43 |
|
|
qual.clear(); |
44 |
|
|
alt_name.clear(); |
45 |
|
|
} |
46 |
|
|
}; |
47 |
|
|
|
48 |
|
|
void reverse_complement(string& seq); |
49 |
|
|
string convert_color_to_bp(const string& color); |
50 |
|
|
seqan::String<char> convert_color_to_bp(char base, const seqan::String<char>& color); |
51 |
|
|
|
52 |
|
|
string convert_bp_to_color(const string& bp, bool remove_primer = false); |
53 |
|
|
seqan::String<char> convert_bp_to_color(const seqan::String<char>& bp, bool remove_primer = false); |
54 |
|
|
|
55 |
|
|
/* |
56 |
|
|
This is a dynamic programming to decode a colorspace read, which is from BWA paper. |
57 |
|
|
|
58 |
|
|
Heng Li and Richard Durbin |
59 |
|
|
Fast and accurate short read alignment with Burrows-Wheeler transform |
60 |
|
|
*/ |
61 |
|
|
void BWA_decode(const string& color, const string& qual, const string& ref, string& decode); |
62 |
|
|
|
63 |
|
|
|
64 |
|
|
template <class Type> |
65 |
|
|
string DnaString_to_string(const Type& dnaString) |
66 |
|
|
{ |
67 |
|
|
std::string result; |
68 |
|
|
std::stringstream ss(std::stringstream::in | std::stringstream::out); |
69 |
|
|
ss << dnaString >> result; |
70 |
|
|
return result; |
71 |
|
|
} |
72 |
|
|
|
73 |
|
|
class ReadTable; |
74 |
|
|
|
75 |
|
|
class FLineReader { //simple text line reader class, buffering last line read |
76 |
|
|
int len; |
77 |
|
|
int allocated; |
78 |
|
|
char* buf; |
79 |
|
|
bool isEOF; |
80 |
|
|
FILE* file; |
81 |
|
|
bool is_pipe; |
82 |
|
|
bool pushed; //pushed back |
83 |
|
|
int lcount; //counting all lines read by the object |
84 |
gpertea |
154 |
|
85 |
|
|
public: |
86 |
|
|
// daehwan - this is not a good place to store the last read ... |
87 |
|
|
Read last_read; |
88 |
|
|
bool pushed_read; |
89 |
|
|
|
90 |
gpertea |
29 |
public: |
91 |
|
|
char* chars() { return buf; } |
92 |
|
|
char* line() { return buf; } |
93 |
|
|
int readcount() { return lcount; } //number of lines read |
94 |
|
|
int length() { return len; } //length of the last line read |
95 |
|
|
bool isEof() {return isEOF; } |
96 |
|
|
char* nextLine(); |
97 |
|
|
FILE* fhandle() { return file; } |
98 |
|
|
void pushBack() { if (lcount>0) pushed=true; } // "undo" the last getLine request |
99 |
|
|
// so the next call will in fact return the same line |
100 |
gpertea |
154 |
void pushBack_read() { if(!last_read.name.empty()) pushed_read=true;} |
101 |
gpertea |
29 |
FLineReader(FILE* stream=NULL) { |
102 |
|
|
len=0; |
103 |
|
|
isEOF=false; |
104 |
|
|
is_pipe=false; |
105 |
|
|
allocated=512; |
106 |
|
|
buf=(char*)malloc(allocated); |
107 |
|
|
lcount=0; |
108 |
|
|
buf[0]=0; |
109 |
|
|
file=stream; |
110 |
|
|
pushed=false; |
111 |
gpertea |
154 |
pushed_read=false; |
112 |
gpertea |
29 |
} |
113 |
|
|
FLineReader(FZPipe& fzpipe) { |
114 |
|
|
len=0; |
115 |
|
|
isEOF=false; |
116 |
|
|
allocated=512; |
117 |
|
|
buf=(char*)malloc(allocated); |
118 |
|
|
lcount=0; |
119 |
|
|
buf[0]=0; |
120 |
|
|
file=fzpipe.file; |
121 |
|
|
is_pipe=!fzpipe.pipecmd.empty(); |
122 |
|
|
pushed=false; |
123 |
gpertea |
154 |
pushed_read=false; |
124 |
gpertea |
29 |
} |
125 |
|
|
void close() { |
126 |
|
|
if (file==NULL) return; |
127 |
|
|
if (is_pipe) pclose(file); |
128 |
|
|
else fclose(file); |
129 |
|
|
} |
130 |
gpertea |
154 |
|
131 |
gpertea |
29 |
~FLineReader() { |
132 |
|
|
free(buf); //does not call close() -- we might reuse the file handle |
133 |
|
|
} |
134 |
|
|
}; |
135 |
|
|
|
136 |
gpertea |
154 |
bool get_read_from_stream(uint64_t insert_id, |
137 |
|
|
FLineReader& fr, |
138 |
|
|
ReadFormat reads_format, |
139 |
|
|
bool strip_slash, |
140 |
|
|
Read& read, |
141 |
|
|
FILE* um_out=NULL, //unmapped reads output |
142 |
|
|
bool um_write_found=false); |
143 |
gpertea |
135 |
|
144 |
gpertea |
29 |
void skip_lines(FLineReader& fr); |
145 |
|
|
bool next_fasta_record(FLineReader& fr, string& defline, string& seq, ReadFormat reads_format); |
146 |
|
|
bool next_fastq_record(FLineReader& fr, const string& seq, string& alt_name, string& qual, ReadFormat reads_format); |
147 |
gpertea |
135 |
bool next_fastx_read(FLineReader& fr, Read& read, ReadFormat reads_format=FASTQ, |
148 |
gpertea |
29 |
FLineReader* frq=NULL); |
149 |
|
|
|
150 |
gpertea |
135 |
class ReadStream { |
151 |
|
|
protected: |
152 |
|
|
struct ReadOrdering |
153 |
|
|
{ |
154 |
|
|
bool operator()(std::pair<uint64_t, Read>& lhs, std::pair<uint64_t, Read>& rhs) |
155 |
|
|
{ |
156 |
|
|
return (lhs.first > rhs.first); |
157 |
|
|
} |
158 |
|
|
}; |
159 |
|
|
FZPipe fstream; |
160 |
|
|
std::priority_queue< std::pair<uint64_t, Read>, |
161 |
|
|
std::vector<std::pair<uint64_t, Read> >, |
162 |
|
|
ReadOrdering > read_pq; |
163 |
|
|
uint64_t last_id; //keep track of last requested ID, for consistency check |
164 |
|
|
bool r_eof; |
165 |
|
|
bool next_read(Read& read, ReadFormat read_format); //get top read from the queue |
166 |
|
|
|
167 |
|
|
public: |
168 |
|
|
ReadStream():fstream(), read_pq(), last_id(0), r_eof(false) { } |
169 |
|
|
|
170 |
gpertea |
154 |
ReadStream(const string& fname):fstream(fname, false), |
171 |
gpertea |
135 |
read_pq(), last_id(0), r_eof(false) { } |
172 |
|
|
|
173 |
|
|
void init(string& fname) { |
174 |
|
|
fstream.openRead(fname, false); |
175 |
|
|
} |
176 |
|
|
const char* filename() { |
177 |
|
|
return fstream.filename.c_str(); |
178 |
|
|
} |
179 |
|
|
//read_ids must ALWAYS be requested in increasing order |
180 |
|
|
bool getRead(uint64_t read_id, Read& read, |
181 |
gpertea |
154 |
ReadFormat read_format=FASTQ, |
182 |
|
|
bool strip_slash=false, |
183 |
gpertea |
159 |
uint64_t begin_id = 0, |
184 |
|
|
uint64_t end_id=std::numeric_limits<uint64_t>::max(), |
185 |
gpertea |
154 |
FILE* um_out=NULL, //unmapped reads output |
186 |
gpertea |
159 |
bool um_write_found=false |
187 |
|
|
); |
188 |
gpertea |
135 |
|
189 |
|
|
void rewind() { |
190 |
|
|
fstream.rewind(); |
191 |
|
|
clear(); |
192 |
|
|
} |
193 |
gpertea |
154 |
void seek(int64_t offset) { |
194 |
|
|
clear(); |
195 |
|
|
fstream.seek(offset); |
196 |
|
|
} |
197 |
gpertea |
135 |
FILE* file() { |
198 |
|
|
return fstream.file; |
199 |
|
|
} |
200 |
|
|
void clear() { |
201 |
|
|
/* while (read_pq.size()) { |
202 |
|
|
const std::pair<uint64_t, Read>& t = read_pq.top(); |
203 |
|
|
//free(t.second); |
204 |
|
|
read_pq.pop(); |
205 |
|
|
} */ |
206 |
|
|
read_pq=std::priority_queue< std::pair<uint64_t, Read>, |
207 |
|
|
std::vector<std::pair<uint64_t, Read> >, |
208 |
|
|
ReadOrdering > (); |
209 |
|
|
} |
210 |
|
|
void close() { |
211 |
|
|
clear(); |
212 |
|
|
fstream.close(); |
213 |
|
|
} |
214 |
|
|
~ReadStream() { |
215 |
|
|
close(); |
216 |
|
|
} |
217 |
|
|
}; |
218 |
gpertea |
29 |
#endif |