ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/segment_juncs.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (7 years, 8 months ago) by gpertea
File size: 157870 byte(s)
Log Message:
massive update with Daehwan's work

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * segment_juncs.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <iostream>
22 #include <fstream>
23 #include <sstream>
24 #include <cstring>
25 #include <bitset>
26 #include <seqan/sequence.h>
27 #include <seqan/find.h>
28 #include <seqan/file.h>
29 #include <seqan/modifier.h>
30 #include <getopt.h>
31
32 #include <boost/thread.hpp>
33
34 #include "common.h"
35 #include "utils.h"
36 #include "bwt_map.h"
37 #include "tokenize.h"
38 #include "segments.h"
39 #include "reads.h"
40 #include "junctions.h"
41 #include "insertions.h"
42 #include "deletions.h"
43 #include "fusions.h"
44
45 using namespace seqan;
46 using namespace std;
47 using namespace __gnu_cxx;
48
49 // daehwan //geo
50 //#define B_DEBUG 1
51
52 void print_usage()
53 {
54 fprintf(stderr, "Usage: segment_juncs <ref.fa> <segment.juncs> <segment.insertions> <segment.deletions> <left_reads.fq> <left_reads.bwtout> <left_seg1.bwtout,...,segN.bwtout> [right_reads.fq right_reads.bwtout right_seg1.bwtout,...,right_segN.bwtout]\n");
55 }
56
57 // This is the maximum number of bowtie mismatches allower per segment hit
58 static const int num_bowtie_mismatches = 2;
59 static const int max_cov_juncs = 5000000;
60 // static const int max_cov_juncs = std::numeric_limits<int>::max();
61 static const int max_seg_juncs = 10000000;
62
63 int max_microexon_stretch = 2000;
64 int butterfly_overhang = 6;
65 int min_cov_length = 20;
66
67 void get_seqs(istream& ref_stream,
68 RefSequenceTable& rt,
69 bool keep_seqs = true)
70 {
71 while(ref_stream.good() &&
72 !ref_stream.eof())
73 {
74 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
75 string name;
76 readMeta(ref_stream, name, Fasta());
77 string::size_type space_pos = name.find_first_of(" \t\r");
78 if (space_pos != string::npos)
79 {
80 name.resize(space_pos);
81 }
82 fprintf(stderr, "\tLoading %s...", name.c_str());
83 seqan::read(ref_stream, *ref_str, Fasta());
84 fprintf(stderr, "done\n");
85 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
86 if (!keep_seqs)
87 delete ref_str;
88 }
89 }
90
91 RefSeg seg_from_bowtie_hit(const BowtieHit& T)
92 {
93 RefSeg r_seg(T.ref_id(), POINT_DIR_DONTCARE, T.antisense_align(), READ_DONTCARE, 0, 0);
94
95 if (T.antisense_align())
96 {
97 r_seg.left = max(0, T.right() - 2);
98 r_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches
99 r_seg.points_where = POINT_DIR_RIGHT;
100 }
101 else
102 {
103 r_seg.left = max(0, T.left() - (T.right() - T.left() + 1));
104 r_seg.right = T.left() + 2; // num allowed bowtie mismatches
105 r_seg.points_where = POINT_DIR_LEFT;
106 }
107
108 return r_seg;
109 }
110
111 pair<RefSeg, RefSeg> segs_from_bowtie_hits(const BowtieHit& T,
112 const BowtieHit& H)
113 {
114 pair<RefSeg, RefSeg> seg_pair;
115 if (H.antisense_align() == false &&
116 abs((H.right() + 1) - T.left()) < (int)max_segment_intron_length)
117 {
118 RefSeg left_seg(H.ref_id(), POINT_DIR_RIGHT, H.antisense_align(), READ_DONTCARE, 0, 0);
119 left_seg.left = max(0, H.right() - 2);
120 left_seg.right = H.right() + (H.right() - H.left() + 1); // num allowed bowtie mismatches
121
122 RefSeg right_seg(T.ref_id(), POINT_DIR_LEFT, T.antisense_align(), READ_DONTCARE, 0, 0);
123 right_seg.left = max(0, T.left() - (T.right() - T.left() + 1));
124 right_seg.right = T.left() + 2; // num allowed bowtie mismatches
125
126 seg_pair = make_pair(left_seg, right_seg);
127 }
128 else if (H.antisense_align() == true &&
129 abs((T.right() + 1) - H.left()) < (int)max_segment_intron_length)
130 {
131 RefSeg left_seg(T.ref_id(), POINT_DIR_RIGHT, T.antisense_align(), READ_DONTCARE, 0, 0);
132 left_seg.left = max(0, T.right() - 2);
133 left_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches
134
135 RefSeg right_seg(H.ref_id(), POINT_DIR_LEFT, H.antisense_align(), READ_DONTCARE, 0, 0);
136 right_seg.left = max(0, H.left() - (H.right() - H.left() + 1));
137 right_seg.right = H.left() + 2; // num allowed bowtie mismatches
138
139 seg_pair = make_pair(left_seg, right_seg);
140 }
141 return seg_pair;
142 }
143
144 //static const size_t half_splice_mer_len = 6;
145 //static const size_t splice_mer_len = 2 * half_splice_mer_len;
146
147 struct MerExtension
148 {
149 static const int MAX_EXTENSION_BP = 14;
150 uint32_t left_dna_str : 28; // up to 14bp encoded in 2-bits-per-base
151 uint8_t left_ext_len : 4; // how many bases in this extension
152
153 uint32_t right_dna_str : 28; // up to 14bp encoded in 2-bits-per-base
154 uint8_t right_ext_len : 4; // how many bases in this extension
155
156 MerExtension() : left_dna_str(0), left_ext_len(0), right_dna_str(0), right_ext_len(0) {}
157
158 bool operator<(const MerExtension& rhs) const
159 {
160 if (left_dna_str != rhs.left_dna_str)
161 return left_dna_str < rhs.left_dna_str;
162 if (left_ext_len != rhs.left_ext_len)
163 return left_ext_len < rhs.left_ext_len;
164 if (right_dna_str != rhs.right_dna_str)
165 return right_dna_str < rhs.right_dna_str;
166 if (right_ext_len != rhs.right_ext_len)
167 return right_ext_len < rhs.right_ext_len;
168 return false;
169 }
170
171 bool operator==(const MerExtension& rhs) const
172 {
173 bool eq = left_dna_str == rhs.left_dna_str &&
174 right_dna_str == rhs.right_dna_str &&
175 left_ext_len == rhs.left_ext_len &&
176 right_ext_len == rhs.right_ext_len;
177
178 return eq;
179 }
180 };
181
182
183 /// For converting from ASCII to the Dna5 code where A=0, C=1, G=2,
184 /// T=3, N=4
185 uint8_t charToDna5[] = {
186 /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187 /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 /* 48 */ 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 /* 64 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
191 /* A C G N */
192 /* 80 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
193 /* T */
194 /* 96 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
195 /* a c g n */
196 /* 112 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* t */
198 /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
203 /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205 /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
206 };
207
208
209 typedef vector<vector<MerExtension> > MerExtensionTable;
210 typedef vector<uint32_t> MerExtensionCounts;
211
212 MerExtensionTable extensions;
213 MerExtensionCounts extension_counts;
214
215 uint64_t dna5str_to_idx(const string& str)
216 {
217 uint64_t idx = 0;
218
219 for (size_t i = 0; i < str.length(); ++i)
220 {
221 idx <<=2;
222 char c = toupper(str[i]);
223 idx |= (0x3 & charToDna5[(size_t)c]);
224 }
225 return idx;
226 }
227
228 uint64_t colorstr_to_idx(const string& str)
229 {
230 uint64_t idx = 0;
231
232 for (size_t i = 0; i < str.length(); ++i)
233 {
234 idx <<=2;
235 char c = str[i];
236 idx |= (0x3 & charToDna5[(size_t)c]);
237 }
238 return idx;
239 }
240
241 void store_read_extensions(MerExtensionTable& ext_table,
242 int seq_key_len,
243 int min_ext_len,
244 const string& seq,
245 bool use_precount_table)
246 {
247 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
248 // this read.
249
250 uint64_t seed = 0;
251 bitset<256> left = 0;
252 bitset<256> right = 0;
253 const char* p = seq.c_str();
254
255 unsigned int seq_len = (int)seq.length();
256 const char* seq_end = p + seq_len;
257
258 // Build the first seed
259 while (p < seq.c_str() + (2 * seq_key_len))
260 {
261 seed <<= 2;
262 seed |= (0x3 & charToDna5[(size_t)*p]);
263 ++p;
264 }
265
266 // Build the rest of them with a sliding window, adding successive bases
267 // to the "right-remainder" word.
268 while (p < seq_end)
269 {
270 right <<= 2;
271 right |= (0x3 & charToDna5[(size_t)*p]);
272 ++p;
273 }
274
275 // This loop will construct successive seed, along with 32-bit words
276 // containing the left and right remainders for each seed
277 uint32_t i = 0;
278 size_t new_hits = 0;
279 do
280 {
281 // How many base pairs exist in the right remainder beyond what we have
282 // space for ?
283 int extra_right_bp = ((int)seq.length() -
284 (i + 2 * seq_key_len)) - MerExtension::MAX_EXTENSION_BP;
285
286 uint32_t hit_right = 0;
287 if (extra_right_bp > 0)
288 {
289 //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1));
290 hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong();
291 }
292 else
293 {
294 hit_right = (uint32_t)right.to_ulong();
295 }
296
297 uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong();
298
299 //size_t prev_cap = (*mer_table)[seed].capacity();
300 //(*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement));
301 //cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit);
302
303 MerExtension ext;
304
305 ext.right_dna_str = hit_right;
306 ext.right_ext_len = min(seq_len - (2 * seq_key_len) - i,
307 (unsigned int)MerExtension::MAX_EXTENSION_BP);
308
309 ext.left_dna_str = hit_left;
310 ext.left_ext_len = min(i, (unsigned int)MerExtension::MAX_EXTENSION_BP);
311 if (use_precount_table)
312 {
313 int curr_seed = --extension_counts[seed];
314 if (curr_seed < 0 || curr_seed > (int)ext_table[seed].size())
315 {
316 fprintf(stderr, "Error: curr_seed is %d, max is %lu\n", curr_seed, (long unsigned int)ext_table[seed].size());
317 }
318
319 ext_table[seed][curr_seed] = ext;
320 }
321 else
322 {
323 ext_table[seed].push_back(ext);
324 }
325 new_hits++;
326
327 // Take the leftmost base of the seed and stick it into bp
328 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
329
330 // Move that base down to the least significant bits of bp
331 bp >>= ((seq_key_len << 2) - 2);
332
333 // And tack it onto the left remainder of the read
334 left <<= 2;
335 left |= bp;
336
337 // Now take the leftmost base of the right remainder and stick it into
338 // the rightmost position of the seed
339 uint32_t right_len = seq_len - (i + seq_key_len * 2);
340 //bp = right & (0x3uLL << ((right_len - 1) << 1));
341
342 seed <<= 2;
343 //cout << right << endl;
344 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
345 //cout <<tmp_right << endl;
346 seed |= tmp_right.to_ulong();
347 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
348
349
350 //Now remove that leftmost base of the right remainder
351 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
352 if (right_len)
353 {
354 right.set(((right_len - 1) << 1), 0);
355 right.set(((right_len - 1) << 1) + 1, 0);
356 }
357 ++i;
358
359 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
360 return;
361 }
362
363 void count_read_extensions(MerExtensionCounts& ext_counts,
364 int seq_key_len,
365 int min_ext_len,
366 const string& seq)
367 {
368 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
369 // this read.
370
371 uint64_t seed = 0;
372 bitset<256> left = 0;
373 bitset<256> right = 0;
374 const char* p = seq.c_str();
375
376 unsigned int seq_len = (int)seq.length();
377 const char* seq_end = p + seq_len;
378
379 // Build the first seed
380 while (p < seq.c_str() + (2 * seq_key_len))
381 {
382 seed <<= 2;
383 seed |= (0x3 & charToDna5[(size_t)*p]);
384 ++p;
385 }
386
387 // Build the rest of them with a sliding window, adding successive bases
388 // to the "right-remainder" word.
389 while (p < seq_end)
390 {
391 right <<= 2;
392 right |= (0x3 & charToDna5[(size_t)*p]);
393 ++p;
394 }
395
396 // This loop will construct successive seed, along with 32-bit words
397 // containing the left and right remainders for each seed
398 uint32_t i = 0;
399 size_t new_hits = 0;
400 do
401 {
402 ext_counts[seed]++;
403
404 new_hits++;
405
406 // Take the leftmost base of the seed and stick it into bp
407 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
408
409 // Move that base down to the least significant bits of bp
410 bp >>= ((seq_key_len << 2) - 2);
411
412 // And tack it onto the left remainder of the read
413 left <<= 2;
414 left |= bp;
415
416 // Now take the leftmost base of the right remainder and stick it into
417 // the rightmost position of the seed
418 uint32_t right_len = seq_len - (i + seq_key_len * 2);
419 //bp = right & (0x3uLL << ((right_len - 1) << 1));
420
421 seed <<= 2;
422 //cout << right << endl;
423 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
424 //cout <<tmp_right << endl;
425 seed |= tmp_right.to_ulong();
426 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
427
428
429 //Now remove that leftmost base of the right remainder
430 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
431 if (right_len)
432 {
433 right.set(((right_len - 1) << 1), 0);
434 right.set(((right_len - 1) << 1) + 1, 0);
435 }
436 ++i;
437
438 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
439 return;
440 }
441
442 //void count_read_mers(FILE* reads_file, size_t half_splice_mer_len)
443 void count_read_mers(FZPipe& reads_file, size_t half_splice_mer_len)
444 {
445 Read read;
446 size_t splice_mer_len = 2 * half_splice_mer_len;
447 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
448 extension_counts.resize(mer_table_size);
449 FLineReader fr(reads_file);
450 //while(!feof(reads_file))
451 while (!fr.isEof())
452 {
453 read.clear();
454
455 // Get the next read from the file
456 if (!next_fasta_record(fr, read.name, read.seq, reads_format))
457 break;
458 if (reads_format == FASTQ)
459 {
460 if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
461 break;
462 }
463
464 if (color)
465 // erase the primer and the adjacent color
466 read.seq.erase(0, 2);
467
468 if (read.seq.size() > 32)
469 read.seq.resize(32);
470
471 count_read_extensions(extension_counts,
472 half_splice_mer_len,
473 half_splice_mer_len,
474 read.seq);
475 }
476
477 //rewind(reads_file);
478 reads_file.rewind();
479 }
480
481 void compact_extension_table()
482 {
483 for (size_t i = 0; i < extensions.size(); ++i)
484 {
485 vector<MerExtension>& exts = extensions[i];
486 sort(exts.begin(), exts.end());
487 vector<MerExtension>::iterator new_end = unique(exts.begin(), exts.end());
488 exts.erase(new_end, exts.end());
489 vector<MerExtension>(exts).swap(exts);
490 }
491 }
492
493 void prune_extension_table(uint8_t max_extension_bp)
494 {
495 uint32_t mask = ~(0xFFFFFFFFuLL << (max_extension_bp << 1));
496
497 for (size_t i = 0; i < extensions.size(); ++i)
498 {
499 vector<MerExtension>& exts = extensions[i];
500 for (size_t j = 0; j < exts.size(); ++j)
501 {
502 MerExtension& ex = exts[j];
503 if (ex.left_ext_len > max_extension_bp)
504 {
505 ex.left_ext_len = max_extension_bp;
506 ex.left_dna_str &= mask;
507 }
508
509 if (ex.right_ext_len > max_extension_bp)
510 {
511 ex.right_dna_str >>= ((ex.right_ext_len - max_extension_bp) << 1);
512 ex.right_ext_len = max_extension_bp;
513 }
514 }
515 }
516 }
517
518 //void store_read_mers(FILE* reads_file, size_t half_splice_mer_len)
519 void store_read_mers(FZPipe& reads_file, size_t half_splice_mer_len)
520 {
521 Read read;
522 size_t splice_mer_len = 2 * half_splice_mer_len;
523
524 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
525 extensions.resize(mer_table_size);
526
527 size_t num_indexed_reads = 0;
528 FLineReader fr(reads_file);
529 //while(!feof(reads_file))
530 while(!fr.isEof())
531 {
532 read.clear();
533
534 // Get the next read from the file
535 if (!next_fasta_record(fr, read.name, read.seq, reads_format))
536 break;
537 if (reads_format == FASTQ)
538 {
539 if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
540 break;
541 }
542
543 if (color)
544 // erase the primer and the adjacent color
545 read.seq.erase(0, 2);
546
547 if (read.seq.size() > 32)
548 read.seq.resize(32);
549
550 store_read_extensions(extensions,
551 half_splice_mer_len,
552 half_splice_mer_len,
553 read.seq,
554 true);
555
556 // Do NOT index the reverse of the reads
557
558 num_indexed_reads++;
559 if (num_indexed_reads % 1000000 == 0)
560 {
561 //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads);
562 //compact_extension_table();
563 }
564 }
565
566 //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads)
567
568 uint64_t num_extensions = 0;
569 for (size_t i = 0; i < extensions.size(); ++i)
570 {
571 num_extensions += extensions[i].size();
572 }
573 //fprintf (stderr, "Total extensions: %lu\n", (long unsigned int)num_extensions);
574 reads_file.rewind();
575 }
576
577 //void index_read_mers(vector<FILE*> reads_files,
578 void index_read_mers(vector<FZPipe>& reads_files,
579 size_t half_splice_mer_len)
580 {
581 extensions.clear();
582 for (size_t i = 0; i < reads_files.size(); ++i)
583 {
584 count_read_mers(reads_files[i], half_splice_mer_len);
585 }
586
587 extensions.resize(extension_counts.size());
588
589 for (size_t i = 0; i < extension_counts.size(); ++i)
590 {
591 extensions[i].resize(extension_counts[i]);
592 }
593
594 for (size_t i = 0; i < reads_files.size(); ++i)
595 {
596 store_read_mers(reads_files[i], half_splice_mer_len);
597 }
598
599 compact_extension_table();
600 }
601
602 /** Returns the number of characters in strings w1 and w2 that match,
603 * starting from right to left
604 */
605 int get_matching_chars(uint32_t w1, uint32_t w2)
606 {
607 //find the least significant mismatching bit between w1 and w2
608 int mismatch_bit = ffs(w1 ^ w2);
609
610 // If there is no mismatching bit, the words are equal
611 if (!mismatch_bit)
612 return -1;
613
614 // Given the mismatching bit, determine where the mismatching base is
615 mismatch_bit -= 1;
616 mismatch_bit -= ((mismatch_bit) & 1);
617 mismatch_bit >>= 1;
618
619 // Return the number of matching characters.
620 return mismatch_bit;
621 }
622
623 /**
624 * Computes the Hamming distance between two 16bp words, up to a specified
625 * maximum number of mismatches.
626 */
627 uint32_t mismatching_bases(uint32_t w1_word,
628 uint32_t w2_word,
629 int len,
630 uint32_t max_mis)
631 {
632 uint32_t diffs = 0;
633
634 int shift = 0;
635 int L = len;
636 uint32_t misses = 0;
637
638 // While we haven't yet exceeded the maximum allowable mismatches,
639 // and there are still unaligned bases, keep shift-anding
640 while (shift < len && misses <= max_mis)
641 {
642 int match_chars = 0;
643
644 // Get the number of characters matching on the right sides of
645 // both words
646 match_chars = get_matching_chars(w1_word, w2_word);
647
648 // If they are equal for this shift, we are done,
649 // the loop will stop at the next iteration
650 if (match_chars == -1 || match_chars >= len)
651 {
652 match_chars = len;
653 shift = len;
654 }
655 else
656 {
657 // If there is a mismatch in the remaining words
658 // decide how much to shift by and try again
659 match_chars = min(len, match_chars);
660 int shift_chars = (match_chars + 1);
661
662 L -= shift_chars;
663
664 int shift_bits = shift_chars << 1;
665
666 // Shift right past the matching part and the first mismatching base
667 w1_word >>= (shift_bits);
668 w2_word >>= (shift_bits);
669
670 shift += shift_chars;
671 diffs++;
672 misses++;
673 }
674 }
675
676 return diffs;
677 }
678
679 uint64_t rc_dna_str(uint64_t dna_str)
680 {
681 dna_str = ~dna_str;
682 uint64_t rc = 0;
683 for (int i = 0; i < 32; i++)
684 {
685 rc <<= 2;
686 rc |= (dna_str & 0x3);
687 dna_str >>= 2;
688 }
689 return rc;
690 }
691
692 uint64_t rc_color_str(uint64_t color_str)
693 {
694 uint64_t rc = 0;
695 for (int i = 0; i < 32; ++i)
696 {
697 rc <<= 2;
698 rc |= (color_str & 0x3);
699 color_str >>= 2;
700 }
701 return rc;
702 }
703
704 struct DnaSpliceStrings
705 {
706 DnaSpliceStrings(uint64_t f, uint64_t r) : fwd_string(f), rev_string(r), first_in_string('N'), last_in_string('N') {}
707 uint64_t fwd_string;
708 uint64_t rev_string;
709
710 // for color-space purposes
711 char first_in_string;
712 char last_in_string;
713
714 bool operator<(const DnaSpliceStrings& rhs) const
715 {
716 if (fwd_string != rhs.fwd_string)
717 return fwd_string < rhs.fwd_string;
718 if (rev_string != rhs.rev_string)
719 return rev_string < rhs.rev_string;
720 return false;
721 }
722
723 bool operator==(const DnaSpliceStrings& rhs) const
724 {
725 return fwd_string == rhs.fwd_string && rev_string == rhs.rev_string;
726 }
727 };
728
729 struct IntronMotifs
730 {
731 IntronMotifs(uint32_t rid) : ref_id(rid) {}
732 uint32_t ref_id;
733
734 vector<pair<size_t, DnaSpliceStrings> > fwd_donors;
735 vector<pair<size_t, DnaSpliceStrings> > fwd_acceptors;
736 vector<pair<size_t, DnaSpliceStrings> > rev_donors;
737 vector<pair<size_t, DnaSpliceStrings> > rev_acceptors;
738
739 void unique(vector<pair<size_t, DnaSpliceStrings> >& f)
740 {
741 sort(f.begin(), f.end());
742 vector<pair<size_t, DnaSpliceStrings> >::iterator i = std::unique(f.begin(), f.end());
743 f.erase(i, f.end());
744 }
745
746 void unique()
747 {
748 unique(fwd_donors);
749 unique(fwd_acceptors);
750 unique(rev_donors);
751 unique(rev_acceptors);
752 }
753
754 void attach_mers(RefSequenceTable::Sequence& ref_str)
755 {
756 attach_upstream_mers(ref_str, fwd_donors);
757 attach_upstream_mers(ref_str, rev_acceptors);
758
759 attach_downstream_mers(ref_str, rev_donors);
760 attach_downstream_mers(ref_str, fwd_acceptors);
761 }
762
763 void attach_upstream_mers(RefSequenceTable::Sequence& ref_str,
764 vector<pair<size_t, DnaSpliceStrings> >& dinucs)
765 {
766 for (size_t i = 0; i < dinucs.size(); ++i)
767 {
768 size_t pos = dinucs[i].first;
769 int half_splice_mer_len = 32;
770
771 if (color)
772 {
773 if (pos <= (size_t)half_splice_mer_len+1 || pos >= length(ref_str))
774 continue;
775
776 Dna5String seg_str = seqan::infixWithLength(ref_str,
777 pos - half_splice_mer_len - 1,
778 half_splice_mer_len + 1);
779 stringstream ss(stringstream::in | stringstream::out);
780 string s;
781 ss << seg_str;
782 ss >> s;
783
784 string col_seg_str = convert_bp_to_color(s, true);
785 uint64_t idx = colorstr_to_idx(col_seg_str);
786
787 dinucs[i].second.fwd_string = idx;
788 dinucs[i].second.rev_string = rc_color_str(idx);
789 dinucs[i].second.first_in_string = s[1];
790 dinucs[i].second.last_in_string = s[half_splice_mer_len];
791 }
792 else
793 {
794 if (pos <= (size_t)half_splice_mer_len || pos >= length(ref_str))
795 continue;
796
797 Dna5String seg_str = seqan::infixWithLength(ref_str,
798 pos - half_splice_mer_len,
799 half_splice_mer_len);
800
801 stringstream ss(stringstream::in | stringstream::out);
802 string s;
803 ss << seg_str;
804 ss >> s;
805 uint64_t idx = dna5str_to_idx(s);
806 dinucs[i].second.fwd_string = idx;
807 dinucs[i].second.rev_string = rc_dna_str(idx);
808 }
809 }
810 }
811
812
813 void attach_downstream_mers(RefSequenceTable::Sequence& ref_str,
814 vector<pair<size_t, DnaSpliceStrings> >& dinucs)
815 {
816 for (size_t i = 0; i < dinucs.size(); ++i)
817 {
818 size_t pos = dinucs[i].first;
819
820 int half_splice_mer_len = 32;
821
822 if (pos + 2 + half_splice_mer_len >= length(ref_str))
823 continue;
824
825 if (color)
826 {
827 Dna5String seg_str = seqan::infixWithLength(ref_str,
828 pos + 2 - 1,
829 half_splice_mer_len + 1);
830 stringstream ss(stringstream::in | stringstream::out);
831 string s;
832 ss << seg_str;
833 ss >> s;
834
835 string col_seg_str = convert_bp_to_color(s, true);
836 uint64_t idx = colorstr_to_idx(col_seg_str);
837
838 dinucs[i].second.fwd_string = idx;
839 dinucs[i].second.rev_string = rc_color_str(idx);
840 dinucs[i].second.first_in_string = s[1];
841 dinucs[i].second.last_in_string = s[half_splice_mer_len];
842 }
843 else
844 {
845 Dna5String seg_str = seqan::infixWithLength(ref_str,
846 pos + 2,
847 half_splice_mer_len);
848
849 stringstream ss(stringstream::in | stringstream::out);
850 string s;
851 ss << seg_str;
852 ss >> s;
853 uint64_t idx = dna5str_to_idx(s);
854
855 dinucs[i].second.fwd_string = idx;
856 dinucs[i].second.rev_string = rc_dna_str(idx);
857 }
858 }
859 }
860 };
861
862 struct PackedSplice
863 {
864 PackedSplice() : left(0u), seed(0u), right(0u) {}
865 PackedSplice(uint32_t l, uint64_t s, uint32_t r) : left(l), seed(s), right(r) {}
866 uint32_t left;
867 uint64_t seed;
868 uint32_t right;
869 };
870
871 // The second element of these pairs is the left (or right) side of a splice
872 // seed from a possible junction. The first element is the sequence flanking
873 // that seed
874 typedef pair<uint32_t, uint32_t> PackedSpliceHalf;
875
876 static inline std::string u32ToDna(uint32_t a, int len) {
877 char buf[17];
878 assert(len <= 16);
879 for(int i = 0; i < len; i++) {
880 buf[len-i-1] = "ACGT"[a & 3];
881 a >>= 2;
882 }
883 buf[len] = '\0';
884 return std::string(buf);
885 }
886
887
888 PackedSpliceHalf pack_left_splice_half(const string& seq,
889 uint32_t pos_in_l,
890 unsigned int seq_key_len)
891 {
892 const char* l = seq.c_str();
893 l += pos_in_l;
894
895 const char* left_end = l;
896 l -= 16;
897
898 assert (l + seq_key_len < seq.c_str() + seq.length());
899
900 PackedSpliceHalf packed_half = make_pair(0u,0u);
901
902 // Pack up to 32 bits (16 bases) of sequence into left
903 if (l < seq.c_str())
904 l = seq.c_str();
905 while (l < left_end)
906 {
907 packed_half.first <<= 2;
908 packed_half.first |= (0x3 & charToDna5[(size_t)*l]);
909 ++l;
910 }
911
912 // Pack up the seed bits
913 for (unsigned int i = 0; i < seq_key_len; ++i)
914 {
915 packed_half.second <<= 2;
916 packed_half.second |= (0x3 & charToDna5[(size_t)*(l + i)]);
917 }
918 return packed_half;
919 }
920
921
922 PackedSpliceHalf pack_right_splice_half(const string& seq,
923 uint32_t pos,
924 unsigned int seq_key_len)
925 {
926 const char* r = seq.c_str();
927 r += pos - seq_key_len;
928
929 PackedSpliceHalf packed_half = make_pair(0u,0u);
930
931 // Pack the seed bits
932 for (unsigned int i = 0; i < seq_key_len; ++i)
933 {
934 packed_half.second <<= 2;
935 packed_half.second |= (0x3 & charToDna5[(size_t)*(r + i)]);
936 }
937
938 r += seq_key_len;
939 // Now pack 32 bits (16 bases) of sequence into left
940 const char* right_end = r + 16;
941
942 if ((size_t)(right_end - seq.c_str()) > seq.length())
943 right_end = seq.c_str() + seq.length();
944 while (r < right_end)
945 {
946 packed_half.first <<= 2;
947 packed_half.first |= (0x3 & charToDna5[(size_t)*r]);
948 ++r;
949 }
950
951 return packed_half;
952 }
953
954 PackedSplice combine_splice_halves(const PackedSpliceHalf& left_half,
955 const PackedSpliceHalf& right_half,
956 int seq_key_len)
957 {
958 uint64_t seed = left_half.second << (seq_key_len << 1) | right_half.second;
959 return PackedSplice(left_half.first,seed, right_half.first);
960 }
961
962 PackedSplice pack_splice(const string& seq,
963 int l_pos_in_seq,
964 int r_pos_in_seq,
965 unsigned int seq_key_len)
966 {
967 const char* l = seq.c_str(); // l points to beginning of left exon sequence
968 l += l_pos_in_seq;
969
970 assert (l + seq_key_len < seq.c_str() + seq.length());
971
972 const char* r = seq.c_str(); // r points to beginning of right exon sequence
973 r += r_pos_in_seq - seq_key_len;
974 //r += 2; // r points to right side of junction;
975
976 uint64_t seed = 0;
977 uint64_t left = 0;
978 uint64_t right = 0;
979
980 // Pack up the seed bits
981 for (unsigned int i = 0; i < seq_key_len; ++i)
982 {
983 seed <<= 2;
984 seed |= (0x3 & charToDna5[(size_t)*(l + i)]);
985 }
986
987 for (unsigned int i = 0; i < seq_key_len; ++i)
988 {
989 seed <<= 2;
990 seed |= (0x3 & charToDna5[(size_t)*(r + i)]);
991 }
992
993 // Now pack 32 bits (16 bases) of sequence into left
994 const char* left_end = l;
995 l -= 16;
996 if (l < seq.c_str())
997 l = seq.c_str();
998 while (l < left_end)
999 {
1000 left <<= 2;
1001 left |= (0x3 & charToDna5[(size_t)*l]);
1002 ++l;
1003 }
1004
1005 r += seq_key_len;
1006 // Now pack 32 bits (16 bases) of sequence into left
1007 const char* right_end = r + 16;
1008
1009 if ((size_t)(right_end - seq.c_str()) > seq.length())
1010 right_end = seq.c_str() + seq.length();
1011 while (r < right_end)
1012 {
1013 right <<= 2;
1014 right |= (0x3 & charToDna5[(size_t)*r]);
1015 ++r;
1016 }
1017
1018 return PackedSplice((uint32_t)left,seed,(uint32_t)right);
1019 }
1020
1021
1022
1023 /* Represents a hit between a splice seed and a read. */
1024 // TODO: consider packing pos and meta into a single 32-bit int.
1025 struct ReadHit
1026 {
1027 ReadHit(uint32_t l, uint32_t r, uint32_t p, uint32_t m, bool rc)
1028 : left(l), right(r), pos(p), meta(m), reverse_complement(rc) {}
1029 uint32_t left; // 2-bits per base rep of the left remainder
1030 uint32_t right; //2-bits per base rep of the right remainder
1031 uint32_t pos; // position of the seed within the read
1032 uint32_t meta : 31;
1033 bool reverse_complement : 1;
1034 } __attribute__((packed));
1035
1036 // A MerTable maps k-mers to hits in indexed reads. See the comment for
1037 // mer_table
1038 typedef vector<ReadHit> ReadHitList;
1039 typedef vector<ReadHitList> MerTable;
1040
1041 size_t index_read(MerTable* mer_table,
1042 int seq_key_len,
1043 const string& seq,
1044 unsigned int read_num,
1045 bool reverse_complement,
1046 vector<uint64_t>& seeds)
1047 {
1048 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
1049 // this read.
1050
1051 uint64_t seed = 0;
1052 bitset<256> left = 0;
1053 bitset<256> right = 0;
1054 const char* p = seq.c_str();
1055
1056 unsigned int seq_len = (int)seq.length();
1057 const char* seq_end = p + seq_len;
1058
1059 // Build the first seed
1060 while (p < seq.c_str() + (2 * seq_key_len))
1061 {
1062 seed <<= 2;
1063 seed |= (0x3 & charToDna5[(size_t)*p]);
1064 ++p;
1065 }
1066
1067 seeds.push_back(seed);
1068
1069 // Build the rest of them with a sliding window, adding successive bases
1070 // to the "right-remainder" word.
1071 while (p < seq_end)
1072 {
1073
1074 right <<= 2;
1075 right |= (0x3 & charToDna5[(size_t)*p]);
1076 ++p;
1077 }
1078
1079 size_t cap_increase = 0;
1080
1081 // At this point, seed contains the 5'-most 2*min_anchor_len bases of the
1082 // read, and right contains everthing else on the 3' end.
1083
1084 // This loop will construct successive seed, along with 32-bit words
1085 // containing the left and right remainders for each seed
1086 uint32_t i = 0;
1087 size_t new_hits = 0;
1088 do
1089 {
1090 // Let's not make an out-of-bounds write, if this fails the global
1091 // mer_table is too small
1092 assert (!mer_table || seed < mer_table->size());
1093
1094 // How many base pairs exist in the right remainder beyond what we have
1095 // space for ?
1096 int extra_right_bp = ((int)seq.length() - (i + 2 * seq_key_len)) - 16;
1097
1098 uint32_t hit_right;
1099 if (extra_right_bp > 0)
1100 {
1101 //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1));
1102 hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong();
1103 }
1104 else
1105 {
1106 hit_right = (uint32_t)right.to_ulong();
1107 }
1108
1109 uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong();
1110
1111 if (mer_table)
1112 {
1113 size_t prev_cap = (*mer_table)[seed].capacity();
1114 (*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement));
1115 cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit);
1116 }
1117 new_hits++;
1118
1119 // Take the leftmost base of the seed and stick it into bp
1120 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
1121
1122 // Move that base down to the least significant bits of bp
1123 bp >>= ((seq_key_len << 2) - 2);
1124
1125 // And tack it onto the left remainder of the read
1126 left <<= 2;
1127 left |= bp;
1128
1129 // Now take the leftmost base of the right remainder and stick it into
1130 // the rightmost position of the seed
1131 uint32_t right_len = seq_len - (i + seq_key_len * 2);
1132 //bp = right & (0x3uLL << ((right_len - 1) << 1));
1133
1134 seed <<= 2;
1135 //cout << right << endl;
1136 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
1137 //cout <<tmp_right << endl;
1138 seed |= tmp_right.to_ulong();
1139 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
1140
1141 seeds.push_back(seed);
1142
1143 //Now remove that leftmost base of the right remainder
1144 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
1145 if (right_len)
1146 {
1147 right.set(((right_len - 1) << 1), 0);
1148 right.set(((right_len - 1) << 1) + 1, 0);
1149 }
1150 ++i;
1151
1152 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
1153 return cap_increase;
1154 }
1155
1156
1157
1158 struct SeedExtension
1159 {
1160 SeedExtension(int lp,
1161 int rp,
1162 int rd_p,
1163 int le,
1164 int re,
1165 int mm) :
1166 l_pos_in_ref(lp),
1167 r_pos_in_ref(rp),
1168 read_pos(rd_p),
1169 left_extent(le),
1170 right_extent(re),
1171 mismatches(mm){}
1172
1173 int l_pos_in_ref;
1174 int r_pos_in_ref;
1175 int read_pos;
1176 int left_extent;
1177 int right_extent;
1178 int mismatches;
1179 };
1180
1181 pair<string::size_type, int> left_extend(const string& ref,
1182 const string& read,
1183 int ref_pos,
1184 int read_pos,
1185 int num_mismatches)
1186 {
1187 string::size_type ext = 0;
1188 int mm_encountered = 0;
1189 while(ref_pos >= 0 &&
1190 read_pos >= 0)
1191 {
1192 //char ref_char = ref[ref_pos];
1193 //char read_char = read[read_pos];
1194 if (ref[ref_pos] != read[read_pos])
1195 {
1196 if (mm_encountered + 1 > num_mismatches)
1197 return make_pair(ext, mm_encountered);
1198 mm_encountered++;
1199 }
1200 ext++;
1201
1202 --ref_pos;
1203 --read_pos;
1204 }
1205
1206 return make_pair(ext, mm_encountered);
1207 }
1208
1209 pair<string::size_type, int> right_extend(const string& ref,
1210 const string& read,
1211 int ref_pos,
1212 int read_pos,
1213 int num_mismatches)
1214 {
1215 string::size_type ext = 0;
1216 int mm_encountered = 0;
1217 while(ref_pos < (int)ref.size() &&
1218 read_pos < (int)read.size())
1219 {
1220 if (ref[ref_pos] != read[read_pos])
1221 {
1222 if (mm_encountered + 1 > num_mismatches)
1223 return make_pair(ext, mm_encountered);
1224 mm_encountered++;
1225 }
1226
1227 ext++;
1228
1229 ++ref_pos;
1230 ++read_pos;
1231 }
1232
1233 return make_pair(ext, mm_encountered);
1234 }
1235
1236
1237 void extend_from_seeds(vector<SeedExtension>& extensions,
1238 const PackedSplice& p,
1239 const MerTable& mer_table,
1240 const string& ref,
1241 const string& read,
1242 size_t l_pos_in_ref,
1243 size_t r_pos_in_ref,
1244 int seq_key_len)
1245 {
1246 assert(p.seed < mer_table.size());
1247 const ReadHitList& hl = mer_table[p.seed];
1248
1249 for (size_t hit = 0; hit < hl.size(); ++hit)
1250 {
1251 const ReadHit& rh = hl[hit];
1252 uint32_t pos = rh.pos;
1253
1254 pair<string::size_type, int> left_extension;
1255 pair<string::size_type, int> right_extension;
1256 left_extension = left_extend(ref, read, l_pos_in_ref - seq_key_len + 1, pos, 2);
1257 right_extension = right_extend(ref, read, r_pos_in_ref + seq_key_len, pos + 2 * seq_key_len, 2);
1258 extensions.push_back(SeedExtension(l_pos_in_ref,
1259 r_pos_in_ref,
1260 pos + seq_key_len,
1261 left_extension.first,
1262 right_extension.first,
1263 left_extension.second + right_extension.second));
1264
1265 }
1266 }
1267
1268 typedef pair<size_t, PackedSpliceHalf> SpliceHalf;
1269
1270 void get_seed_extensions(const string& ref,
1271 const string& read,
1272 int seq_key_len,
1273 MerTable& mer_table,
1274 vector<SpliceHalf>& donors,
1275 vector<SpliceHalf>& acceptors,
1276 vector<SeedExtension>& extensions)
1277 {
1278 for (size_t d = 0; d < donors.size(); ++d)
1279 {
1280 bool broke_out = false;
1281
1282 // start pos is a lower bound on downstream acceptor positions
1283 // to consider
1284 size_t start_pos = donors[d].first + min_report_intron_length;
1285
1286 SpliceHalf dummy = make_pair(start_pos,PackedSpliceHalf());
1287 vector<SpliceHalf>::iterator lb = upper_bound(acceptors.begin(),
1288 acceptors.end(),
1289 dummy);
1290
1291 if (lb == acceptors.end())
1292 break;
1293 for (size_t a = lb - acceptors.begin();
1294 a < acceptors.size();
1295 ++a)
1296 {
1297 if (acceptors[a].first - donors[d].first > (size_t)max_microexon_stretch)
1298 {
1299 broke_out = true;
1300 break;
1301 }
1302
1303 size_t l_pos_in_ref = donors[d].first - 1;
1304 size_t r_pos_in_ref = acceptors[a].first + 2;
1305 PackedSplice p = combine_splice_halves(donors[d].second,
1306 acceptors[a].second,
1307 seq_key_len);
1308 extend_from_seeds(extensions,
1309 p,
1310 mer_table,
1311 ref,
1312 read,
1313 l_pos_in_ref,
1314 r_pos_in_ref,
1315 seq_key_len);
1316 }
1317 if (broke_out)
1318 continue;
1319 }
1320
1321 }
1322
1323 void hits_from_seed_extension(uint32_t ref_id,
1324 int ref_offset,
1325 uint64_t insert_id,
1326 bool antisense,
1327 vector<SeedExtension>& extensions,
1328 vector<BowtieHit>& hits_out,
1329 int left_read_edge,
1330 int right_read_edge,
1331 int seq_key_len)
1332 {
1333 for (size_t i = 0; i < extensions.size(); ++i)
1334 {
1335 SeedExtension& s = extensions[i];
1336 if (s.read_pos >= right_read_edge ||
1337 s.read_pos < left_read_edge)
1338 continue;
1339 if (s.read_pos - seq_key_len - s.left_extent <= left_read_edge &&
1340 s.read_pos + seq_key_len + s.right_extent >= right_read_edge && s.mismatches <= 2 )
1341 {
1342 vector<CigarOp> cigar;
1343 int off_adjust;
1344 if (antisense)
1345 {
1346 CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge);
1347 CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref);
1348 CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos);
1349 cigar.push_back(m1);
1350 cigar.push_back(skip);
1351 cigar.push_back(m2);
1352 off_adjust = m1.length;
1353 }
1354 else
1355 {
1356 CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge + 1);
1357 CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref);
1358 CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos - 1);
1359 cigar.push_back(m1);
1360 cigar.push_back(skip);
1361 cigar.push_back(m2);
1362 off_adjust = m1.length;
1363 }
1364 // daehwan - check this
1365 bool end = false;
1366 BowtieHit bh(ref_id,
1367 ref_id,
1368 insert_id,
1369 ref_offset + s.l_pos_in_ref - off_adjust + 1,
1370 cigar,
1371 antisense,
1372 false,
1373 s.mismatches,
1374 0,
1375 end);
1376 hits_out.push_back(bh);
1377 }
1378 }
1379 }
1380
1381 void align(uint32_t ref_id,
1382 uint64_t insert_id,
1383 bool antisense,
1384 const string& ref,
1385 const string& read,
1386 int ref_offset,
1387 int left_read_edge,
1388 int right_read_edge,
1389 MerTable& mer_table,
1390 int seq_key_len,
1391 vector<BowtieHit>& hits_out)
1392 {
1393 // Reserve an entry for each k-mer we might see
1394 size_t mer_table_size = 1 << ((seq_key_len << 1)<<1);
1395
1396 mer_table.resize(mer_table_size);
1397
1398 vector<uint64_t> seeds;
1399 index_read(&mer_table, seq_key_len, read, 0, antisense, seeds);
1400
1401 vector<SpliceHalf> forward_donors;
1402 vector<SpliceHalf> forward_acceptors;
1403 vector<SpliceHalf> reverse_donors;
1404 vector<SpliceHalf> reverse_acceptors;
1405
1406 const string& seq = ref;
1407
1408 unsigned int pos = 0;
1409
1410
1411 for (size_t z = seq_key_len + 1; z < seq.length() - seq_key_len - 2; ++z)
1412 {
1413 char l = seq[z - 1];
1414 char r = seq[z];
1415 if (l == 'G' && r == 'T')
1416 {
1417 size_t donor_pos = pos + z - 1;
1418 size_t s = donor_pos - seq_key_len;
1419 PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len);
1420 forward_donors.push_back(make_pair(donor_pos,p));
1421 }
1422 if (l == 'A' && r == 'G')
1423 {
1424 size_t acceptor_pos = pos + z - 1;
1425 size_t s = acceptor_pos + 2 + seq_key_len;
1426 PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len);
1427 forward_acceptors.push_back(make_pair(acceptor_pos,p));
1428 }
1429 if (l == 'C' && r == 'T')
1430 {
1431 size_t acceptor_pos = pos + z - 1;
1432 size_t s = acceptor_pos - seq_key_len;
1433 PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len);
1434 reverse_acceptors.push_back(make_pair(pos + z - 1,p));
1435 }
1436 if (l == 'A' && r == 'C')
1437 {
1438 size_t donor_pos = pos + z - 1;
1439 size_t s = donor_pos + 2 + seq_key_len;
1440 PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len);
1441 reverse_donors.push_back(make_pair(donor_pos,p));
1442 }
1443 }
1444
1445 vector<SeedExtension> fwd_extensions;
1446 get_seed_extensions(seq,
1447 read,
1448 seq_key_len,
1449 mer_table,
1450 forward_donors,
1451 forward_acceptors,
1452 fwd_extensions);
1453
1454 hits_from_seed_extension(ref_id,
1455 ref_offset,
1456 insert_id,
1457 antisense,
1458 fwd_extensions,
1459 hits_out,
1460 left_read_edge,
1461 right_read_edge,
1462 seq_key_len);
1463
1464 //fprintf(stderr, "Found %d seed hits\n", fwd_extensions.size());
1465
1466 vector<SeedExtension> rev_extensions;
1467 get_seed_extensions(seq,
1468 read,
1469 seq_key_len,
1470 mer_table,
1471 reverse_donors,
1472 reverse_acceptors,
1473 rev_extensions);
1474
1475 hits_from_seed_extension(ref_id,
1476 ref_offset,
1477 insert_id,
1478 antisense,
1479 rev_extensions,
1480 hits_out,
1481 left_read_edge,
1482 right_read_edge,
1483 seq_key_len);
1484
1485 for (size_t i = 0; i < seeds.size(); ++i)
1486 mer_table[seeds[i]].clear();
1487 }
1488
1489 int extension_mismatches = 0;
1490
1491
1492 bool left_extendable_junction(uint64_t upstream_dna_str,
1493 size_t key,
1494 size_t splice_mer_len,
1495 size_t min_ext_len)
1496 {
1497 vector<MerExtension>& exts = extensions[key];
1498 for (size_t i = 0; i < exts.size(); ++i)
1499 {
1500 const MerExtension& ext = exts[i];
1501 if (ext.left_ext_len < min_ext_len)
1502 continue;
1503 uint64_t upstream = upstream_dna_str & ~(0xFFFFFFFFFFFFFFFFull << (ext.left_ext_len << 1));
1504 int mism = mismatching_bases(ext.left_dna_str, upstream, ext.left_ext_len, extension_mismatches);
1505 if (mism <= extension_mismatches)
1506 return true;
1507 }
1508 return false;
1509 }
1510
1511 bool right_extendable_junction(uint64_t downstream_dna_str,
1512 size_t key,
1513 size_t splice_mer_len,
1514 size_t min_ext_len)
1515 {
1516 vector<MerExtension>& exts = extensions[key];
1517 for (size_t i = 0; i < exts.size(); ++i)
1518 {
1519 const MerExtension& ext = exts[i];
1520 if (ext.right_ext_len < min_ext_len)
1521 continue;
1522 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull >> (ext.right_ext_len << 1));
1523 uint64_t downstream = downstream_dna_str & mask;
1524 downstream >>= ((32 - ext.right_ext_len) << 1);
1525 int mism = mismatching_bases(ext.right_dna_str, downstream, ext.right_ext_len, extension_mismatches);
1526
1527 if (mism <= extension_mismatches)
1528 return true;
1529 }
1530 return false;
1531 }
1532
1533 uint32_t junction_key(uint64_t upstream_dna_str,
1534 uint64_t downstream_dna_str,
1535 size_t splice_mer_len)
1536 {
1537 uint64_t upstream_mask = ~(0xFFFFFFFFFFFFFFFFull << (splice_mer_len));
1538 uint64_t upstream_key_half = upstream_dna_str & upstream_mask;
1539 uint64_t downstream_mask = ~(0xFFFFFFFFFFFFFFFFull >> (splice_mer_len));
1540 uint64_t downstream_key_half = (downstream_dna_str & downstream_mask) >> (64 - splice_mer_len);
1541 uint32_t key = ((uint32_t)upstream_key_half << splice_mer_len) | (uint32_t)downstream_key_half;
1542 return key;
1543 }
1544
1545 bool extendable_junction(uint64_t upstream_dna_str,
1546 uint64_t downstream_dna_str,
1547 size_t splice_mer_len,
1548 size_t min_ext_len,
1549 bool reverse,
1550 char last_in_upstream = 'N',
1551 char first_in_downstream = 'N')
1552 {
1553 if (color)
1554 {
1555 string two_bp; two_bp.push_back(last_in_upstream); two_bp.push_back(first_in_downstream);
1556 string color = convert_bp_to_color(two_bp, true);
1557 char num = (color[0] - '0') & 0x3;
1558
1559 if (reverse)
1560 {
1561 upstream_dna_str = (upstream_dna_str >> 2) << 2;
1562 upstream_dna_str |= (uint64_t)num;
1563 }
1564 else
1565 {
1566 downstream_dna_str = (downstream_dna_str << 2) >> 2;
1567 downstream_dna_str |= ((uint64_t)num << 62);
1568 }
1569 }
1570
1571 uint32_t key = junction_key(upstream_dna_str,
1572 downstream_dna_str,
1573 splice_mer_len);
1574
1575 upstream_dna_str >>= splice_mer_len;
1576 downstream_dna_str <<= splice_mer_len;
1577
1578 bool extendable = (left_extendable_junction(upstream_dna_str,
1579 key, splice_mer_len, min_ext_len) ||
1580 right_extendable_junction(downstream_dna_str,
1581 key, splice_mer_len, min_ext_len));
1582 return extendable;
1583 }
1584
1585 typedef std::set<Junction, skip_count_lt> PotentialJuncs;
1586
1587 struct RecordExtendableJuncs
1588 {
1589 void record(uint32_t ref_id,
1590 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1591 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1592 bool antisense,
1593 PotentialJuncs& juncs,
1594 int min_intron,
1595 int max_intron,
1596 size_t max_juncs,
1597 size_t half_splice_mer_len)
1598 {
1599
1600 size_t splice_mer_len = 2 * half_splice_mer_len;
1601
1602 size_t curr_R = 0;
1603 for (size_t L = 0; L < left_sites.size(); ++L)
1604 {
1605 while (curr_R < right_sites.size() &&
1606 right_sites[curr_R].first < left_sites[L].first + min_intron)
1607 {
1608 curr_R++;
1609 }
1610
1611 size_t left_pos = left_sites[L].first;
1612 size_t max_right_pos = left_pos + max_intron;
1613 for (size_t R = curr_R;
1614 R < right_sites.size() && right_sites[R].first < max_right_pos; ++R)
1615 {
1616 uint64_t upstream_dna_str = left_sites[L].second.fwd_string;
1617 char last_in_upstream = left_sites[L].second.last_in_string;
1618 uint64_t downstream_dna_str = right_sites[R].second.fwd_string;
1619 char first_in_downstream = right_sites[R].second.first_in_string;
1620 uint64_t rc_upstream_dna_str = left_sites[L].second.rev_string;
1621 uint64_t rc_downstream_dna_str = right_sites[R].second.rev_string;
1622
1623 if (extendable_junction(upstream_dna_str,
1624 downstream_dna_str, splice_mer_len, 7, false,
1625 last_in_upstream, first_in_downstream) ||
1626 extendable_junction(rc_downstream_dna_str,
1627 rc_upstream_dna_str, splice_mer_len, 7, true,
1628 last_in_upstream, first_in_downstream))
1629 {
1630 juncs.insert(Junction(ref_id,
1631 left_sites[L].first - 1,
1632 right_sites[R].first + 2,
1633 antisense,
1634 R - curr_R));
1635 }
1636 if (juncs.size() > max_juncs)
1637 juncs.erase(*(juncs.rbegin()));
1638 }
1639 }
1640 }
1641 };
1642
1643 struct RecordAllJuncs
1644 {
1645 void record(uint32_t ref_id,
1646 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1647 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1648 bool antisense,
1649 PotentialJuncs& juncs,
1650 int min_intron,
1651 int max_intron,
1652 size_t max_juncs,
1653 size_t half_splice_mer_len)
1654 {
1655 size_t curr_R = 0;
1656 for (size_t L = 0; L < left_sites.size(); ++L)
1657 {
1658 while (curr_R < right_sites.size() &&
1659 right_sites[curr_R].first < left_sites[L].first + min_intron)
1660 {
1661 curr_R++;
1662 }
1663
1664 size_t left_pos = left_sites[L].first;
1665 size_t max_right_pos = left_pos + max_intron;
1666 for (size_t R = curr_R;
1667 R < right_sites.size() && right_sites[R].first < max_right_pos; ++R)
1668 {
1669 Junction j(ref_id,
1670 left_sites[L].first - 1,
1671 right_sites[R].first + 2,
1672 antisense,
1673 R - curr_R);
1674
1675 juncs.insert(j);
1676
1677 if (juncs.size() > max_juncs)
1678 juncs.erase(*(juncs.rbegin()));
1679 }
1680 }
1681 }
1682 };
1683
1684 struct RecordSegmentJuncs
1685 {
1686 void record(uint32_t ref_id,
1687 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1688 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1689 bool antisense,
1690 PotentialJuncs& juncs,
1691 int min_intron,
1692 int max_intron,
1693 size_t max_juncs,
1694 size_t half_splice_mer_len)
1695 {
1696 if (left_sites.size() != right_sites.size())
1697 return;
1698
1699 for (size_t i = 0; i < left_sites.size(); ++i)
1700 {
1701 Junction j(ref_id,
1702 left_sites[i].first - 1,
1703 right_sites[i].first + 2,
1704 antisense);
1705
1706 juncs.insert(j);
1707 if (juncs.size() > max_juncs)
1708 juncs.erase(*(juncs.rbegin()));
1709 }
1710 }
1711 };
1712
1713 struct ButterflyKey
1714 {
1715 uint32_t pos;
1716 uint32_t key;
1717
1718 ButterflyKey(uint32_t p, uint32_t k) : pos(p), key(k) {}
1719
1720 bool operator<(const ButterflyKey& rhs) const
1721 {
1722 if (key != rhs.key)
1723 return key < rhs.key;
1724 if (pos != rhs.pos)
1725 return pos < rhs.pos;
1726 return false;
1727 }
1728
1729 bool operator==(const ButterflyKey& rhs) const
1730 {
1731 return pos == rhs.pos && key == rhs.key;
1732 }
1733 };
1734
1735 uint32_t get_left_butterfly_key(uint64_t upstream_key,
1736 const MerExtension& ext,
1737 size_t half_splice_mer_len)
1738 {
1739 uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - half_splice_mer_len) << 1);
1740 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1));
1741 uint64_t top_half = upstream_key & mask;
1742 key |= (top_half << (half_splice_mer_len << 1));
1743 return (uint32_t)key;
1744 }
1745
1746 uint32_t get_right_butterfly_key(uint64_t downstream_key,
1747 const MerExtension& ext,
1748 size_t half_splice_mer_len)
1749 {
1750 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1));
1751 uint64_t key = (ext.left_dna_str & mask) << (half_splice_mer_len << 1);
1752 uint64_t bottom_half = (downstream_key >> (half_splice_mer_len << 1));
1753 key |= bottom_half;
1754 return (uint32_t)key;
1755 }
1756
1757 struct RecordButterflyJuncs
1758 {
1759 void record(uint32_t ref_id,
1760 const vector<pair<size_t, DnaSpliceStrings> >& all_left_sites,
1761 const vector<pair<size_t, DnaSpliceStrings> >& all_right_sites,
1762 bool antisense,
1763 PotentialJuncs& juncs,
1764 int min_intron,
1765 int max_intron,
1766 size_t max_juncs,
1767 size_t half_splice_mer_len)
1768 {
1769 size_t key_length = 2 * half_splice_mer_len;
1770 size_t extension_length = butterfly_overhang;
1771 uint64_t bottom_bit_mask = ~(0xFFFFFFFFFFFFFFFFull << (key_length<<1));
1772 uint64_t top_bit_mask = ~(0xFFFFFFFFFFFFFFFFull >> (key_length<<1));
1773
1774 if (all_left_sites.empty() || all_right_sites.empty())
1775 return;
1776
1777 size_t last_site = max(all_left_sites.back().first,
1778 all_right_sites.back().first);
1779
1780 size_t curr_left_site = 0;
1781 size_t curr_right_site = 0;
1782 for (size_t window_left_edge = 0;
1783 window_left_edge < last_site;
1784 window_left_edge += max_intron)
1785 {
1786 //fprintf(stderr, "\twindow %lu - %lu\n", window_left_edge, window_left_edge + 2 * max_intron);
1787 vector<pair<size_t, DnaSpliceStrings> > left_sites;
1788 vector<pair<size_t, DnaSpliceStrings> > right_sites;
1789
1790 while(curr_left_site < all_left_sites.size() &&
1791 all_left_sites[curr_left_site].first < window_left_edge)
1792 {
1793 curr_left_site++;
1794 }
1795
1796 while(curr_right_site < all_right_sites.size() &&
1797 all_right_sites[curr_right_site].first < window_left_edge)
1798 {
1799 curr_right_site++;
1800 }
1801
1802 for (size_t ls = curr_left_site; ls < all_left_sites.size(); ++ls)
1803 {
1804 if (all_left_sites[ls].first < window_left_edge + 2 * max_intron)
1805 {
1806 left_sites.push_back(all_left_sites[ls]);
1807 }
1808 }
1809
1810 for (size_t rs = curr_right_site; rs < all_right_sites.size(); ++rs)
1811 {
1812 if (all_right_sites[rs].first < window_left_edge + 2 * max_intron)
1813 {
1814 right_sites.push_back(all_right_sites[rs]);
1815 }
1816 }
1817
1818 vector<ButterflyKey> left_keys;
1819 for (size_t L = 0; L < left_sites.size(); ++L)
1820 {
1821 uint64_t fwd_upstream_dna_str = left_sites[L].second.fwd_string;
1822 uint64_t fwd_upstream_key = fwd_upstream_dna_str & bottom_bit_mask;
1823
1824 assert (fwd_upstream_key < extensions.size());
1825
1826 vector<MerExtension>& fwd_exts = extensions[fwd_upstream_key];
1827 for (size_t i = 0; i < fwd_exts.size(); ++i)
1828 {
1829 const MerExtension& ext = fwd_exts[i];
1830 if (ext.right_ext_len < extension_length)
1831 continue;
1832
1833 /*
1834 < f_u_key ><ext.right>
1835 NNNNNNNNNN GT
1836 */
1837
1838 // take the top bits of the right extension
1839 uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - extension_length) << 1);
1840
1841 // and the bottom bits of the site key
1842 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1843 uint64_t top_half = fwd_upstream_key & mask;
1844
1845 // and cat them together
1846 key |= (top_half << (extension_length << 1));
1847 left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key));
1848 }
1849
1850 uint64_t rev_upstream_dna_str = left_sites[L].second.rev_string;
1851 uint64_t rev_upstream_key = (rev_upstream_dna_str & top_bit_mask) >> (64 - (key_length<<1));
1852
1853 assert (rev_upstream_key < extensions.size());
1854
1855 vector<MerExtension>& rev_exts = extensions[rev_upstream_key];
1856 for (size_t i = 0; i < rev_exts.size(); ++i)
1857 {
1858 const MerExtension& ext = rev_exts[i];
1859 if (ext.left_ext_len < extension_length)
1860 continue;
1861
1862 /*
1863 < r_u_key ><ext.left>
1864 NNNNNNNNNN GT
1865 */
1866
1867 // reverse complement the left extension, and we will need
1868 // what were the bottom bits. these become the top bits in the
1869 // rc.
1870 uint64_t ext_str = color ? rc_color_str(ext.left_dna_str) : rc_dna_str(ext.left_dna_str);
1871 ext_str >>= 64 - (ext.left_ext_len << 1);
1872
1873 // now take the top bits of the rc, make them the bottom of
1874 // the key
1875 uint64_t key = ext_str >> ((ext.left_ext_len - extension_length) << 1);
1876
1877 // now add in the seed key bottom bits, making them the top of
1878 // the key
1879 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1880 uint64_t top_half = fwd_upstream_key & mask;
1881 key |= (top_half << (extension_length << 1));
1882 left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key));
1883 }
1884 }
1885 sort (left_keys.begin(), left_keys.end());
1886 vector<ButterflyKey>::iterator new_end = unique(left_keys.begin(), left_keys.end());
1887 left_keys.erase(new_end, left_keys.end());
1888
1889 vector<ButterflyKey> right_keys;
1890 for (size_t R = 0; R < right_sites.size(); ++R)
1891 {
1892 uint64_t fwd_downstream_dna_str = right_sites[R].second.fwd_string;
1893 uint64_t fwd_downstream_key = (fwd_downstream_dna_str & top_bit_mask) >> (64 - (key_length<<1));
1894
1895 assert (fwd_downstream_key < extensions.size());
1896
1897 vector<uint64_t> fwd_downstream_keys;
1898 if (color)
1899 {
1900 for(size_t color_value = 0; color_value < 4; ++color_value)
1901 {
1902 uint64_t tmp_key = (fwd_downstream_key << 2) >> 2 | (color_value << ((key_length - 1) << 1));
1903 fwd_downstream_keys.push_back(tmp_key);
1904 }
1905 }
1906 else
1907 {
1908 fwd_downstream_keys.push_back(fwd_downstream_key);
1909 }
1910
1911 for(size_t key = 0; key < fwd_downstream_keys.size(); ++key)
1912 {
1913 uint64_t tmp_fwd_downstream_key = fwd_downstream_keys[key];
1914 vector<MerExtension>& fwd_exts = extensions[tmp_fwd_downstream_key];
1915 for (size_t i = 0; i < fwd_exts.size(); ++i)
1916 {
1917 const MerExtension& ext = fwd_exts[i];
1918 if (ext.left_ext_len < extension_length)
1919 continue;
1920
1921 /*
1922 <ext.left>< f_d_key >
1923 AG NNNNNNNNNN
1924 */
1925
1926 // take the bottom bits of the left extension, making them the
1927 // top of the key.
1928 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1929 uint64_t key = (ext.left_dna_str & mask) << (extension_length << 1);
1930
1931 // add in the top bits of the seed key, making them the bottom bits
1932 // of the key.
1933 uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1));
1934 key |= bottom_half;
1935 right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key));
1936 }
1937 }
1938
1939 uint64_t rev_downstream_dna_str = right_sites[R].second.rev_string;
1940 uint64_t rev_downstream_key = rev_downstream_dna_str & bottom_bit_mask;
1941
1942 assert (rev_downstream_key < extensions.size());
1943
1944 vector<uint64_t> rev_downstream_keys;
1945 if (color)
1946 {
1947 for(size_t color_value = 0; color_value < 4; ++color_value)
1948 {
1949 uint64_t tmp_key = (rev_downstream_key >> 2) << 2 | color_value;
1950 rev_downstream_keys.push_back(tmp_key);
1951 }
1952 }
1953 else
1954 {
1955 rev_downstream_keys.push_back(rev_downstream_key);
1956 }
1957
1958 for(size_t key = 0; key < rev_downstream_keys.size(); ++key)
1959 {
1960 uint64_t tmp_rev_downstream_key = rev_downstream_keys[key];
1961 uint64_t tmp_fwd_downstream_key = fwd_downstream_key;
1962 if (color)
1963 {
1964 tmp_fwd_downstream_key = rc_color_str(tmp_rev_downstream_key) >> (64 - (key_length << 1));
1965 }
1966
1967 vector<MerExtension>& rev_exts = extensions[tmp_rev_downstream_key];
1968 for (size_t i = 0; i < rev_exts.size(); ++i)
1969 {
1970 const MerExtension& ext = rev_exts[i];
1971 if (ext.right_ext_len < extension_length)
1972 continue;
1973
1974 /*
1975 <ext.right>< r_d_key >
1976 AG NNNNNNNNNN
1977 */
1978
1979 // reverse complement the right_extension. we want the
1980 // top bits of the extension, but these become the bottom bits
1981 // under the rc.
1982 uint64_t ext_str = color ? rc_color_str(ext.right_dna_str) : rc_dna_str(ext.right_dna_str);
1983 ext_str >>= 64 - (ext.right_ext_len << 1);
1984
1985 // take the bottom bits of the rc and make it the top of the key
1986 uint64_t key = ext_str << (extension_length << 1);
1987
1988 // take the top bits of the seed key and make them the bottom
1989 // of the key.
1990 uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1));
1991 key |= bottom_half;
1992
1993 right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key));
1994 }
1995 }
1996 }
1997
1998 sort (right_keys.begin(), right_keys.end());
1999 new_end = unique(right_keys.begin(), right_keys.end());
2000 right_keys.erase(new_end, right_keys.end());
2001
2002 size_t lk = 0;
2003 size_t rk = 0;
2004
2005 while (lk < left_keys.size() && rk < right_keys.size())
2006 {
2007 while (lk < left_keys.size() &&
2008 left_keys[lk].key < right_keys[rk].key) { ++lk; }
2009
2010 if (lk == left_keys.size())
2011 break;
2012
2013 while (rk < right_keys.size() &&
2014 right_keys[rk].key < left_keys[lk].key) { ++rk; }
2015
2016 if (rk == right_keys.size())
2017 break;
2018
2019 if (lk < left_keys.size() && rk < right_keys.size() &&
2020 right_keys[rk].key == left_keys[lk].key)
2021 {
2022
2023 size_t k = right_keys[rk].key;
2024 size_t lk_end = lk;
2025 size_t rk_end = rk;
2026 while (rk_end < right_keys.size() && right_keys[rk_end].key == k) {++rk_end;}
2027 while (lk_end < left_keys.size() && left_keys[lk_end].key == k) {++lk_end;}
2028
2029 size_t tmp_lk = lk;
2030
2031 while (tmp_lk < lk_end)
2032 {
2033 size_t tmp_rk = rk;
2034 while (tmp_rk < rk_end)
2035 {
2036 int donor = (int)left_keys[tmp_lk].pos - 1;
2037 int acceptor = (int)right_keys[tmp_rk].pos + 2;
2038
2039 if (acceptor - donor > min_intron && acceptor - donor < max_intron)
2040 {
2041 Junction j(ref_id,
2042 donor,
2043 acceptor,
2044 antisense,
2045 acceptor - donor); // just prefer shorter introns
2046 juncs.insert(j);
2047 if (juncs.size() > max_juncs)
2048 {
2049 juncs.erase(*(juncs.rbegin()));
2050 }
2051 }
2052 ++tmp_rk;
2053 }
2054
2055 ++tmp_lk;
2056 }
2057
2058 lk = lk_end;
2059 rk = rk_end;
2060 }
2061 }
2062 }
2063 }
2064 };
2065
2066 template <class JunctionRecorder>
2067 void juncs_from_ref_segs(RefSequenceTable& rt,
2068 vector<RefSeg>& expected_don_acc_windows,
2069 PotentialJuncs& juncs,
2070 const DnaString& donor_dinuc,
2071 const DnaString& acceptor_dinuc,
2072 int max_intron,
2073 int min_intron,
2074 size_t max_juncs,
2075 bool talkative,
2076 size_t half_splice_mer_len)
2077 {
2078
2079 typedef map<uint32_t, IntronMotifs> MotifMap;
2080
2081 MotifMap ims;
2082
2083 seqan::DnaStringReverseComplement rev_donor_dinuc(donor_dinuc);
2084 seqan::DnaStringReverseComplement rev_acceptor_dinuc(acceptor_dinuc);
2085
2086 if (talkative)
2087 fprintf(stderr, "Collecting potential splice sites in islands\n");
2088
2089 //
2090 bool all_both = true;
2091 for (size_t r = 0; r < expected_don_acc_windows.size(); ++r)
2092 {
2093 const RefSeg& seg = expected_don_acc_windows[r];
2094
2095 if (seg.points_where != POINT_DIR_BOTH)
2096 all_both = false;
2097
2098 RefSequenceTable::Sequence* ref_str = rt.get_seq(seg.ref_id);
2099
2100 if (!ref_str)
2101 continue;
2102
2103 bool skip_fwd = false;
2104 bool skip_rev = false;
2105
2106 if (library_type == FR_FIRSTSTRAND)
2107 {
2108 if (seg.read == READ_LEFT)
2109 {
2110 if (seg.antisense) skip_rev = true;
2111 else skip_fwd = true;
2112 }
2113 else if(seg.read == READ_RIGHT)
2114 {
2115 if (seg.antisense) skip_fwd = true;
2116 else skip_rev = true;
2117 }
2118 }
2119
2120 if (library_type == FR_SECONDSTRAND)
2121 {
2122 if (seg.read == READ_LEFT)
2123 {
2124 if (seg.antisense) skip_fwd = true;
2125 else skip_rev = true;
2126 }
2127 else if(seg.read == READ_RIGHT)
2128 {
2129 if (seg.antisense) skip_rev = true;
2130 else skip_fwd = true;
2131 }
2132 }
2133
2134 pair<map<uint32_t, IntronMotifs>::iterator, bool> ret =
2135 ims.insert(make_pair(seg.ref_id, IntronMotifs(seg.ref_id)));
2136
2137 IntronMotifs& motifs = ret.first->second;
2138
2139 int left_color_offset = 0, right_color_offset = 0;
2140 if (color)
2141 {
2142 if (seg.antisense)
2143 right_color_offset = 1;
2144 else
2145 left_color_offset = -1;
2146 }
2147
2148 if (seg.left + left_color_offset < 0 || seg.right + right_color_offset >= (int)length(*ref_str) - 1)
2149 continue;
2150
2151 DnaString org_seg_str = seqan::infix(*ref_str, seg.left + left_color_offset, seg.right + right_color_offset);
2152 String<char> seg_str;
2153 assign(seg_str, org_seg_str);
2154
2155 #ifdef B_DEBUG2
2156 cout << "coord: " << seg.left << " " << seg.right << endl;
2157 //<< "seg_str: " << seg_str << endl;
2158 #endif
2159 if (color)
2160 {
2161 bool remove_primer = true;
2162 seg_str = convert_bp_to_color(org_seg_str, remove_primer);
2163 }
2164
2165 size_t to = 0;
2166 size_t seg_len = length(seg_str);
2167 size_t read_len = seg.support_read.size();
2168 if (read_len <= 0)
2169 to = seg_len - 2;
2170 else
2171 to = read_len - 2;
2172
2173 const size_t max_segment_len = 128;
2174 uint8_t left_mismatches[max_segment_len] = {0,};
2175 uint8_t right_mismatches[max_segment_len] = {0,};
2176
2177 if (max_segment_len < read_len)
2178 {
2179 fprintf(stderr, "Error: read len(%d) is greater than %d\n", (int)read_len, (int)max_segment_len);
2180 exit(-1);
2181 }
2182
2183 if (read_len == seg_len || seg.points_where == POINT_DIR_BOTH)
2184 {
2185 if (seg.points_where == POINT_DIR_RIGHT || seg.points_where == POINT_DIR_BOTH)
2186 {
2187 size_t num_mismatches = 0;
2188 for (size_t i = 0; i < read_len - 1; ++i)
2189 {
2190 if (seg_str[i] != seg.support_read[i])
2191 ++num_mismatches;
2192
2193 left_mismatches[i] = num_mismatches;
2194 if (num_mismatches > 2)
2195 {
2196 to = i;
2197 break;
2198 }
2199 }
2200 }
2201
2202 if (seg.points_where == POINT_DIR_LEFT || seg.points_where == POINT_DIR_BOTH)
2203 {
2204 size_t num_mismatches = 0;
2205 for (int i = read_len - 1; i >= 0; --i)
2206 {
2207 if (seg_str[i + (seg_len - read_len)] != seg.support_read[i])
2208 ++num_mismatches;
2209
2210 right_mismatches[i] = num_mismatches;
2211 if (num_mismatches > 2)
2212 break;
2213 }
2214 }
2215
2216 // daehwan
2217 #ifdef B_DEBUG2
2218 cout << "antisense: " << (seg.antisense ? "-" : "+") << endl
2219 << seqan::infix(seg_str, 0, segment_length) << " "
2220 << seqan::infix(seg_str, length(seg_str) - segment_length, length(seg_str)) << endl
2221 << seg.support_read << endl
2222 << 0 << " - " << to << endl;
2223
2224 for (unsigned int i = 0; i < read_len; ++i)
2225 cout << (int)left_mismatches[i];
2226 cout << "\t";
2227
2228 for (unsigned int i = 0; i < read_len; ++i)
2229 cout << (int)right_mismatches[i];
2230 cout << endl;
2231 #endif
2232 }
2233
2234 if (seg.points_where == POINT_DIR_BOTH)
2235 {
2236 for (size_t i = 0; i <= to; ++i)
2237 {
2238 // Look at a slice of the reference without creating a copy.
2239 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2240
2241 if ((!skip_fwd && curr == donor_dinuc) || (!skip_rev && curr == rev_acceptor_dinuc))
2242 {
2243 DnaString partner;
2244 if (curr == donor_dinuc)
2245 partner = acceptor_dinuc;
2246 else
2247 partner = rev_donor_dinuc;
2248
2249 uint8_t left_mismatch = 0;
2250 if (i > 0)
2251 left_mismatch = left_mismatches[i-1];
2252
2253 // daehwan
2254 #ifdef B_DEBUG2
2255 cout << "i: " << i << endl
2256 << "mismatches: " << (int)left_mismatch
2257 << " - " << (int)right_mismatches[i] << endl;
2258 #endif
2259 if (left_mismatch + right_mismatches[i] <= 2)
2260 {
2261 size_t pos = length(seg_str) - (read_len - i) - 2;
2262 if (partner == seqan::infix(org_seg_str, pos - left_color_offset, pos + 2 - left_color_offset))
2263 {
2264 if (curr == donor_dinuc)
2265 {
2266 motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2267 motifs.fwd_acceptors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0)));
2268 }
2269 else
2270 {
2271 motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2272 motifs.rev_donors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0)));
2273 }
2274
2275 // daehwan
2276 #ifdef B_DEBUG2
2277 cout << curr << ":" << partner << " added" << endl;
2278 #endif
2279 }
2280 }
2281 }
2282 }
2283 }
2284
2285 else if (seg.points_where == POINT_DIR_LEFT)
2286 {
2287 // A ref segment that "points left" is one that was flanked
2288 // on the right by a partial bowtie hit, indicating that we
2289 // should be looking for an intron to the left of the hit
2290 // In this seg, that means either an "AG" or an "AC"
2291 for (size_t i = 0; i <= to; ++i)
2292 {
2293 // Look at a slice of the reference without creating a copy.
2294 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2295 if (curr == acceptor_dinuc && !skip_fwd)
2296 motifs.fwd_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2297 else if (curr == rev_donor_dinuc && !skip_rev)
2298 motifs.rev_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2299 }
2300 }
2301 else
2302 {
2303 // A right pointing ref seg wants either a "GT" or a "CT"
2304 for (size_t i = 0; i <= to; ++i)
2305 {
2306 // Look at a slice of the reference without creating a copy.
2307 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2308 if (curr == donor_dinuc && !skip_fwd)
2309 motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2310 else if (curr == rev_acceptor_dinuc && !skip_rev)
2311 motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2312 }
2313 }
2314 }
2315
2316 if (talkative)
2317 {
2318 fprintf(stderr, "reporting synthetic splice junctions...\n");
2319 }
2320 for (MotifMap::iterator motif_itr = ims.begin(); motif_itr != ims.end(); ++motif_itr)
2321 {
2322 uint32_t ref_id = motif_itr->first;
2323
2324 RefSequenceTable::Sequence* ref_str = rt.get_seq(ref_id);
2325
2326 if (!ref_str)
2327 err_die("Error: couldn't get ref string for %u\n", ref_id);
2328
2329 if (talkative)
2330 fprintf(stderr, "Examining donor-acceptor pairings in %s\n", rt.get_name(ref_id));
2331 IntronMotifs& motifs = motif_itr->second;
2332
2333 if (!all_both)
2334 motifs.unique();
2335
2336 //motifs.attach_mer_counts(*ref_str);
2337 motifs.attach_mers(*ref_str);
2338
2339 vector<pair<size_t, DnaSpliceStrings> >& fwd_donors = motifs.fwd_donors;
2340 vector<pair<size_t, DnaSpliceStrings> >& fwd_acceptors = motifs.fwd_acceptors;
2341 vector<pair<size_t, DnaSpliceStrings> >& rev_acceptors = motifs.rev_acceptors;
2342 vector<pair<size_t, DnaSpliceStrings> >& rev_donors = motifs.rev_donors;
2343
2344 //const char* ref_name = rt.get_name(motif_itr->second.ref_id);
2345
2346 JunctionRecorder recorder;
2347 recorder.record(ref_id,
2348 fwd_donors,
2349 fwd_acceptors,
2350 false,
2351 juncs,
2352 min_intron,
2353 max_intron,
2354 max_juncs,
2355 half_splice_mer_len);
2356
2357 recorder.record(ref_id,
2358 rev_acceptors,
2359 rev_donors,
2360 true,
2361 juncs,
2362 min_intron,
2363 max_intron,
2364 max_juncs,
2365 half_splice_mer_len);
2366 }
2367 //fprintf(stderr, "Found %d total splices\n", num_juncs);
2368 }
2369
2370
2371 /**
2372 * Performs a simple global alignment.
2373 * This function will perform a restricted global alignment. The restriction is that only one insertion/deletion
2374 * is allowed in the final alignment.
2375 * @param shortSequence The short sequence to be aligned.
2376 * @param leftReference The left end of the reference to be aligned, must be exactly as long as the short sequence
2377 * @param leftReference The right end of the reference to be aligned, must be exactly as long as the short sequenc
2378 * @param insertPosition This will contain the 0-based index of the first position in the shorter sequence after the insertion/deletion. A value of -1 indicates that the alignment could not be performed.
2379 * @param mismatchCount This will contain the number of mismatches in the optimal restricted global alignment. The number and length of insertions/deletions is fixed. A value of -1 indicates that the alignment could not be performed.
2380 */
2381 void simpleSplitAlignment(seqan::String<char>& shorterSequence,
2382 seqan::String<char>& leftReference,
2383 seqan::String<char>& rightReference,
2384 vector<int>& insertPositions,
2385 int& mismatchCount)
2386 {
2387 /*
2388 * In this restricted alignment, we already know the length and number (1) of insertions/deletions.
2389 * We simply need to know where to put it. Do a linear scan through sequence counting the number of induced
2390 * errors before and after putting the insertion at each sequence.
2391 */
2392
2393 /*
2394 * Note that we could have a case, where both the alignment and the read have the unknonw
2395 * nucleotide ('N') and we don't want to reward cases where these characters match
2396 */
2397 vector<unsigned short> beforeErrors(seqan::length(shorterSequence));
2398 for(int idx = seqan::length(shorterSequence) - 1; idx >= 0; idx -= 1){
2399 unsigned short prevCount = 0;
2400 /*
2401 * We guarentee idx >= 0, so cast to hide the compiler
2402 * warning here
2403 */
2404 if(((size_t)idx) < seqan::length(shorterSequence) - 1){
2405 prevCount = beforeErrors.at(idx + 1);
2406 }
2407 unsigned short currentMismatch = 0;
2408 if(rightReference[idx] == 'N' || shorterSequence[idx] == 'N' || rightReference[idx] != shorterSequence[idx]){
2409 currentMismatch = 1;
2410 }
2411 beforeErrors.at(idx) = prevCount + currentMismatch;
2412 }
2413
2414 vector<unsigned short> afterErrors(seqan::length(shorterSequence));
2415 for(size_t idx = 0; idx < seqan::length(shorterSequence) ; idx += 1){
2416 unsigned short prevCount = 0;
2417 if(idx > 0){
2418 prevCount = afterErrors.at(idx - 1);
2419 }
2420 unsigned short currentMismatch = 0;
2421 if(leftReference[idx] == 'N' || shorterSequence[idx] == 'N' || leftReference[idx] != shorterSequence[idx]){
2422 currentMismatch = 1;
2423 }
2424 afterErrors.at(idx) = prevCount + currentMismatch;
2425 }
2426
2427 mismatchCount = seqan::length(shorterSequence) + 1;
2428 insertPositions.clear();
2429
2430 /*
2431 * Technically, we could allow the insert position to be at the end or beginning of the sequence,
2432 * but we are disallowing it here
2433 */
2434 for(size_t currentInsertPosition = 1; currentInsertPosition < seqan::length(shorterSequence); currentInsertPosition += 1){
2435 size_t errorCount = beforeErrors.at(currentInsertPosition) + afterErrors.at(currentInsertPosition - 1);
2436
2437 if(((int)errorCount) < mismatchCount){
2438 mismatchCount = (int)errorCount;
2439 insertPositions.clear();
2440 insertPositions.push_back(currentInsertPosition);
2441 }
2442 else if ((int)errorCount == mismatchCount) {
2443 insertPositions.push_back(currentInsertPosition);
2444 }
2445 }
2446 return;
2447 }
2448
2449
2450 /**
2451 * Try to detect a small insertion.
2452 * This code will try to identify a small insertion based on the ungapped alignment of two neighboring
2453 * segments. The general idea is to try to realign the local region, and see if we can reduce the
2454 * number of errors. Note that the function makes use of the global parameter "max_insertion_length" to limit the maximum
2455 * size of a detected insertion.
2456 * @param rt Sequence table used to lookup sequence information
2457 * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit.
2458 * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit.
2459 * @param insertions If an insertion is sucessfully detected, it will be added to this set
2460 */
2461 void detect_small_insertion(RefSequenceTable& rt,
2462 seqan::String<char>& read_sequence,
2463 BowtieHit& leftHit,
2464 BowtieHit& rightHit,
2465 std::set<Insertion>& insertions)
2466 {
2467
2468 RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id());
2469 if(!ref_str){
2470 fprintf(stderr, "Error accessing sequence record\n");
2471 }else{
2472 size_t read_length = seqan::length(read_sequence);
2473 int begin_offset = 0;
2474 int end_offset = 0;
2475
2476 if(color){
2477 if(leftHit.antisense_align())
2478 end_offset = 1;
2479 else
2480 begin_offset = -1;
2481 }
2482
2483 if(leftHit.left() + begin_offset < 0)
2484 return;
2485
2486 /*
2487 * If there is in fact a deletion, we are expecting the genomic sequence to be shorter than
2488 * the actual read sequence
2489 */
2490 int discrepancy = read_length - (rightHit.right() - leftHit.left());
2491 DnaString genomic_sequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, rightHit.right() + end_offset);
2492 String<char> genomic_sequence;
2493 assign(genomic_sequence, genomic_sequence_temp);
2494
2495 if(color)
2496 genomic_sequence = convert_bp_to_color(genomic_sequence, true);
2497
2498 String<char> left_read_sequence = seqan::infix(read_sequence, 0, 0 + seqan::length(genomic_sequence));
2499 String<char> right_read_sequence = seqan::infix(read_sequence, read_length - seqan::length(genomic_sequence), read_length);
2500
2501 vector<int> bestInsertPositions;
2502 int minErrors = -1;
2503 simpleSplitAlignment(genomic_sequence, left_read_sequence, right_read_sequence, bestInsertPositions, minErrors);
2504
2505 assert (bestInsertPositions.size() > 0);
2506 int bestInsertPosition = bestInsertPositions[0];
2507
2508
2509 /*
2510 * Need to decide if the insertion is suitably improves the alignment
2511 */
2512 /*
2513 * If these two segments anchors constitue the entire read, then we require
2514 * that this alignment actually improve the number of errors observed in the alignment
2515 * Otherwise, it is OK as long as the number of errors doesn't increase.
2516 */
2517 int adjustment = 0;
2518 if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){
2519 adjustment = -1;
2520 }
2521 if(minErrors <= (leftHit.edit_dist()+rightHit.edit_dist()+adjustment)){
2522 String<char> insertedSequence = seqan::infix(left_read_sequence, bestInsertPosition, bestInsertPosition + discrepancy);
2523 if(color)
2524 insertedSequence = convert_color_to_bp(genomic_sequence_temp[bestInsertPosition - begin_offset + end_offset - 1], insertedSequence);
2525
2526 insertions.insert(Insertion(leftHit.ref_id(),
2527 leftHit.left() + bestInsertPosition - 1 + end_offset,
2528 seqan::toCString(insertedSequence)));
2529 }
2530 }
2531 return;
2532 }
2533
2534 /**
2535 * Try to detect a small deletion.
2536 * This code will try to identify a small deletion based on the ungapped alignment of two neighboring
2537 * segments. The general idea is to try to realign the local region, and see if we can reduce the
2538 * number of errors. Note that the function makes use of the global parameter "max_deletion_length" to limit the maximum
2539 * size of a detected deletion.
2540 * @param rt Sequence table used to lookup sequence information
2541 * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit.
2542 * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit.
2543 * @param deletion_juncs If a deletion is sucessfully detected, it will be added to this set
2544 */
2545 void detect_small_deletion(RefSequenceTable& rt,
2546 seqan::String<char>& read_sequence,
2547 BowtieHit& leftHit,
2548 BowtieHit& rightHit,
2549 std::set<Deletion>& deletions)
2550 {
2551
2552 RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id());
2553 if(!ref_str){
2554 fprintf(stderr, "Error accessing sequence record\n");
2555 }else{
2556 int begin_offset = 0;
2557 int end_offset = 0;
2558
2559 if(color){
2560 if(leftHit.antisense_align())
2561 end_offset = 1;
2562 else
2563 begin_offset = -1;
2564 }
2565
2566 if(leftHit.left() + begin_offset < 0)
2567 return;
2568
2569 size_t read_length = seqan::length(read_sequence);
2570 if(rightHit.right() + read_length + begin_offset < 0 )
2571 return;
2572
2573 int discrepancy = (rightHit.right() - leftHit.left()) - read_length;
2574 Dna5String leftGenomicSequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, leftHit.left() + read_length + end_offset);
2575 Dna5String rightGenomicSequence_temp = seqan::infix(*ref_str, rightHit.right() - read_length + begin_offset, rightHit.right() + end_offset);
2576
2577 if (length(leftGenomicSequence_temp) < read_length || length(rightGenomicSequence_temp) < read_length)
2578 return;
2579
2580 String<char> leftGenomicSequence;
2581 assign(leftGenomicSequence, leftGenomicSequence_temp);
2582
2583 String<char> rightGenomicSequence;
2584 assign(rightGenomicSequence, rightGenomicSequence_temp);
2585
2586 if(color){
2587 leftGenomicSequence = convert_bp_to_color(leftGenomicSequence, true);
2588 rightGenomicSequence = convert_bp_to_color(rightGenomicSequence, true);
2589 }
2590
2591 vector<int> bestInsertPositions;
2592 int minErrors = -1;
2593 simpleSplitAlignment(read_sequence, leftGenomicSequence, rightGenomicSequence, bestInsertPositions, minErrors);
2594
2595 assert (bestInsertPositions.size() > 0);
2596 int bestInsertPosition = bestInsertPositions[0];
2597
2598 /*
2599 * Need to decide if the deletion is suitably improves the alignment
2600 */
2601 int adjustment = 0;
2602
2603 /*
2604 * If these two segments anchors constitue the entire read, then we require
2605 * that this alignment actually improve the number of errors observed in the alignment
2606 * Otherwise, it is OK as long as the number of errors doesn't increase.
2607 */
2608 if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){
2609 adjustment = -1;
2610 }
2611 if(minErrors <= (leftHit.edit_dist()+rightHit.edit_dist()+adjustment)){
2612 deletions.insert(Deletion(leftHit.ref_id(),
2613 leftHit.left() + bestInsertPosition - 1 + end_offset,
2614 leftHit.left() + bestInsertPosition + discrepancy + end_offset,
2615 false));
2616 }
2617 }
2618 return;
2619 }
2620
2621 int difference(const String<char>& first, const String<char>& second)
2622 {
2623 int len = length(first);
2624 if (len != length(second))
2625 return 0;
2626
2627 int min_value = 10000;
2628 short value1[1024] = {0,};
2629 short value2[1024] = {0,};
2630
2631 short* curr = value1;
2632 short* prev = value2;
2633
2634 for (int j = 0; j < len; ++j)
2635 {
2636 for (int i = 0; i < len; ++i)
2637 {
2638 int value = 10000;
2639 int match = first[i] == second[j] ? 0 : 1;
2640
2641 // right
2642 if (i == 0)
2643 value = j * 2 + match;
2644 else if (j > 0)
2645 value = prev[i] + 2;
2646
2647 int temp_value = 10000;
2648
2649 // down
2650 if (j == 0)
2651 temp_value = i * 2 + match;
2652 else if (i > 0)
2653 temp_value = curr[i-1] + 2;
2654
2655 if (temp_value < value)
2656 value = temp_value;
2657
2658 // match
2659 if (i > 0 && j > 0)
2660 temp_value = prev[i-1] + match;
2661
2662 if (temp_value < value)
2663 value = temp_value;
2664
2665 curr[i] = value;
2666
2667 if ((i == len - 1 || j == len - 1) && value < min_value)
2668 min_value = value;
2669 }
2670
2671 short* temp = prev;
2672 prev = curr;
2673 curr = temp;
2674 }
2675
2676 return min_value;
2677 }
2678
2679 void detect_fusion(RefSequenceTable& rt,
2680 seqan::String<char>& read_sequence,
2681 BowtieHit& leftHit,
2682 BowtieHit& rightHit,
2683 FusionSimpleSet& fusions,
2684 uint32_t dir)
2685 {
2686 RefSequenceTable::Sequence* left_ref_str = rt.get_seq(leftHit.ref_id());
2687 RefSequenceTable::Sequence* right_ref_str = rt.get_seq(rightHit.ref_id());
2688
2689 size_t read_length = seqan::length(read_sequence);
2690
2691 Dna5String leftGenomicSequence_temp;
2692 Dna5String rightGenomicSequence_temp;
2693
2694 if (dir == FUSION_FF || dir == FUSION_FR)
2695 {
2696 if (leftHit.left() + read_length > seqan::length(*left_ref_str))
2697 return;
2698
2699 leftGenomicSequence_temp = seqan::infix(*left_ref_str, leftHit.left(), leftHit.left() + read_length);
2700 }
2701 else
2702 {
2703 if (leftHit.right() < read_length)
2704 return;
2705
2706 leftGenomicSequence_temp = seqan::infix(*left_ref_str, leftHit.right() - read_length, leftHit.right());
2707 seqan::convertInPlace(leftGenomicSequence_temp, seqan::FunctorComplement<Dna>());
2708 seqan::reverseInPlace(leftGenomicSequence_temp);
2709 }
2710
2711 if (dir == FUSION_FF || dir == FUSION_RF)
2712 {
2713 if (rightHit.right() < read_length)
2714 return;
2715
2716 rightGenomicSequence_temp = seqan::infix(*right_ref_str, rightHit.right() - read_length, rightHit.right());
2717 }
2718 else
2719 {
2720 if (rightHit.left() + read_length > seqan::length(*right_ref_str))
2721 return;
2722
2723 rightGenomicSequence_temp = seqan::infix(*right_ref_str, rightHit.left(), rightHit.left() + read_length);
2724 seqan::convertInPlace(rightGenomicSequence_temp, seqan::FunctorComplement<Dna>());
2725 seqan::reverseInPlace(rightGenomicSequence_temp);
2726 }
2727
2728 String<char> leftGenomicSequence;
2729 assign(leftGenomicSequence, leftGenomicSequence_temp);
2730
2731 String<char> rightGenomicSequence;
2732 assign(rightGenomicSequence, rightGenomicSequence_temp);
2733
2734 vector<int> bestInsertPositions;
2735 int minErrors = -1;
2736
2737 simpleSplitAlignment(read_sequence, leftGenomicSequence, rightGenomicSequence, bestInsertPositions, minErrors);
2738
2739 uint32_t total_edit_dist = leftHit.edit_dist() + rightHit.edit_dist();
2740 if (minErrors > total_edit_dist)
2741 return;
2742
2743 // daehwan
2744 if (read_length <= 60)
2745 {
2746 /*
2747 * check if the two contig from two different chromosome are different enough
2748 */
2749 int different_count = min(difference(leftGenomicSequence, read_sequence), difference(rightGenomicSequence, read_sequence));
2750 // different_count = min(different_count, difference(leftGenomicSequence, rightGenomicSequence));
2751 if (different_count < read_length / 6)
2752 return;
2753 }
2754
2755 for (size_t i = 0; i < bestInsertPositions.size(); ++i)
2756 {
2757 int bestInsertPosition = bestInsertPositions[i];
2758
2759 uint32_t left, right;
2760 if (dir == FUSION_FF || dir == FUSION_FR)
2761 left = leftHit.left() + bestInsertPosition - 1;
2762 else
2763 left = leftHit.right() - bestInsertPosition;
2764
2765 if (dir == FUSION_FF || dir == FUSION_RF)
2766 right = rightHit.right() - (read_length - bestInsertPosition);
2767 else
2768 right = rightHit.left() + (read_length - bestInsertPosition) - 1;
2769
2770 uint32_t ref_id1 = leftHit.ref_id();
2771 uint32_t ref_id2 = rightHit.ref_id();
2772
2773 uint32_t temp_dir = dir;
2774 if ((ref_id2 < ref_id1) ||
2775 (ref_id1 == ref_id2 && left > right))
2776 {
2777 uint32_t temp = ref_id1;
2778 ref_id1 = ref_id2;
2779 ref_id2 = temp;
2780
2781 temp = left;
2782 left = right;
2783 right = temp;
2784
2785 if (dir == FUSION_FF)
2786 temp_dir = FUSION_RR;
2787 }
2788
2789 // daehwan - check
2790 #if B_DEBUG
2791 cout << endl << endl;
2792 cout << "read id: " << leftHit.insert_id() << endl;
2793 cout << "sense: " << (leftHit.antisense_align() ? "-" : "+") << endl;
2794 // cout << "difference: " << different_count << endl;
2795 cout << "left ref_id: " << rt.get_name(leftHit.ref_id()) << "\tright ref_id: " << rt.get_name(rightHit.ref_id()) << endl;
2796 cout << read_sequence << endl;
2797 cout << leftGenomicSequence << "\t" << rightGenomicSequence << endl;
2798 cout << "insertion pos: " << bestInsertPosition << endl;
2799 cout << left << ":" << right << endl;
2800 cout << "errors: " << minErrors << endl;
2801 #endif
2802
2803 Fusion fusion(ref_id1, ref_id2, left, right, temp_dir);
2804 FusionSimpleSet::iterator itr = fusions.find(fusion);
2805 if (itr == fusions.end())
2806 {
2807 FusionSimpleStat simpleStat;
2808 simpleStat.count = 1;
2809 simpleStat.edit_dist = total_edit_dist;
2810 simpleStat.skip = false;
2811 simpleStat.left_coincide_with_splice_junction = false;
2812 simpleStat.right_coincide_with_splice_junction = false;
2813 fusions[fusion] = simpleStat;
2814 }
2815 else
2816 {
2817 itr->second.count += 1;
2818 itr->second.edit_dist = min(itr->second.edit_dist, total_edit_dist);
2819 }
2820 }
2821 }
2822
2823 void find_insertions_and_deletions(RefSequenceTable& rt,
2824 ReadStream& reads_file,
2825 vector<HitsForRead>& hits_for_read,
2826 std::set<Deletion>& deletions,
2827 std::set<Insertion>& insertions){
2828
2829 if(hits_for_read.empty()){
2830 return;
2831 }
2832
2833 size_t last_segment = hits_for_read.size()-1;
2834 size_t first_segment = 0;
2835 if(last_segment == first_segment){
2836 return;
2837 }
2838
2839 /*
2840 * We can check up front whether the first or last element is empty
2841 * and avoid doing any more work. Note that the following code requires
2842 * that there be at least one elment in each
2843 */
2844 if(hits_for_read[first_segment].hits.empty() || hits_for_read[last_segment].hits.empty()){
2845 return;
2846 }
2847
2848 /*
2849 * Need to identify the appropriate insert id for this group of reads
2850 */
2851 Read read;
2852 /*bool got_read = get_read_from_stream(hits_for_read.back().insert_id,
2853 reads_file,
2854 FASTQ,
2855 false,
2856 read);*/
2857 bool got_read = reads_file.getRead(hits_for_read.back().insert_id, read);
2858 if(!got_read){
2859 err_die("Error: could not get read# %d from stream!",
2860 (int)hits_for_read.back().insert_id);
2861 //return;
2862 }
2863
2864 /*
2865 * Work through all combinations of mappings for the first and last segment to see if any are indicative
2866 * of a small insertions or deletion
2867 */
2868 HitsForRead& left_segment_hits = hits_for_read[first_segment];
2869 HitsForRead& right_segment_hits = hits_for_read[last_segment];
2870
2871 /*
2872 * If either of the segment match lists is empty, we could try
2873 * to be smarter and work our way in until we find good a segment
2874 * match; however, won't do that for noe.
2875 */
2876 if(left_segment_hits.hits.empty() || right_segment_hits.hits.empty()){
2877 return;
2878 }
2879
2880 seqan::String<char> fullRead, rcRead;
2881 if(color){
2882 fullRead = read.seq.c_str() + 1;
2883 rcRead = fullRead;
2884 seqan::reverseInPlace(rcRead);
2885 }else{
2886 fullRead = read.seq;
2887 rcRead = read.seq;
2888 seqan::convertInPlace(rcRead, seqan::FunctorComplement<Dna>());
2889 seqan::reverseInPlace(rcRead);
2890 }
2891
2892 size_t read_length = seqan::length(fullRead);
2893 for(size_t left_segment_index = 0; left_segment_index < left_segment_hits.hits.size(); left_segment_index++){
2894 for(size_t right_segment_index = 0; right_segment_index < right_segment_hits.hits.size(); right_segment_index++){
2895 BowtieHit* leftHit = &left_segment_hits.hits[left_segment_index];
2896 BowtieHit* rightHit = &right_segment_hits.hits[right_segment_index];
2897 /*
2898 * Now we have found a pair of segment hits to investigate. Need to ensure
2899 * that
2900 * 1. the alignment orientation is consistent
2901 * 2. the distance separation is in the appropriate range
2902 * 3. Both hits are aligned to the same contig
2903 */
2904 if(leftHit->ref_id() != rightHit->ref_id()){
2905 continue;
2906 }
2907
2908 if(leftHit->antisense_align() != rightHit->antisense_align()){
2909 continue;
2910 }
2911
2912 seqan::String<char>* modifiedRead = &fullRead;
2913 /*
2914 * If we are dealing with an antisense alignment, then the left
2915 * read will actually be on the right, fix this now, to simplify
2916 * the rest of the logic, in addition, we will need to use the reverse
2917 * complement of the read sequence
2918 */
2919 if(leftHit->antisense_align()){
2920 BowtieHit * tmp = leftHit;
2921 leftHit = rightHit;
2922 rightHit = tmp;
2923 modifiedRead = &rcRead;
2924 }
2925
2926 size_t apparent_length = rightHit->right() - leftHit->left();
2927 int length_discrepancy = apparent_length - read_length;
2928 if(length_discrepancy > 0 && length_discrepancy <= (int)max_deletion_length){
2929 /*
2930 * Search for a deletion
2931 */
2932 detect_small_deletion(rt, *modifiedRead, *leftHit, *rightHit, deletions);
2933 }
2934 if(length_discrepancy < 0 && length_discrepancy >= -(int)max_insertion_length){
2935 /*
2936 * Search for an insertion
2937 */
2938 detect_small_insertion(rt, *modifiedRead, *leftHit, *rightHit, insertions);
2939 }
2940 }
2941 }
2942 }
2943
2944 /*
2945 */
2946 int map_read_to_contig(const String<char>& contig, const String<char>& read)
2947 {
2948 int contig_len = length(contig);
2949 int read_len = length(read);
2950 int pos = -1;
2951 int mismatch = 3;
2952
2953 for (int i = 0; i < contig_len - read_len; ++i)
2954 {
2955 int temp_mismatch = 0;
2956 for (int j = 0; j < read_len; ++j)
2957 {
2958 if (contig[i+j] != read[j])
2959 ++temp_mismatch;
2960
2961 if (temp_mismatch >= mismatch)
2962 break;
2963 }
2964
2965 if (temp_mismatch < mismatch)
2966 {
2967 pos = i;
2968 mismatch = temp_mismatch;
2969 }
2970 }
2971
2972 return pos;
2973 }
2974
2975
2976 void find_fusions(RefSequenceTable& rt,
2977 ReadStream& reads_file,
2978 vector<HitsForRead>& hits_for_read,
2979 HitStream& partner_hit_stream,
2980 HitStream& seg_partner_hit_stream,
2981 FusionSimpleSet& fusions,
2982 eREAD read_side)
2983 {
2984 if (hits_for_read.empty())
2985 return;
2986
2987 size_t last_segment = hits_for_read.size() - 1;
2988 while (last_segment > 0)
2989 {
2990 if (!hits_for_read[last_segment].hits.empty())
2991 break;
2992
2993 --last_segment;
2994 }
2995
2996 // daehwan
2997 #if 0
2998 if (last_segment == 0 || hits_for_read[0].hits.empty())
2999 {
3000 vector<BowtieHit>& hits = last_segment == 0 ? hits_for_read[0].hits : hits_for_read[1].hits;
3001
3002 for (size_t i = 0; i < hits.size(); ++i)
3003 {
3004 BowtieHit* hit = &hits[i];
3005
3006 static const uint32_t chr_id1 = rt.get_id("chr2");
3007 static const uint32_t chr_id2 = rt.get_id("chr3");
3008
3009 // KPL-4 PPP1R12A 12:80167343-80329235:-1 SEPT10 2:110300380-110371783:-1
3010 // const uint32_t left1 = 80167343, right1 = 80329235, left2 = 110300380, right2 = 110371783;
3011
3012 // VCaP TIA1 2:70436576-70475792:-1 DIRC2 3:122513642-122599986:1
3013 const uint32_t left1 = 70436576, right1 = 70475792, left2 = 122513642, right2 = 122599986;
3014
3015 if ((hit->ref_id() == chr_id1 && hit->left() >= left1 && hit->left() <= right1) ||
3016 (hit->ref_id() == chr_id2 && hit->left() >= left2 && hit->left() <= right2))
3017 {
3018 cout << hit->insert_id() << endl;
3019 break;
3020 #if 0
3021 cout << "insert id: " << hit->insert_id() << "\t num hits: " << hits.size() << endl
3022 << hit->ref_id() << ":" << (hit->antisense_align() ? "-" : "+") << " " << (int)hit->edit_dist() << endl
3023 << hit->left() << "-" << hit->right() << endl
3024 << hit->seq() << endl << endl;
3025 #endif
3026 }
3027 }
3028 }
3029 #endif
3030
3031
3032 size_t first_segment = 0;
3033
3034 if (last_segment == first_segment &&
3035 (hits_for_read[first_segment].hits.empty() || hits_for_read[first_segment].hits[0].end()))
3036 return;
3037
3038 uint32_t insert_id = hits_for_read[last_segment].insert_id;
3039
3040 HitsForRead partner_hit_group;
3041 uint32_t next_order = partner_hit_stream.next_group_id();
3042
3043 bool has_partner = false;
3044 while (insert_id >= next_order && next_order != 0)
3045 {
3046 partner_hit_stream.next_read_hits(partner_hit_group);
3047 next_order = partner_hit_stream.next_group_id();
3048 }
3049
3050 has_partner = insert_id == partner_hit_group.insert_id;
3051
3052 if (!has_partner)
3053 {
3054 next_order = seg_partner_hit_stream.next_group_id();
3055 while (insert_id >= next_order && next_order != 0)
3056 {
3057 seg_partner_hit_stream.next_read_hits(partner_hit_group);
3058 next_order = seg_partner_hit_stream.next_group_id();
3059 }
3060
3061 has_partner = insert_id == partner_hit_group.insert_id;
3062 }
3063
3064 /*
3065 * Need to identify the appropriate insert id for this group of reads
3066 */
3067
3068
3069 Read read;
3070 bool got_read = reads_file.getRead(hits_for_read[last_segment].insert_id, read);
3071 if (!got_read)
3072 return;
3073
3074 HitsForRead partner_hits_for_read;
3075 if (first_segment != last_segment)
3076 partner_hits_for_read = hits_for_read[last_segment];
3077
3078 HitsForRead& left_segment_hits = hits_for_read[first_segment];
3079 HitsForRead& right_segment_hits = partner_hits_for_read;
3080
3081 seqan::String<char> fullRead, rcRead;
3082 fullRead = read.seq;
3083 rcRead = read.seq;
3084 seqan::convertInPlace(rcRead, seqan::FunctorComplement<Dna>());
3085 seqan::reverseInPlace(rcRead);
3086
3087 size_t read_length = seqan::length(fullRead);
3088
3089 // daehwan
3090 bool check_partner = true;
3091 if (first_segment != last_segment)
3092 {
3093 for (size_t i = 0; i < left_segment_hits.hits.size(); ++i)
3094 {
3095 BowtieHit& leftHit = left_segment_hits.hits[i];
3096 for (size_t j = 0; j < right_segment_hits.hits.size(); ++j)
3097 {
3098 BowtieHit& rightHit = right_segment_hits.hits[j];
3099
3100 if (leftHit.ref_id() == rightHit.ref_id() && leftHit.antisense_align() == rightHit.antisense_align())
3101 {
3102 int dist = 0;
3103 if (leftHit.antisense_align())
3104 dist = leftHit.left() - rightHit.right();
3105 else
3106 dist = rightHit.left() - leftHit.right();
3107
3108 if (dist > -max_insertion_length && dist <= (int)fusion_min_dist)
3109 {
3110 check_partner = false;
3111 break;
3112 }
3113 }
3114 }
3115
3116 if (!check_partner)
3117 break;
3118 }
3119 }
3120
3121 if (check_partner && has_partner)
3122 {
3123 for (size_t l = 0; l < left_segment_hits.hits.size(); ++l)
3124 {
3125 BowtieHit& leftHit = left_segment_hits.hits[l];
3126 for (size_t r = 0; r < partner_hit_group.hits.size(); ++r)
3127 {
3128 BowtieHit& rightHit = partner_hit_group.hits[r];
3129
3130 if (leftHit.ref_id() == rightHit.ref_id())
3131 {
3132 if (leftHit.antisense_align() != rightHit.antisense_align())
3133 {
3134 int dist = 0;
3135 if (leftHit.antisense_align())
3136 dist = leftHit.left() - rightHit.right();
3137 else
3138 dist = rightHit.left() - leftHit.right();
3139
3140 if (dist > (int)inner_dist_mean - (int)inner_dist_std_dev && dist <= (int)fusion_min_dist)
3141 continue;
3142 }
3143 }
3144
3145 RefSequenceTable::Sequence* ref_str = rt.get_seq(rightHit.ref_id());
3146
3147 const size_t part_seq_len = inner_dist_std_dev > inner_dist_mean ? inner_dist_std_dev - inner_dist_mean : 0;
3148 const size_t flanking_seq_len = inner_dist_mean + inner_dist_std_dev;
3149
3150 Dna5String right_flanking_seq;
3151 size_t left = 0;
3152 if (rightHit.antisense_align())
3153 {
3154 if (flanking_seq_len <= rightHit.left())
3155 {
3156 left = rightHit.left() - flanking_seq_len;
3157 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3158 }
3159 else
3160 break;
3161 }
3162 else
3163 {
3164 if (part_seq_len <= rightHit.right())
3165 {
3166 left = rightHit.right() - part_seq_len;
3167 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3168 }
3169 else
3170 break;
3171 }
3172
3173 const size_t check_read_len = min(15, segment_length - segment_mismatches - 3);
3174 seqan::String<char> fwd_read = infix(fullRead, read_length - check_read_len, read_length);
3175 seqan::String<char> rev_read = infix(rcRead, 0, check_read_len);
3176
3177 int fwd_pos = map_read_to_contig(right_flanking_seq, fwd_read);
3178
3179 if (fwd_pos >= 0)
3180 {
3181 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id(), rightHit.insert_id(),
3182 left + fwd_pos, check_read_len, false, 0, true);
3183
3184 right_segment_hits.hits.push_back(hit);
3185 }
3186
3187 int rev_pos = map_read_to_contig(right_flanking_seq, rev_read);
3188
3189 if (rev_pos >= 0)
3190 {
3191 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id(), rightHit.insert_id(),
3192 left + rev_pos, check_read_len, true, 0, true);
3193
3194 right_segment_hits.hits.push_back(hit);
3195 }
3196
3197 // daehwan
3198 #if 0
3199 if (fwd_pos >= 0 || rev_pos >= 0)
3200 {
3201 // if (leftHit.insert_id() == 409048 || leftHit.insert_id() == 4341516)
3202 {
3203 cout << "insert id: " << leftHit.insert_id() << endl
3204 << "fwd: " << fwd_read << " " << fwd_pos << endl
3205 << "rev: " << rev_read << " " << rev_pos << endl
3206 << "ref: " << right_flanking_seq << endl;
3207 }
3208 }
3209 #endif
3210 }
3211 }
3212 }
3213
3214 // daehwan - should check this out
3215 static const string chrM = "chrM";
3216 static const uint32_t chrM_id = rt.get_id(chrM);
3217 for (size_t left_segment_index = 0; left_segment_index < left_segment_hits.hits.size(); ++left_segment_index)
3218 {
3219 for (size_t right_segment_index = 0; right_segment_index < right_segment_hits.hits.size(); ++right_segment_index)
3220 {
3221 BowtieHit* leftHit = &left_segment_hits.hits[left_segment_index];
3222 BowtieHit* rightHit = &right_segment_hits.hits[right_segment_index];
3223
3224 // daehwan
3225 if (leftHit->ref_id() == chrM_id || rightHit->ref_id() == chrM_id)
3226 continue;
3227
3228 // daehwan
3229 #if 0
3230 if (leftHit->ref_id() == rightHit->ref_id() && leftHit->ref_id() == 1119738090)
3231 {
3232 const uint32_t left1 = 113951556, right1 = 113977987, left2 = 113548692, right2 = 113754053;
3233 if ((leftHit->left() >= left1 && leftHit->left() <= right1 && rightHit->left() >= left2 && rightHit->left() <= right2) ||
3234 (leftHit->left() >= left2 && leftHit->left() <= right2 && rightHit->left() >= left1 && rightHit->left() <= right1))
3235 {
3236 cout << "insert id: " << leftHit->insert_id() << "\t num hits: " << left_segment_hits.hits.size() << ":" << right_segment_hits.hits.size() << endl
3237 << leftHit->ref_id() << ":" << (leftHit->antisense_align() ? "-" : "+") << " " << (int)leftHit->edit_dist() <<endl
3238 << leftHit->left() << "-" << leftHit->right() << endl
3239 << rightHit->ref_id() << ":" << (rightHit->antisense_align() ? "-" : "+") << " " << (int)rightHit->edit_dist() << endl
3240 << rightHit->left() << "-" << rightHit->right() << endl << endl;
3241 }
3242 }
3243 #endif
3244
3245 if (leftHit->ref_id() == rightHit->ref_id())
3246 {
3247 if (leftHit->antisense_align() == rightHit->antisense_align())
3248 {
3249 int dist = 0;
3250 if (leftHit->antisense_align())
3251 dist = leftHit->left() - rightHit->right();
3252 else
3253 dist = rightHit->left() - leftHit->right();
3254
3255 if (dist >= 0 && dist <= (int)fusion_min_dist)
3256 continue;
3257 }
3258 }
3259
3260 uint32_t dir = FUSION_FF;
3261 seqan::String<char>* modifiedRead = &fullRead;
3262
3263 if (leftHit->antisense_align() == rightHit->antisense_align())
3264 {
3265 if (leftHit->antisense_align())
3266 {
3267 BowtieHit * tmp = leftHit;
3268 leftHit = rightHit;
3269 rightHit = tmp;
3270 modifiedRead = &rcRead;
3271 }
3272 }
3273 else if (leftHit->antisense_align() == false && rightHit->antisense_align() == true)
3274 dir = FUSION_FR;
3275 else
3276 dir = FUSION_RF;
3277
3278 detect_fusion(rt, *modifiedRead, *leftHit, *rightHit, fusions, dir);
3279 }
3280 }
3281 }
3282
3283 void find_gaps(RefSequenceTable& rt,
3284 ReadStream& reads_file,
3285 vector<HitsForRead>& hits_for_read,
3286 HitStream& partner_hit_stream,
3287 HitStream& seg_partner_hit_stream,
3288 std::set<Junction, skip_count_lt>& seg_juncs,
3289 eREAD read_side)
3290 {
3291 if (hits_for_read.empty())
3292 return;
3293
3294 size_t last_segment = hits_for_read.size() - 1;
3295 while (last_segment > 0)
3296 {
3297 if (!hits_for_read[last_segment].hits.empty())
3298 break;
3299
3300 --last_segment;
3301 }
3302
3303 hits_for_read.resize(last_segment + 1);
3304
3305 size_t first_segment = 0;
3306 if (last_segment == first_segment &&
3307 (hits_for_read[first_segment].hits.empty() || hits_for_read[first_segment].hits[0].end()))
3308 return;
3309
3310 uint32_t insert_id = hits_for_read[last_segment].insert_id;
3311
3312 HitsForRead partner_hit_group;
3313 uint32_t next_order = partner_hit_stream.next_group_id();
3314
3315 bool has_partner = false;
3316 while (insert_id >= next_order && next_order != 0)
3317 {
3318 partner_hit_stream.next_read_hits(partner_hit_group);
3319 next_order = partner_hit_stream.next_group_id();
3320 }
3321
3322 has_partner = insert_id == partner_hit_group.insert_id;
3323
3324 if (!has_partner)
3325 {
3326 next_order = seg_partner_hit_stream.next_group_id();
3327 while (insert_id >= next_order && next_order != 0)
3328 {
3329 seg_partner_hit_stream.next_read_hits(partner_hit_group);
3330 next_order = seg_partner_hit_stream.next_group_id();
3331 }
3332
3333 has_partner = insert_id == partner_hit_group.insert_id;
3334 }
3335
3336 Read read;
3337 /*
3338 bool got_read = get_read_from_stream(hits_for_read[last_segment].insert_id,
3339 reads_file,
3340 FASTQ,
3341 false,
3342 read); */
3343 bool got_read = reads_file.getRead(hits_for_read[last_segment].insert_id, read);
3344 if (!got_read) {
3345 err_die("Error: could not get read# %d from stream!",
3346 (int)hits_for_read[last_segment].insert_id);
3347 return;
3348 }
3349
3350 HitsForRead partner_hits_for_read;
3351 if (first_segment != last_segment)
3352 partner_hits_for_read = hits_for_read[last_segment];
3353
3354 HitsForRead& left_segment_hits = hits_for_read[first_segment];
3355 HitsForRead& right_segment_hits = partner_hits_for_read;
3356
3357 bool check_partner = true;
3358 if (first_segment != last_segment)
3359 {
3360 for (size_t i = 0; i < left_segment_hits.hits.size(); ++i)
3361 {
3362 BowtieHit& leftHit = left_segment_hits.hits[i];
3363 for (size_t j = 0; j < right_segment_hits.hits.size(); ++j)
3364 {
3365 BowtieHit& rightHit = right_segment_hits.hits[j];
3366
3367 if (leftHit.ref_id() == rightHit.ref_id() && leftHit.antisense_align() == rightHit.antisense_align())
3368 {
3369 int dist = 0;
3370 if (leftHit.antisense_align())
3371 dist = leftHit.left() - rightHit.right();
3372 else
3373 dist = rightHit.left() - leftHit.right();
3374
3375 if (dist >= min_segment_intron_length && dist < (int)max_segment_intron_length)
3376 {
3377 check_partner = false;
3378 break;
3379 }
3380 }
3381 }
3382
3383 if (!check_partner)
3384 break;
3385 }
3386 }
3387
3388 if (check_partner && has_partner)
3389 {
3390 // empty hits from 1 to num_segments - 1
3391 for (size_t i = first_segment + 1; i < hits_for_read.size(); ++i)
3392 {
3393 hits_for_read[i].hits.clear();
3394 }
3395
3396 seqan::String<char> fullRead, rcRead;
3397 fullRead = read.seq;
3398 rcRead = read.seq;
3399 seqan::convertInPlace(rcRead, seqan::FunctorComplement<Dna>());
3400 seqan::reverseInPlace(rcRead);
3401 size_t read_length = read.seq.length();
3402
3403 for (size_t l = 0; l < left_segment_hits.hits.size(); ++l)
3404 {
3405 BowtieHit& leftHit = left_segment_hits.hits[l];
3406 for (size_t r = 0; r < partner_hit_group.hits.size(); ++r)
3407 {
3408 BowtieHit& rightHit = partner_hit_group.hits[r];
3409 if (leftHit.ref_id() != rightHit.ref_id() || leftHit.antisense_align() == rightHit.antisense_align())
3410 continue;
3411
3412 int dist = 0;
3413 if (leftHit.antisense_align())
3414 dist = leftHit.left() - rightHit.right();
3415 else
3416 dist = rightHit.left() - leftHit.right();
3417
3418 if (dist < min_segment_intron_length && dist >= (int)max_segment_intron_length)
3419 continue;
3420
3421 RefSequenceTable::Sequence* ref_str = rt.get_seq(rightHit.ref_id());
3422 const size_t part_seq_len = inner_dist_std_dev > inner_dist_mean ? inner_dist_std_dev - inner_dist_mean : 0;
3423 const size_t flanking_seq_len = inner_dist_mean + inner_dist_std_dev;
3424
3425 Dna5String right_flanking_seq;
3426 size_t left = 0;
3427 if (rightHit.antisense_align())
3428 {
3429 if (flanking_seq_len <= rightHit.left())
3430 {
3431 left = rightHit.left() - flanking_seq_len;
3432 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3433 }
3434 else
3435 break;
3436 }
3437 else
3438 {
3439 if (part_seq_len <= rightHit.right())
3440 {
3441 left = rightHit.right() - part_seq_len;
3442 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3443 }
3444 else
3445 break;
3446 }
3447
3448 const size_t check_read_len = min(15, segment_length - segment_mismatches - 3);
3449 seqan::String<char> fwd_read = infix(fullRead, read_length - check_read_len, read_length);
3450 seqan::String<char> rev_read = infix(rcRead, 0, check_read_len);
3451
3452 int fwd_pos = map_read_to_contig(right_flanking_seq, fwd_read);
3453 if (fwd_pos >= 0)
3454 {
3455 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id2(), rightHit.insert_id(),
3456 left + fwd_pos, check_read_len, false, 0, true);
3457
3458 hits_for_read[last_segment].hits.push_back(hit);
3459 }
3460
3461 int rev_pos = map_read_to_contig(right_flanking_seq, rev_read);
3462
3463 if (rev_pos >= 0)
3464 {
3465 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id2(), rightHit.insert_id(),
3466 left + rev_pos, check_read_len, true, 0, true);
3467
3468 hits_for_read[last_segment].hits.push_back(hit);
3469 }
3470
3471 // daehwan - for debug purposes
3472 #if B_DEBUG
3473 cerr << "daehwan!!!" << endl
3474 << "insert id: " << rightHit.insert_id() << endl
3475 << "first segment: " << first_segment << ", last_segment: " << last_segment << endl
3476 << right_flanking_seq << " : " << seqan::length(right_flanking_seq) << endl
3477 << fwd_read << " : " << fwd_pos << endl
3478 << rev_read << " : " << rev_pos << endl
3479 << "left: " << leftHit.left() << "-" << leftHit.right() << (leftHit.antisense_align() ? " -" : " +") << endl
3480 << "right: " << rightHit.left() << "-" << rightHit.right() << (rightHit.antisense_align() ? " -" : " +") << endl;
3481 if (fwd_pos >= 0 || rev_pos >= 0)
3482 {
3483 const BowtieHit& hit = hits_for_read[last_segment].hits.back();
3484 cerr << "back: " << hit.left() << "-" << hit.right() << (hit.antisense_align() ? " -" : " +") << endl;
3485 }
3486 #endif
3487 }
3488 }
3489 }
3490
3491 vector<RefSeg> expected_don_acc_windows;
3492 string seq(read.seq);
3493
3494 // ignore segments that map to more than this many places that would otherwise produce
3495 // many false splice junctions.
3496 if (bowtie2)
3497 {
3498 const size_t max_seg_hits = max_multihits * 2;
3499 for (size_t s = 0; s < hits_for_read.size(); ++s)
3500 {
3501 if (hits_for_read[s].hits.size() > max_seg_hits)
3502 return;
3503 }
3504 }
3505
3506 for (size_t s = 0; s < hits_for_read.size(); ++s)
3507 {
3508 HitsForRead& curr = hits_for_read[s];
3509 for (size_t h = 0; h < curr.hits.size(); ++h)
3510 {
3511 bool found_right_seg_partner = s == hits_for_read.size() - 1;
3512 BowtieHit& bh = curr.hits[h];
3513
3514 // "drs" is distant seg right partner
3515 // "rrs" is right of right seg partner
3516 vector<BowtieHit*> drs_bhs;
3517 vector<BowtieHit*> rrs_bhs;
3518
3519 if (s < hits_for_read.size() - 1)
3520 {
3521 // Look for a right partner for the current hit
3522 HitsForRead& right = hits_for_read[s + 1];
3523
3524 for (size_t r = 0; r < right.hits.size(); ++r)
3525 {
3526 BowtieHit& rh = right.hits[r];
3527 if (bh.antisense_align() != rh.antisense_align() || bh.ref_id() != rh.ref_id())
3528 continue;
3529
3530 if ((bh.antisense_align() && rh.right() == bh.left()) ||
3531 (!bh.antisense_align() && bh.right() == rh.left() ))
3532 {
3533 found_right_seg_partner = true;
3534 break;
3535 }
3536
3537 int dist = 0;
3538 if (bh.antisense_align())
3539 dist = bh.left() - rh.right();
3540 else
3541 dist = rh.left() - bh.right();
3542
3543 if (dist >= min_segment_intron_length && dist < (int)max_segment_intron_length)
3544 drs_bhs.push_back(&rh);
3545 }
3546 }
3547
3548 if (!found_right_seg_partner && s < hits_for_read.size() - 2)
3549 {
3550 // Look for a right of right partner for the current hit
3551 HitsForRead& right_right = hits_for_read[s + 2];
3552
3553 for (size_t r = 0; r < right_right.hits.size(); ++r)
3554 {
3555 BowtieHit& rrh = right_right.hits[r];
3556 if (bh.antisense_align() != rrh.antisense_align() || bh.ref_id() != rrh.ref_id())
3557 continue;
3558
3559 int dist = 0;
3560 if (bh.antisense_align())
3561 dist = bh.left() - rrh.right();
3562 else
3563 dist = rrh.left() - bh.right();
3564
3565 if (dist >= min_segment_intron_length + segment_length && dist < (int)max_segment_intron_length + segment_length)
3566 rrs_bhs.push_back(&rrh);
3567 }
3568 }
3569
3570 if (!found_right_seg_partner && (drs_bhs.size() > 0 || rrs_bhs.size() > 0))
3571 {
3572 const int look_bp = 8;
3573 const size_t color_offset = color ? 1 : 0;
3574
3575 vector<BowtieHit*> d_bhs = rrs_bhs.size() > 0 ? rrs_bhs : drs_bhs;
3576 for (size_t r = 0; r < d_bhs.size(); ++r)
3577 {
3578 string support_read;
3579 if (rrs_bhs.size() <= 0)
3580 support_read = seq.substr(color_offset + (s+1) * segment_length - look_bp, look_bp * 2);
3581 else
3582 support_read = seq.substr(color_offset + (s+1) * segment_length - look_bp, segment_length + look_bp * 2);
3583
3584 BowtieHit& d_bh = *(d_bhs[r]);
3585 if (!bh.antisense_align())
3586 {
3587 RefSeg right_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read_side, 0, 0, support_read);
3588 right_seg.left = max(0, bh.right() - look_bp);
3589 right_seg.right = d_bh.left() + look_bp;
3590 expected_don_acc_windows.push_back(right_seg);
3591 }
3592 else
3593 {
3594 if (color)
3595 reverse(support_read.begin(), support_read.end());
3596 else
3597 reverse_complement(support_read);
3598
3599 RefSeg left_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read_side, 0, 0, support_read);
3600 left_seg.left = d_bh.right() - look_bp;
3601 left_seg.right = bh.left() + look_bp;
3602 expected_don_acc_windows.push_back(left_seg);
3603 }
3604
3605 // daehwan
3606 #ifdef B_DEBUG2
3607 cout << "insert id: " << bh.insert_id() << endl
3608 << (bh.antisense_align() ? "-" : "+") << endl
3609 << seq << endl
3610 << "(" << s << ") - " << support_read << endl;
3611 #endif
3612 }
3613 }
3614 }
3615 } //for each hits_for_read
3616
3617 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
3618 expected_don_acc_windows,
3619 seg_juncs,
3620 "GT",
3621 "AG",
3622 max_segment_intron_length,
3623 min_segment_intron_length,
3624 max_seg_juncs,
3625 false,
3626 0);
3627
3628 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
3629 expected_don_acc_windows,
3630 seg_juncs,
3631 "GC",
3632 "AG",
3633 max_segment_intron_length,
3634 min_segment_intron_length,
3635 max_seg_juncs,
3636 false,
3637 0);
3638
3639 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
3640 expected_don_acc_windows,
3641 seg_juncs,
3642 "AT",
3643 "AC",
3644 max_segment_intron_length,
3645 min_segment_intron_length,
3646 max_seg_juncs,
3647 false,
3648 0);
3649 }
3650
3651 MerTable mer_table;
3652
3653 int seed_alignments = 0;
3654 int microaligned_segs = 0;
3655
3656 map<RefSeg, vector<string>* > microexon_windows;
3657
3658 bool overlap_in_genome(int ll, int lr, int rl, int rr)
3659 {
3660 if (ll >= rl && ll < rr)
3661 return true;
3662 if (lr > rl && lr < rr)
3663 return true;
3664 if (rl >= ll && rl < lr)
3665 return true;
3666 if (rr > ll && rr < lr)
3667 return true;
3668 return false;
3669 }
3670
3671 void add_to_microexon_windows(uint32_t ref_id,
3672 int left_boundary,
3673 int right_boundary,
3674 const string& dna_str,
3675 eREAD read)
3676 {
3677 RefSeg left_dummy(ref_id, POINT_DIR_DONTCARE, false, read, left_boundary, right_boundary);
3678 RefSeg right_dummy(ref_id, POINT_DIR_DONTCARE, false, read, right_boundary, right_boundary + 1);
3679
3680 map<RefSeg, vector<string>* >::iterator lb = microexon_windows.lower_bound(left_dummy);
3681 map<RefSeg, vector<string>* >::iterator ub = microexon_windows.lower_bound(right_dummy);
3682 vector<string>* new_vec = NULL;
3683 if (lb == microexon_windows.end())
3684 {
3685 microexon_windows.insert(make_pair(left_dummy, new vector<string>(1, dna_str)));
3686 return;
3687 }
3688
3689 map<RefSeg, vector<string>* >::iterator first_to_be_erased = microexon_windows.end();
3690 map<RefSeg, vector<string>* >::iterator last_to_be_erased = ub;
3691
3692 while (lb != ub)
3693 {
3694 // everyone in this range that overlaps with the new interval needs to
3695 // be merged together.
3696 if (overlap_in_genome(lb->first.left, lb->first.right, left_boundary, right_boundary))
3697 {
3698 if (!new_vec)
3699 new_vec = new vector<string>();
3700 if (first_to_be_erased == microexon_windows.end())
3701 first_to_be_erased = lb;
3702 left_dummy.left = min(lb->first.left, left_boundary);
3703 left_dummy.right = max(lb->first.right, right_boundary);
3704
3705 new_vec->insert(new_vec->end(), lb->second->begin(), lb->second->end());
3706 delete lb->second;
3707 }
3708 else if (first_to_be_erased != microexon_windows.end())
3709 {
3710 last_to_be_erased = lb;
3711 }
3712
3713 ++lb;
3714 }
3715
3716 if (first_to_be_erased != microexon_windows.end())
3717 {
3718 microexon_windows.erase(first_to_be_erased, last_to_be_erased);
3719 }
3720
3721 if (!new_vec)
3722 {
3723 // never found an overlapping window, so just add this one and bail
3724 microexon_windows.insert(make_pair(left_dummy, new vector<string>(1, dna_str)));
3725 return;
3726 }
3727 else
3728 {
3729 new_vec->push_back(dna_str);
3730 microexon_windows.insert(make_pair(left_dummy, new_vec));
3731 return;
3732 }
3733
3734 }
3735
3736 void align_microexon_segs(RefSequenceTable& rt,
3737 std::set<Junction, skip_count_lt>& juncs,
3738 int max_juncs,
3739 int half_splice_mer_len)
3740 {
3741 int num_segments = 0;
3742 for (map<RefSeg, vector<string>* >::iterator itr = microexon_windows.begin();
3743 itr != microexon_windows.end(); ++itr)
3744 {
3745 vector<string>& unaligned_segments = *itr->second;
3746 num_segments += unaligned_segments.size();
3747 }
3748
3749 fprintf(stderr, "Aligning %d microexon segments in %lu windows\n",
3750 num_segments, (long unsigned int)microexon_windows.size());
3751
3752 extensions.clear();
3753
3754 size_t splice_mer_len = 2 * half_splice_mer_len;
3755 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
3756
3757 extensions.resize(mer_table_size);
3758
3759 int window_num = 0;
3760 for (map<RefSeg, vector<string>* >::iterator itr = microexon_windows.begin();
3761 itr != microexon_windows.end(); ++itr)
3762 {
3763 window_num++;
3764 if ((window_num % 100) == 0)
3765 fprintf(stderr, "\twindow %d\n",window_num);
3766
3767
3768 stringstream ss(stringstream::in | stringstream::out);
3769
3770 for (size_t j = 0; j < extensions.size(); ++j)
3771 {
3772 extensions[j].clear();
3773 }
3774
3775 vector<string>& unaligned_segments = *itr->second;
3776
3777 for (size_t j = 0; j < unaligned_segments.size(); ++j)
3778 {
3779 stringstream ss(stringstream::in | stringstream::out);
3780 string s;
3781 //cerr << w.unaligned_segments[j];
3782 ss << unaligned_segments[j];
3783 ss >> s;
3784
3785 store_read_extensions(extensions,
3786 half_splice_mer_len,
3787 half_splice_mer_len,
3788 s,
3789 false);
3790 }
3791
3792 vector<RefSeg> segs;
3793 segs.push_back(itr->first);
3794 RefSeg r = itr->first;
3795 r.points_where = POINT_DIR_LEFT;
3796 segs.push_back(r);
3797
3798 juncs_from_ref_segs<RecordExtendableJuncs>(rt,
3799 segs,
3800 juncs,
3801 "GT",
3802 "AG",
3803 max_microexon_stretch,
3804 min_coverage_intron_length,
3805 max_juncs,
3806 false,
3807 half_splice_mer_len);
3808 num_segments += unaligned_segments.size();
3809 delete itr->second;
3810 }
3811 fprintf(stderr, "Checked %d segments against %lu windows for microexon junctions\n",
3812 num_segments, (long unsigned int)microexon_windows.size());
3813 fprintf(stderr, "Found %ld potential microexon junctions\n", (long int)juncs.size());
3814 }
3815
3816 /*
3817 * Easy guys ... this function puts its pants on just like the rest of you -- one leg
3818 * at a time. Except, once its pants are on, it makes gold records. Alright, here we
3819 * go.
3820 */
3821
3822 void look_for_hit_group(RefSequenceTable& rt,
3823 ReadStream& readstream,
3824 ReadStream& readstream_for_segment_search,
3825 ReadStream& readstream_for_indel_discovery,
3826 ReadStream& readstream_for_fusion_discovery,
3827 ReadTable& unmapped_reads,
3828 vector<HitStream>& seg_files,
3829 int curr_file,
3830 uint64_t insert_id,
3831 vector<HitsForRead>& hits_for_read, //<-- collecting segment hits for this read
3832 HitStream& partner_hit_stream_for_segment_search,
3833 HitStream& seg_partner_hit_stream_for_segment_search,
3834 HitStream& partner_hit_stream_for_fusion_discovery,
3835 HitStream& seg_partner_hit_stream_for_fusion_discovery,
3836 std::set<Junction, skip_count_lt>& juncs,
3837 std::set<Deletion>& deletions,
3838 std::set<Insertion>& insertions,
3839 FusionSimpleSet& fusions,
3840 eREAD read_side,
3841 uint32_t begin_id,
3842 uint32_t end_id)
3843 {
3844 HitStream& hit_stream = seg_files[curr_file];
3845 HitsForRead hit_group;
3846 uint32_t order = unmapped_reads.observation_order(insert_id);
3847 int seq_key_len = min((int)min_anchor_len, 6);
3848 while(true) {
3849 uint64_t next_group_id = hit_stream.next_group_id();
3850 uint32_t next_order = unmapped_reads.observation_order(next_group_id);
3851 // If we would have seen the hits by now, stop looking in this stream,
3852 // but forward the search to the next (lower) segment if possible.
3853 if (order < next_order || next_group_id == 0) {
3854 if (curr_file > 0) {
3855 //look for next (lower) segment mappings for this read
3856 look_for_hit_group(rt,
3857 readstream,
3858 readstream_for_segment_search,
3859 readstream_for_indel_discovery,
3860 readstream_for_fusion_discovery,
3861 unmapped_reads,
3862 seg_files,
3863 curr_file - 1,
3864 insert_id,
3865 hits_for_read,
3866 partner_hit_stream_for_segment_search,
3867 seg_partner_hit_stream_for_segment_search,
3868 partner_hit_stream_for_fusion_discovery,
3869 seg_partner_hit_stream_for_fusion_discovery,
3870 juncs,
3871 deletions,
3872 insertions,
3873 fusions,
3874 read_side,
3875 begin_id,
3876 end_id);
3877 }
3878 else
3879 if (insert_id && !no_microexon_search) {
3880 //microexon search
3881 Read read;
3882 // The hits are missing for the leftmost segment, which means
3883 // we should try looking for junctions via seed and extend
3884 // using it (helps find junctions to microexons).
3885 /* bool got_read = get_read_from_stream(insert_id,
3886 readstream.file,
3887 FASTQ,
3888 false,
3889 read); */
3890 bool got_read = readstream.getRead(insert_id, read);
3891 if (!got_read) {
3892 //fprintf(stderr, "Warning: could not get read with insert_id=%d\n", (int)insert_id);
3893 //break; //exit loop
3894 err_die("Error: could not get read with insert_id=%d from file %s\n",
3895 (int)insert_id, readstream.filename());
3896 }
3897 string fwd_read = read.seq;
3898 if (color) // remove the primer and the adjacent color
3899 fwd_read.erase(0, 2);
3900 // make sure there are hits for all the other segs, all the
3901 // way to the root (right-most) one.
3902 int empty_seg = 0;
3903 for (size_t h = 1; h < hits_for_read.size(); ++h) {
3904 if (hits_for_read[h].hits.empty())
3905 empty_seg = h;
3906 }
3907 // if not, no microexon alignment for this segment
3908 if (empty_seg != 0)
3909 break;
3910 fwd_read = fwd_read.substr(0, segment_length);
3911 string rev_read = fwd_read;
3912 //check the reverse
3913 if (color) reverse(rev_read.begin(), rev_read.end());
3914 else reverse_complement(rev_read);
3915 for (size_t h = 0; h < hits_for_read[empty_seg + 1].hits.size(); ++h) {
3916 const BowtieHit& bh = hits_for_read[empty_seg + 1].hits[h];
3917 RefSequenceTable::Sequence* ref_str = rt.get_seq(bh.ref_id());
3918 if (ref_str == NULL)
3919 continue;
3920 int ref_len = length(*ref_str);
3921 int left_boundary;
3922 int right_boundary;
3923 bool antisense = bh.antisense_align();
3924 vector<BowtieHit> empty_seg_hits;
3925 seed_alignments++;
3926 if (antisense) {
3927 left_boundary = max(0, bh.right() - (int)min_anchor_len);
3928 right_boundary = min(ref_len - 2,
3929 left_boundary + max_microexon_stretch);
3930 if (right_boundary - left_boundary < 2 * seq_key_len)
3931 continue;
3932 microaligned_segs++;
3933 add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, rev_read, read_side);
3934 }
3935 else {
3936 right_boundary = min(ref_len - 2,
3937 bh.left() + (int)min_anchor_len);
3938 left_boundary = max(0, right_boundary - max_microexon_stretch);
3939 if (right_boundary - left_boundary < 2 * seq_key_len)
3940 continue;
3941 microaligned_segs++;
3942 add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, fwd_read, read_side);
3943 }
3944 } //for h
3945 } // !no_microexon_search
3946 break;
3947 }
3948 else if (hit_stream.next_read_hits(hit_group)) {
3949 // if we found hits for the target group in the left stream,
3950 // add them to the accumulating vector and continue the search
3951 if (hit_group.insert_id == insert_id) {
3952 hits_for_read[curr_file] = hit_group;
3953 if (curr_file > 0)
3954 // we need to look left (recursively) for the group we
3955 // just read for this stream.
3956 look_for_hit_group(rt,
3957 readstream,
3958 readstream_for_segment_search,
3959 readstream_for_indel_discovery,
3960 readstream_for_fusion_discovery,
3961 unmapped_reads,
3962 seg_files,
3963 curr_file - 1,
3964 insert_id,
3965 hits_for_read,
3966 partner_hit_stream_for_segment_search,
3967 seg_partner_hit_stream_for_segment_search,
3968 partner_hit_stream_for_fusion_discovery,
3969 seg_partner_hit_stream_for_fusion_discovery,
3970 juncs,
3971 deletions,
3972 insertions,
3973 fusions,
3974 read_side,
3975 begin_id,
3976 end_id);
3977 break;
3978 } //same insert_id (group)
3979 else if (curr_file >= 0) {
3980 // different group, we need to start a whole new
3981 // search for it, with a whole new vector of hits.
3982 vector<HitsForRead> hits_for_new_read(seg_files.size());
3983 hits_for_new_read[curr_file] = hit_group;
3984
3985 if (curr_file > 0)
3986 {
3987 look_for_hit_group(rt,
3988 readstream,
3989 readstream_for_segment_search,
3990 readstream_for_indel_discovery,
3991 readstream_for_fusion_discovery,
3992 unmapped_reads,
3993 seg_files,
3994 curr_file - 1,
3995 hit_group.insert_id,
3996 hits_for_new_read,
3997 partner_hit_stream_for_segment_search,
3998 seg_partner_hit_stream_for_segment_search,
3999 partner_hit_stream_for_fusion_discovery,
4000 seg_partner_hit_stream_for_fusion_discovery,
4001 juncs,
4002 deletions,
4003 insertions,
4004 fusions,
4005 read_side,
4006 begin_id,
4007 end_id);
4008
4009 if (hit_group.insert_id >= begin_id && hit_group.insert_id < end_id)
4010 {
4011 find_insertions_and_deletions(rt,
4012 readstream_for_indel_discovery,
4013 hits_for_new_read,
4014 deletions,
4015 insertions);
4016 find_gaps(rt,
4017 readstream_for_segment_search,
4018 hits_for_new_read,
4019 partner_hit_stream_for_segment_search,
4020 seg_partner_hit_stream_for_segment_search,
4021 juncs,
4022 read_side);
4023 }
4024 }
4025
4026 if (hit_group.insert_id >= begin_id && hit_group.insert_id < end_id)
4027 {
4028 if (fusion_search)
4029 {
4030 find_fusions(rt,
4031 readstream_for_fusion_discovery,
4032 hits_for_new_read,
4033 partner_hit_stream_for_fusion_discovery,
4034 seg_partner_hit_stream_for_fusion_discovery,
4035 fusions,
4036 read_side);
4037 }
4038 }
4039 } //different group
4040 }//got next group
4041 } //while loop
4042 }
4043
4044
4045 uint64_t process_next_hit_group(RefSequenceTable& rt,
4046 ReadStream& readstream,
4047 ReadStream& readstream_for_segment_search,
4048 ReadStream& readstream_for_indel_discovery,
4049 ReadStream& readstream_for_fusion_discovery,
4050 ReadTable& unmapped_reads,
4051 vector<HitStream>& seg_files,
4052 size_t last_file_idx,
4053 HitStream& partner_hit_stream_for_segment_search,
4054 HitStream& seg_partner_hit_stream_for_segment_search,
4055 HitStream& partner_hit_stream_for_fusion_discovery,
4056 HitStream& seg_partner_hit_stream_for_fusion_discovery,
4057 std::set<Junction, skip_count_lt>& juncs,
4058 std::set<Deletion>& deletions,
4059 std::set<Insertion>& insertions,
4060 FusionSimpleSet& fusions,
4061 eREAD read,
4062 uint32_t begin_id = 0,
4063 uint32_t end_id = VMAXINT32)
4064 {
4065 HitStream& last_segmap_hitstream = seg_files[last_file_idx];
4066 HitsForRead hit_group;
4067 bool result = last_segmap_hitstream.next_read_hits(hit_group);
4068 vector<HitsForRead> hits_for_read(seg_files.size());
4069 hits_for_read.back() = hit_group;
4070
4071 if (result && hit_group.insert_id >= end_id)
4072 return 0;
4073
4074 look_for_hit_group(rt,
4075 readstream,
4076 readstream_for_segment_search,
4077 readstream_for_indel_discovery,
4078 readstream_for_fusion_discovery,
4079 unmapped_reads,
4080 seg_files,
4081 (int)last_file_idx - 1,
4082 hit_group.insert_id,
4083 hits_for_read,
4084 partner_hit_stream_for_segment_search,
4085 seg_partner_hit_stream_for_segment_search,
4086 partner_hit_stream_for_fusion_discovery,
4087 seg_partner_hit_stream_for_fusion_discovery,
4088 juncs,
4089 deletions,
4090 insertions,
4091 fusions,
4092 read,
4093 begin_id,
4094 end_id);
4095
4096 if (result)
4097 {
4098 find_insertions_and_deletions(rt,
4099 readstream_for_indel_discovery,
4100 hits_for_read,
4101 deletions,
4102 insertions);
4103
4104 if (fusion_search)
4105 {
4106 find_fusions(rt,
4107 readstream_for_fusion_discovery,
4108 hits_for_read,
4109 partner_hit_stream_for_fusion_discovery,
4110 seg_partner_hit_stream_for_fusion_discovery,
4111 fusions,
4112 read);
4113 }
4114
4115 find_gaps(rt,
4116 readstream_for_segment_search,
4117 hits_for_read,
4118 partner_hit_stream_for_segment_search,
4119 seg_partner_hit_stream_for_segment_search,
4120 juncs,
4121 read);
4122
4123 return hit_group.insert_id;
4124 }
4125
4126 return 0;
4127 }
4128
4129 static const int UNCOVERED = 0;
4130 static const int LOOK_LEFT = 1;
4131 static const int LOOK_RIGHT = 2;
4132
4133 uint8_t get_cov(const vector<uint8_t>& cov, uint32_t c)
4134 {
4135 uint32_t b = c >> 2;
4136 uint32_t r = c & 0x3;
4137 uint8_t s = (r << 1);
4138 uint8_t v = cov[b];
4139 v &= (0x3 << s);
4140 v >>= s;
4141 return v;
4142 }
4143
4144 void build_coverage_map(ReadTable& it, RefSequenceTable& rt,
4145 vector<string>& seg_files, map<uint32_t, vector<bool> >& coverage_map) {
4146 if (!coverage_map.empty()) return;
4147 BAMHitFactory hit_factory(it,rt);
4148
4149 for (size_t f = 0; f < seg_files.size(); ++f)
4150 {
4151 //fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f);
4152 //seg_files[f]->rewind();
4153 HitStream hs(seg_files[f], &hit_factory, false, false, false);
4154 HitsForRead hit_group;
4155 while (hs.next_read_hits(hit_group))
4156 {
4157 for (size_t h = 0; h < hit_group.hits.size(); ++h)
4158 {
4159 BowtieHit& bh = hit_group.hits[h];
4160
4161 pair<map<uint32_t, vector<bool> >::iterator, bool> ret =
4162 coverage_map.insert(make_pair(bh.ref_id(), vector<bool>()));
4163 vector<bool>& ref_cov = ret.first->second;
4164
4165 size_t right_extent = bh.right();
4166
4167 if (right_extent >= ref_cov.size())
4168 {
4169 ref_cov.resize(right_extent + 1, 0);
4170 }
4171
4172 for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c)
4173 {
4174 ref_cov[c] = true;
4175 //if (ref_cov[c]<255) ref_cov[c]++;
4176 }
4177 }
4178 } //while next_read_hits
4179 }
4180 }
4181
4182 void pair_covered_sites(ReadTable& it,
4183 RefSequenceTable& rt,
4184 vector<string>& segmap_fnames,
4185 std::set<Junction, skip_count_lt>& cov_juncs,
4186 map<uint32_t, vector<bool> >& coverage_map,
4187 size_t half_splice_mer_len)
4188 {
4189 vector<RefSeg> expected_look_left_windows;
4190 vector<RefSeg> expected_look_right_windows;
4191 build_coverage_map(it,rt, segmap_fnames, coverage_map);
4192
4193 static const int extend = 45;
4194 int num_islands = 0;
4195
4196 vector<RefSeg> expected_don_acc_windows;
4197
4198 fprintf(stderr, "Recording coverage islands\n");
4199 size_t cov_bases = 0;
4200 for (map<uint32_t, vector<bool> >::iterator itr = coverage_map.begin();
4201 itr != coverage_map.end();
4202 ++itr)
4203 {
4204 vector<bool>& cov = itr->second;
4205
4206 size_t island_left_edge = 0;
4207 for (size_t c = 1; c < cov.size(); ++c)
4208 {
4209 if (cov[c])
4210 {
4211 cov_bases++;
4212 if (!cov[c - 1])
4213 {
4214 num_islands += 1;
4215 int edge = (int)c - extend;
4216 edge = max(edge, 0);
4217 island_left_edge = edge;
4218 }
4219 }
4220 else
4221 {
4222 if (cov[c - 1])
4223 {
4224 expected_don_acc_windows.push_back(RefSeg(itr->first,
4225 POINT_DIR_LEFT,
4226 false, /* not important */
4227 READ_DONTCARE,
4228 island_left_edge,
4229 c + extend));
4230 expected_don_acc_windows.push_back(RefSeg(itr->first,
4231 POINT_DIR_RIGHT,
4232 false, /* not important */
4233 READ_DONTCARE,
4234 island_left_edge,
4235 c + extend));
4236 }
4237 }
4238 }
4239 }
4240 fprintf(stderr, "Found %d islands covering %ld bases\n", num_islands, (long int)cov_bases);
4241
4242 juncs_from_ref_segs<RecordButterflyJuncs>(rt,
4243 expected_don_acc_windows,
4244 cov_juncs,
4245 "GT",
4246 "AG",
4247 max_coverage_intron_length,
4248 min_coverage_intron_length,
4249 max_cov_juncs,
4250 true,
4251 half_splice_mer_len);
4252 fprintf(stderr, "Found %ld potential intra-island junctions\n", (long int)cov_juncs.size());
4253 }
4254
4255 // daehwan
4256 struct ReadInfo
4257 {
4258 ReadID read_id;
4259 uint32_t left;
4260 uint32_t right;
4261
4262 bool operator<(const ReadInfo& rhs) const
4263 {
4264 if (left != rhs.left)
4265 return left < rhs.left;
4266 if (right != rhs.right)
4267 return right < rhs.right;
4268
4269 return false;
4270 }
4271 };
4272
4273 void capture_island_ends(ReadTable& it,
4274 RefSequenceTable& rt,
4275 vector<string>& segmap_fnames,
4276 std::set<Junction, skip_count_lt>& cov_juncs,
4277 map<uint32_t, vector<bool> >& coverage_map,
4278 size_t half_splice_mer_len)
4279 {
4280 //static int island_repeat_tolerance = 10;
4281 vector<RefSeg> expected_look_left_windows;
4282 vector<RefSeg> expected_look_right_windows;
4283
4284 // daehwan
4285 //#define DEBUG_CHECK_EXONS 1
4286 //#define DEBUG_RANGE_ONLY 1
4287
4288 #ifndef DEBUG_CHECK_EXONS
4289 build_coverage_map(it, rt, segmap_fnames, coverage_map);
4290 #else
4291 //build coverage map here, so we can debug it
4292 #ifdef DEBUG_RANGE_ONLY
4293 static const uint32_t chr14_id = rt.get_id("chr14");
4294 #endif
4295 vector<ReadInfo> hits;
4296 BAMHitFactory hit_factory(it,rt);
4297
4298 for (size_t f = 0; f < seg_files.size(); ++f)
4299 {
4300 fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f);
4301 seg_files[f]->rewind();
4302 FILE* fp = seg_files[f]->file;
4303 //rewind(fp);
4304 HitStream hs(fp, &hit_factory, false, false, false);
4305
4306 HitsForRead hit_group;
4307 while (hs.next_read_hits(hit_group))
4308 {
4309 for (size_t h = 0; h < hit_group.hits.size(); ++h)
4310 {
4311 BowtieHit& bh = hit_group.hits[h];
4312 // daehwan
4313 //if (check_exons) <-- DEBUG_CHECK_EXONS
4314 #ifdef DEBUG_RANGE_ONLY
4315 if (bh.ref_id() != chr14_id)
4316 continue;
4317
4318 // if (bh.left() < 66567028 && bh.right() > 66604392)
4319 if (bh.left() < 66400000 || bh.right() > 66700000)
4320 continue;
4321
4322 ReadInfo read_info;
4323 read_info.read_id = bh.insert_id();
4324 read_info.left = bh.left();
4325 read_info.right = bh.right();
4326 hits.push_back(read_info);
4327 #endif
4328
4329 pair<map<uint32_t, vector<bool> >::iterator, bool> ret =
4330 coverage_map.insert(make_pair(bh.ref_id(), vector<bool>()));
4331 vector<bool>& ref_cov = ret.first->second;
4332
4333 size_t right_extent = bh.right();
4334
4335 if (right_extent >= ref_cov.size())
4336 {
4337 ref_cov.resize(right_extent + 1, 0);
4338 }
4339 for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c)
4340 {
4341 ref_cov[c] = true;
4342 }
4343 }
4344 }
4345 }
4346
4347 sort(hits.begin(), hits.end());
4348 #endif
4349 // static const int min_cov_length = segment_length + 2;
4350 long covered_bases = 0;
4351 int long_enough_bases = 0;
4352 int left_looking = 0;
4353 int right_looking = 0;
4354 static const int extend = 45;
4355 static const int repeat_tol = 5;
4356
4357 int num_islands = 0;
4358
4359 for (map<uint32_t, vector<bool> >::iterator itr = coverage_map.begin();
4360 itr != coverage_map.end();
4361 ++itr)
4362 {
4363 #ifdef B_DEBUG
4364 fprintf (stderr, "Finding pairings in ref seq %s\n", rt.get_name(itr->first));
4365 #endif
4366 vector<bool>& cov = itr->second;
4367 vector<bool> long_enough(cov.size());
4368
4369 size_t last_uncovered = 0;
4370
4371 static const uint8_t min_cov = 1;
4372
4373 for (size_t c = 1; c < cov.size(); ++c)
4374 {
4375 uint8_t c_cov = cov[c]; //get_cov(cov, c);
4376 if (c_cov < min_cov || c == (cov.size()) - 1)
4377 {
4378 int putative_exon_length = (int)c - (int)last_uncovered;
4379 uint32_t last_pos_cov = cov[c - 1]; //get_cov(cov,c - 1);
4380 if (last_pos_cov >= min_cov && putative_exon_length >= min_cov_length)
4381 {
4382 #ifdef B_DEBUG
4383 fprintf(stderr, "cov. island: %d-%d\n", (int)(last_uncovered + 1), (int)c);
4384 fprintf(stderr, "\t(putative exon length = %d, min_cov_length=%d)\n",putative_exon_length, min_cov_length);
4385 #endif
4386 covered_bases += (c + 1 - last_uncovered);
4387 for (int l = (int)c; l > (int)last_uncovered; --l)
4388 {
4389 long_enough[l] = true;
4390 }
4391 }
4392 last_uncovered = c;
4393 }
4394 }
4395 vector<bool>& ref_cov = long_enough;
4396 vector<uint8_t> cov_state(ref_cov.size(), UNCOVERED);
4397
4398 // daehwan - print islands (exons)
4399 //if (check_exons)
4400 #ifdef DEBUG_CHECK_EXONS
4401 //{
4402 uint32_t left = 0, right = 0;
4403 for (size_t c = 1; c < ref_cov.size(); ++c)
4404 {
4405 if (ref_cov[c] && !ref_cov[c-1])
4406 {
4407 left = c;
4408 cout << "Exon: " << left << "-";
4409 }
4410 else if (!ref_cov[c] && ref_cov[c-1])
4411 {
4412 right = c - 1;
4413 cout << right << endl;
4414 for (size_t k = 0; k < hits.size(); ++k)
4415 {
4416 const ReadInfo& hit = hits[k];
4417 if (hit.right < left)
4418 continue;
4419
4420 if (hit.left > right)
4421 break;
4422
4423 cout << "\t" << hit.read_id << " " << hit.left << "-" << hit.right << endl;
4424 }
4425 }
4426 }
4427 //}
4428 #endif
4429 for (size_t c = 1; c < ref_cov.size(); ++c)
4430 {
4431 if (ref_cov[c])
4432 {
4433 long_enough_bases++;
4434 if (!ref_cov[c - 1])
4435 {
4436 num_islands += 1;
4437
4438 for (int r = (int)c - extend;
4439 r >= 0 && r < (int)c + repeat_tol && r < (int)cov_state.size();
4440 ++r)
4441 {
4442 cov_state[r] |= LOOK_LEFT;
4443 }
4444 }
4445 }
4446 else
4447 {
4448 if (ref_cov[c - 1])
4449 {
4450 for (int l = (int)c - repeat_tol;
4451 l >= 0 && l < (int)c + extend && l < (int)cov_state.size();
4452 ++l)
4453 {
4454 cov_state[l] |= LOOK_RIGHT;
4455 }
4456 }
4457 }
4458 }
4459
4460 RefSeg* curr_look_left = NULL;
4461 RefSeg* curr_look_right = NULL;
4462
4463 for (size_t c = 1; c < cov_state.size(); ++c)
4464 {
4465 if (cov_state[c] & LOOK_LEFT)
4466 {
4467 left_looking++;
4468 if (!(cov_state[c-1] & LOOK_LEFT))
4469 {
4470 expected_look_left_windows.push_back(RefSeg(itr->first,
4471 POINT_DIR_LEFT,
4472 false, /* not important */
4473 READ_DONTCARE,
4474 c,
4475 c + 1));
4476 curr_look_left = &(expected_look_left_windows.back());
4477 }
4478 else if (curr_look_left)
4479 {
4480 curr_look_left->right++;
4481 }
4482 }
4483 else
4484 {
4485 if ((cov_state[c-1] & LOOK_LEFT))
4486 {
4487 curr_look_left = NULL;
4488 }
4489 }
4490
4491 if (cov_state[c] & LOOK_RIGHT)
4492 {
4493 right_looking++;
4494 if (!(cov_state[c-1] & LOOK_RIGHT))
4495 {
4496 expected_look_right_windows.push_back(RefSeg(itr->first,
4497 POINT_DIR_RIGHT,
4498 false, /* not important */
4499 READ_DONTCARE,
4500 c,
4501 c + 1));
4502
4503 curr_look_right = &(expected_look_right_windows.back());
4504 }
4505 else if (curr_look_right)
4506 {
4507 curr_look_right->right++;
4508 }
4509 }
4510 else
4511 {
4512 if ((cov_state[c-1] & LOOK_RIGHT))
4513 {
4514 curr_look_right = NULL;
4515 }
4516 }
4517 }
4518 }
4519
4520 fprintf(stderr, " Map covers %ld bases\n", covered_bases);
4521 fprintf(stderr, " Map covers %d bases in sufficiently long segments\n", long_enough_bases);
4522 fprintf(stderr, " Map contains %d good islands\n", num_islands + 1);
4523 fprintf(stderr, " %d are left looking bases\n", left_looking);
4524 fprintf(stderr, " %d are right looking bases\n", right_looking);
4525
4526 vector<RefSeg> expected_don_acc_windows;
4527 expected_don_acc_windows.insert(expected_don_acc_windows.end(),
4528 expected_look_right_windows.begin(),
4529 expected_look_right_windows.end());
4530
4531 expected_don_acc_windows.insert(expected_don_acc_windows.end(),
4532 expected_look_left_windows.begin(),
4533 expected_look_left_windows.end());
4534
4535 if (!butterfly_search) coverage_map.clear(); //free some memory
4536 juncs_from_ref_segs<RecordExtendableJuncs>(rt,
4537 expected_don_acc_windows,
4538 cov_juncs,
4539 "GT",
4540 "AG",
4541 max_coverage_intron_length,
4542 min_coverage_intron_length,
4543 max_cov_juncs,
4544 true,
4545 half_splice_mer_len);
4546 //fprintf(stderr, "Found %ld potential island-end pairing junctions\n", (long int)cov_juncs.size());
4547 }
4548
4549 void print_juncs(RefSequenceTable& rt, std::set<Junction, skip_count_lt>& juncs, const char* str)
4550 {
4551 fprintf (stderr, "-- %s --\n", str);
4552 for(std::set<Junction>::iterator itr = juncs.begin();
4553 itr != juncs.end();
4554 ++itr)
4555 {
4556 const char* ref_name = rt.get_name(itr->refid);
4557
4558 fprintf(stderr, "%s\t%d\t%d\t%c\n",
4559 ref_name,
4560 itr->left,
4561 itr->right,
4562 itr->antisense ? '-' : '+');
4563 }
4564 fprintf (stderr, "-- done --\n");
4565 }
4566
4567 struct SegmentSearchWorker
4568 {
4569 void operator()()
4570 {
4571 ReadTable it;
4572
4573 ReadStream readstream(reads_fname);
4574 ReadStream readstream_for_segment_search(reads_fname);
4575 ReadStream readstream_for_indel_discovery(reads_fname);
4576 ReadStream readstream_for_fusion_discovery(reads_fname);
4577
4578 if (readstream.file() == NULL ||
4579 readstream_for_segment_search.file() == NULL ||
4580 readstream_for_indel_discovery.file() == NULL ||
4581 readstream_for_fusion_discovery.file() == NULL)
4582 {
4583 fprintf(stderr, "Error: cannot open %s for reading\n",
4584 reads_fname.c_str());
4585 exit(1);
4586 }
4587
4588 if (read_offset > 0)
4589 {
4590 readstream.seek(read_offset);
4591 readstream_for_segment_search.seek(read_offset);
4592 readstream_for_indel_discovery.seek(read_offset);
4593 readstream_for_fusion_discovery.seek(read_offset);
4594 }
4595
4596 vector<BAMHitFactory*> hit_factories;
4597 hit_factories.push_back(new BAMHitFactory(it, *rt));
4598 HitStream partner_hit_stream_for_segment_search(partner_reads_map_fname, hit_factories.back(), false, false, false);
4599 hit_factories.push_back(new BAMHitFactory(it, *rt));
4600 HitStream partner_hit_stream_for_fusion_discovery(partner_reads_map_fname, hit_factories.back(), false, false, false);
4601 if (partner_hit_offset > 0)
4602 {
4603 partner_hit_stream_for_segment_search.seek(partner_hit_offset);
4604 partner_hit_stream_for_fusion_discovery.seek(partner_hit_offset);
4605 }
4606
4607 hit_factories.push_back(new BAMHitFactory(it, *rt));
4608 HitStream seg_partner_hit_stream_for_segment_search(seg_partner_reads_map_fname, hit_factories.back(), false, false, false);
4609 hit_factories.push_back(new BAMHitFactory(it, *rt));
4610 HitStream seg_partner_hit_stream_for_fusion_discovery(seg_partner_reads_map_fname, hit_factories.back(), false, false, false);
4611 if (seg_partner_hit_offset > 0)
4612 {
4613 seg_partner_hit_stream_for_segment_search.seek(seg_partner_hit_offset);
4614 seg_partner_hit_stream_for_fusion_discovery.seek(seg_partner_hit_offset);
4615 }
4616
4617 vector<HitStream> hit_streams;
4618 for (size_t i = 0; i < segmap_fnames->size(); ++i)
4619 {
4620 hit_factories.push_back(new BAMHitFactory(it, *rt));
4621 HitStream hs((*segmap_fnames)[i], hit_factories.back(), false, false, false);
4622 if (seg_offsets[i] > 0)
4623 hs.seek(seg_offsets[i]);
4624
4625 hit_streams.push_back(hs);
4626 }
4627
4628 int num_group = 0;
4629 uint64_t read_id = 0, last_read_id = 0;
4630 while ((read_id = process_next_hit_group(*rt,
4631 readstream,
4632 readstream_for_segment_search,
4633 readstream_for_indel_discovery,
4634 readstream_for_fusion_discovery,
4635 it,
4636 hit_streams,
4637 hit_streams.size() - 1,
4638 partner_hit_stream_for_segment_search,
4639 seg_partner_hit_stream_for_segment_search,
4640 partner_hit_stream_for_fusion_discovery,
4641 seg_partner_hit_stream_for_fusion_discovery,
4642 *juncs, *deletions, *insertions, *fusions,
4643 read,
4644 begin_id, end_id)) != 0)
4645 {
4646 num_group++;
4647 //if (num_group % 500000 == 0)
4648 // fprintf(stderr, "\tProcessed %d root segment groups\n", num_group);
4649
4650 last_read_id = read_id;
4651 }
4652
4653 // "microaligned_segs" is not protected against multi-threading
4654 // fprintf(stderr, "Microaligned %d segments\n", microaligned_segs);
4655
4656 for (size_t i = 0; i < hit_factories.size(); ++i)
4657 delete hit_factories[i];
4658
4659 hit_factories.clear();
4660 }
4661
4662 RefSequenceTable* rt;
4663 string reads_fname;
4664 vector<string>* segmap_fnames;
4665 string partner_reads_map_fname;
4666 string seg_partner_reads_map_fname;
4667 std::set<Junction, skip_count_lt>* juncs;
4668 std::set<Deletion>* deletions;
4669 std::set<Insertion>* insertions;
4670 FusionSimpleSet* fusions;
4671 eREAD read;
4672
4673 uint64_t begin_id;
4674 uint64_t end_id;
4675 int64_t read_offset;
4676 vector<int64_t> seg_offsets;
4677
4678 int64_t partner_hit_offset;
4679 int64_t seg_partner_hit_offset;
4680 };
4681
4682 struct SpliceJunctionCoord
4683 {
4684 uint32_t refid;
4685 int coord;
4686
4687 SpliceJunctionCoord(uint32_t r, int c) :
4688 refid(r),
4689 coord(c)
4690 {}
4691
4692 bool operator< (const SpliceJunctionCoord& r) const
4693 {
4694 if (refid < r.refid)
4695 return true;
4696 else if (refid == r.refid && coord < r.coord)
4697 return true;
4698 else
4699 return false;
4700 }
4701 };
4702
4703 void driver(istream& ref_stream,
4704 FILE* juncs_out,
4705 FILE* insertions_out,
4706 FILE* deletions_out,
4707 FILE* fusions_out,
4708 string& left_reads_fname,
4709 string& left_reads_map_fname,
4710 vector<string>& left_segmap_fnames,
4711 string& right_reads_fname,
4712 string& right_reads_map_fname,
4713 vector<string>& right_segmap_fnames)
4714 {
4715 if (!parallel)
4716 num_threads = 1;
4717
4718 // turn off parallelization in case of the following search methods
4719 if (!no_coverage_search || !no_microexon_search || butterfly_search)
4720 num_threads = 1;
4721
4722 assert (num_threads > 0);
4723 if (left_segmap_fnames.size() == 0)
4724 {
4725 fprintf(stderr, "No hits to process, exiting\n");
4726 exit(0);
4727 }
4728
4729 std::set<Junction, skip_count_lt> vseg_juncs[num_threads];
4730 std::set<Junction, skip_count_lt> cov_juncs;
4731 std::set<Junction, skip_count_lt> butterfly_juncs;
4732
4733 std::set<Junction> juncs;
4734
4735 std::set<Deletion> vdeletions[num_threads];
4736 std::set<Insertion> vinsertions[num_threads];
4737 FusionSimpleSet vfusions[num_threads];
4738
4739 RefSequenceTable rt(sam_header, true);
4740
4741 fprintf (stderr, "Loading reference sequences...\n");
4742 get_seqs(ref_stream, rt, true);
4743
4744 string left_seg_fname_for_segment_search = left_segmap_fnames.back();
4745 string right_seg_fname_for_segment_search;
4746 if (right_segmap_fnames.size() > 0)
4747 right_seg_fname_for_segment_search = right_segmap_fnames.back();
4748
4749 fprintf(stderr, ">> Performing segment-search:\n");
4750
4751 if (left_segmap_fnames.size() > 1)
4752 {
4753 fprintf( stderr, "Loading left segment hits...\n");
4754
4755 vector<uint64_t> read_ids;
4756 vector<vector<int64_t> > offsets;
4757 vector<int64_t> partner_offsets;
4758 vector<int64_t> seg_partner_offsets;
4759 if (num_threads > 1)
4760 {
4761 vector<string> fnames;
4762 fnames.push_back(left_reads_fname);
4763 fnames.insert(fnames.end(), left_segmap_fnames.begin(), left_segmap_fnames.end());
4764 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
4765 if (!enough_data)
4766 num_threads = 1;
4767
4768 if (enough_data && right_reads_map_fname != "")
4769 calculate_offsets_from_ids(right_reads_map_fname, read_ids, partner_offsets);
4770
4771 if (enough_data && right_seg_fname_for_segment_search != "")
4772 calculate_offsets_from_ids(right_seg_fname_for_segment_search, read_ids, seg_partner_offsets);
4773 }
4774
4775 vector<boost::thread*> threads;
4776 for (int i = 0; i < num_threads; ++i)
4777 {
4778 SegmentSearchWorker worker;
4779 worker.rt = &rt;
4780 worker.reads_fname = left_reads_fname;
4781 worker.segmap_fnames = &left_segmap_fnames;
4782 worker.partner_reads_map_fname = right_reads_map_fname;
4783 worker.seg_partner_reads_map_fname = right_seg_fname_for_segment_search;
4784 worker.juncs = &vseg_juncs[i];
4785 worker.deletions = &vdeletions[i];
4786 worker.insertions = &vinsertions[i];
4787 worker.fusions = &vfusions[i];
4788 worker.read = READ_LEFT;
4789 worker.partner_hit_offset = 0;
4790 worker.seg_partner_hit_offset = 0;
4791
4792 if (i == 0)
4793 {
4794 worker.begin_id = 0;
4795 worker.seg_offsets = vector<int64_t>(left_segmap_fnames.size(), 0);
4796 worker.read_offset = 0;
4797 }
4798 else
4799 {
4800 worker.begin_id = read_ids[i-1];
4801 worker.seg_offsets.insert(worker.seg_offsets.end(), offsets[i-1].begin()+1, offsets[i-1].end());
4802 worker.read_offset = offsets[i-1][0];
4803 if (partner_offsets.size() > 0)
4804 worker.partner_hit_offset = partner_offsets[i-1];
4805 if (seg_partner_offsets.size() > 0)
4806 worker.seg_partner_hit_offset = seg_partner_offsets[i-1];
4807 }
4808
4809 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
4810
4811 if (num_threads > 1)
4812 threads.push_back(new boost::thread(worker));
4813 else
4814 worker();
4815 }
4816
4817 for (size_t i = 0; i < threads.size(); ++i)
4818 {
4819 threads[i]->join();
4820 delete threads[i];
4821 threads[i] = NULL;
4822 }
4823 threads.clear();
4824
4825 fprintf( stderr, "done.\n");
4826 }
4827
4828 if (right_segmap_fnames.size() > 1)
4829 {
4830 fprintf( stderr, "Loading right segment hits...\n");
4831
4832 vector<uint64_t> read_ids;
4833 vector<vector<int64_t> > offsets;
4834 vector<int64_t> partner_offsets;
4835 vector<int64_t> seg_partner_offsets;
4836 if (num_threads > 1)
4837 {
4838 vector<string> fnames;
4839 fnames.push_back(right_reads_fname);
4840 fnames.insert(fnames.end(), right_segmap_fnames.begin(), right_segmap_fnames.end());
4841 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
4842 if (!enough_data)
4843 num_threads = 1;
4844
4845 if (enough_data)
4846 calculate_offsets_from_ids(left_reads_map_fname, read_ids, partner_offsets);
4847
4848 if (enough_data)
4849 calculate_offsets_from_ids(left_seg_fname_for_segment_search, read_ids, seg_partner_offsets);
4850 }
4851
4852 vector<boost::thread*> threads;
4853 for (int i = 0; i < num_threads; ++i)
4854 {
4855 SegmentSearchWorker worker;
4856 worker.rt = &rt;
4857 worker.reads_fname = right_reads_fname;
4858 worker.segmap_fnames = &right_segmap_fnames;
4859 worker.partner_reads_map_fname = left_reads_map_fname;
4860 worker.seg_partner_reads_map_fname = left_seg_fname_for_segment_search;
4861 worker.juncs = &vseg_juncs[i];
4862 worker.deletions = &vdeletions[i];
4863 worker.insertions = &vinsertions[i];
4864 worker.fusions = &vfusions[i];
4865 worker.read = READ_RIGHT;
4866 worker.partner_hit_offset = 0;
4867 worker.seg_partner_hit_offset = 0;
4868
4869 if (i == 0)
4870 {
4871 worker.begin_id = 0;
4872 worker.seg_offsets = vector<int64_t>(left_segmap_fnames.size(), 0);
4873 worker.read_offset = 0;
4874 }
4875 else
4876 {
4877 worker.begin_id = read_ids[i-1];
4878 worker.seg_offsets.insert(worker.seg_offsets.end(), offsets[i-1].begin() + 1, offsets[i-1].end());
4879 worker.read_offset = offsets[i-1][0];
4880 if (partner_offsets.size() > 0)
4881 worker.partner_hit_offset = partner_offsets[i-1];
4882 if (seg_partner_offsets.size() > 0)
4883 worker.seg_partner_hit_offset = seg_partner_offsets[i-1];
4884 }
4885
4886 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
4887
4888 if (num_threads > 1)
4889 threads.push_back(new boost::thread(worker));
4890 else
4891 worker();
4892 }
4893
4894 for (size_t i = 0; i < threads.size(); ++i)
4895 {
4896 threads[i]->join();
4897 delete threads[i];
4898 threads[i] = NULL;
4899 }
4900 threads.clear();
4901 fprintf( stderr, "done.\n");
4902 }
4903
4904 std::set<Junction, skip_count_lt> seg_juncs;
4905 std::set<Deletion> deletions;
4906 std::set<Insertion> insertions;
4907
4908 for (int i = 0; i < num_threads; ++i)
4909 {
4910 seg_juncs.insert(vseg_juncs[i].begin(), vseg_juncs[i].end());
4911 deletions.insert(vdeletions[i].begin(), vdeletions[i].end());
4912 insertions.insert(vinsertions[i].begin(), vinsertions[i].end());
4913 }
4914
4915 FusionSimpleSet fusions = vfusions[0];
4916 for (int i = 1; i < num_threads; ++i)
4917 {
4918 merge_with(fusions, vfusions[i]);
4919 }
4920
4921
4922 fprintf(stderr, "\tfound %ld potential split-segment junctions\n", (long int)seg_juncs.size());
4923 fprintf(stderr, "\tfound %ld potential small deletions\n", (long int)deletions.size());
4924 fprintf(stderr, "\tfound %ld potential small insertions\n", (long int)insertions.size());
4925
4926 vector<string> all_segmap_fnames;
4927 for (vector<string>::size_type i = 0; i != left_segmap_fnames.size();i++) {
4928 all_segmap_fnames.push_back( left_segmap_fnames[i] );
4929 }
4930 for (vector<string>::size_type i = 0; i != right_segmap_fnames.size();i++) {
4931 all_segmap_fnames.push_back( right_segmap_fnames[i] );
4932 }
4933
4934 #if 0
4935 // daehwan - check this out as Cole insists on using segments gives better results.
4936 vector<FZPipe*> all_map_files;
4937 if (left_seg_files.size() > 1)
4938 {
4939 all_map_files.push_back(&left_reads_map_file);
4940 }
4941
4942 if (right_seg_files.size() > 1)
4943 {
4944 all_map_files.push_back(&right_reads_map_file);
4945 }
4946
4947 copy(all_seg_files.begin(), all_seg_files.end(), back_inserter(all_map_files));
4948 #endif
4949
4950 ReadTable it;
4951 map<uint32_t, vector<bool> > coverage_map;
4952 if (!no_coverage_search || butterfly_search)
4953 {
4954 if (ium_reads != "")
4955 {
4956 vector<string> ium_read_files;
4957 tokenize(ium_reads,",", ium_read_files);
4958 vector<FZPipe> iums;
4959 string unzcmd=getUnpackCmd(ium_read_files[0],false); //could be BAM file
4960 for (size_t ium = 0; ium < ium_read_files.size(); ++ium)
4961 {
4962 //fprintf (stderr, "Indexing extensions in %s\n", ium_read_files[ium].c_str());
4963 FZPipe ium_file(ium_read_files[ium],unzcmd);
4964 if (ium_file.file==NULL)
4965 {
4966 fprintf (stderr, "Can't open file %s for reading, skipping...\n",ium_read_files[ium].c_str());
4967 continue;
4968 }
4969 iums.push_back(ium_file);
4970 }
4971
4972 index_read_mers(iums, 5);
4973 }
4974 else
4975 { //no unmapped reads
4976 no_coverage_search = true;
4977 butterfly_search = false;
4978 }
4979
4980 if (!no_coverage_search)
4981 { //coverage search
4982 // looking for junctions by island end pairings
4983 fprintf(stderr, ">> Performing coverage-search:\n");
4984 capture_island_ends(it,
4985 rt,
4986 all_segmap_fnames,
4987 cov_juncs,
4988 coverage_map,
4989 5);
4990 fprintf(stderr, "\tfound %d potential junctions\n",(int)cov_juncs.size());
4991 }
4992 } //coverage search or butterfly search
4993
4994 if (butterfly_search)
4995 {
4996 //looking for junctions between and within islands
4997 fprintf(stderr, ">> Performing butterfly-search: \n");
4998 prune_extension_table(butterfly_overhang);
4999 compact_extension_table();
5000 pair_covered_sites(it,
5001 rt,
5002 all_segmap_fnames,
5003 butterfly_juncs,
5004 coverage_map,
5005 5);
5006
5007 fprintf(stderr, "\tfound %d potential junctions\n",(int)butterfly_juncs.size());
5008 }
5009
5010 coverage_map.clear();
5011
5012 std::set<Junction, skip_count_lt> microexon_juncs;
5013 if (!no_microexon_search)
5014 {
5015 fprintf(stderr, ">> Performing microexon-search: \n");
5016 std::set<Junction, skip_count_lt> microexon_juncs;
5017 align_microexon_segs(rt,
5018 microexon_juncs,
5019 max_cov_juncs,
5020 5);
5021 fprintf(stderr, "\tfound %d potential junctions\n",(int)microexon_juncs.size());
5022 juncs.insert(microexon_juncs.begin(), microexon_juncs.end());
5023 }
5024
5025 juncs.insert(cov_juncs.begin(), cov_juncs.end());
5026 juncs.insert(seg_juncs.begin(), seg_juncs.end());
5027 juncs.insert(butterfly_juncs.begin(), butterfly_juncs.end());
5028
5029 //fprintf(stderr, "Reporting potential splice junctions...");
5030 vector<SpliceJunctionCoord> splice_junction_coords;
5031 for(std::set<Junction>::iterator itr = juncs.begin();
5032 itr != juncs.end();
5033 ++itr)
5034 {
5035 const char* ref_name = rt.get_name(itr->refid);
5036
5037 fprintf(juncs_out,
5038 "%s\t%d\t%d\t%c\n",
5039 ref_name,
5040 itr->left,
5041 itr->right,
5042 itr->antisense ? '-' : '+');
5043
5044 if (fusion_search)
5045 {
5046 splice_junction_coords.push_back(SpliceJunctionCoord(itr->refid, itr->left));
5047 splice_junction_coords.push_back(SpliceJunctionCoord(itr->refid, itr->right));
5048 }
5049 }
5050 //close all reading pipes, just to exit cleanly
5051 /*
5052 for (vector<FZPipe*>::size_type i = 0; i != all_segmap_fnames.size();i++) {
5053 all_segmap_fnames[i]->close();
5054 }
5055 */
5056 fprintf(stderr, "Reported %d total potential splices\n", (int)juncs.size());
5057 sort(splice_junction_coords.begin(), splice_junction_coords.end());
5058
5059 fprintf(stderr, "Reporting %u potential deletions...\n", deletions.size());
5060 if(deletions_out){
5061 for(std::set<Deletion>::iterator itr = deletions.begin(); itr != deletions.end(); ++itr){
5062 const char* ref_name = rt.get_name(itr->refid);
5063 /*
5064 * We fix up the left co-ordinate to reference the first deleted base
5065 */
5066 fprintf(deletions_out,
5067 "%s\t%d\t%d\n",
5068 ref_name,
5069 itr->left + 1,
5070 itr->right);
5071 }
5072 fclose(deletions_out);
5073 }else{
5074 fprintf(stderr, "Failed to open deletions file for writing, no deletions reported\n");
5075 }
5076
5077 fprintf(stderr, "Reporting %u potential insertions...\n", insertions.size());
5078 if(insertions_out){
5079 for(std::set<Insertion>::iterator itr = insertions.begin(); itr != insertions.end(); ++itr){
5080 const char* ref_name = rt.get_name(itr->refid);
5081 fprintf(insertions_out,
5082 "%s\t%d\t%d\t%s\n",
5083 ref_name,
5084 itr->left,
5085 itr->left,
5086 itr->sequence.c_str());
5087 }
5088 fclose(insertions_out);
5089 }else{
5090 fprintf(stderr, "Failed to open insertions file for writing, no insertions reported\n");
5091 }
5092 if (fusions_out)
5093 {
5094 // check if a fusion point coincides with splice junctions.
5095 for(FusionSimpleSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr)
5096 {
5097 const Fusion& fusion = itr->first;
5098 FusionSimpleStat& fusion_stat = itr->second;
5099
5100 bool found = binary_search(splice_junction_coords.begin(),
5101 splice_junction_coords.end(),
5102 SpliceJunctionCoord(fusion.refid1, fusion.left));
5103 if (found)
5104 fusion_stat.left_coincide_with_splice_junction = true;
5105
5106 found = binary_search(splice_junction_coords.begin(),
5107 splice_junction_coords.end(),
5108 SpliceJunctionCoord(fusion.refid2, fusion.right));
5109
5110 if (found)
5111 fusion_stat.right_coincide_with_splice_junction = true;
5112 }
5113
5114 for(FusionSimpleSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr)
5115 {
5116 const Fusion& fusion = itr->first;
5117 const FusionSimpleStat& fusion_stat = itr->second;
5118
5119 // compare the current fusion with the next fusion, pick up the better one.
5120 FusionSimpleSet::iterator next_itr = itr; ++next_itr;
5121 while (next_itr != fusions.end())
5122 {
5123 const Fusion& next_fusion = next_itr->first;
5124 const FusionSimpleStat& next_fusion_stat = next_itr->second;
5125
5126 uint32_t left_diff = abs((int)fusion.left - (int)next_fusion.left);
5127 if (fusion.refid1 == next_fusion.refid1 && fusion.refid2 == next_fusion.refid2 && left_diff < 10)
5128 {
5129 if (fusion.dir == next_fusion.dir && left_diff == abs((int)fusion.right - (int)next_fusion.right))
5130 {
5131 if (next_fusion_stat.count > fusion_stat.count)
5132 itr->second.skip = true;
5133 else if (next_fusion_stat.count == fusion_stat.count)
5134 {
5135 int curr_count = (int)fusion_stat.left_coincide_with_splice_junction + (int)fusion_stat.right_coincide_with_splice_junction;
5136 int next_count = (int)next_fusion_stat.left_coincide_with_splice_junction + (int)next_fusion_stat.right_coincide_with_splice_junction;
5137
5138 if (curr_count < next_count)
5139 itr->second.skip = true;
5140 else
5141 next_itr->second.skip = true;
5142 }
5143 else
5144 next_itr->second.skip = true;
5145 }
5146
5147 ++next_itr;
5148 }
5149 else
5150 break;
5151 }
5152
5153 if (itr->second.skip)
5154 continue;
5155
5156 const char* ref_name1 = rt.get_name(fusion.refid1);
5157 const char* ref_name2 = rt.get_name(fusion.refid2);
5158
5159 const char* dir = "";
5160 if (fusion.dir == FUSION_FR)
5161 dir = "fr";
5162 else if(fusion.dir == FUSION_RF)
5163 dir = "rf";
5164 else if(fusion.dir == FUSION_RR)
5165 dir = "rr";
5166 else
5167 dir = "ff";
5168
5169 fprintf(fusions_out,
5170 "%s\t%d\t%s\t%d\t%s\n",
5171 ref_name1,
5172 fusion.left,
5173 ref_name2,
5174 fusion.right,
5175 dir);
5176 }
5177 fclose(fusions_out);
5178 }
5179 fprintf(stderr, "Reporting potential fusions...\n");
5180 }
5181
5182 int main(int argc, char** argv)
5183 {
5184 fprintf(stderr, "segment_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
5185 fprintf(stderr, "---------------------------\n");
5186
5187 int parse_ret = parse_options(argc, argv, print_usage);
5188 if (parse_ret)
5189 return parse_ret;
5190
5191 if(optind >= argc)
5192 {
5193 print_usage();
5194 return 1;
5195 }
5196
5197 string ref_file_name = argv[optind++];
5198
5199 if(optind >= argc)
5200 {
5201 print_usage();
5202 return 1;
5203 }
5204
5205 string juncs_file_name = argv[optind++];
5206
5207 if(optind >= argc)
5208 {
5209 print_usage();
5210 return 1;
5211 }
5212
5213 string insertions_file_name = argv[optind++];
5214 if(optind >= argc)
5215 {
5216 print_usage();
5217 return 1;
5218 }
5219
5220 string deletions_file_name = argv[optind++];
5221 if(optind >= argc)
5222 {
5223 print_usage();
5224 return 1;
5225 }
5226
5227 string fusions_file_name = argv[optind++];
5228 if(optind >= argc)
5229 {
5230 print_usage();
5231 return 1;
5232 }
5233
5234 string left_reads_file_name = argv[optind++];
5235
5236 if(optind >= argc)
5237 {
5238 print_usage();
5239 return 1;
5240 }
5241
5242 string left_reads_map_file_name = argv[optind++];
5243
5244 if(optind >= argc)
5245 {
5246 print_usage();
5247 return 1;
5248 }
5249
5250 string left_segment_map_file_list = argv[optind++];
5251
5252 string right_segment_map_file_list;
5253 string right_reads_file_name;
5254 string right_reads_map_file_name;
5255 if (optind < argc)
5256 {
5257 right_reads_file_name = argv[optind++];
5258
5259 if(optind >= argc)
5260 {
5261 print_usage();
5262 return 1;
5263 }
5264
5265 right_reads_map_file_name = argv[optind++];
5266
5267 if(optind >= argc)
5268 {
5269 print_usage();
5270 return 1;
5271 }
5272 right_segment_map_file_list = argv[optind++];
5273 }
5274
5275 // Open the approppriate files
5276
5277 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
5278 if (!ref_stream.good())
5279 {
5280 fprintf(stderr, "Error: cannot open %s for reading\n",
5281 ref_file_name.c_str());
5282 exit(1);
5283 }
5284
5285
5286 FILE* juncs_file = fopen(juncs_file_name.c_str(), "w");
5287 if (!juncs_file)
5288 {
5289 fprintf(stderr, "Error: cannot open %s for writing\n",
5290 juncs_file_name.c_str());
5291 exit(1);
5292 }
5293
5294 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
5295 if (!insertions_file)
5296 {
5297 fprintf(stderr, "Error: cannot open %s for writing\n",
5298 insertions_file_name.c_str());
5299 exit(1);
5300 }
5301
5302
5303 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
5304 if (!deletions_file)
5305 {
5306 fprintf(stderr, "Error: cannot open %s for writing\n",
5307 deletions_file_name.c_str());
5308 exit(1);
5309 }
5310
5311 vector<string> left_segment_map_fnames;
5312 string left_segment_file_for_segment_search;
5313 tokenize(left_segment_map_file_list, ",",left_segment_map_fnames);
5314
5315 FILE* fusions_file = fopen(fusions_file_name.c_str(), "w");
5316 if (!fusions_file)
5317 {
5318 fprintf(stderr, "Error: cannot open %s for writing\n",
5319 fusions_file_name.c_str());
5320 exit(1);
5321 }
5322
5323 //FILE* left_reads_file = fopen(left_reads_file_name.c_str(), "r");
5324 //FILE* left_reads_file_for_indel_discovery = fopen(left_reads_file_name.c_str(),"r");
5325 string unzcmd=getUnpackCmd(left_reads_file_name, false);
5326 FZPipe left_reads_file(left_reads_file_name, unzcmd);
5327 FZPipe left_reads_file_for_segment_search(left_reads_file_name, unzcmd);
5328 FZPipe left_reads_file_for_indel_discovery(left_reads_file_name, unzcmd);
5329 FILE* left_reads_file_for_fusion_discovery = fopen(left_reads_file_name.c_str(),"r");
5330 if (left_reads_file.file==NULL || left_reads_file_for_segment_search.file==NULL ||
5331 left_reads_file_for_indel_discovery.file==NULL || left_reads_file_for_fusion_discovery==NULL)
5332 {
5333 fprintf(stderr, "Error: cannot open %s for reading\n",
5334 left_reads_file_name.c_str());
5335 exit(1);
5336 }
5337
5338 vector<string> right_segment_map_fnames;
5339 string right_segment_file_for_segment_search;
5340 if (right_segment_map_file_list != "")
5341 {
5342 tokenize(right_segment_map_file_list, ",", right_segment_map_fnames);
5343 }
5344
5345 // min_cov_length=20;
5346 if (min_cov_length>segment_length-2) min_cov_length=segment_length-2;
5347 driver(ref_stream,
5348 juncs_file,
5349 insertions_file,
5350 deletions_file,
5351 fusions_file,
5352 left_reads_file_name,
5353 left_reads_map_file_name,
5354 left_segment_map_fnames,
5355 right_reads_file_name,
5356 right_reads_map_file_name,
5357 right_segment_map_fnames);
5358
5359 return 0;
5360 }