ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/segment_juncs.cpp
Revision: 159
Committed: Mon Jan 30 22:38:08 2012 UTC (7 years, 8 months ago) by gpertea
File size: 157822 byte(s)
Log Message:
wip

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * segment_juncs.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <iostream>
22 #include <fstream>
23 #include <sstream>
24 #include <cstring>
25 #include <bitset>
26 #include <seqan/sequence.h>
27 #include <seqan/find.h>
28 #include <seqan/file.h>
29 #include <seqan/modifier.h>
30 #include <getopt.h>
31
32 #include <boost/thread.hpp>
33
34 #include "common.h"
35 #include "utils.h"
36 #include "bwt_map.h"
37 #include "tokenize.h"
38 #include "segments.h"
39 #include "reads.h"
40 #include "junctions.h"
41 #include "insertions.h"
42 #include "deletions.h"
43 #include "fusions.h"
44
45 using namespace seqan;
46 using namespace std;
47 using namespace __gnu_cxx;
48
49 // daehwan //geo
50 //#define B_DEBUG 1
51
52 void print_usage()
53 {
54 fprintf(stderr, "Usage: segment_juncs <ref.fa> <segment.juncs> <segment.insertions> <segment.deletions> <left_reads.fq> <left_reads.bwtout> <left_seg1.bwtout,...,segN.bwtout> [right_reads.fq right_reads.bwtout right_seg1.bwtout,...,right_segN.bwtout]\n");
55 }
56
57 // This is the maximum number of bowtie mismatches allower per segment hit
58 static const int num_bowtie_mismatches = 2;
59 static const int max_cov_juncs = 5000000;
60 // static const int max_cov_juncs = std::numeric_limits<int>::max();
61 static const int max_seg_juncs = 10000000;
62
63 int max_microexon_stretch = 2000;
64 int butterfly_overhang = 6;
65 int min_cov_length = 20;
66
67 void get_seqs(istream& ref_stream,
68 RefSequenceTable& rt,
69 bool keep_seqs = true)
70 {
71 while(ref_stream.good() &&
72 !ref_stream.eof())
73 {
74 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
75 string name;
76 readMeta(ref_stream, name, Fasta());
77 string::size_type space_pos = name.find_first_of(" \t\r");
78 if (space_pos != string::npos)
79 {
80 name.resize(space_pos);
81 }
82 fprintf(stderr, "\tLoading %s...", name.c_str());
83 seqan::read(ref_stream, *ref_str, Fasta());
84 fprintf(stderr, "done\n");
85 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
86 if (!keep_seqs)
87 delete ref_str;
88 }
89 }
90
91 RefSeg seg_from_bowtie_hit(const BowtieHit& T)
92 {
93 RefSeg r_seg(T.ref_id(), POINT_DIR_DONTCARE, T.antisense_align(), READ_DONTCARE, 0, 0);
94
95 if (T.antisense_align())
96 {
97 r_seg.left = max(0, T.right() - 2);
98 r_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches
99 r_seg.points_where = POINT_DIR_RIGHT;
100 }
101 else
102 {
103 r_seg.left = max(0, T.left() - (T.right() - T.left() + 1));
104 r_seg.right = T.left() + 2; // num allowed bowtie mismatches
105 r_seg.points_where = POINT_DIR_LEFT;
106 }
107
108 return r_seg;
109 }
110
111 pair<RefSeg, RefSeg> segs_from_bowtie_hits(const BowtieHit& T,
112 const BowtieHit& H)
113 {
114 pair<RefSeg, RefSeg> seg_pair;
115 if (H.antisense_align() == false &&
116 abs((H.right() + 1) - T.left()) < (int)max_segment_intron_length)
117 {
118 RefSeg left_seg(H.ref_id(), POINT_DIR_RIGHT, H.antisense_align(), READ_DONTCARE, 0, 0);
119 left_seg.left = max(0, H.right() - 2);
120 left_seg.right = H.right() + (H.right() - H.left() + 1); // num allowed bowtie mismatches
121
122 RefSeg right_seg(T.ref_id(), POINT_DIR_LEFT, T.antisense_align(), READ_DONTCARE, 0, 0);
123 right_seg.left = max(0, T.left() - (T.right() - T.left() + 1));
124 right_seg.right = T.left() + 2; // num allowed bowtie mismatches
125
126 seg_pair = make_pair(left_seg, right_seg);
127 }
128 else if (H.antisense_align() == true &&
129 abs((T.right() + 1) - H.left()) < (int)max_segment_intron_length)
130 {
131 RefSeg left_seg(T.ref_id(), POINT_DIR_RIGHT, T.antisense_align(), READ_DONTCARE, 0, 0);
132 left_seg.left = max(0, T.right() - 2);
133 left_seg.right = T.right() + (T.right() - T.left() + 1); // num allowed bowtie mismatches
134
135 RefSeg right_seg(H.ref_id(), POINT_DIR_LEFT, H.antisense_align(), READ_DONTCARE, 0, 0);
136 right_seg.left = max(0, H.left() - (H.right() - H.left() + 1));
137 right_seg.right = H.left() + 2; // num allowed bowtie mismatches
138
139 seg_pair = make_pair(left_seg, right_seg);
140 }
141 return seg_pair;
142 }
143
144 //static const size_t half_splice_mer_len = 6;
145 //static const size_t splice_mer_len = 2 * half_splice_mer_len;
146
147 struct MerExtension
148 {
149 static const int MAX_EXTENSION_BP = 14;
150 uint32_t left_dna_str : 28; // up to 14bp encoded in 2-bits-per-base
151 uint8_t left_ext_len : 4; // how many bases in this extension
152
153 uint32_t right_dna_str : 28; // up to 14bp encoded in 2-bits-per-base
154 uint8_t right_ext_len : 4; // how many bases in this extension
155
156 MerExtension() : left_dna_str(0), left_ext_len(0), right_dna_str(0), right_ext_len(0) {}
157
158 bool operator<(const MerExtension& rhs) const
159 {
160 if (left_dna_str != rhs.left_dna_str)
161 return left_dna_str < rhs.left_dna_str;
162 if (left_ext_len != rhs.left_ext_len)
163 return left_ext_len < rhs.left_ext_len;
164 if (right_dna_str != rhs.right_dna_str)
165 return right_dna_str < rhs.right_dna_str;
166 if (right_ext_len != rhs.right_ext_len)
167 return right_ext_len < rhs.right_ext_len;
168 return false;
169 }
170
171 bool operator==(const MerExtension& rhs) const
172 {
173 bool eq = left_dna_str == rhs.left_dna_str &&
174 right_dna_str == rhs.right_dna_str &&
175 left_ext_len == rhs.left_ext_len &&
176 right_ext_len == rhs.right_ext_len;
177
178 return eq;
179 }
180 };
181
182
183 /// For converting from ASCII to the Dna5 code where A=0, C=1, G=2,
184 /// T=3, N=4
185 uint8_t charToDna5[] = {
186 /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
187 /* 16 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
188 /* 32 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
189 /* 48 */ 0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
190 /* 64 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
191 /* A C G N */
192 /* 80 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
193 /* T */
194 /* 96 */ 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0,
195 /* a c g n */
196 /* 112 */ 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
197 /* t */
198 /* 128 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
199 /* 144 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
200 /* 160 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
201 /* 176 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
202 /* 192 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
203 /* 208 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204 /* 224 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205 /* 240 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
206 };
207
208
209 typedef vector<vector<MerExtension> > MerExtensionTable;
210 typedef vector<uint32_t> MerExtensionCounts;
211
212 MerExtensionTable extensions;
213 MerExtensionCounts extension_counts;
214
215 uint64_t dna5str_to_idx(const string& str)
216 {
217 uint64_t idx = 0;
218
219 for (size_t i = 0; i < str.length(); ++i)
220 {
221 idx <<=2;
222 char c = toupper(str[i]);
223 idx |= (0x3 & charToDna5[(size_t)c]);
224 }
225 return idx;
226 }
227
228 uint64_t colorstr_to_idx(const string& str)
229 {
230 uint64_t idx = 0;
231
232 for (size_t i = 0; i < str.length(); ++i)
233 {
234 idx <<=2;
235 char c = str[i];
236 idx |= (0x3 & charToDna5[(size_t)c]);
237 }
238 return idx;
239 }
240
241 void store_read_extensions(MerExtensionTable& ext_table,
242 int seq_key_len,
243 int min_ext_len,
244 const string& seq,
245 bool use_precount_table)
246 {
247 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
248 // this read.
249
250 uint64_t seed = 0;
251 bitset<256> left = 0;
252 bitset<256> right = 0;
253 const char* p = seq.c_str();
254
255 unsigned int seq_len = (int)seq.length();
256 const char* seq_end = p + seq_len;
257
258 // Build the first seed
259 while (p < seq.c_str() + (2 * seq_key_len))
260 {
261 seed <<= 2;
262 seed |= (0x3 & charToDna5[(size_t)*p]);
263 ++p;
264 }
265
266 // Build the rest of them with a sliding window, adding successive bases
267 // to the "right-remainder" word.
268 while (p < seq_end)
269 {
270 right <<= 2;
271 right |= (0x3 & charToDna5[(size_t)*p]);
272 ++p;
273 }
274
275 // This loop will construct successive seed, along with 32-bit words
276 // containing the left and right remainders for each seed
277 uint32_t i = 0;
278 size_t new_hits = 0;
279 do
280 {
281 // How many base pairs exist in the right remainder beyond what we have
282 // space for ?
283 int extra_right_bp = ((int)seq.length() -
284 (i + 2 * seq_key_len)) - MerExtension::MAX_EXTENSION_BP;
285
286 uint32_t hit_right = 0;
287 if (extra_right_bp > 0)
288 {
289 //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1));
290 hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong();
291 }
292 else
293 {
294 hit_right = (uint32_t)right.to_ulong();
295 }
296
297 uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong();
298
299 //size_t prev_cap = (*mer_table)[seed].capacity();
300 //(*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement));
301 //cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit);
302
303 MerExtension ext;
304
305 ext.right_dna_str = hit_right;
306 ext.right_ext_len = min(seq_len - (2 * seq_key_len) - i,
307 (unsigned int)MerExtension::MAX_EXTENSION_BP);
308
309 ext.left_dna_str = hit_left;
310 ext.left_ext_len = min(i, (unsigned int)MerExtension::MAX_EXTENSION_BP);
311 if (use_precount_table)
312 {
313 int curr_seed = --extension_counts[seed];
314 if (curr_seed < 0 || curr_seed > (int)ext_table[seed].size())
315 {
316 fprintf(stderr, "Error: curr_seed is %d, max is %lu\n", curr_seed, (long unsigned int)ext_table[seed].size());
317 }
318
319 ext_table[seed][curr_seed] = ext;
320 }
321 else
322 {
323 ext_table[seed].push_back(ext);
324 }
325 new_hits++;
326
327 // Take the leftmost base of the seed and stick it into bp
328 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
329
330 // Move that base down to the least significant bits of bp
331 bp >>= ((seq_key_len << 2) - 2);
332
333 // And tack it onto the left remainder of the read
334 left <<= 2;
335 left |= bp;
336
337 // Now take the leftmost base of the right remainder and stick it into
338 // the rightmost position of the seed
339 uint32_t right_len = seq_len - (i + seq_key_len * 2);
340 //bp = right & (0x3uLL << ((right_len - 1) << 1));
341
342 seed <<= 2;
343 //cout << right << endl;
344 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
345 //cout <<tmp_right << endl;
346 seed |= tmp_right.to_ulong();
347 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
348
349
350 //Now remove that leftmost base of the right remainder
351 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
352 if (right_len)
353 {
354 right.set(((right_len - 1) << 1), 0);
355 right.set(((right_len - 1) << 1) + 1, 0);
356 }
357 ++i;
358
359 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
360 return;
361 }
362
363 void count_read_extensions(MerExtensionCounts& ext_counts,
364 int seq_key_len,
365 int min_ext_len,
366 const string& seq)
367 {
368 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
369 // this read.
370
371 uint64_t seed = 0;
372 bitset<256> left = 0;
373 bitset<256> right = 0;
374 const char* p = seq.c_str();
375
376 unsigned int seq_len = (int)seq.length();
377 const char* seq_end = p + seq_len;
378
379 // Build the first seed
380 while (p < seq.c_str() + (2 * seq_key_len))
381 {
382 seed <<= 2;
383 seed |= (0x3 & charToDna5[(size_t)*p]);
384 ++p;
385 }
386
387 // Build the rest of them with a sliding window, adding successive bases
388 // to the "right-remainder" word.
389 while (p < seq_end)
390 {
391 right <<= 2;
392 right |= (0x3 & charToDna5[(size_t)*p]);
393 ++p;
394 }
395
396 // This loop will construct successive seed, along with 32-bit words
397 // containing the left and right remainders for each seed
398 uint32_t i = 0;
399 size_t new_hits = 0;
400 do
401 {
402 ext_counts[seed]++;
403
404 new_hits++;
405
406 // Take the leftmost base of the seed and stick it into bp
407 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
408
409 // Move that base down to the least significant bits of bp
410 bp >>= ((seq_key_len << 2) - 2);
411
412 // And tack it onto the left remainder of the read
413 left <<= 2;
414 left |= bp;
415
416 // Now take the leftmost base of the right remainder and stick it into
417 // the rightmost position of the seed
418 uint32_t right_len = seq_len - (i + seq_key_len * 2);
419 //bp = right & (0x3uLL << ((right_len - 1) << 1));
420
421 seed <<= 2;
422 //cout << right << endl;
423 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
424 //cout <<tmp_right << endl;
425 seed |= tmp_right.to_ulong();
426 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
427
428
429 //Now remove that leftmost base of the right remainder
430 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
431 if (right_len)
432 {
433 right.set(((right_len - 1) << 1), 0);
434 right.set(((right_len - 1) << 1) + 1, 0);
435 }
436 ++i;
437
438 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
439 return;
440 }
441
442 //void count_read_mers(FILE* reads_file, size_t half_splice_mer_len)
443 void count_read_mers(FZPipe& reads_file, size_t half_splice_mer_len)
444 {
445 Read read;
446 size_t splice_mer_len = 2 * half_splice_mer_len;
447 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
448 extension_counts.resize(mer_table_size);
449 FLineReader fr(reads_file);
450 //while(!feof(reads_file))
451 while (!fr.isEof())
452 {
453 read.clear();
454
455 // Get the next read from the file
456 if (!next_fasta_record(fr, read.name, read.seq, reads_format))
457 break;
458 if (reads_format == FASTQ)
459 {
460 if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
461 break;
462 }
463
464 if (color)
465 // erase the primer and the adjacent color
466 read.seq.erase(0, 2);
467
468 if (read.seq.size() > 32)
469 read.seq.resize(32);
470
471 count_read_extensions(extension_counts,
472 half_splice_mer_len,
473 half_splice_mer_len,
474 read.seq);
475 }
476
477 //rewind(reads_file);
478 reads_file.rewind();
479 }
480
481 void compact_extension_table()
482 {
483 for (size_t i = 0; i < extensions.size(); ++i)
484 {
485 vector<MerExtension>& exts = extensions[i];
486 sort(exts.begin(), exts.end());
487 vector<MerExtension>::iterator new_end = unique(exts.begin(), exts.end());
488 exts.erase(new_end, exts.end());
489 vector<MerExtension>(exts).swap(exts);
490 }
491 }
492
493 void prune_extension_table(uint8_t max_extension_bp)
494 {
495 uint32_t mask = ~(0xFFFFFFFFuLL << (max_extension_bp << 1));
496
497 for (size_t i = 0; i < extensions.size(); ++i)
498 {
499 vector<MerExtension>& exts = extensions[i];
500 for (size_t j = 0; j < exts.size(); ++j)
501 {
502 MerExtension& ex = exts[j];
503 if (ex.left_ext_len > max_extension_bp)
504 {
505 ex.left_ext_len = max_extension_bp;
506 ex.left_dna_str &= mask;
507 }
508
509 if (ex.right_ext_len > max_extension_bp)
510 {
511 ex.right_dna_str >>= ((ex.right_ext_len - max_extension_bp) << 1);
512 ex.right_ext_len = max_extension_bp;
513 }
514 }
515 }
516 }
517
518 //void store_read_mers(FILE* reads_file, size_t half_splice_mer_len)
519 void store_read_mers(FZPipe& reads_file, size_t half_splice_mer_len)
520 {
521 Read read;
522 size_t splice_mer_len = 2 * half_splice_mer_len;
523
524 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
525 extensions.resize(mer_table_size);
526
527 size_t num_indexed_reads = 0;
528 FLineReader fr(reads_file);
529 //while(!feof(reads_file))
530 while(!fr.isEof())
531 {
532 read.clear();
533
534 // Get the next read from the file
535 if (!next_fasta_record(fr, read.name, read.seq, reads_format))
536 break;
537 if (reads_format == FASTQ)
538 {
539 if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, reads_format))
540 break;
541 }
542
543 if (color)
544 // erase the primer and the adjacent color
545 read.seq.erase(0, 2);
546
547 if (read.seq.size() > 32)
548 read.seq.resize(32);
549
550 store_read_extensions(extensions,
551 half_splice_mer_len,
552 half_splice_mer_len,
553 read.seq,
554 true);
555
556 // Do NOT index the reverse of the reads
557
558 num_indexed_reads++;
559 if (num_indexed_reads % 1000000 == 0)
560 {
561 //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads);
562 //compact_extension_table();
563 }
564 }
565
566 //fprintf(stderr, "Indexed %lu reads, compacting extension table\n", num_indexed_reads)
567
568 uint64_t num_extensions = 0;
569 for (size_t i = 0; i < extensions.size(); ++i)
570 {
571 num_extensions += extensions[i].size();
572 }
573 //fprintf (stderr, "Total extensions: %lu\n", (long unsigned int)num_extensions);
574 reads_file.rewind();
575 }
576
577 //void index_read_mers(vector<FILE*> reads_files,
578 void index_read_mers(vector<FZPipe>& reads_files,
579 size_t half_splice_mer_len)
580 {
581 extensions.clear();
582 for (size_t i = 0; i < reads_files.size(); ++i)
583 {
584 count_read_mers(reads_files[i], half_splice_mer_len);
585 }
586
587 extensions.resize(extension_counts.size());
588
589 for (size_t i = 0; i < extension_counts.size(); ++i)
590 {
591 extensions[i].resize(extension_counts[i]);
592 }
593
594 for (size_t i = 0; i < reads_files.size(); ++i)
595 {
596 store_read_mers(reads_files[i], half_splice_mer_len);
597 }
598
599 compact_extension_table();
600 }
601
602 /** Returns the number of characters in strings w1 and w2 that match,
603 * starting from right to left
604 */
605 int get_matching_chars(uint32_t w1, uint32_t w2)
606 {
607 //find the least significant mismatching bit between w1 and w2
608 int mismatch_bit = ffs(w1 ^ w2);
609
610 // If there is no mismatching bit, the words are equal
611 if (!mismatch_bit)
612 return -1;
613
614 // Given the mismatching bit, determine where the mismatching base is
615 mismatch_bit -= 1;
616 mismatch_bit -= ((mismatch_bit) & 1);
617 mismatch_bit >>= 1;
618
619 // Return the number of matching characters.
620 return mismatch_bit;
621 }
622
623 /**
624 * Computes the Hamming distance between two 16bp words, up to a specified
625 * maximum number of mismatches.
626 */
627 uint32_t mismatching_bases(uint32_t w1_word,
628 uint32_t w2_word,
629 int len,
630 uint32_t max_mis)
631 {
632 uint32_t diffs = 0;
633
634 int shift = 0;
635 int L = len;
636 uint32_t misses = 0;
637
638 // While we haven't yet exceeded the maximum allowable mismatches,
639 // and there are still unaligned bases, keep shift-anding
640 while (shift < len && misses <= max_mis)
641 {
642 int match_chars = 0;
643
644 // Get the number of characters matching on the right sides of
645 // both words
646 match_chars = get_matching_chars(w1_word, w2_word);
647
648 // If they are equal for this shift, we are done,
649 // the loop will stop at the next iteration
650 if (match_chars == -1 || match_chars >= len)
651 {
652 match_chars = len;
653 shift = len;
654 }
655 else
656 {
657 // If there is a mismatch in the remaining words
658 // decide how much to shift by and try again
659 match_chars = min(len, match_chars);
660 int shift_chars = (match_chars + 1);
661
662 L -= shift_chars;
663
664 int shift_bits = shift_chars << 1;
665
666 // Shift right past the matching part and the first mismatching base
667 w1_word >>= (shift_bits);
668 w2_word >>= (shift_bits);
669
670 shift += shift_chars;
671 diffs++;
672 misses++;
673 }
674 }
675
676 return diffs;
677 }
678
679 uint64_t rc_dna_str(uint64_t dna_str)
680 {
681 dna_str = ~dna_str;
682 uint64_t rc = 0;
683 for (int i = 0; i < 32; i++)
684 {
685 rc <<= 2;
686 rc |= (dna_str & 0x3);
687 dna_str >>= 2;
688 }
689 return rc;
690 }
691
692 uint64_t rc_color_str(uint64_t color_str)
693 {
694 uint64_t rc = 0;
695 for (int i = 0; i < 32; ++i)
696 {
697 rc <<= 2;
698 rc |= (color_str & 0x3);
699 color_str >>= 2;
700 }
701 return rc;
702 }
703
704 struct DnaSpliceStrings
705 {
706 DnaSpliceStrings(uint64_t f, uint64_t r) : fwd_string(f), rev_string(r), first_in_string('N'), last_in_string('N') {}
707 uint64_t fwd_string;
708 uint64_t rev_string;
709
710 // for color-space purposes
711 char first_in_string;
712 char last_in_string;
713
714 bool operator<(const DnaSpliceStrings& rhs) const
715 {
716 if (fwd_string != rhs.fwd_string)
717 return fwd_string < rhs.fwd_string;
718 if (rev_string != rhs.rev_string)
719 return rev_string < rhs.rev_string;
720 return false;
721 }
722
723 bool operator==(const DnaSpliceStrings& rhs) const
724 {
725 return fwd_string == rhs.fwd_string && rev_string == rhs.rev_string;
726 }
727 };
728
729 struct IntronMotifs
730 {
731 IntronMotifs(uint32_t rid) : ref_id(rid) {}
732 uint32_t ref_id;
733
734 vector<pair<size_t, DnaSpliceStrings> > fwd_donors;
735 vector<pair<size_t, DnaSpliceStrings> > fwd_acceptors;
736 vector<pair<size_t, DnaSpliceStrings> > rev_donors;
737 vector<pair<size_t, DnaSpliceStrings> > rev_acceptors;
738
739 void unique(vector<pair<size_t, DnaSpliceStrings> >& f)
740 {
741 sort(f.begin(), f.end());
742 vector<pair<size_t, DnaSpliceStrings> >::iterator i = std::unique(f.begin(), f.end());
743 f.erase(i, f.end());
744 }
745
746 void unique()
747 {
748 unique(fwd_donors);
749 unique(fwd_acceptors);
750 unique(rev_donors);
751 unique(rev_acceptors);
752 }
753
754 void attach_mers(RefSequenceTable::Sequence& ref_str)
755 {
756 attach_upstream_mers(ref_str, fwd_donors);
757 attach_upstream_mers(ref_str, rev_acceptors);
758
759 attach_downstream_mers(ref_str, rev_donors);
760 attach_downstream_mers(ref_str, fwd_acceptors);
761 }
762
763 void attach_upstream_mers(RefSequenceTable::Sequence& ref_str,
764 vector<pair<size_t, DnaSpliceStrings> >& dinucs)
765 {
766 for (size_t i = 0; i < dinucs.size(); ++i)
767 {
768 size_t pos = dinucs[i].first;
769 int half_splice_mer_len = 32;
770
771 if (color)
772 {
773 if (pos <= (size_t)half_splice_mer_len+1 || pos >= length(ref_str))
774 continue;
775
776 Dna5String seg_str = seqan::infixWithLength(ref_str,
777 pos - half_splice_mer_len - 1,
778 half_splice_mer_len + 1);
779 stringstream ss(stringstream::in | stringstream::out);
780 string s;
781 ss << seg_str;
782 ss >> s;
783
784 string col_seg_str = convert_bp_to_color(s, true);
785 uint64_t idx = colorstr_to_idx(col_seg_str);
786
787 dinucs[i].second.fwd_string = idx;
788 dinucs[i].second.rev_string = rc_color_str(idx);
789 dinucs[i].second.first_in_string = s[1];
790 dinucs[i].second.last_in_string = s[half_splice_mer_len];
791 }
792 else
793 {
794 if (pos <= (size_t)half_splice_mer_len || pos >= length(ref_str))
795 continue;
796
797 Dna5String seg_str = seqan::infixWithLength(ref_str,
798 pos - half_splice_mer_len,
799 half_splice_mer_len);
800
801 stringstream ss(stringstream::in | stringstream::out);
802 string s;
803 ss << seg_str;
804 ss >> s;
805 uint64_t idx = dna5str_to_idx(s);
806 dinucs[i].second.fwd_string = idx;
807 dinucs[i].second.rev_string = rc_dna_str(idx);
808 }
809 }
810 }
811
812
813 void attach_downstream_mers(RefSequenceTable::Sequence& ref_str,
814 vector<pair<size_t, DnaSpliceStrings> >& dinucs)
815 {
816 for (size_t i = 0; i < dinucs.size(); ++i)
817 {
818 size_t pos = dinucs[i].first;
819
820 int half_splice_mer_len = 32;
821
822 if (pos + 2 + half_splice_mer_len >= length(ref_str))
823 continue;
824
825 if (color)
826 {
827 Dna5String seg_str = seqan::infixWithLength(ref_str,
828 pos + 2 - 1,
829 half_splice_mer_len + 1);
830 stringstream ss(stringstream::in | stringstream::out);
831 string s;
832 ss << seg_str;
833 ss >> s;
834
835 string col_seg_str = convert_bp_to_color(s, true);
836 uint64_t idx = colorstr_to_idx(col_seg_str);
837
838 dinucs[i].second.fwd_string = idx;
839 dinucs[i].second.rev_string = rc_color_str(idx);
840 dinucs[i].second.first_in_string = s[1];
841 dinucs[i].second.last_in_string = s[half_splice_mer_len];
842 }
843 else
844 {
845 Dna5String seg_str = seqan::infixWithLength(ref_str,
846 pos + 2,
847 half_splice_mer_len);
848
849 stringstream ss(stringstream::in | stringstream::out);
850 string s;
851 ss << seg_str;
852 ss >> s;
853 uint64_t idx = dna5str_to_idx(s);
854
855 dinucs[i].second.fwd_string = idx;
856 dinucs[i].second.rev_string = rc_dna_str(idx);
857 }
858 }
859 }
860 };
861
862 struct PackedSplice
863 {
864 PackedSplice() : left(0u), seed(0u), right(0u) {}
865 PackedSplice(uint32_t l, uint64_t s, uint32_t r) : left(l), seed(s), right(r) {}
866 uint32_t left;
867 uint64_t seed;
868 uint32_t right;
869 };
870
871 // The second element of these pairs is the left (or right) side of a splice
872 // seed from a possible junction. The first element is the sequence flanking
873 // that seed
874 typedef pair<uint32_t, uint32_t> PackedSpliceHalf;
875
876 static inline std::string u32ToDna(uint32_t a, int len) {
877 char buf[17];
878 assert(len <= 16);
879 for(int i = 0; i < len; i++) {
880 buf[len-i-1] = "ACGT"[a & 3];
881 a >>= 2;
882 }
883 buf[len] = '\0';
884 return std::string(buf);
885 }
886
887
888 PackedSpliceHalf pack_left_splice_half(const string& seq,
889 uint32_t pos_in_l,
890 unsigned int seq_key_len)
891 {
892 const char* l = seq.c_str();
893 l += pos_in_l;
894
895 const char* left_end = l;
896 l -= 16;
897
898 assert (l + seq_key_len < seq.c_str() + seq.length());
899
900 PackedSpliceHalf packed_half = make_pair(0u,0u);
901
902 // Pack up to 32 bits (16 bases) of sequence into left
903 if (l < seq.c_str())
904 l = seq.c_str();
905 while (l < left_end)
906 {
907 packed_half.first <<= 2;
908 packed_half.first |= (0x3 & charToDna5[(size_t)*l]);
909 ++l;
910 }
911
912 // Pack up the seed bits
913 for (unsigned int i = 0; i < seq_key_len; ++i)
914 {
915 packed_half.second <<= 2;
916 packed_half.second |= (0x3 & charToDna5[(size_t)*(l + i)]);
917 }
918 return packed_half;
919 }
920
921
922 PackedSpliceHalf pack_right_splice_half(const string& seq,
923 uint32_t pos,
924 unsigned int seq_key_len)
925 {
926 const char* r = seq.c_str();
927 r += pos - seq_key_len;
928
929 PackedSpliceHalf packed_half = make_pair(0u,0u);
930
931 // Pack the seed bits
932 for (unsigned int i = 0; i < seq_key_len; ++i)
933 {
934 packed_half.second <<= 2;
935 packed_half.second |= (0x3 & charToDna5[(size_t)*(r + i)]);
936 }
937
938 r += seq_key_len;
939 // Now pack 32 bits (16 bases) of sequence into left
940 const char* right_end = r + 16;
941
942 if ((size_t)(right_end - seq.c_str()) > seq.length())
943 right_end = seq.c_str() + seq.length();
944 while (r < right_end)
945 {
946 packed_half.first <<= 2;
947 packed_half.first |= (0x3 & charToDna5[(size_t)*r]);
948 ++r;
949 }
950
951 return packed_half;
952 }
953
954 PackedSplice combine_splice_halves(const PackedSpliceHalf& left_half,
955 const PackedSpliceHalf& right_half,
956 int seq_key_len)
957 {
958 uint64_t seed = left_half.second << (seq_key_len << 1) | right_half.second;
959 return PackedSplice(left_half.first,seed, right_half.first);
960 }
961
962 PackedSplice pack_splice(const string& seq,
963 int l_pos_in_seq,
964 int r_pos_in_seq,
965 unsigned int seq_key_len)
966 {
967 const char* l = seq.c_str(); // l points to beginning of left exon sequence
968 l += l_pos_in_seq;
969
970 assert (l + seq_key_len < seq.c_str() + seq.length());
971
972 const char* r = seq.c_str(); // r points to beginning of right exon sequence
973 r += r_pos_in_seq - seq_key_len;
974 //r += 2; // r points to right side of junction;
975
976 uint64_t seed = 0;
977 uint64_t left = 0;
978 uint64_t right = 0;
979
980 // Pack up the seed bits
981 for (unsigned int i = 0; i < seq_key_len; ++i)
982 {
983 seed <<= 2;
984 seed |= (0x3 & charToDna5[(size_t)*(l + i)]);
985 }
986
987 for (unsigned int i = 0; i < seq_key_len; ++i)
988 {
989 seed <<= 2;
990 seed |= (0x3 & charToDna5[(size_t)*(r + i)]);
991 }
992
993 // Now pack 32 bits (16 bases) of sequence into left
994 const char* left_end = l;
995 l -= 16;
996 if (l < seq.c_str())
997 l = seq.c_str();
998 while (l < left_end)
999 {
1000 left <<= 2;
1001 left |= (0x3 & charToDna5[(size_t)*l]);
1002 ++l;
1003 }
1004
1005 r += seq_key_len;
1006 // Now pack 32 bits (16 bases) of sequence into left
1007 const char* right_end = r + 16;
1008
1009 if ((size_t)(right_end - seq.c_str()) > seq.length())
1010 right_end = seq.c_str() + seq.length();
1011 while (r < right_end)
1012 {
1013 right <<= 2;
1014 right |= (0x3 & charToDna5[(size_t)*r]);
1015 ++r;
1016 }
1017
1018 return PackedSplice((uint32_t)left,seed,(uint32_t)right);
1019 }
1020
1021
1022
1023 /* Represents a hit between a splice seed and a read. */
1024 // TODO: consider packing pos and meta into a single 32-bit int.
1025 struct ReadHit
1026 {
1027 ReadHit(uint32_t l, uint32_t r, uint32_t p, uint32_t m, bool rc)
1028 : left(l), right(r), pos(p), meta(m), reverse_complement(rc) {}
1029 uint32_t left; // 2-bits per base rep of the left remainder
1030 uint32_t right; //2-bits per base rep of the right remainder
1031 uint32_t pos; // position of the seed within the read
1032 uint32_t meta : 31;
1033 bool reverse_complement : 1;
1034 } __attribute__((packed));
1035
1036 // A MerTable maps k-mers to hits in indexed reads. See the comment for
1037 // mer_table
1038 typedef vector<ReadHit> ReadHitList;
1039 typedef vector<ReadHitList> MerTable;
1040
1041 size_t index_read(MerTable* mer_table,
1042 int seq_key_len,
1043 const string& seq,
1044 unsigned int read_num,
1045 bool reverse_complement,
1046 vector<uint64_t>& seeds)
1047 {
1048 // h is will hold the 2-bit-per-base representation of the k-mer seeds for
1049 // this read.
1050
1051 uint64_t seed = 0;
1052 bitset<256> left = 0;
1053 bitset<256> right = 0;
1054 const char* p = seq.c_str();
1055
1056 unsigned int seq_len = (int)seq.length();
1057 const char* seq_end = p + seq_len;
1058
1059 // Build the first seed
1060 while (p < seq.c_str() + (2 * seq_key_len))
1061 {
1062 seed <<= 2;
1063 seed |= (0x3 & charToDna5[(size_t)*p]);
1064 ++p;
1065 }
1066
1067 seeds.push_back(seed);
1068
1069 // Build the rest of them with a sliding window, adding successive bases
1070 // to the "right-remainder" word.
1071 while (p < seq_end)
1072 {
1073
1074 right <<= 2;
1075 right |= (0x3 & charToDna5[(size_t)*p]);
1076 ++p;
1077 }
1078
1079 size_t cap_increase = 0;
1080
1081 // At this point, seed contains the 5'-most 2*min_anchor_len bases of the
1082 // read, and right contains everthing else on the 3' end.
1083
1084 // This loop will construct successive seed, along with 32-bit words
1085 // containing the left and right remainders for each seed
1086 uint32_t i = 0;
1087 size_t new_hits = 0;
1088 do
1089 {
1090 // Let's not make an out-of-bounds write, if this fails the global
1091 // mer_table is too small
1092 assert (!mer_table || seed < mer_table->size());
1093
1094 // How many base pairs exist in the right remainder beyond what we have
1095 // space for ?
1096 int extra_right_bp = ((int)seq.length() - (i + 2 * seq_key_len)) - 16;
1097
1098 uint32_t hit_right;
1099 if (extra_right_bp > 0)
1100 {
1101 //bitset<32> tmp_hit_right = (right >> (extra_right_bp << 1));
1102 hit_right = (uint32_t)(right >> (extra_right_bp << 1)).to_ulong();
1103 }
1104 else
1105 {
1106 hit_right = (uint32_t)right.to_ulong();
1107 }
1108
1109 uint32_t hit_left = (uint32_t)((left << (256 - 32)) >> (256 - 32)).to_ulong();
1110
1111 if (mer_table)
1112 {
1113 size_t prev_cap = (*mer_table)[seed].capacity();
1114 (*mer_table)[seed].push_back(ReadHit(hit_left, hit_right,i, read_num, reverse_complement));
1115 cap_increase += ((*mer_table)[seed].capacity() - prev_cap) * sizeof (ReadHit);
1116 }
1117 new_hits++;
1118
1119 // Take the leftmost base of the seed and stick it into bp
1120 uint64_t bp = seed & (0x3uLL << ((seq_key_len << 2) - 2));
1121
1122 // Move that base down to the least significant bits of bp
1123 bp >>= ((seq_key_len << 2) - 2);
1124
1125 // And tack it onto the left remainder of the read
1126 left <<= 2;
1127 left |= bp;
1128
1129 // Now take the leftmost base of the right remainder and stick it into
1130 // the rightmost position of the seed
1131 uint32_t right_len = seq_len - (i + seq_key_len * 2);
1132 //bp = right & (0x3uLL << ((right_len - 1) << 1));
1133
1134 seed <<= 2;
1135 //cout << right << endl;
1136 bitset<256> tmp_right = (right >> ((right_len - 1) << 1));
1137 //cout <<tmp_right << endl;
1138 seed |= tmp_right.to_ulong();
1139 seed &= ~(0xFFFFFFFFFFFFFFFFuLL << (seq_key_len << 2));
1140
1141 seeds.push_back(seed);
1142
1143 //Now remove that leftmost base of the right remainder
1144 //right &= ~(0xFFFFFFFFFFFFFFFFuLL << (right_len - 1 << 1));
1145 if (right_len)
1146 {
1147 right.set(((right_len - 1) << 1), 0);
1148 right.set(((right_len - 1) << 1) + 1, 0);
1149 }
1150 ++i;
1151
1152 }while(i <= (size_t)(seq_end - seq.c_str()) - (2 * seq_key_len));
1153 return cap_increase;
1154 }
1155
1156
1157
1158 struct SeedExtension
1159 {
1160 SeedExtension(int lp,
1161 int rp,
1162 int rd_p,
1163 int le,
1164 int re,
1165 int mm) :
1166 l_pos_in_ref(lp),
1167 r_pos_in_ref(rp),
1168 read_pos(rd_p),
1169 left_extent(le),
1170 right_extent(re),
1171 mismatches(mm){}
1172
1173 int l_pos_in_ref;
1174 int r_pos_in_ref;
1175 int read_pos;
1176 int left_extent;
1177 int right_extent;
1178 int mismatches;
1179 };
1180
1181 pair<string::size_type, int> left_extend(const string& ref,
1182 const string& read,
1183 int ref_pos,
1184 int read_pos,
1185 int num_mismatches)
1186 {
1187 string::size_type ext = 0;
1188 int mm_encountered = 0;
1189 while(ref_pos >= 0 &&
1190 read_pos >= 0)
1191 {
1192 //char ref_char = ref[ref_pos];
1193 //char read_char = read[read_pos];
1194 if (ref[ref_pos] != read[read_pos])
1195 {
1196 if (mm_encountered + 1 > num_mismatches)
1197 return make_pair(ext, mm_encountered);
1198 mm_encountered++;
1199 }
1200 ext++;
1201
1202 --ref_pos;
1203 --read_pos;
1204 }
1205
1206 return make_pair(ext, mm_encountered);
1207 }
1208
1209 pair<string::size_type, int> right_extend(const string& ref,
1210 const string& read,
1211 int ref_pos,
1212 int read_pos,
1213 int num_mismatches)
1214 {
1215 string::size_type ext = 0;
1216 int mm_encountered = 0;
1217 while(ref_pos < (int)ref.size() &&
1218 read_pos < (int)read.size())
1219 {
1220 if (ref[ref_pos] != read[read_pos])
1221 {
1222 if (mm_encountered + 1 > num_mismatches)
1223 return make_pair(ext, mm_encountered);
1224 mm_encountered++;
1225 }
1226
1227 ext++;
1228
1229 ++ref_pos;
1230 ++read_pos;
1231 }
1232
1233 return make_pair(ext, mm_encountered);
1234 }
1235
1236
1237 void extend_from_seeds(vector<SeedExtension>& extensions,
1238 const PackedSplice& p,
1239 const MerTable& mer_table,
1240 const string& ref,
1241 const string& read,
1242 size_t l_pos_in_ref,
1243 size_t r_pos_in_ref,
1244 int seq_key_len)
1245 {
1246 assert(p.seed < mer_table.size());
1247 const ReadHitList& hl = mer_table[p.seed];
1248
1249 for (size_t hit = 0; hit < hl.size(); ++hit)
1250 {
1251 const ReadHit& rh = hl[hit];
1252 uint32_t pos = rh.pos;
1253
1254 pair<string::size_type, int> left_extension;
1255 pair<string::size_type, int> right_extension;
1256 left_extension = left_extend(ref, read, l_pos_in_ref - seq_key_len + 1, pos, 2);
1257 right_extension = right_extend(ref, read, r_pos_in_ref + seq_key_len, pos + 2 * seq_key_len, 2);
1258 extensions.push_back(SeedExtension(l_pos_in_ref,
1259 r_pos_in_ref,
1260 pos + seq_key_len,
1261 left_extension.first,
1262 right_extension.first,
1263 left_extension.second + right_extension.second));
1264
1265 }
1266 }
1267
1268 typedef pair<size_t, PackedSpliceHalf> SpliceHalf;
1269
1270 void get_seed_extensions(const string& ref,
1271 const string& read,
1272 int seq_key_len,
1273 MerTable& mer_table,
1274 vector<SpliceHalf>& donors,
1275 vector<SpliceHalf>& acceptors,
1276 vector<SeedExtension>& extensions)
1277 {
1278 for (size_t d = 0; d < donors.size(); ++d)
1279 {
1280 bool broke_out = false;
1281
1282 // start pos is a lower bound on downstream acceptor positions
1283 // to consider
1284 size_t start_pos = donors[d].first + min_report_intron_length;
1285
1286 SpliceHalf dummy = make_pair(start_pos,PackedSpliceHalf());
1287 vector<SpliceHalf>::iterator lb = upper_bound(acceptors.begin(),
1288 acceptors.end(),
1289 dummy);
1290
1291 if (lb == acceptors.end())
1292 break;
1293 for (size_t a = lb - acceptors.begin();
1294 a < acceptors.size();
1295 ++a)
1296 {
1297 if (acceptors[a].first - donors[d].first > (size_t)max_microexon_stretch)
1298 {
1299 broke_out = true;
1300 break;
1301 }
1302
1303 size_t l_pos_in_ref = donors[d].first - 1;
1304 size_t r_pos_in_ref = acceptors[a].first + 2;
1305 PackedSplice p = combine_splice_halves(donors[d].second,
1306 acceptors[a].second,
1307 seq_key_len);
1308 extend_from_seeds(extensions,
1309 p,
1310 mer_table,
1311 ref,
1312 read,
1313 l_pos_in_ref,
1314 r_pos_in_ref,
1315 seq_key_len);
1316 }
1317 if (broke_out)
1318 continue;
1319 }
1320
1321 }
1322
1323 void hits_from_seed_extension(uint32_t ref_id,
1324 int ref_offset,
1325 uint64_t insert_id,
1326 bool antisense,
1327 vector<SeedExtension>& extensions,
1328 vector<BowtieHit>& hits_out,
1329 int left_read_edge,
1330 int right_read_edge,
1331 int seq_key_len)
1332 {
1333 for (size_t i = 0; i < extensions.size(); ++i)
1334 {
1335 SeedExtension& s = extensions[i];
1336 if (s.read_pos >= right_read_edge ||
1337 s.read_pos < left_read_edge)
1338 continue;
1339 if (s.read_pos - seq_key_len - s.left_extent <= left_read_edge &&
1340 s.read_pos + seq_key_len + s.right_extent >= right_read_edge && s.mismatches <= 2 )
1341 {
1342 vector<CigarOp> cigar;
1343 int off_adjust;
1344 if (antisense)
1345 {
1346 CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge);
1347 CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref);
1348 CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos);
1349 cigar.push_back(m1);
1350 cigar.push_back(skip);
1351 cigar.push_back(m2);
1352 off_adjust = m1.length;
1353 }
1354 else
1355 {
1356 CigarOp m1 = CigarOp(MATCH, s.read_pos - left_read_edge + 1);
1357 CigarOp skip = CigarOp(REF_SKIP, s.r_pos_in_ref - s.l_pos_in_ref);
1358 CigarOp m2 = CigarOp(MATCH, right_read_edge - s.read_pos - 1);
1359 cigar.push_back(m1);
1360 cigar.push_back(skip);
1361 cigar.push_back(m2);
1362 off_adjust = m1.length;
1363 }
1364 // daehwan - check this
1365 bool end = false;
1366 BowtieHit bh(ref_id,
1367 ref_id,
1368 insert_id,
1369 ref_offset + s.l_pos_in_ref - off_adjust + 1,
1370 cigar,
1371 antisense,
1372 false,
1373 s.mismatches,
1374 0,
1375 end);
1376 hits_out.push_back(bh);
1377 }
1378 }
1379 }
1380
1381 void align(uint32_t ref_id,
1382 uint64_t insert_id,
1383 bool antisense,
1384 const string& ref,
1385 const string& read,
1386 int ref_offset,
1387 int left_read_edge,
1388 int right_read_edge,
1389 MerTable& mer_table,
1390 int seq_key_len,
1391 vector<BowtieHit>& hits_out)
1392 {
1393 // Reserve an entry for each k-mer we might see
1394 size_t mer_table_size = 1 << ((seq_key_len << 1)<<1);
1395
1396 mer_table.resize(mer_table_size);
1397
1398 vector<uint64_t> seeds;
1399 index_read(&mer_table, seq_key_len, read, 0, antisense, seeds);
1400
1401 vector<SpliceHalf> forward_donors;
1402 vector<SpliceHalf> forward_acceptors;
1403 vector<SpliceHalf> reverse_donors;
1404 vector<SpliceHalf> reverse_acceptors;
1405
1406 const string& seq = ref;
1407
1408 unsigned int pos = 0;
1409
1410
1411 for (size_t z = seq_key_len + 1; z < seq.length() - seq_key_len - 2; ++z)
1412 {
1413 char l = seq[z - 1];
1414 char r = seq[z];
1415 if (l == 'G' && r == 'T')
1416 {
1417 size_t donor_pos = pos + z - 1;
1418 size_t s = donor_pos - seq_key_len;
1419 PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len);
1420 forward_donors.push_back(make_pair(donor_pos,p));
1421 }
1422 if (l == 'A' && r == 'G')
1423 {
1424 size_t acceptor_pos = pos + z - 1;
1425 size_t s = acceptor_pos + 2 + seq_key_len;
1426 PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len);
1427 forward_acceptors.push_back(make_pair(acceptor_pos,p));
1428 }
1429 if (l == 'C' && r == 'T')
1430 {
1431 size_t acceptor_pos = pos + z - 1;
1432 size_t s = acceptor_pos - seq_key_len;
1433 PackedSpliceHalf p = pack_left_splice_half(seq, s, seq_key_len);
1434 reverse_acceptors.push_back(make_pair(pos + z - 1,p));
1435 }
1436 if (l == 'A' && r == 'C')
1437 {
1438 size_t donor_pos = pos + z - 1;
1439 size_t s = donor_pos + 2 + seq_key_len;
1440 PackedSpliceHalf p = pack_right_splice_half(seq, s, seq_key_len);
1441 reverse_donors.push_back(make_pair(donor_pos,p));
1442 }
1443 }
1444
1445 vector<SeedExtension> fwd_extensions;
1446 get_seed_extensions(seq,
1447 read,
1448 seq_key_len,
1449 mer_table,
1450 forward_donors,
1451 forward_acceptors,
1452 fwd_extensions);
1453
1454 hits_from_seed_extension(ref_id,
1455 ref_offset,
1456 insert_id,
1457 antisense,
1458 fwd_extensions,
1459 hits_out,
1460 left_read_edge,
1461 right_read_edge,
1462 seq_key_len);
1463
1464 //fprintf(stderr, "Found %d seed hits\n", fwd_extensions.size());
1465
1466 vector<SeedExtension> rev_extensions;
1467 get_seed_extensions(seq,
1468 read,
1469 seq_key_len,
1470 mer_table,
1471 reverse_donors,
1472 reverse_acceptors,
1473 rev_extensions);
1474
1475 hits_from_seed_extension(ref_id,
1476 ref_offset,
1477 insert_id,
1478 antisense,
1479 rev_extensions,
1480 hits_out,
1481 left_read_edge,
1482 right_read_edge,
1483 seq_key_len);
1484
1485 for (size_t i = 0; i < seeds.size(); ++i)
1486 mer_table[seeds[i]].clear();
1487 }
1488
1489 int extension_mismatches = 0;
1490
1491
1492 bool left_extendable_junction(uint64_t upstream_dna_str,
1493 size_t key,
1494 size_t splice_mer_len,
1495 size_t min_ext_len)
1496 {
1497 vector<MerExtension>& exts = extensions[key];
1498 for (size_t i = 0; i < exts.size(); ++i)
1499 {
1500 const MerExtension& ext = exts[i];
1501 if (ext.left_ext_len < min_ext_len)
1502 continue;
1503 uint64_t upstream = upstream_dna_str & ~(0xFFFFFFFFFFFFFFFFull << (ext.left_ext_len << 1));
1504 int mism = mismatching_bases(ext.left_dna_str, upstream, ext.left_ext_len, extension_mismatches);
1505 if (mism <= extension_mismatches)
1506 return true;
1507 }
1508 return false;
1509 }
1510
1511 bool right_extendable_junction(uint64_t downstream_dna_str,
1512 size_t key,
1513 size_t splice_mer_len,
1514 size_t min_ext_len)
1515 {
1516 vector<MerExtension>& exts = extensions[key];
1517 for (size_t i = 0; i < exts.size(); ++i)
1518 {
1519 const MerExtension& ext = exts[i];
1520 if (ext.right_ext_len < min_ext_len)
1521 continue;
1522 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull >> (ext.right_ext_len << 1));
1523 uint64_t downstream = downstream_dna_str & mask;
1524 downstream >>= ((32 - ext.right_ext_len) << 1);
1525 int mism = mismatching_bases(ext.right_dna_str, downstream, ext.right_ext_len, extension_mismatches);
1526
1527 if (mism <= extension_mismatches)
1528 return true;
1529 }
1530 return false;
1531 }
1532
1533 uint32_t junction_key(uint64_t upstream_dna_str,
1534 uint64_t downstream_dna_str,
1535 size_t splice_mer_len)
1536 {
1537 uint64_t upstream_mask = ~(0xFFFFFFFFFFFFFFFFull << (splice_mer_len));
1538 uint64_t upstream_key_half = upstream_dna_str & upstream_mask;
1539 uint64_t downstream_mask = ~(0xFFFFFFFFFFFFFFFFull >> (splice_mer_len));
1540 uint64_t downstream_key_half = (downstream_dna_str & downstream_mask) >> (64 - splice_mer_len);
1541 uint32_t key = ((uint32_t)upstream_key_half << splice_mer_len) | (uint32_t)downstream_key_half;
1542 return key;
1543 }
1544
1545 bool extendable_junction(uint64_t upstream_dna_str,
1546 uint64_t downstream_dna_str,
1547 size_t splice_mer_len,
1548 size_t min_ext_len,
1549 bool reverse,
1550 char last_in_upstream = 'N',
1551 char first_in_downstream = 'N')
1552 {
1553 if (color)
1554 {
1555 string two_bp; two_bp.push_back(last_in_upstream); two_bp.push_back(first_in_downstream);
1556 string color = convert_bp_to_color(two_bp, true);
1557 char num = (color[0] - '0') & 0x3;
1558
1559 if (reverse)
1560 {
1561 upstream_dna_str = (upstream_dna_str >> 2) << 2;
1562 upstream_dna_str |= (uint64_t)num;
1563 }
1564 else
1565 {
1566 downstream_dna_str = (downstream_dna_str << 2) >> 2;
1567 downstream_dna_str |= ((uint64_t)num << 62);
1568 }
1569 }
1570
1571 uint32_t key = junction_key(upstream_dna_str,
1572 downstream_dna_str,
1573 splice_mer_len);
1574
1575 upstream_dna_str >>= splice_mer_len;
1576 downstream_dna_str <<= splice_mer_len;
1577
1578 bool extendable = (left_extendable_junction(upstream_dna_str,
1579 key, splice_mer_len, min_ext_len) ||
1580 right_extendable_junction(downstream_dna_str,
1581 key, splice_mer_len, min_ext_len));
1582 return extendable;
1583 }
1584
1585 typedef std::set<Junction, skip_count_lt> PotentialJuncs;
1586
1587 struct RecordExtendableJuncs
1588 {
1589 void record(uint32_t ref_id,
1590 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1591 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1592 bool antisense,
1593 PotentialJuncs& juncs,
1594 int min_intron,
1595 int max_intron,
1596 size_t max_juncs,
1597 size_t half_splice_mer_len)
1598 {
1599
1600 size_t splice_mer_len = 2 * half_splice_mer_len;
1601
1602 size_t curr_R = 0;
1603 for (size_t L = 0; L < left_sites.size(); ++L)
1604 {
1605 while (curr_R < right_sites.size() &&
1606 right_sites[curr_R].first < left_sites[L].first + min_intron)
1607 {
1608 curr_R++;
1609 }
1610
1611 size_t left_pos = left_sites[L].first;
1612 size_t max_right_pos = left_pos + max_intron;
1613 for (size_t R = curr_R;
1614 R < right_sites.size() && right_sites[R].first < max_right_pos; ++R)
1615 {
1616 uint64_t upstream_dna_str = left_sites[L].second.fwd_string;
1617 char last_in_upstream = left_sites[L].second.last_in_string;
1618 uint64_t downstream_dna_str = right_sites[R].second.fwd_string;
1619 char first_in_downstream = right_sites[R].second.first_in_string;
1620 uint64_t rc_upstream_dna_str = left_sites[L].second.rev_string;
1621 uint64_t rc_downstream_dna_str = right_sites[R].second.rev_string;
1622
1623 if (extendable_junction(upstream_dna_str,
1624 downstream_dna_str, splice_mer_len, 7, false,
1625 last_in_upstream, first_in_downstream) ||
1626 extendable_junction(rc_downstream_dna_str,
1627 rc_upstream_dna_str, splice_mer_len, 7, true,
1628 last_in_upstream, first_in_downstream))
1629 {
1630 juncs.insert(Junction(ref_id,
1631 left_sites[L].first - 1,
1632 right_sites[R].first + 2,
1633 antisense,
1634 R - curr_R));
1635 }
1636 if (juncs.size() > max_juncs)
1637 juncs.erase(*(juncs.rbegin()));
1638 }
1639 }
1640 }
1641 };
1642
1643 struct RecordAllJuncs
1644 {
1645 void record(uint32_t ref_id,
1646 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1647 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1648 bool antisense,
1649 PotentialJuncs& juncs,
1650 int min_intron,
1651 int max_intron,
1652 size_t max_juncs,
1653 size_t half_splice_mer_len)
1654 {
1655 size_t curr_R = 0;
1656 for (size_t L = 0; L < left_sites.size(); ++L)
1657 {
1658 while (curr_R < right_sites.size() &&
1659 right_sites[curr_R].first < left_sites[L].first + min_intron)
1660 {
1661 curr_R++;
1662 }
1663
1664 size_t left_pos = left_sites[L].first;
1665 size_t max_right_pos = left_pos + max_intron;
1666 for (size_t R = curr_R;
1667 R < right_sites.size() && right_sites[R].first < max_right_pos; ++R)
1668 {
1669 Junction j(ref_id,
1670 left_sites[L].first - 1,
1671 right_sites[R].first + 2,
1672 antisense,
1673 R - curr_R);
1674
1675 juncs.insert(j);
1676
1677 if (juncs.size() > max_juncs)
1678 juncs.erase(*(juncs.rbegin()));
1679 }
1680 }
1681 }
1682 };
1683
1684 struct RecordSegmentJuncs
1685 {
1686 void record(uint32_t ref_id,
1687 const vector<pair<size_t, DnaSpliceStrings> >& left_sites,
1688 const vector<pair<size_t, DnaSpliceStrings> >& right_sites,
1689 bool antisense,
1690 PotentialJuncs& juncs,
1691 int min_intron,
1692 int max_intron,
1693 size_t max_juncs,
1694 size_t half_splice_mer_len)
1695 {
1696 if (left_sites.size() != right_sites.size())
1697 return;
1698
1699 for (size_t i = 0; i < left_sites.size(); ++i)
1700 {
1701 Junction j(ref_id,
1702 left_sites[i].first - 1,
1703 right_sites[i].first + 2,
1704 antisense);
1705
1706 juncs.insert(j);
1707 if (juncs.size() > max_juncs)
1708 juncs.erase(*(juncs.rbegin()));
1709 }
1710 }
1711 };
1712
1713 struct ButterflyKey
1714 {
1715 uint32_t pos;
1716 uint32_t key;
1717
1718 ButterflyKey(uint32_t p, uint32_t k) : pos(p), key(k) {}
1719
1720 bool operator<(const ButterflyKey& rhs) const
1721 {
1722 if (key != rhs.key)
1723 return key < rhs.key;
1724 if (pos != rhs.pos)
1725 return pos < rhs.pos;
1726 return false;
1727 }
1728
1729 bool operator==(const ButterflyKey& rhs) const
1730 {
1731 return pos == rhs.pos && key == rhs.key;
1732 }
1733 };
1734
1735 uint32_t get_left_butterfly_key(uint64_t upstream_key,
1736 const MerExtension& ext,
1737 size_t half_splice_mer_len)
1738 {
1739 uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - half_splice_mer_len) << 1);
1740 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1));
1741 uint64_t top_half = upstream_key & mask;
1742 key |= (top_half << (half_splice_mer_len << 1));
1743 return (uint32_t)key;
1744 }
1745
1746 uint32_t get_right_butterfly_key(uint64_t downstream_key,
1747 const MerExtension& ext,
1748 size_t half_splice_mer_len)
1749 {
1750 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (half_splice_mer_len << 1));
1751 uint64_t key = (ext.left_dna_str & mask) << (half_splice_mer_len << 1);
1752 uint64_t bottom_half = (downstream_key >> (half_splice_mer_len << 1));
1753 key |= bottom_half;
1754 return (uint32_t)key;
1755 }
1756
1757 struct RecordButterflyJuncs
1758 {
1759 void record(uint32_t ref_id,
1760 const vector<pair<size_t, DnaSpliceStrings> >& all_left_sites,
1761 const vector<pair<size_t, DnaSpliceStrings> >& all_right_sites,
1762 bool antisense,
1763 PotentialJuncs& juncs,
1764 int min_intron,
1765 int max_intron,
1766 size_t max_juncs,
1767 size_t half_splice_mer_len)
1768 {
1769 size_t key_length = 2 * half_splice_mer_len;
1770 size_t extension_length = butterfly_overhang;
1771 uint64_t bottom_bit_mask = ~(0xFFFFFFFFFFFFFFFFull << (key_length<<1));
1772 uint64_t top_bit_mask = ~(0xFFFFFFFFFFFFFFFFull >> (key_length<<1));
1773
1774 if (all_left_sites.empty() || all_right_sites.empty())
1775 return;
1776
1777 size_t last_site = max(all_left_sites.back().first,
1778 all_right_sites.back().first);
1779
1780 size_t curr_left_site = 0;
1781 size_t curr_right_site = 0;
1782 for (size_t window_left_edge = 0;
1783 window_left_edge < last_site;
1784 window_left_edge += max_intron)
1785 {
1786 //fprintf(stderr, "\twindow %lu - %lu\n", window_left_edge, window_left_edge + 2 * max_intron);
1787 vector<pair<size_t, DnaSpliceStrings> > left_sites;
1788 vector<pair<size_t, DnaSpliceStrings> > right_sites;
1789
1790 while(curr_left_site < all_left_sites.size() &&
1791 all_left_sites[curr_left_site].first < window_left_edge)
1792 {
1793 curr_left_site++;
1794 }
1795
1796 while(curr_right_site < all_right_sites.size() &&
1797 all_right_sites[curr_right_site].first < window_left_edge)
1798 {
1799 curr_right_site++;
1800 }
1801
1802 for (size_t ls = curr_left_site; ls < all_left_sites.size(); ++ls)
1803 {
1804 if (all_left_sites[ls].first < window_left_edge + 2 * max_intron)
1805 {
1806 left_sites.push_back(all_left_sites[ls]);
1807 }
1808 }
1809
1810 for (size_t rs = curr_right_site; rs < all_right_sites.size(); ++rs)
1811 {
1812 if (all_right_sites[rs].first < window_left_edge + 2 * max_intron)
1813 {
1814 right_sites.push_back(all_right_sites[rs]);
1815 }
1816 }
1817
1818 vector<ButterflyKey> left_keys;
1819 for (size_t L = 0; L < left_sites.size(); ++L)
1820 {
1821 uint64_t fwd_upstream_dna_str = left_sites[L].second.fwd_string;
1822 uint64_t fwd_upstream_key = fwd_upstream_dna_str & bottom_bit_mask;
1823
1824 assert (fwd_upstream_key < extensions.size());
1825
1826 vector<MerExtension>& fwd_exts = extensions[fwd_upstream_key];
1827 for (size_t i = 0; i < fwd_exts.size(); ++i)
1828 {
1829 const MerExtension& ext = fwd_exts[i];
1830 if (ext.right_ext_len < extension_length)
1831 continue;
1832
1833 /*
1834 < f_u_key ><ext.right>
1835 NNNNNNNNNN GT
1836 */
1837
1838 // take the top bits of the right extension
1839 uint64_t key = ext.right_dna_str >> ((ext.right_ext_len - extension_length) << 1);
1840
1841 // and the bottom bits of the site key
1842 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1843 uint64_t top_half = fwd_upstream_key & mask;
1844
1845 // and cat them together
1846 key |= (top_half << (extension_length << 1));
1847 left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key));
1848 }
1849
1850 uint64_t rev_upstream_dna_str = left_sites[L].second.rev_string;
1851 uint64_t rev_upstream_key = (rev_upstream_dna_str & top_bit_mask) >> (64 - (key_length<<1));
1852
1853 assert (rev_upstream_key < extensions.size());
1854
1855 vector<MerExtension>& rev_exts = extensions[rev_upstream_key];
1856 for (size_t i = 0; i < rev_exts.size(); ++i)
1857 {
1858 const MerExtension& ext = rev_exts[i];
1859 if (ext.left_ext_len < extension_length)
1860 continue;
1861
1862 /*
1863 < r_u_key ><ext.left>
1864 NNNNNNNNNN GT
1865 */
1866
1867 // reverse complement the left extension, and we will need
1868 // what were the bottom bits. these become the top bits in the
1869 // rc.
1870 uint64_t ext_str = color ? rc_color_str(ext.left_dna_str) : rc_dna_str(ext.left_dna_str);
1871 ext_str >>= 64 - (ext.left_ext_len << 1);
1872
1873 // now take the top bits of the rc, make them the bottom of
1874 // the key
1875 uint64_t key = ext_str >> ((ext.left_ext_len - extension_length) << 1);
1876
1877 // now add in the seed key bottom bits, making them the top of
1878 // the key
1879 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1880 uint64_t top_half = fwd_upstream_key & mask;
1881 key |= (top_half << (extension_length << 1));
1882 left_keys.push_back(ButterflyKey((uint32_t)left_sites[L].first, key));
1883 }
1884 }
1885 sort (left_keys.begin(), left_keys.end());
1886 vector<ButterflyKey>::iterator new_end = unique(left_keys.begin(), left_keys.end());
1887 left_keys.erase(new_end, left_keys.end());
1888
1889 vector<ButterflyKey> right_keys;
1890 for (size_t R = 0; R < right_sites.size(); ++R)
1891 {
1892 uint64_t fwd_downstream_dna_str = right_sites[R].second.fwd_string;
1893 uint64_t fwd_downstream_key = (fwd_downstream_dna_str & top_bit_mask) >> (64 - (key_length<<1));
1894
1895 assert (fwd_downstream_key < extensions.size());
1896
1897 vector<uint64_t> fwd_downstream_keys;
1898 if (color)
1899 {
1900 for(size_t color_value = 0; color_value < 4; ++color_value)
1901 {
1902 uint64_t tmp_key = (fwd_downstream_key << 2) >> 2 | (color_value << ((key_length - 1) << 1));
1903 fwd_downstream_keys.push_back(tmp_key);
1904 }
1905 }
1906 else
1907 {
1908 fwd_downstream_keys.push_back(fwd_downstream_key);
1909 }
1910
1911 for(size_t key = 0; key < fwd_downstream_keys.size(); ++key)
1912 {
1913 uint64_t tmp_fwd_downstream_key = fwd_downstream_keys[key];
1914 vector<MerExtension>& fwd_exts = extensions[tmp_fwd_downstream_key];
1915 for (size_t i = 0; i < fwd_exts.size(); ++i)
1916 {
1917 const MerExtension& ext = fwd_exts[i];
1918 if (ext.left_ext_len < extension_length)
1919 continue;
1920
1921 /*
1922 <ext.left>< f_d_key >
1923 AG NNNNNNNNNN
1924 */
1925
1926 // take the bottom bits of the left extension, making them the
1927 // top of the key.
1928 uint64_t mask = ~(0xFFFFFFFFFFFFFFFFull << (extension_length << 1));
1929 uint64_t key = (ext.left_dna_str & mask) << (extension_length << 1);
1930
1931 // add in the top bits of the seed key, making them the bottom bits
1932 // of the key.
1933 uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1));
1934 key |= bottom_half;
1935 right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key));
1936 }
1937 }
1938
1939 uint64_t rev_downstream_dna_str = right_sites[R].second.rev_string;
1940 uint64_t rev_downstream_key = rev_downstream_dna_str & bottom_bit_mask;
1941
1942 assert (rev_downstream_key < extensions.size());
1943
1944 vector<uint64_t> rev_downstream_keys;
1945 if (color)
1946 {
1947 for(size_t color_value = 0; color_value < 4; ++color_value)
1948 {
1949 uint64_t tmp_key = (rev_downstream_key >> 2) << 2 | color_value;
1950 rev_downstream_keys.push_back(tmp_key);
1951 }
1952 }
1953 else
1954 {
1955 rev_downstream_keys.push_back(rev_downstream_key);
1956 }
1957
1958 for(size_t key = 0; key < rev_downstream_keys.size(); ++key)
1959 {
1960 uint64_t tmp_rev_downstream_key = rev_downstream_keys[key];
1961 uint64_t tmp_fwd_downstream_key = fwd_downstream_key;
1962 if (color)
1963 {
1964 tmp_fwd_downstream_key = rc_color_str(tmp_rev_downstream_key) >> (64 - (key_length << 1));
1965 }
1966
1967 vector<MerExtension>& rev_exts = extensions[tmp_rev_downstream_key];
1968 for (size_t i = 0; i < rev_exts.size(); ++i)
1969 {
1970 const MerExtension& ext = rev_exts[i];
1971 if (ext.right_ext_len < extension_length)
1972 continue;
1973
1974 /*
1975 <ext.right>< r_d_key >
1976 AG NNNNNNNNNN
1977 */
1978
1979 // reverse complement the right_extension. we want the
1980 // top bits of the extension, but these become the bottom bits
1981 // under the rc.
1982 uint64_t ext_str = color ? rc_color_str(ext.right_dna_str) : rc_dna_str(ext.right_dna_str);
1983 ext_str >>= 64 - (ext.right_ext_len << 1);
1984
1985 // take the bottom bits of the rc and make it the top of the key
1986 uint64_t key = ext_str << (extension_length << 1);
1987
1988 // take the top bits of the seed key and make them the bottom
1989 // of the key.
1990 uint64_t bottom_half = (tmp_fwd_downstream_key >> ((key_length - extension_length) << 1));
1991 key |= bottom_half;
1992
1993 right_keys.push_back(ButterflyKey((uint32_t)right_sites[R].first, key));
1994 }
1995 }
1996 }
1997
1998 sort (right_keys.begin(), right_keys.end());
1999 new_end = unique(right_keys.begin(), right_keys.end());
2000 right_keys.erase(new_end, right_keys.end());
2001
2002 size_t lk = 0;
2003 size_t rk = 0;
2004
2005 while (lk < left_keys.size() && rk < right_keys.size())
2006 {
2007 while (lk < left_keys.size() &&
2008 left_keys[lk].key < right_keys[rk].key) { ++lk; }
2009
2010 if (lk == left_keys.size())
2011 break;
2012
2013 while (rk < right_keys.size() &&
2014 right_keys[rk].key < left_keys[lk].key) { ++rk; }
2015
2016 if (rk == right_keys.size())
2017 break;
2018
2019 if (lk < left_keys.size() && rk < right_keys.size() &&
2020 right_keys[rk].key == left_keys[lk].key)
2021 {
2022
2023 size_t k = right_keys[rk].key;
2024 size_t lk_end = lk;
2025 size_t rk_end = rk;
2026 while (rk_end < right_keys.size() && right_keys[rk_end].key == k) {++rk_end;}
2027 while (lk_end < left_keys.size() && left_keys[lk_end].key == k) {++lk_end;}
2028
2029 size_t tmp_lk = lk;
2030
2031 while (tmp_lk < lk_end)
2032 {
2033 size_t tmp_rk = rk;
2034 while (tmp_rk < rk_end)
2035 {
2036 int donor = (int)left_keys[tmp_lk].pos - 1;
2037 int acceptor = (int)right_keys[tmp_rk].pos + 2;
2038
2039 if (acceptor - donor > min_intron && acceptor - donor < max_intron)
2040 {
2041 Junction j(ref_id,
2042 donor,
2043 acceptor,
2044 antisense,
2045 acceptor - donor); // just prefer shorter introns
2046 juncs.insert(j);
2047 if (juncs.size() > max_juncs)
2048 {
2049 juncs.erase(*(juncs.rbegin()));
2050 }
2051 }
2052 ++tmp_rk;
2053 }
2054
2055 ++tmp_lk;
2056 }
2057
2058 lk = lk_end;
2059 rk = rk_end;
2060 }
2061 }
2062 }
2063 }
2064 };
2065
2066 template <class JunctionRecorder>
2067 void juncs_from_ref_segs(RefSequenceTable& rt,
2068 vector<RefSeg>& expected_don_acc_windows,
2069 PotentialJuncs& juncs,
2070 const DnaString& donor_dinuc,
2071 const DnaString& acceptor_dinuc,
2072 int max_intron,
2073 int min_intron,
2074 size_t max_juncs,
2075 bool talkative,
2076 size_t half_splice_mer_len)
2077 {
2078
2079 typedef map<uint32_t, IntronMotifs> MotifMap;
2080
2081 MotifMap ims;
2082
2083 seqan::DnaStringReverseComplement rev_donor_dinuc(donor_dinuc);
2084 seqan::DnaStringReverseComplement rev_acceptor_dinuc(acceptor_dinuc);
2085
2086 if (talkative)
2087 fprintf(stderr, "Collecting potential splice sites in islands\n");
2088
2089 //
2090 bool all_both = true;
2091 for (size_t r = 0; r < expected_don_acc_windows.size(); ++r)
2092 {
2093 const RefSeg& seg = expected_don_acc_windows[r];
2094
2095 if (seg.points_where != POINT_DIR_BOTH)
2096 all_both = false;
2097
2098 RefSequenceTable::Sequence* ref_str = rt.get_seq(seg.ref_id);
2099
2100 if (!ref_str)
2101 continue;
2102
2103 bool skip_fwd = false;
2104 bool skip_rev = false;
2105
2106 if (library_type == FR_FIRSTSTRAND)
2107 {
2108 if (seg.read == READ_LEFT)
2109 {
2110 if (seg.antisense) skip_rev = true;
2111 else skip_fwd = true;
2112 }
2113 else if(seg.read == READ_RIGHT)
2114 {
2115 if (seg.antisense) skip_fwd = true;
2116 else skip_rev = true;
2117 }
2118 }
2119
2120 if (library_type == FR_SECONDSTRAND)
2121 {
2122 if (seg.read == READ_LEFT)
2123 {
2124 if (seg.antisense) skip_fwd = true;
2125 else skip_rev = true;
2126 }
2127 else if(seg.read == READ_RIGHT)
2128 {
2129 if (seg.antisense) skip_rev = true;
2130 else skip_fwd = true;
2131 }
2132 }
2133
2134 pair<map<uint32_t, IntronMotifs>::iterator, bool> ret =
2135 ims.insert(make_pair(seg.ref_id, IntronMotifs(seg.ref_id)));
2136
2137 IntronMotifs& motifs = ret.first->second;
2138
2139 int left_color_offset = 0, right_color_offset = 0;
2140 if (color)
2141 {
2142 if (seg.antisense)
2143 right_color_offset = 1;
2144 else
2145 left_color_offset = -1;
2146 }
2147
2148 if (seg.left + left_color_offset < 0 || seg.right + right_color_offset >= (int)length(*ref_str) - 1)
2149 continue;
2150
2151 DnaString org_seg_str = seqan::infix(*ref_str, seg.left + left_color_offset, seg.right + right_color_offset);
2152 String<char> seg_str;
2153 assign(seg_str, org_seg_str);
2154
2155 #ifdef B_DEBUG2
2156 cout << "coord: " << seg.left << " " << seg.right << endl;
2157 //<< "seg_str: " << seg_str << endl;
2158 #endif
2159 if (color)
2160 {
2161 bool remove_primer = true;
2162 seg_str = convert_bp_to_color(org_seg_str, remove_primer);
2163 }
2164
2165 size_t to = 0;
2166 size_t seg_len = length(seg_str);
2167 size_t read_len = seg.support_read.size();
2168 if (read_len <= 0)
2169 to = seg_len - 2;
2170 else
2171 to = read_len - 2;
2172
2173 const size_t max_segment_len = 128;
2174 uint8_t left_mismatches[max_segment_len] = {0,};
2175 uint8_t right_mismatches[max_segment_len] = {0,};
2176
2177 if (max_segment_len < read_len)
2178 {
2179 fprintf(stderr, "Error: read len(%d) is greater than %d\n", (int)read_len, (int)max_segment_len);
2180 exit(-1);
2181 }
2182
2183 if (read_len == seg_len || seg.points_where == POINT_DIR_BOTH)
2184 {
2185 if (seg.points_where == POINT_DIR_RIGHT || seg.points_where == POINT_DIR_BOTH)
2186 {
2187 size_t num_mismatches = 0;
2188 for (size_t i = 0; i < read_len - 1; ++i)
2189 {
2190 if (seg_str[i] != seg.support_read[i])
2191 ++num_mismatches;
2192
2193 left_mismatches[i] = num_mismatches;
2194 if (num_mismatches > 2)
2195 {
2196 to = i;
2197 break;
2198 }
2199 }
2200 }
2201
2202 if (seg.points_where == POINT_DIR_LEFT || seg.points_where == POINT_DIR_BOTH)
2203 {
2204 size_t num_mismatches = 0;
2205 for (int i = read_len - 1; i >= 0; --i)
2206 {
2207 if (seg_str[i + (seg_len - read_len)] != seg.support_read[i])
2208 ++num_mismatches;
2209
2210 right_mismatches[i] = num_mismatches;
2211 if (num_mismatches > 2)
2212 break;
2213 }
2214 }
2215
2216 // daehwan
2217 #ifdef B_DEBUG2
2218 cout << "antisense: " << (seg.antisense ? "-" : "+") << endl
2219 << seqan::infix(seg_str, 0, segment_length) << " "
2220 << seqan::infix(seg_str, length(seg_str) - segment_length, length(seg_str)) << endl
2221 << seg.support_read << endl
2222 << 0 << " - " << to << endl;
2223
2224 for (unsigned int i = 0; i < read_len; ++i)
2225 cout << (int)left_mismatches[i];
2226 cout << "\t";
2227
2228 for (unsigned int i = 0; i < read_len; ++i)
2229 cout << (int)right_mismatches[i];
2230 cout << endl;
2231 #endif
2232 }
2233
2234 if (seg.points_where == POINT_DIR_BOTH)
2235 {
2236 for (size_t i = 0; i <= to; ++i)
2237 {
2238 // Look at a slice of the reference without creating a copy.
2239 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2240
2241 if ((!skip_fwd && curr == donor_dinuc) || (!skip_rev && curr == rev_acceptor_dinuc))
2242 {
2243 DnaString partner;
2244 if (curr == donor_dinuc)
2245 partner = acceptor_dinuc;
2246 else
2247 partner = rev_donor_dinuc;
2248
2249 uint8_t left_mismatch = 0;
2250 if (i > 0)
2251 left_mismatch = left_mismatches[i-1];
2252
2253 // daehwan
2254 #ifdef B_DEBUG2
2255 cout << "i: " << i << endl
2256 << "mismatches: " << (int)left_mismatch
2257 << " - " << (int)right_mismatches[i] << endl;
2258 #endif
2259 if (left_mismatch + right_mismatches[i] <= 2)
2260 {
2261 size_t pos = length(seg_str) - (read_len - i) - 2;
2262 if (partner == seqan::infix(org_seg_str, pos - left_color_offset, pos + 2 - left_color_offset))
2263 {
2264 if (curr == donor_dinuc)
2265 {
2266 motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2267 motifs.fwd_acceptors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0)));
2268 }
2269 else
2270 {
2271 motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2272 motifs.rev_donors.push_back(make_pair(seg.left + pos, DnaSpliceStrings(0,0)));
2273 }
2274
2275 // daehwan
2276 #ifdef B_DEBUG2
2277 cout << curr << ":" << partner << " added" << endl;
2278 #endif
2279 }
2280 }
2281 }
2282 }
2283 }
2284
2285 else if (seg.points_where == POINT_DIR_LEFT)
2286 {
2287 // A ref segment that "points left" is one that was flanked
2288 // on the right by a partial bowtie hit, indicating that we
2289 // should be looking for an intron to the left of the hit
2290 // In this seg, that means either an "AG" or an "AC"
2291 for (size_t i = 0; i <= to; ++i)
2292 {
2293 // Look at a slice of the reference without creating a copy.
2294 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2295 if (curr == acceptor_dinuc && !skip_fwd)
2296 motifs.fwd_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2297 else if (curr == rev_donor_dinuc && !skip_rev)
2298 motifs.rev_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2299 }
2300 }
2301 else
2302 {
2303 // A right pointing ref seg wants either a "GT" or a "CT"
2304 for (size_t i = 0; i <= to; ++i)
2305 {
2306 // Look at a slice of the reference without creating a copy.
2307 DnaString curr = seqan::infix(org_seg_str, i - left_color_offset, i + 2 - left_color_offset);
2308 if (curr == donor_dinuc && !skip_fwd)
2309 motifs.fwd_donors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2310 else if (curr == rev_acceptor_dinuc && !skip_rev)
2311 motifs.rev_acceptors.push_back(make_pair(seg.left + i, DnaSpliceStrings(0,0)));
2312 }
2313 }
2314 }
2315
2316 if (talkative)
2317 {
2318 fprintf(stderr, "reporting synthetic splice junctions...\n");
2319 }
2320 for (MotifMap::iterator motif_itr = ims.begin(); motif_itr != ims.end(); ++motif_itr)
2321 {
2322 uint32_t ref_id = motif_itr->first;
2323
2324 RefSequenceTable::Sequence* ref_str = rt.get_seq(ref_id);
2325
2326 if (!ref_str)
2327 err_die("Error: couldn't get ref string for %u\n", ref_id);
2328
2329 if (talkative)
2330 fprintf(stderr, "Examining donor-acceptor pairings in %s\n", rt.get_name(ref_id));
2331 IntronMotifs& motifs = motif_itr->second;
2332
2333 if (!all_both)
2334 motifs.unique();
2335
2336 //motifs.attach_mer_counts(*ref_str);
2337 motifs.attach_mers(*ref_str);
2338
2339 vector<pair<size_t, DnaSpliceStrings> >& fwd_donors = motifs.fwd_donors;
2340 vector<pair<size_t, DnaSpliceStrings> >& fwd_acceptors = motifs.fwd_acceptors;
2341 vector<pair<size_t, DnaSpliceStrings> >& rev_acceptors = motifs.rev_acceptors;
2342 vector<pair<size_t, DnaSpliceStrings> >& rev_donors = motifs.rev_donors;
2343
2344 //const char* ref_name = rt.get_name(motif_itr->second.ref_id);
2345
2346 JunctionRecorder recorder;
2347 recorder.record(ref_id,
2348 fwd_donors,
2349 fwd_acceptors,
2350 false,
2351 juncs,
2352 min_intron,
2353 max_intron,
2354 max_juncs,
2355 half_splice_mer_len);
2356
2357 recorder.record(ref_id,
2358 rev_acceptors,
2359 rev_donors,
2360 true,
2361 juncs,
2362 min_intron,
2363 max_intron,
2364 max_juncs,
2365 half_splice_mer_len);
2366 }
2367 //fprintf(stderr, "Found %d total splices\n", num_juncs);
2368 }
2369
2370
2371 /**
2372 * Performs a simple global alignment.
2373 * This function will perform a restricted global alignment. The restriction is that only one insertion/deletion
2374 * is allowed in the final alignment.
2375 * @param shortSequence The short sequence to be aligned.
2376 * @param leftReference The left end of the reference to be aligned, must be exactly as long as the short sequence
2377 * @param leftReference The right end of the reference to be aligned, must be exactly as long as the short sequenc
2378 * @param insertPosition This will contain the 0-based index of the first position in the shorter sequence after the insertion/deletion. A value of -1 indicates that the alignment could not be performed.
2379 * @param mismatchCount This will contain the number of mismatches in the optimal restricted global alignment. The number and length of insertions/deletions is fixed. A value of -1 indicates that the alignment could not be performed.
2380 */
2381 void simpleSplitAlignment(seqan::String<char>& shorterSequence,
2382 seqan::String<char>& leftReference,
2383 seqan::String<char>& rightReference,
2384 vector<int>& insertPositions,
2385 int& mismatchCount)
2386 {
2387 /*
2388 * In this restricted alignment, we already know the length and number (1) of insertions/deletions.
2389 * We simply need to know where to put it. Do a linear scan through sequence counting the number of induced
2390 * errors before and after putting the insertion at each sequence.
2391 */
2392
2393 /*
2394 * Note that we could have a case, where both the alignment and the read have the unknonw
2395 * nucleotide ('N') and we don't want to reward cases where these characters match
2396 */
2397 vector<unsigned short> beforeErrors(seqan::length(shorterSequence));
2398 for(int idx = seqan::length(shorterSequence) - 1; idx >= 0; idx -= 1){
2399 unsigned short prevCount = 0;
2400 /*
2401 * We guarentee idx >= 0, so cast to hide the compiler
2402 * warning here
2403 */
2404 if(((size_t)idx) < seqan::length(shorterSequence) - 1){
2405 prevCount = beforeErrors.at(idx + 1);
2406 }
2407 unsigned short currentMismatch = 0;
2408 if(rightReference[idx] == 'N' || shorterSequence[idx] == 'N' || rightReference[idx] != shorterSequence[idx]){
2409 currentMismatch = 1;
2410 }
2411 beforeErrors.at(idx) = prevCount + currentMismatch;
2412 }
2413
2414 vector<unsigned short> afterErrors(seqan::length(shorterSequence));
2415 for(size_t idx = 0; idx < seqan::length(shorterSequence) ; idx += 1){
2416 unsigned short prevCount = 0;
2417 if(idx > 0){
2418 prevCount = afterErrors.at(idx - 1);
2419 }
2420 unsigned short currentMismatch = 0;
2421 if(leftReference[idx] == 'N' || shorterSequence[idx] == 'N' || leftReference[idx] != shorterSequence[idx]){
2422 currentMismatch = 1;
2423 }
2424 afterErrors.at(idx) = prevCount + currentMismatch;
2425 }
2426
2427 mismatchCount = seqan::length(shorterSequence) + 1;
2428 insertPositions.clear();
2429
2430 /*
2431 * Technically, we could allow the insert position to be at the end or beginning of the sequence,
2432 * but we are disallowing it here
2433 */
2434 for(size_t currentInsertPosition = 1; currentInsertPosition < seqan::length(shorterSequence); currentInsertPosition += 1){
2435 size_t errorCount = beforeErrors.at(currentInsertPosition) + afterErrors.at(currentInsertPosition - 1);
2436
2437 if(((int)errorCount) < mismatchCount){
2438 mismatchCount = (int)errorCount;
2439 insertPositions.clear();
2440 insertPositions.push_back(currentInsertPosition);
2441 }
2442 else if ((int)errorCount == mismatchCount) {
2443 insertPositions.push_back(currentInsertPosition);
2444 }
2445 }
2446 return;
2447 }
2448
2449
2450 /**
2451 * Try to detect a small insertion.
2452 * This code will try to identify a small insertion based on the ungapped alignment of two neighboring
2453 * segments. The general idea is to try to realign the local region, and see if we can reduce the
2454 * number of errors. Note that the function makes use of the global parameter "max_insertion_length" to limit the maximum
2455 * size of a detected insertion.
2456 * @param rt Sequence table used to lookup sequence information
2457 * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit.
2458 * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit.
2459 * @param insertions If an insertion is sucessfully detected, it will be added to this set
2460 */
2461 void detect_small_insertion(RefSequenceTable& rt,
2462 seqan::String<char>& read_sequence,
2463 BowtieHit& leftHit,
2464 BowtieHit& rightHit,
2465 std::set<Insertion>& insertions)
2466 {
2467
2468 RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id());
2469 if(!ref_str){
2470 fprintf(stderr, "Error accessing sequence record\n");
2471 }else{
2472 size_t read_length = seqan::length(read_sequence);
2473 int begin_offset = 0;
2474 int end_offset = 0;
2475
2476 if(color){
2477 if(leftHit.antisense_align())
2478 end_offset = 1;
2479 else
2480 begin_offset = -1;
2481 }
2482
2483 if(leftHit.left() + begin_offset < 0)
2484 return;
2485
2486 /*
2487 * If there is in fact a deletion, we are expecting the genomic sequence to be shorter than
2488 * the actual read sequence
2489 */
2490 int discrepancy = read_length - (rightHit.right() - leftHit.left());
2491 DnaString genomic_sequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, rightHit.right() + end_offset);
2492 String<char> genomic_sequence;
2493 assign(genomic_sequence, genomic_sequence_temp);
2494
2495 if(color)
2496 genomic_sequence = convert_bp_to_color(genomic_sequence, true);
2497
2498 String<char> left_read_sequence = seqan::infix(read_sequence, 0, 0 + seqan::length(genomic_sequence));
2499 String<char> right_read_sequence = seqan::infix(read_sequence, read_length - seqan::length(genomic_sequence), read_length);
2500
2501 vector<int> bestInsertPositions;
2502 int minErrors = -1;
2503 simpleSplitAlignment(genomic_sequence, left_read_sequence, right_read_sequence, bestInsertPositions, minErrors);
2504
2505 assert (bestInsertPositions.size() > 0);
2506 int bestInsertPosition = bestInsertPositions[0];
2507
2508
2509 /*
2510 * Need to decide if the insertion is suitably improves the alignment
2511 */
2512 /*
2513 * If these two segments anchors constitue the entire read, then we require
2514 * that this alignment actually improve the number of errors observed in the alignment
2515 * Otherwise, it is OK as long as the number of errors doesn't increase.
2516 */
2517 int adjustment = 0;
2518 if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){
2519 adjustment = -1;
2520 }
2521 if(minErrors <= (leftHit.edit_dist()+rightHit.edit_dist()+adjustment)){
2522 String<char> insertedSequence = seqan::infix(left_read_sequence, bestInsertPosition, bestInsertPosition + discrepancy);
2523 if(color)
2524 insertedSequence = convert_color_to_bp(genomic_sequence_temp[bestInsertPosition - begin_offset + end_offset - 1], insertedSequence);
2525
2526 insertions.insert(Insertion(leftHit.ref_id(),
2527 leftHit.left() + bestInsertPosition - 1 + end_offset,
2528 seqan::toCString(insertedSequence)));
2529 }
2530 }
2531 return;
2532 }
2533
2534 /**
2535 * Try to detect a small deletion.
2536 * This code will try to identify a small deletion based on the ungapped alignment of two neighboring
2537 * segments. The general idea is to try to realign the local region, and see if we can reduce the
2538 * number of errors. Note that the function makes use of the global parameter "max_deletion_length" to limit the maximum
2539 * size of a detected deletion.
2540 * @param rt Sequence table used to lookup sequence information
2541 * @param leftHit The alignment of the left segment. Note that the leftHit must have a left position less than that of the right hit.
2542 * @param rightHit The alignment of the right segment. Note that the rightHit must have a left position greater than that of the left hit.
2543 * @param deletion_juncs If a deletion is sucessfully detected, it will be added to this set
2544 */
2545 void detect_small_deletion(RefSequenceTable& rt,
2546 seqan::String<char>& read_sequence,
2547 BowtieHit& leftHit,
2548 BowtieHit& rightHit,
2549 std::set<Deletion>& deletions)
2550 {
2551
2552 RefSequenceTable::Sequence* ref_str = rt.get_seq(leftHit.ref_id());
2553 if(!ref_str){
2554 fprintf(stderr, "Error accessing sequence record\n");
2555 }else{
2556 int begin_offset = 0;
2557 int end_offset = 0;
2558
2559 if(color){
2560 if(leftHit.antisense_align())
2561 end_offset = 1;
2562 else
2563 begin_offset = -1;
2564 }
2565
2566 if(leftHit.left() + begin_offset < 0)
2567 return;
2568
2569 size_t read_length = seqan::length(read_sequence);
2570 if(rightHit.right() + read_length + begin_offset < 0 )
2571 return;
2572
2573 int discrepancy = (rightHit.right() - leftHit.left()) - read_length;
2574 Dna5String leftGenomicSequence_temp = seqan::infix(*ref_str, leftHit.left() + begin_offset, leftHit.left() + read_length + end_offset);
2575 Dna5String rightGenomicSequence_temp = seqan::infix(*ref_str, rightHit.right() - read_length + begin_offset, rightHit.right() + end_offset);
2576
2577 if (length(leftGenomicSequence_temp) < read_length || length(rightGenomicSequence_temp) < read_length)
2578 return;
2579
2580 String<char> leftGenomicSequence;
2581 assign(leftGenomicSequence, leftGenomicSequence_temp);
2582
2583 String<char> rightGenomicSequence;
2584 assign(rightGenomicSequence, rightGenomicSequence_temp);
2585
2586 if(color){
2587 leftGenomicSequence = convert_bp_to_color(leftGenomicSequence, true);
2588 rightGenomicSequence = convert_bp_to_color(rightGenomicSequence, true);
2589 }
2590
2591 vector<int> bestInsertPositions;
2592 int minErrors = -1;
2593 simpleSplitAlignment(read_sequence, leftGenomicSequence, rightGenomicSequence, bestInsertPositions, minErrors);
2594
2595 assert (bestInsertPositions.size() > 0);
2596 int bestInsertPosition = bestInsertPositions[0];
2597
2598 /*
2599 * Need to decide if the deletion is suitably improves the alignment
2600 */
2601 int adjustment = 0;
2602
2603 /*
2604 * If these two segments anchors constitue the entire read, then we require
2605 * that this alignment actually improve the number of errors observed in the alignment
2606 * Otherwise, it is OK as long as the number of errors doesn't increase.
2607 */
2608 if(leftHit.read_len() + rightHit.read_len() >= (int)read_length){
2609 adjustment = -1;
2610 }
2611 if(minErrors <= (leftHit.edit_dist()+rightHit.edit_dist()+adjustment)){
2612 deletions.insert(Deletion(leftHit.ref_id(),
2613 leftHit.left() + bestInsertPosition - 1 + end_offset,
2614 leftHit.left() + bestInsertPosition + discrepancy + end_offset,
2615 false));
2616 }
2617 }
2618 return;
2619 }
2620
2621 int difference(const String<char>& first, const String<char>& second)
2622 {
2623 int len = length(first);
2624 if (len != length(second))
2625 return 0;
2626
2627 int min_value = 10000;
2628 short value1[1024] = {0,};
2629 short value2[1024] = {0,};
2630
2631 short* curr = value1;
2632 short* prev = value2;
2633
2634 for (int j = 0; j < len; ++j)
2635 {
2636 for (int i = 0; i < len; ++i)
2637 {
2638 int value = 10000;
2639 int match = first[i] == second[j] ? 0 : 1;
2640
2641 // right
2642 if (i == 0)
2643 value = j * 2 + match;
2644 else if (j > 0)
2645 value = prev[i] + 2;
2646
2647 int temp_value = 10000;
2648
2649 // down
2650 if (j == 0)
2651 temp_value = i * 2 + match;
2652 else if (i > 0)
2653 temp_value = curr[i-1] + 2;
2654
2655 if (temp_value < value)
2656 value = temp_value;
2657
2658 // match
2659 if (i > 0 && j > 0)
2660 temp_value = prev[i-1] + match;
2661
2662 if (temp_value < value)
2663 value = temp_value;
2664
2665 curr[i] = value;
2666
2667 if ((i == len - 1 || j == len - 1) && value < min_value)
2668 min_value = value;
2669 }
2670
2671 short* temp = prev;
2672 prev = curr;
2673 curr = temp;
2674 }
2675
2676 return min_value;
2677 }
2678
2679 void detect_fusion(RefSequenceTable& rt,
2680 seqan::String<char>& read_sequence,
2681 BowtieHit& leftHit,
2682 BowtieHit& rightHit,
2683 FusionSimpleSet& fusions,
2684 uint32_t dir)
2685 {
2686 RefSequenceTable::Sequence* left_ref_str = rt.get_seq(leftHit.ref_id());
2687 RefSequenceTable::Sequence* right_ref_str = rt.get_seq(rightHit.ref_id());
2688
2689 size_t read_length = seqan::length(read_sequence);
2690
2691 Dna5String leftGenomicSequence_temp;
2692 Dna5String rightGenomicSequence_temp;
2693
2694 if (dir == FUSION_FF || dir == FUSION_FR)
2695 {
2696 if (leftHit.left() + read_length > seqan::length(*left_ref_str))
2697 return;
2698
2699 leftGenomicSequence_temp = seqan::infix(*left_ref_str, leftHit.left(), leftHit.left() + read_length);
2700 }
2701 else
2702 {
2703 if (leftHit.right() < read_length)
2704 return;
2705
2706 leftGenomicSequence_temp = seqan::infix(*left_ref_str, leftHit.right() - read_length, leftHit.right());
2707 seqan::convertInPlace(leftGenomicSequence_temp, seqan::FunctorComplement<Dna>());
2708 seqan::reverseInPlace(leftGenomicSequence_temp);
2709 }
2710
2711 if (dir == FUSION_FF || dir == FUSION_RF)
2712 {
2713 if (rightHit.right() < read_length)
2714 return;
2715
2716 rightGenomicSequence_temp = seqan::infix(*right_ref_str, rightHit.right() - read_length, rightHit.right());
2717 }
2718 else
2719 {
2720 if (rightHit.left() + read_length > seqan::length(*right_ref_str))
2721 return;
2722
2723 rightGenomicSequence_temp = seqan::infix(*right_ref_str, rightHit.left(), rightHit.left() + read_length);
2724 seqan::convertInPlace(rightGenomicSequence_temp, seqan::FunctorComplement<Dna>());
2725 seqan::reverseInPlace(rightGenomicSequence_temp);
2726 }
2727
2728 String<char> leftGenomicSequence;
2729 assign(leftGenomicSequence, leftGenomicSequence_temp);
2730
2731 String<char> rightGenomicSequence;
2732 assign(rightGenomicSequence, rightGenomicSequence_temp);
2733
2734 vector<int> bestInsertPositions;
2735 int minErrors = -1;
2736
2737 simpleSplitAlignment(read_sequence, leftGenomicSequence, rightGenomicSequence, bestInsertPositions, minErrors);
2738
2739 uint32_t total_edit_dist = leftHit.edit_dist() + rightHit.edit_dist();
2740 if (minErrors > total_edit_dist)
2741 return;
2742
2743 // daehwan
2744 if (read_length <= 60)
2745 {
2746 /*
2747 * check if the two contig from two different chromosome are different enough
2748 */
2749 int different_count = min(difference(leftGenomicSequence, read_sequence), difference(rightGenomicSequence, read_sequence));
2750 // different_count = min(different_count, difference(leftGenomicSequence, rightGenomicSequence));
2751 if (different_count < read_length / 6)
2752 return;
2753 }
2754
2755 for (size_t i = 0; i < bestInsertPositions.size(); ++i)
2756 {
2757 int bestInsertPosition = bestInsertPositions[i];
2758
2759 uint32_t left, right;
2760 if (dir == FUSION_FF || dir == FUSION_FR)
2761 left = leftHit.left() + bestInsertPosition - 1;
2762 else
2763 left = leftHit.right() - bestInsertPosition;
2764
2765 if (dir == FUSION_FF || dir == FUSION_RF)
2766 right = rightHit.right() - (read_length - bestInsertPosition);
2767 else
2768 right = rightHit.left() + (read_length - bestInsertPosition) - 1;
2769
2770 uint32_t ref_id1 = leftHit.ref_id();
2771 uint32_t ref_id2 = rightHit.ref_id();
2772
2773 uint32_t temp_dir = dir;
2774 if ((ref_id2 < ref_id1) ||
2775 (ref_id1 == ref_id2 && left > right))
2776 {
2777 uint32_t temp = ref_id1;
2778 ref_id1 = ref_id2;
2779 ref_id2 = temp;
2780
2781 temp = left;
2782 left = right;
2783 right = temp;
2784
2785 if (dir == FUSION_FF)
2786 temp_dir = FUSION_RR;
2787 }
2788
2789 // daehwan - check
2790 #if B_DEBUG
2791 cout << endl << endl;
2792 cout << "read id: " << leftHit.insert_id() << endl;
2793 cout << "sense: " << (leftHit.antisense_align() ? "-" : "+") << endl;
2794 // cout << "difference: " << different_count << endl;
2795 cout << "left ref_id: " << rt.get_name(leftHit.ref_id()) << "\tright ref_id: " << rt.get_name(rightHit.ref_id()) << endl;
2796 cout << read_sequence << endl;
2797 cout << leftGenomicSequence << "\t" << rightGenomicSequence << endl;
2798 cout << "insertion pos: " << bestInsertPosition << endl;
2799 cout << left << ":" << right << endl;
2800 cout << "errors: " << minErrors << endl;
2801 #endif
2802
2803 Fusion fusion(ref_id1, ref_id2, left, right, temp_dir);
2804 FusionSimpleSet::iterator itr = fusions.find(fusion);
2805 if (itr == fusions.end())
2806 {
2807 FusionSimpleStat simpleStat;
2808 simpleStat.count = 1;
2809 simpleStat.edit_dist = total_edit_dist;
2810 simpleStat.skip = false;
2811 simpleStat.left_coincide_with_splice_junction = false;
2812 simpleStat.right_coincide_with_splice_junction = false;
2813 fusions[fusion] = simpleStat;
2814 }
2815 else
2816 {
2817 itr->second.count += 1;
2818 itr->second.edit_dist = min(itr->second.edit_dist, total_edit_dist);
2819 }
2820 }
2821 }
2822
2823 void find_insertions_and_deletions(RefSequenceTable& rt,
2824 ReadStream& reads_file,
2825 vector<HitsForRead>& hits_for_read,
2826 std::set<Deletion>& deletions,
2827 std::set<Insertion>& insertions){
2828
2829 if(hits_for_read.empty()){
2830 return;
2831 }
2832
2833 size_t last_segment = hits_for_read.size()-1;
2834 size_t first_segment = 0;
2835 if(last_segment == first_segment){
2836 return;
2837 }
2838
2839 /*
2840 * We can check up front whether the first or last element is empty
2841 * and avoid doing any more work. Note that the following code requires
2842 * that there be at least one elment in each
2843 */
2844 if(hits_for_read[first_segment].hits.empty() || hits_for_read[last_segment].hits.empty()){
2845 return;
2846 }
2847
2848 /*
2849 * Need to identify the appropriate insert id for this group of reads
2850 */
2851 Read read;
2852 /*bool got_read = get_read_from_stream(hits_for_read.back().insert_id,
2853 reads_file,
2854 FASTQ,
2855 false,
2856 read);*/
2857 bool got_read = reads_file.getRead(hits_for_read.back().insert_id, read);
2858 if(!got_read){
2859 err_die("Error: could not get read# %d from stream!",
2860 (int)hits_for_read.back().insert_id);
2861 //return;
2862 }
2863
2864 /*
2865 * Work through all combinations of mappings for the first and last segment to see if any are indicative
2866 * of a small insertions or deletion
2867 */
2868 HitsForRead& left_segment_hits = hits_for_read[first_segment];
2869 HitsForRead& right_segment_hits = hits_for_read[last_segment];
2870
2871 /*
2872 * If either of the segment match lists is empty, we could try
2873 * to be smarter and work our way in until we find good a segment
2874 * match; however, won't do that for noe.
2875 */
2876 if(left_segment_hits.hits.empty() || right_segment_hits.hits.empty()){
2877 return;
2878 }
2879
2880 seqan::String<char> fullRead, rcRead;
2881 if(color){
2882 fullRead = read.seq.c_str() + 1;
2883 rcRead = fullRead;
2884 seqan::reverseInPlace(rcRead);
2885 }else{
2886 fullRead = read.seq;
2887 rcRead = read.seq;
2888 seqan::convertInPlace(rcRead, seqan::FunctorComplement<Dna>());
2889 seqan::reverseInPlace(rcRead);
2890 }
2891
2892 size_t read_length = seqan::length(fullRead);
2893 for(size_t left_segment_index = 0; left_segment_index < left_segment_hits.hits.size(); left_segment_index++){
2894 for(size_t right_segment_index = 0; right_segment_index < right_segment_hits.hits.size(); right_segment_index++){
2895 BowtieHit* leftHit = &left_segment_hits.hits[left_segment_index];
2896 BowtieHit* rightHit = &right_segment_hits.hits[right_segment_index];
2897 /*
2898 * Now we have found a pair of segment hits to investigate. Need to ensure
2899 * that
2900 * 1. the alignment orientation is consistent
2901 * 2. the distance separation is in the appropriate range
2902 * 3. Both hits are aligned to the same contig
2903 */
2904 if(leftHit->ref_id() != rightHit->ref_id()){
2905 continue;
2906 }
2907
2908 if(leftHit->antisense_align() != rightHit->antisense_align()){
2909 continue;
2910 }
2911
2912 seqan::String<char>* modifiedRead = &fullRead;
2913 /*
2914 * If we are dealing with an antisense alignment, then the left
2915 * read will actually be on the right, fix this now, to simplify
2916 * the rest of the logic, in addition, we will need to use the reverse
2917 * complement of the read sequence
2918 */
2919 if(leftHit->antisense_align()){
2920 BowtieHit * tmp = leftHit;
2921 leftHit = rightHit;
2922 rightHit = tmp;
2923 modifiedRead = &rcRead;
2924 }
2925
2926 size_t apparent_length = rightHit->right() - leftHit->left();
2927 int length_discrepancy = apparent_length - read_length;
2928 if(length_discrepancy > 0 && length_discrepancy <= (int)max_deletion_length){
2929 /*
2930 * Search for a deletion
2931 */
2932 detect_small_deletion(rt, *modifiedRead, *leftHit, *rightHit, deletions);
2933 }
2934 if(length_discrepancy < 0 && length_discrepancy >= -(int)max_insertion_length){
2935 /*
2936 * Search for an insertion
2937 */
2938 detect_small_insertion(rt, *modifiedRead, *leftHit, *rightHit, insertions);
2939 }
2940 }
2941 }
2942 }
2943
2944 /*
2945 */
2946 int map_read_to_contig(const String<char>& contig, const String<char>& read)
2947 {
2948 int contig_len = length(contig);
2949 int read_len = length(read);
2950 int pos = -1;
2951 int mismatch = 3;
2952
2953 for (int i = 0; i < contig_len - read_len; ++i)
2954 {
2955 int temp_mismatch = 0;
2956 for (int j = 0; j < read_len; ++j)
2957 {
2958 if (contig[i+j] != read[j])
2959 ++temp_mismatch;
2960
2961 if (temp_mismatch >= mismatch)
2962 break;
2963 }
2964
2965 if (temp_mismatch < mismatch)
2966 {
2967 pos = i;
2968 mismatch = temp_mismatch;
2969 }
2970 }
2971
2972 return pos;
2973 }
2974
2975
2976 void find_fusions(RefSequenceTable& rt,
2977 ReadStream& reads_file,
2978 vector<HitsForRead>& hits_for_read,
2979 HitStream& partner_hit_stream,
2980 HitStream& seg_partner_hit_stream,
2981 FusionSimpleSet& fusions,
2982 eREAD read_side)
2983 {
2984 if (hits_for_read.empty())
2985 return;
2986
2987 size_t last_segment = hits_for_read.size() - 1;
2988 while (last_segment > 0)
2989 {
2990 if (!hits_for_read[last_segment].hits.empty())
2991 break;
2992
2993 --last_segment;
2994 }
2995
2996 // daehwan
2997 #if 0
2998 if (last_segment == 0 || hits_for_read[0].hits.empty())
2999 {
3000 vector<BowtieHit>& hits = last_segment == 0 ? hits_for_read[0].hits : hits_for_read[1].hits;
3001
3002 for (size_t i = 0; i < hits.size(); ++i)
3003 {
3004 BowtieHit* hit = &hits[i];
3005
3006 static const uint32_t chr_id1 = rt.get_id("chr2");
3007 static const uint32_t chr_id2 = rt.get_id("chr3");
3008
3009 // KPL-4 PPP1R12A 12:80167343-80329235:-1 SEPT10 2:110300380-110371783:-1
3010 // const uint32_t left1 = 80167343, right1 = 80329235, left2 = 110300380, right2 = 110371783;
3011
3012 // VCaP TIA1 2:70436576-70475792:-1 DIRC2 3:122513642-122599986:1
3013 const uint32_t left1 = 70436576, right1 = 70475792, left2 = 122513642, right2 = 122599986;
3014
3015 if ((hit->ref_id() == chr_id1 && hit->left() >= left1 && hit->left() <= right1) ||
3016 (hit->ref_id() == chr_id2 && hit->left() >= left2 && hit->left() <= right2))
3017 {
3018 cout << hit->insert_id() << endl;
3019 break;
3020 #if 0
3021 cout << "insert id: " << hit->insert_id() << "\t num hits: " << hits.size() << endl
3022 << hit->ref_id() << ":" << (hit->antisense_align() ? "-" : "+") << " " << (int)hit->edit_dist() << endl
3023 << hit->left() << "-" << hit->right() << endl
3024 << hit->seq() << endl << endl;
3025 #endif
3026 }
3027 }
3028 }
3029 #endif
3030
3031
3032 size_t first_segment = 0;
3033
3034 if (last_segment == first_segment &&
3035 (hits_for_read[first_segment].hits.empty() || hits_for_read[first_segment].hits[0].end()))
3036 return;
3037
3038 uint32_t insert_id = hits_for_read[last_segment].insert_id;
3039
3040 HitsForRead partner_hit_group;
3041 uint32_t next_order = partner_hit_stream.next_group_id();
3042
3043 bool has_partner = false;
3044 while (insert_id >= next_order && next_order != 0)
3045 {
3046 partner_hit_stream.next_read_hits(partner_hit_group);
3047 next_order = partner_hit_stream.next_group_id();
3048 }
3049
3050 has_partner = insert_id == partner_hit_group.insert_id;
3051
3052 if (!has_partner)
3053 {
3054 next_order = seg_partner_hit_stream.next_group_id();
3055 while (insert_id >= next_order && next_order != 0)
3056 {
3057 seg_partner_hit_stream.next_read_hits(partner_hit_group);
3058 next_order = seg_partner_hit_stream.next_group_id();
3059 }
3060
3061 has_partner = insert_id == partner_hit_group.insert_id;
3062 }
3063
3064 /*
3065 * Need to identify the appropriate insert id for this group of reads
3066 */
3067
3068
3069 Read read;
3070 bool got_read = reads_file.getRead(hits_for_read[last_segment].insert_id, read);
3071 if (!got_read)
3072 return;
3073
3074 HitsForRead partner_hits_for_read;
3075 if (first_segment != last_segment)
3076 partner_hits_for_read = hits_for_read[last_segment];
3077
3078 HitsForRead& left_segment_hits = hits_for_read[first_segment];
3079 HitsForRead& right_segment_hits = partner_hits_for_read;
3080
3081 seqan::String<char> fullRead, rcRead;
3082 fullRead = read.seq;
3083 rcRead = read.seq;
3084 seqan::convertInPlace(rcRead, seqan::FunctorComplement<Dna>());
3085 seqan::reverseInPlace(rcRead);
3086
3087 size_t read_length = seqan::length(fullRead);
3088
3089 // daehwan
3090 bool check_partner = true;
3091 if (first_segment != last_segment)
3092 {
3093 for (size_t i = 0; i < left_segment_hits.hits.size(); ++i)
3094 {
3095 BowtieHit& leftHit = left_segment_hits.hits[i];
3096 for (size_t j = 0; j < right_segment_hits.hits.size(); ++j)
3097 {
3098 BowtieHit& rightHit = right_segment_hits.hits[j];
3099
3100 if (leftHit.ref_id() == rightHit.ref_id() && leftHit.antisense_align() == rightHit.antisense_align())
3101 {
3102 int dist = 0;
3103 if (leftHit.antisense_align())
3104 dist = leftHit.left() - rightHit.right();
3105 else
3106 dist = rightHit.left() - leftHit.right();
3107
3108 if (dist > -max_insertion_length && dist <= (int)fusion_min_dist)
3109 {
3110 check_partner = false;
3111 break;
3112 }
3113 }
3114 }
3115
3116 if (!check_partner)
3117 break;
3118 }
3119 }
3120
3121 if (check_partner && has_partner)
3122 {
3123 for (size_t l = 0; l < left_segment_hits.hits.size(); ++l)
3124 {
3125 BowtieHit& leftHit = left_segment_hits.hits[l];
3126 for (size_t r = 0; r < partner_hit_group.hits.size(); ++r)
3127 {
3128 BowtieHit& rightHit = partner_hit_group.hits[r];
3129
3130 if (leftHit.ref_id() == rightHit.ref_id())
3131 {
3132 if (leftHit.antisense_align() != rightHit.antisense_align())
3133 {
3134 int dist = 0;
3135 if (leftHit.antisense_align())
3136 dist = leftHit.left() - rightHit.right();
3137 else
3138 dist = rightHit.left() - leftHit.right();
3139
3140 if (dist > (int)inner_dist_mean - (int)inner_dist_std_dev && dist <= (int)fusion_min_dist)
3141 continue;
3142 }
3143 }
3144
3145 RefSequenceTable::Sequence* ref_str = rt.get_seq(rightHit.ref_id());
3146
3147 const size_t part_seq_len = inner_dist_std_dev > inner_dist_mean ? inner_dist_std_dev - inner_dist_mean : 0;
3148 const size_t flanking_seq_len = inner_dist_mean + inner_dist_std_dev;
3149
3150 Dna5String right_flanking_seq;
3151 size_t left = 0;
3152 if (rightHit.antisense_align())
3153 {
3154 if (flanking_seq_len <= rightHit.left())
3155 {
3156 left = rightHit.left() - flanking_seq_len;
3157 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3158 }
3159 else
3160 break;
3161 }
3162 else
3163 {
3164 if (part_seq_len <= rightHit.right())
3165 {
3166 left = rightHit.right() - part_seq_len;
3167 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3168 }
3169 else
3170 break;
3171 }
3172
3173 const size_t check_read_len = min(15, segment_length - segment_mismatches - 3);
3174 seqan::String<char> fwd_read = infix(fullRead, read_length - check_read_len, read_length);
3175 seqan::String<char> rev_read = infix(rcRead, 0, check_read_len);
3176
3177 int fwd_pos = map_read_to_contig(right_flanking_seq, fwd_read);
3178
3179 if (fwd_pos >= 0)
3180 {
3181 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id(), rightHit.insert_id(),
3182 left + fwd_pos, check_read_len, false, 0, true);
3183
3184 right_segment_hits.hits.push_back(hit);
3185 }
3186
3187 int rev_pos = map_read_to_contig(right_flanking_seq, rev_read);
3188
3189 if (rev_pos >= 0)
3190 {
3191 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id(), rightHit.insert_id(),
3192 left + rev_pos, check_read_len, true, 0, true);
3193
3194 right_segment_hits.hits.push_back(hit);
3195 }
3196
3197 // daehwan
3198 #if 0
3199 if (fwd_pos >= 0 || rev_pos >= 0)
3200 {
3201 // if (leftHit.insert_id() == 409048 || leftHit.insert_id() == 4341516)
3202 {
3203 cout << "insert id: " << leftHit.insert_id() << endl
3204 << "fwd: " << fwd_read << " " << fwd_pos << endl
3205 << "rev: " << rev_read << " " << rev_pos << endl
3206 << "ref: " << right_flanking_seq << endl;
3207 }
3208 }
3209 #endif
3210 }
3211 }
3212 }
3213
3214 // daehwan - should check this out
3215 static const string chrM = "chrM";
3216 static const uint32_t chrM_id = rt.get_id(chrM);
3217 for (size_t left_segment_index = 0; left_segment_index < left_segment_hits.hits.size(); ++left_segment_index)
3218 {
3219 for (size_t right_segment_index = 0; right_segment_index < right_segment_hits.hits.size(); ++right_segment_index)
3220 {
3221 BowtieHit* leftHit = &left_segment_hits.hits[left_segment_index];
3222 BowtieHit* rightHit = &right_segment_hits.hits[right_segment_index];
3223
3224 // daehwan
3225 if (leftHit->ref_id() == chrM_id || rightHit->ref_id() == chrM_id)
3226 continue;
3227
3228 // daehwan
3229 #if 0
3230 if (leftHit->ref_id() == rightHit->ref_id() && leftHit->ref_id() == 1119738090)
3231 {
3232 const uint32_t left1 = 113951556, right1 = 113977987, left2 = 113548692, right2 = 113754053;
3233 if ((leftHit->left() >= left1 && leftHit->left() <= right1 && rightHit->left() >= left2 && rightHit->left() <= right2) ||
3234 (leftHit->left() >= left2 && leftHit->left() <= right2 && rightHit->left() >= left1 && rightHit->left() <= right1))
3235 {
3236 cout << "insert id: " << leftHit->insert_id() << "\t num hits: " << left_segment_hits.hits.size() << ":" << right_segment_hits.hits.size() << endl
3237 << leftHit->ref_id() << ":" << (leftHit->antisense_align() ? "-" : "+") << " " << (int)leftHit->edit_dist() <<endl
3238 << leftHit->left() << "-" << leftHit->right() << endl
3239 << rightHit->ref_id() << ":" << (rightHit->antisense_align() ? "-" : "+") << " " << (int)rightHit->edit_dist() << endl
3240 << rightHit->left() << "-" << rightHit->right() << endl << endl;
3241 }
3242 }
3243 #endif
3244
3245 if (leftHit->ref_id() == rightHit->ref_id())
3246 {
3247 if (leftHit->antisense_align() == rightHit->antisense_align())
3248 {
3249 int dist = 0;
3250 if (leftHit->antisense_align())
3251 dist = leftHit->left() - rightHit->right();
3252 else
3253 dist = rightHit->left() - leftHit->right();
3254
3255 if (dist >= 0 && dist <= (int)fusion_min_dist)
3256 continue;
3257 }
3258 }
3259
3260 uint32_t dir = FUSION_FF;
3261 seqan::String<char>* modifiedRead = &fullRead;
3262
3263 if (leftHit->antisense_align() == rightHit->antisense_align())
3264 {
3265 if (leftHit->antisense_align())
3266 {
3267 BowtieHit * tmp = leftHit;
3268 leftHit = rightHit;
3269 rightHit = tmp;
3270 modifiedRead = &rcRead;
3271 }
3272 }
3273 else if (leftHit->antisense_align() == false && rightHit->antisense_align() == true)
3274 dir = FUSION_FR;
3275 else
3276 dir = FUSION_RF;
3277
3278 detect_fusion(rt, *modifiedRead, *leftHit, *rightHit, fusions, dir);
3279 }
3280 }
3281 }
3282
3283 void find_gaps(RefSequenceTable& rt,
3284 ReadStream& reads_file,
3285 vector<HitsForRead>& hits_for_read,
3286 HitStream& partner_hit_stream,
3287 HitStream& seg_partner_hit_stream,
3288 std::set<Junction, skip_count_lt>& seg_juncs,
3289 eREAD read_side)
3290 {
3291 if (hits_for_read.empty())
3292 return;
3293
3294 size_t last_segment = hits_for_read.size() - 1;
3295 while (last_segment > 0)
3296 {
3297 if (!hits_for_read[last_segment].hits.empty())
3298 break;
3299
3300 --last_segment;
3301 }
3302
3303 hits_for_read.resize(last_segment + 1);
3304
3305 size_t first_segment = 0;
3306 if (last_segment == first_segment &&
3307 (hits_for_read[first_segment].hits.empty() || hits_for_read[first_segment].hits[0].end()))
3308 return;
3309
3310 uint32_t insert_id = hits_for_read[last_segment].insert_id;
3311
3312 HitsForRead partner_hit_group;
3313 uint32_t next_order = partner_hit_stream.next_group_id();
3314
3315 bool has_partner = false;
3316 while (insert_id >= next_order && next_order != 0)
3317 {
3318 partner_hit_stream.next_read_hits(partner_hit_group);
3319 next_order = partner_hit_stream.next_group_id();
3320 }
3321
3322 has_partner = insert_id == partner_hit_group.insert_id;
3323
3324 if (!has_partner)
3325 {
3326 next_order = seg_partner_hit_stream.next_group_id();
3327 while (insert_id >= next_order && next_order != 0)
3328 {
3329 seg_partner_hit_stream.next_read_hits(partner_hit_group);
3330 next_order = seg_partner_hit_stream.next_group_id();
3331 }
3332
3333 has_partner = insert_id == partner_hit_group.insert_id;
3334 }
3335
3336 Read read;
3337 /*
3338 bool got_read = get_read_from_stream(hits_for_read[last_segment].insert_id,
3339 reads_file,
3340 FASTQ,
3341 false,
3342 read); */
3343 bool got_read = reads_file.getRead(hits_for_read[last_segment].insert_id, read);
3344 if (!got_read) {
3345 err_die("Error: could not get read# %d from stream!",
3346 (int)hits_for_read[last_segment].insert_id);
3347 return;
3348 }
3349
3350 HitsForRead partner_hits_for_read;
3351 if (first_segment != last_segment)
3352 partner_hits_for_read = hits_for_read[last_segment];
3353
3354 HitsForRead& left_segment_hits = hits_for_read[first_segment];
3355 HitsForRead& right_segment_hits = partner_hits_for_read;
3356
3357 bool check_partner = true;
3358 if (first_segment != last_segment)
3359 {
3360 for (size_t i = 0; i < left_segment_hits.hits.size(); ++i)
3361 {
3362 BowtieHit& leftHit = left_segment_hits.hits[i];
3363 for (size_t j = 0; j < right_segment_hits.hits.size(); ++j)
3364 {
3365 BowtieHit& rightHit = right_segment_hits.hits[j];
3366
3367 if (leftHit.ref_id() == rightHit.ref_id() && leftHit.antisense_align() == rightHit.antisense_align())
3368 {
3369 int dist = 0;
3370 if (leftHit.antisense_align())
3371 dist = leftHit.left() - rightHit.right();
3372 else
3373 dist = rightHit.left() - leftHit.right();
3374
3375 if (dist >= min_segment_intron_length && dist < (int)max_segment_intron_length)
3376 {
3377 check_partner = false;
3378 break;
3379 }
3380 }
3381 }
3382
3383 if (!check_partner)
3384 break;
3385 }
3386 }
3387
3388 if (check_partner && has_partner)
3389 {
3390 // empty hits from 1 to num_segments - 1
3391 for (size_t i = first_segment + 1; i < hits_for_read.size(); ++i)
3392 {
3393 hits_for_read[i].hits.clear();
3394 }
3395
3396 seqan::String<char> fullRead, rcRead;
3397 fullRead = read.seq;
3398 rcRead = read.seq;
3399 seqan::convertInPlace(rcRead, seqan::FunctorComplement<Dna>());
3400 seqan::reverseInPlace(rcRead);
3401 size_t read_length = read.seq.length();
3402
3403 for (size_t l = 0; l < left_segment_hits.hits.size(); ++l)
3404 {
3405 BowtieHit& leftHit = left_segment_hits.hits[l];
3406 for (size_t r = 0; r < partner_hit_group.hits.size(); ++r)
3407 {
3408 BowtieHit& rightHit = partner_hit_group.hits[r];
3409 if (leftHit.ref_id() != rightHit.ref_id() || leftHit.antisense_align() == rightHit.antisense_align())
3410 continue;
3411
3412 int dist = 0;
3413 if (leftHit.antisense_align())
3414 dist = leftHit.left() - rightHit.right();
3415 else
3416 dist = rightHit.left() - leftHit.right();
3417
3418 if (dist < min_segment_intron_length && dist >= (int)max_segment_intron_length)
3419 continue;
3420
3421 RefSequenceTable::Sequence* ref_str = rt.get_seq(rightHit.ref_id());
3422 const size_t part_seq_len = inner_dist_std_dev > inner_dist_mean ? inner_dist_std_dev - inner_dist_mean : 0;
3423 const size_t flanking_seq_len = inner_dist_mean + inner_dist_std_dev;
3424
3425 Dna5String right_flanking_seq;
3426 size_t left = 0;
3427 if (rightHit.antisense_align())
3428 {
3429 if (flanking_seq_len <= rightHit.left())
3430 {
3431 left = rightHit.left() - flanking_seq_len;
3432 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3433 }
3434 else
3435 break;
3436 }
3437 else
3438 {
3439 if (part_seq_len <= rightHit.right())
3440 {
3441 left = rightHit.right() - part_seq_len;
3442 right_flanking_seq = seqan::infix(*ref_str, left, left + flanking_seq_len + part_seq_len);
3443 }
3444 else
3445 break;
3446 }
3447
3448 const size_t check_read_len = min(15, segment_length - segment_mismatches - 3);
3449 seqan::String<char> fwd_read = infix(fullRead, read_length - check_read_len, read_length);
3450 seqan::String<char> rev_read = infix(rcRead, 0, check_read_len);
3451
3452 int fwd_pos = map_read_to_contig(right_flanking_seq, fwd_read);
3453 if (fwd_pos >= 0)
3454 {
3455 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id2(), rightHit.insert_id(),
3456 left + fwd_pos, check_read_len, false, 0, true);
3457
3458 hits_for_read[last_segment].hits.push_back(hit);
3459 }
3460
3461 int rev_pos = map_read_to_contig(right_flanking_seq, rev_read);
3462
3463 if (rev_pos >= 0)
3464 {
3465 BowtieHit hit(rightHit.ref_id(), rightHit.ref_id2(), rightHit.insert_id(),
3466 left + rev_pos, check_read_len, true, 0, true);
3467
3468 hits_for_read[last_segment].hits.push_back(hit);
3469 }
3470
3471 // daehwan - for debug purposes
3472 #if B_DEBUG
3473 cerr << "daehwan!!!" << endl
3474 << "insert id: " << rightHit.insert_id() << endl
3475 << "first segment: " << first_segment << ", last_segment: " << last_segment << endl
3476 << right_flanking_seq << " : " << seqan::length(right_flanking_seq) << endl
3477 << fwd_read << " : " << fwd_pos << endl
3478 << rev_read << " : " << rev_pos << endl
3479 << "left: " << leftHit.left() << "-" << leftHit.right() << (leftHit.antisense_align() ? " -" : " +") << endl
3480 << "right: " << rightHit.left() << "-" << rightHit.right() << (rightHit.antisense_align() ? " -" : " +") << endl;
3481 if (fwd_pos >= 0 || rev_pos >= 0)
3482 {
3483 const BowtieHit& hit = hits_for_read[last_segment].hits.back();
3484 cerr << "back: " << hit.left() << "-" << hit.right() << (hit.antisense_align() ? " -" : " +") << endl;
3485 }
3486 #endif
3487 }
3488 }
3489 }
3490
3491 vector<RefSeg> expected_don_acc_windows;
3492 string seq(read.seq);
3493
3494 // ignore segments that map to more than this many places that would otherwise produce
3495 // many false splice junctions.
3496 if (bowtie2)
3497 {
3498 for (size_t s = 0; s < hits_for_read.size(); ++s)
3499 {
3500 if (hits_for_read[s].hits.size() > max_seg_multihits)
3501 return;
3502 }
3503 }
3504
3505 for (size_t s = 0; s < hits_for_read.size(); ++s)
3506 {
3507 HitsForRead& curr = hits_for_read[s];
3508 for (size_t h = 0; h < curr.hits.size(); ++h)
3509 {
3510 bool found_right_seg_partner = s == hits_for_read.size() - 1;
3511 BowtieHit& bh = curr.hits[h];
3512
3513 // "drs" is distant seg right partner
3514 // "rrs" is right of right seg partner
3515 vector<BowtieHit*> drs_bhs;
3516 vector<BowtieHit*> rrs_bhs;
3517
3518 if (s < hits_for_read.size() - 1)
3519 {
3520 // Look for a right partner for the current hit
3521 HitsForRead& right = hits_for_read[s + 1];
3522
3523 for (size_t r = 0; r < right.hits.size(); ++r)
3524 {
3525 BowtieHit& rh = right.hits[r];
3526 if (bh.antisense_align() != rh.antisense_align() || bh.ref_id() != rh.ref_id())
3527 continue;
3528
3529 if ((bh.antisense_align() && rh.right() == bh.left()) ||
3530 (!bh.antisense_align() && bh.right() == rh.left() ))
3531 {
3532 found_right_seg_partner = true;
3533 break;
3534 }
3535
3536 int dist = 0;
3537 if (bh.antisense_align())
3538 dist = bh.left() - rh.right();
3539 else
3540 dist = rh.left() - bh.right();
3541
3542 if (dist >= min_segment_intron_length && dist < (int)max_segment_intron_length)
3543 drs_bhs.push_back(&rh);
3544 }
3545 }
3546
3547 if (!found_right_seg_partner && s < hits_for_read.size() - 2)
3548 {
3549 // Look for a right of right partner for the current hit
3550 HitsForRead& right_right = hits_for_read[s + 2];
3551
3552 for (size_t r = 0; r < right_right.hits.size(); ++r)
3553 {
3554 BowtieHit& rrh = right_right.hits[r];
3555 if (bh.antisense_align() != rrh.antisense_align() || bh.ref_id() != rrh.ref_id())
3556 continue;
3557
3558 int dist = 0;
3559 if (bh.antisense_align())
3560 dist = bh.left() - rrh.right();
3561 else
3562 dist = rrh.left() - bh.right();
3563
3564 if (dist >= min_segment_intron_length + segment_length && dist < (int)max_segment_intron_length + segment_length)
3565 rrs_bhs.push_back(&rrh);
3566 }
3567 }
3568
3569 if (!found_right_seg_partner && (drs_bhs.size() > 0 || rrs_bhs.size() > 0))
3570 {
3571 const int look_bp = 8;
3572 const size_t color_offset = color ? 1 : 0;
3573
3574 vector<BowtieHit*> d_bhs = rrs_bhs.size() > 0 ? rrs_bhs : drs_bhs;
3575 for (size_t r = 0; r < d_bhs.size(); ++r)
3576 {
3577 string support_read;
3578 if (rrs_bhs.size() <= 0)
3579 support_read = seq.substr(color_offset + (s+1) * segment_length - look_bp, look_bp * 2);
3580 else
3581 support_read = seq.substr(color_offset + (s+1) * segment_length - look_bp, segment_length + look_bp * 2);
3582
3583 BowtieHit& d_bh = *(d_bhs[r]);
3584 if (!bh.antisense_align())
3585 {
3586 RefSeg right_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read_side, 0, 0, support_read);
3587 right_seg.left = max(0, bh.right() - look_bp);
3588 right_seg.right = d_bh.left() + look_bp;
3589 expected_don_acc_windows.push_back(right_seg);
3590 }
3591 else
3592 {
3593 if (color)
3594 reverse(support_read.begin(), support_read.end());
3595 else
3596 reverse_complement(support_read);
3597
3598 RefSeg left_seg(bh.ref_id(), POINT_DIR_BOTH, bh.antisense_align(), read_side, 0, 0, support_read);
3599 left_seg.left = d_bh.right() - look_bp;
3600 left_seg.right = bh.left() + look_bp;
3601 expected_don_acc_windows.push_back(left_seg);
3602 }
3603
3604 // daehwan
3605 #ifdef B_DEBUG2
3606 cout << "insert id: " << bh.insert_id() << endl
3607 << (bh.antisense_align() ? "-" : "+") << endl
3608 << seq << endl
3609 << "(" << s << ") - " << support_read << endl;
3610 #endif
3611 }
3612 }
3613 }
3614 } //for each hits_for_read
3615
3616 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
3617 expected_don_acc_windows,
3618 seg_juncs,
3619 "GT",
3620 "AG",
3621 max_segment_intron_length,
3622 min_segment_intron_length,
3623 max_seg_juncs,
3624 false,
3625 0);
3626
3627 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
3628 expected_don_acc_windows,
3629 seg_juncs,
3630 "GC",
3631 "AG",
3632 max_segment_intron_length,
3633 min_segment_intron_length,
3634 max_seg_juncs,
3635 false,
3636 0);
3637
3638 juncs_from_ref_segs<RecordSegmentJuncs>(rt,
3639 expected_don_acc_windows,
3640 seg_juncs,
3641 "AT",
3642 "AC",
3643 max_segment_intron_length,
3644 min_segment_intron_length,
3645 max_seg_juncs,
3646 false,
3647 0);
3648 }
3649
3650 MerTable mer_table;
3651
3652 int seed_alignments = 0;
3653 int microaligned_segs = 0;
3654
3655 map<RefSeg, vector<string>* > microexon_windows;
3656
3657 bool overlap_in_genome(int ll, int lr, int rl, int rr)
3658 {
3659 if (ll >= rl && ll < rr)
3660 return true;
3661 if (lr > rl && lr < rr)
3662 return true;
3663 if (rl >= ll && rl < lr)
3664 return true;
3665 if (rr > ll && rr < lr)
3666 return true;
3667 return false;
3668 }
3669
3670 void add_to_microexon_windows(uint32_t ref_id,
3671 int left_boundary,
3672 int right_boundary,
3673 const string& dna_str,
3674 eREAD read)
3675 {
3676 RefSeg left_dummy(ref_id, POINT_DIR_DONTCARE, false, read, left_boundary, right_boundary);
3677 RefSeg right_dummy(ref_id, POINT_DIR_DONTCARE, false, read, right_boundary, right_boundary + 1);
3678
3679 map<RefSeg, vector<string>* >::iterator lb = microexon_windows.lower_bound(left_dummy);
3680 map<RefSeg, vector<string>* >::iterator ub = microexon_windows.lower_bound(right_dummy);
3681 vector<string>* new_vec = NULL;
3682 if (lb == microexon_windows.end())
3683 {
3684 microexon_windows.insert(make_pair(left_dummy, new vector<string>(1, dna_str)));
3685 return;
3686 }
3687
3688 map<RefSeg, vector<string>* >::iterator first_to_be_erased = microexon_windows.end();
3689 map<RefSeg, vector<string>* >::iterator last_to_be_erased = ub;
3690
3691 while (lb != ub)
3692 {
3693 // everyone in this range that overlaps with the new interval needs to
3694 // be merged together.
3695 if (overlap_in_genome(lb->first.left, lb->first.right, left_boundary, right_boundary))
3696 {
3697 if (!new_vec)
3698 new_vec = new vector<string>();
3699 if (first_to_be_erased == microexon_windows.end())
3700 first_to_be_erased = lb;
3701 left_dummy.left = min(lb->first.left, left_boundary);
3702 left_dummy.right = max(lb->first.right, right_boundary);
3703
3704 new_vec->insert(new_vec->end(), lb->second->begin(), lb->second->end());
3705 delete lb->second;
3706 }
3707 else if (first_to_be_erased != microexon_windows.end())
3708 {
3709 last_to_be_erased = lb;
3710 }
3711
3712 ++lb;
3713 }
3714
3715 if (first_to_be_erased != microexon_windows.end())
3716 {
3717 microexon_windows.erase(first_to_be_erased, last_to_be_erased);
3718 }
3719
3720 if (!new_vec)
3721 {
3722 // never found an overlapping window, so just add this one and bail
3723 microexon_windows.insert(make_pair(left_dummy, new vector<string>(1, dna_str)));
3724 return;
3725 }
3726 else
3727 {
3728 new_vec->push_back(dna_str);
3729 microexon_windows.insert(make_pair(left_dummy, new_vec));
3730 return;
3731 }
3732
3733 }
3734
3735 void align_microexon_segs(RefSequenceTable& rt,
3736 std::set<Junction, skip_count_lt>& juncs,
3737 int max_juncs,
3738 int half_splice_mer_len)
3739 {
3740 int num_segments = 0;
3741 for (map<RefSeg, vector<string>* >::iterator itr = microexon_windows.begin();
3742 itr != microexon_windows.end(); ++itr)
3743 {
3744 vector<string>& unaligned_segments = *itr->second;
3745 num_segments += unaligned_segments.size();
3746 }
3747
3748 fprintf(stderr, "Aligning %d microexon segments in %lu windows\n",
3749 num_segments, (long unsigned int)microexon_windows.size());
3750
3751 extensions.clear();
3752
3753 size_t splice_mer_len = 2 * half_splice_mer_len;
3754 size_t mer_table_size = 1 << ((splice_mer_len)<<1);
3755
3756 extensions.resize(mer_table_size);
3757
3758 int window_num = 0;
3759 for (map<RefSeg, vector<string>* >::iterator itr = microexon_windows.begin();
3760 itr != microexon_windows.end(); ++itr)
3761 {
3762 window_num++;
3763 if ((window_num % 100) == 0)
3764 fprintf(stderr, "\twindow %d\n",window_num);
3765
3766
3767 stringstream ss(stringstream::in | stringstream::out);
3768
3769 for (size_t j = 0; j < extensions.size(); ++j)
3770 {
3771 extensions[j].clear();
3772 }
3773
3774 vector<string>& unaligned_segments = *itr->second;
3775
3776 for (size_t j = 0; j < unaligned_segments.size(); ++j)
3777 {
3778 stringstream ss(stringstream::in | stringstream::out);
3779 string s;
3780 //cerr << w.unaligned_segments[j];
3781 ss << unaligned_segments[j];
3782 ss >> s;
3783
3784 store_read_extensions(extensions,
3785 half_splice_mer_len,
3786 half_splice_mer_len,
3787 s,
3788 false);
3789 }
3790
3791 vector<RefSeg> segs;
3792 segs.push_back(itr->first);
3793 RefSeg r = itr->first;
3794 r.points_where = POINT_DIR_LEFT;
3795 segs.push_back(r);
3796
3797 juncs_from_ref_segs<RecordExtendableJuncs>(rt,
3798 segs,
3799 juncs,
3800 "GT",
3801 "AG",
3802 max_microexon_stretch,
3803 min_coverage_intron_length,
3804 max_juncs,
3805 false,
3806 half_splice_mer_len);
3807 num_segments += unaligned_segments.size();
3808 delete itr->second;
3809 }
3810 fprintf(stderr, "Checked %d segments against %lu windows for microexon junctions\n",
3811 num_segments, (long unsigned int)microexon_windows.size());
3812 fprintf(stderr, "Found %ld potential microexon junctions\n", (long int)juncs.size());
3813 }
3814
3815 /*
3816 * Easy guys ... this function puts its pants on just like the rest of you -- one leg
3817 * at a time. Except, once its pants are on, it makes gold records. Alright, here we
3818 * go.
3819 */
3820
3821 void look_for_hit_group(RefSequenceTable& rt,
3822 ReadStream& readstream,
3823 ReadStream& readstream_for_segment_search,
3824 ReadStream& readstream_for_indel_discovery,
3825 ReadStream& readstream_for_fusion_discovery,
3826 ReadTable& unmapped_reads,
3827 vector<HitStream>& seg_files,
3828 int curr_file,
3829 uint64_t insert_id,
3830 vector<HitsForRead>& hits_for_read, //<-- collecting segment hits for this read
3831 HitStream& partner_hit_stream_for_segment_search,
3832 HitStream& seg_partner_hit_stream_for_segment_search,
3833 HitStream& partner_hit_stream_for_fusion_discovery,
3834 HitStream& seg_partner_hit_stream_for_fusion_discovery,
3835 std::set<Junction, skip_count_lt>& juncs,
3836 std::set<Deletion>& deletions,
3837 std::set<Insertion>& insertions,
3838 FusionSimpleSet& fusions,
3839 eREAD read_side,
3840 uint32_t begin_id,
3841 uint32_t end_id)
3842 {
3843 HitStream& hit_stream = seg_files[curr_file];
3844 HitsForRead hit_group;
3845 uint32_t order = unmapped_reads.observation_order(insert_id);
3846 int seq_key_len = min((int)min_anchor_len, 6);
3847 while(true) {
3848 uint64_t next_group_id = hit_stream.next_group_id();
3849 uint32_t next_order = unmapped_reads.observation_order(next_group_id);
3850 // If we would have seen the hits by now, stop looking in this stream,
3851 // but forward the search to the next (lower) segment if possible.
3852 if (order < next_order || next_group_id == 0) {
3853 if (curr_file > 0) {
3854 //look for next (lower) segment mappings for this read
3855 look_for_hit_group(rt,
3856 readstream,
3857 readstream_for_segment_search,
3858 readstream_for_indel_discovery,
3859 readstream_for_fusion_discovery,
3860 unmapped_reads,
3861 seg_files,
3862 curr_file - 1,
3863 insert_id,
3864 hits_for_read,
3865 partner_hit_stream_for_segment_search,
3866 seg_partner_hit_stream_for_segment_search,
3867 partner_hit_stream_for_fusion_discovery,
3868 seg_partner_hit_stream_for_fusion_discovery,
3869 juncs,
3870 deletions,
3871 insertions,
3872 fusions,
3873 read_side,
3874 begin_id,
3875 end_id);
3876 }
3877 else
3878 if (insert_id && !no_microexon_search) {
3879 //microexon search
3880 Read read;
3881 // The hits are missing for the leftmost segment, which means
3882 // we should try looking for junctions via seed and extend
3883 // using it (helps find junctions to microexons).
3884 /* bool got_read = get_read_from_stream(insert_id,
3885 readstream.file,
3886 FASTQ,
3887 false,
3888 read); */
3889 bool got_read = readstream.getRead(insert_id, read);
3890 if (!got_read) {
3891 //fprintf(stderr, "Warning: could not get read with insert_id=%d\n", (int)insert_id);
3892 //break; //exit loop
3893 err_die("Error: could not get read with insert_id=%d from file %s\n",
3894 (int)insert_id, readstream.filename());
3895 }
3896 string fwd_read = read.seq;
3897 if (color) // remove the primer and the adjacent color
3898 fwd_read.erase(0, 2);
3899 // make sure there are hits for all the other segs, all the
3900 // way to the root (right-most) one.
3901 int empty_seg = 0;
3902 for (size_t h = 1; h < hits_for_read.size(); ++h) {
3903 if (hits_for_read[h].hits.empty())
3904 empty_seg = h;
3905 }
3906 // if not, no microexon alignment for this segment
3907 if (empty_seg != 0)
3908 break;
3909 fwd_read = fwd_read.substr(0, segment_length);
3910 string rev_read = fwd_read;
3911 //check the reverse
3912 if (color) reverse(rev_read.begin(), rev_read.end());
3913 else reverse_complement(rev_read);
3914 for (size_t h = 0; h < hits_for_read[empty_seg + 1].hits.size(); ++h) {
3915 const BowtieHit& bh = hits_for_read[empty_seg + 1].hits[h];
3916 RefSequenceTable::Sequence* ref_str = rt.get_seq(bh.ref_id());
3917 if (ref_str == NULL)
3918 continue;
3919 int ref_len = length(*ref_str);
3920 int left_boundary;
3921 int right_boundary;
3922 bool antisense = bh.antisense_align();
3923 vector<BowtieHit> empty_seg_hits;
3924 seed_alignments++;
3925 if (antisense) {
3926 left_boundary = max(0, bh.right() - (int)min_anchor_len);
3927 right_boundary = min(ref_len - 2,
3928 left_boundary + max_microexon_stretch);
3929 if (right_boundary - left_boundary < 2 * seq_key_len)
3930 continue;
3931 microaligned_segs++;
3932 add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, rev_read, read_side);
3933 }
3934 else {
3935 right_boundary = min(ref_len - 2,
3936 bh.left() + (int)min_anchor_len);
3937 left_boundary = max(0, right_boundary - max_microexon_stretch);
3938 if (right_boundary - left_boundary < 2 * seq_key_len)
3939 continue;
3940 microaligned_segs++;
3941 add_to_microexon_windows(bh.ref_id(), left_boundary, right_boundary, fwd_read, read_side);
3942 }
3943 } //for h
3944 } // !no_microexon_search
3945 break;
3946 }
3947 else if (hit_stream.next_read_hits(hit_group)) {
3948 // if we found hits for the target group in the left stream,
3949 // add them to the accumulating vector and continue the search
3950 if (hit_group.insert_id == insert_id) {
3951 hits_for_read[curr_file] = hit_group;
3952 if (curr_file > 0)
3953 // we need to look left (recursively) for the group we
3954 // just read for this stream.
3955 look_for_hit_group(rt,
3956 readstream,
3957 readstream_for_segment_search,
3958 readstream_for_indel_discovery,
3959 readstream_for_fusion_discovery,
3960 unmapped_reads,
3961 seg_files,
3962 curr_file - 1,
3963 insert_id,
3964 hits_for_read,
3965 partner_hit_stream_for_segment_search,
3966 seg_partner_hit_stream_for_segment_search,
3967 partner_hit_stream_for_fusion_discovery,
3968 seg_partner_hit_stream_for_fusion_discovery,
3969 juncs,
3970 deletions,
3971 insertions,
3972 fusions,
3973 read_side,
3974 begin_id,
3975 end_id);
3976 break;
3977 } //same insert_id (group)
3978 else if (curr_file >= 0) {
3979 // different group, we need to start a whole new
3980 // search for it, with a whole new vector of hits.
3981 vector<HitsForRead> hits_for_new_read(seg_files.size());
3982 hits_for_new_read[curr_file] = hit_group;
3983
3984 if (curr_file > 0)
3985 {
3986 look_for_hit_group(rt,
3987 readstream,
3988 readstream_for_segment_search,
3989 readstream_for_indel_discovery,
3990 readstream_for_fusion_discovery,
3991 unmapped_reads,
3992 seg_files,
3993 curr_file - 1,
3994 hit_group.insert_id,
3995 hits_for_new_read,
3996 partner_hit_stream_for_segment_search,
3997 seg_partner_hit_stream_for_segment_search,
3998 partner_hit_stream_for_fusion_discovery,
3999 seg_partner_hit_stream_for_fusion_discovery,
4000 juncs,
4001 deletions,
4002 insertions,
4003 fusions,
4004 read_side,
4005 begin_id,
4006 end_id);
4007
4008 if (hit_group.insert_id >= begin_id && hit_group.insert_id < end_id)
4009 {
4010 find_insertions_and_deletions(rt,
4011 readstream_for_indel_discovery,
4012 hits_for_new_read,
4013 deletions,
4014 insertions);
4015 find_gaps(rt,
4016 readstream_for_segment_search,
4017 hits_for_new_read,
4018 partner_hit_stream_for_segment_search,
4019 seg_partner_hit_stream_for_segment_search,
4020 juncs,
4021 read_side);
4022 }
4023 }
4024
4025 if (hit_group.insert_id >= begin_id && hit_group.insert_id < end_id)
4026 {
4027 if (fusion_search)
4028 {
4029 find_fusions(rt,
4030 readstream_for_fusion_discovery,
4031 hits_for_new_read,
4032 partner_hit_stream_for_fusion_discovery,
4033 seg_partner_hit_stream_for_fusion_discovery,
4034 fusions,
4035 read_side);
4036 }
4037 }
4038 } //different group
4039 }//got next group
4040 } //while loop
4041 }
4042
4043
4044 uint64_t process_next_hit_group(RefSequenceTable& rt,
4045 ReadStream& readstream,
4046 ReadStream& readstream_for_segment_search,
4047 ReadStream& readstream_for_indel_discovery,
4048 ReadStream& readstream_for_fusion_discovery,
4049 ReadTable& unmapped_reads,
4050 vector<HitStream>& seg_files,
4051 size_t last_file_idx,
4052 HitStream& partner_hit_stream_for_segment_search,
4053 HitStream& seg_partner_hit_stream_for_segment_search,
4054 HitStream& partner_hit_stream_for_fusion_discovery,
4055 HitStream& seg_partner_hit_stream_for_fusion_discovery,
4056 std::set<Junction, skip_count_lt>& juncs,
4057 std::set<Deletion>& deletions,
4058 std::set<Insertion>& insertions,
4059 FusionSimpleSet& fusions,
4060 eREAD read,
4061 uint32_t begin_id = 0,
4062 uint32_t end_id = VMAXINT32)
4063 {
4064 HitStream& last_segmap_hitstream = seg_files[last_file_idx];
4065 HitsForRead hit_group;
4066 bool result = last_segmap_hitstream.next_read_hits(hit_group);
4067 vector<HitsForRead> hits_for_read(seg_files.size());
4068 hits_for_read.back() = hit_group;
4069
4070 if (result && hit_group.insert_id >= end_id)
4071 return 0;
4072
4073 look_for_hit_group(rt,
4074 readstream,
4075 readstream_for_segment_search,
4076 readstream_for_indel_discovery,
4077 readstream_for_fusion_discovery,
4078 unmapped_reads,
4079 seg_files,
4080 (int)last_file_idx - 1,
4081 hit_group.insert_id,
4082 hits_for_read,
4083 partner_hit_stream_for_segment_search,
4084 seg_partner_hit_stream_for_segment_search,
4085 partner_hit_stream_for_fusion_discovery,
4086 seg_partner_hit_stream_for_fusion_discovery,
4087 juncs,
4088 deletions,
4089 insertions,
4090 fusions,
4091 read,
4092 begin_id,
4093 end_id);
4094
4095 if (result)
4096 {
4097 find_insertions_and_deletions(rt,
4098 readstream_for_indel_discovery,
4099 hits_for_read,
4100 deletions,
4101 insertions);
4102
4103 if (fusion_search)
4104 {
4105 find_fusions(rt,
4106 readstream_for_fusion_discovery,
4107 hits_for_read,
4108 partner_hit_stream_for_fusion_discovery,
4109 seg_partner_hit_stream_for_fusion_discovery,
4110 fusions,
4111 read);
4112 }
4113
4114 find_gaps(rt,
4115 readstream_for_segment_search,
4116 hits_for_read,
4117 partner_hit_stream_for_segment_search,
4118 seg_partner_hit_stream_for_segment_search,
4119 juncs,
4120 read);
4121
4122 return hit_group.insert_id;
4123 }
4124
4125 return 0;
4126 }
4127
4128 static const int UNCOVERED = 0;
4129 static const int LOOK_LEFT = 1;
4130 static const int LOOK_RIGHT = 2;
4131
4132 uint8_t get_cov(const vector<uint8_t>& cov, uint32_t c)
4133 {
4134 uint32_t b = c >> 2;
4135 uint32_t r = c & 0x3;
4136 uint8_t s = (r << 1);
4137 uint8_t v = cov[b];
4138 v &= (0x3 << s);
4139 v >>= s;
4140 return v;
4141 }
4142
4143 void build_coverage_map(ReadTable& it, RefSequenceTable& rt,
4144 vector<string>& seg_files, map<uint32_t, vector<bool> >& coverage_map) {
4145 if (!coverage_map.empty()) return;
4146 BAMHitFactory hit_factory(it,rt);
4147
4148 for (size_t f = 0; f < seg_files.size(); ++f)
4149 {
4150 //fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f);
4151 //seg_files[f]->rewind();
4152 HitStream hs(seg_files[f], &hit_factory, false, false, false);
4153 HitsForRead hit_group;
4154 while (hs.next_read_hits(hit_group))
4155 {
4156 for (size_t h = 0; h < hit_group.hits.size(); ++h)
4157 {
4158 BowtieHit& bh = hit_group.hits[h];
4159
4160 pair<map<uint32_t, vector<bool> >::iterator, bool> ret =
4161 coverage_map.insert(make_pair(bh.ref_id(), vector<bool>()));
4162 vector<bool>& ref_cov = ret.first->second;
4163
4164 size_t right_extent = bh.right();
4165
4166 if (right_extent >= ref_cov.size())
4167 {
4168 ref_cov.resize(right_extent + 1, 0);
4169 }
4170
4171 for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c)
4172 {
4173 ref_cov[c] = true;
4174 //if (ref_cov[c]<255) ref_cov[c]++;
4175 }
4176 }
4177 } //while next_read_hits
4178 }
4179 }
4180
4181 void pair_covered_sites(ReadTable& it,
4182 RefSequenceTable& rt,
4183 vector<string>& segmap_fnames,
4184 std::set<Junction, skip_count_lt>& cov_juncs,
4185 map<uint32_t, vector<bool> >& coverage_map,
4186 size_t half_splice_mer_len)
4187 {
4188 vector<RefSeg> expected_look_left_windows;
4189 vector<RefSeg> expected_look_right_windows;
4190 build_coverage_map(it,rt, segmap_fnames, coverage_map);
4191
4192 static const int extend = 45;
4193 int num_islands = 0;
4194
4195 vector<RefSeg> expected_don_acc_windows;
4196
4197 fprintf(stderr, "Recording coverage islands\n");
4198 size_t cov_bases = 0;
4199 for (map<uint32_t, vector<bool> >::iterator itr = coverage_map.begin();
4200 itr != coverage_map.end();
4201 ++itr)
4202 {
4203 vector<bool>& cov = itr->second;
4204
4205 size_t island_left_edge = 0;
4206 for (size_t c = 1; c < cov.size(); ++c)
4207 {
4208 if (cov[c])
4209 {
4210 cov_bases++;
4211 if (!cov[c - 1])
4212 {
4213 num_islands += 1;
4214 int edge = (int)c - extend;
4215 edge = max(edge, 0);
4216 island_left_edge = edge;
4217 }
4218 }
4219 else
4220 {
4221 if (cov[c - 1])
4222 {
4223 expected_don_acc_windows.push_back(RefSeg(itr->first,
4224 POINT_DIR_LEFT,
4225 false, /* not important */
4226 READ_DONTCARE,
4227 island_left_edge,
4228 c + extend));
4229 expected_don_acc_windows.push_back(RefSeg(itr->first,
4230 POINT_DIR_RIGHT,
4231 false, /* not important */
4232 READ_DONTCARE,
4233 island_left_edge,
4234 c + extend));
4235 }
4236 }
4237 }
4238 }
4239 fprintf(stderr, "Found %d islands covering %ld bases\n", num_islands, (long int)cov_bases);
4240
4241 juncs_from_ref_segs<RecordButterflyJuncs>(rt,
4242 expected_don_acc_windows,
4243 cov_juncs,
4244 "GT",
4245 "AG",
4246 max_coverage_intron_length,
4247 min_coverage_intron_length,
4248 max_cov_juncs,
4249 true,
4250 half_splice_mer_len);
4251 fprintf(stderr, "Found %ld potential intra-island junctions\n", (long int)cov_juncs.size());
4252 }
4253
4254 // daehwan
4255 struct ReadInfo
4256 {
4257 ReadID read_id;
4258 uint32_t left;
4259 uint32_t right;
4260
4261 bool operator<(const ReadInfo& rhs) const
4262 {
4263 if (left != rhs.left)
4264 return left < rhs.left;
4265 if (right != rhs.right)
4266 return right < rhs.right;
4267
4268 return false;
4269 }
4270 };
4271
4272 void capture_island_ends(ReadTable& it,
4273 RefSequenceTable& rt,
4274 vector<string>& segmap_fnames,
4275 std::set<Junction, skip_count_lt>& cov_juncs,
4276 map<uint32_t, vector<bool> >& coverage_map,
4277 size_t half_splice_mer_len)
4278 {
4279 //static int island_repeat_tolerance = 10;
4280 vector<RefSeg> expected_look_left_windows;
4281 vector<RefSeg> expected_look_right_windows;
4282
4283 // daehwan
4284 //#define DEBUG_CHECK_EXONS 1
4285 //#define DEBUG_RANGE_ONLY 1
4286
4287 #ifndef DEBUG_CHECK_EXONS
4288 build_coverage_map(it, rt, segmap_fnames, coverage_map);
4289 #else
4290 //build coverage map here, so we can debug it
4291 #ifdef DEBUG_RANGE_ONLY
4292 static const uint32_t chr14_id = rt.get_id("chr14");
4293 #endif
4294 vector<ReadInfo> hits;
4295 BAMHitFactory hit_factory(it,rt);
4296
4297 for (size_t f = 0; f < seg_files.size(); ++f)
4298 {
4299 fprintf(stderr, "Adding hits from segment file %d to coverage map\n", (int)f);
4300 seg_files[f]->rewind();
4301 FILE* fp = seg_files[f]->file;
4302 //rewind(fp);
4303 HitStream hs(fp, &hit_factory, false, false, false);
4304
4305 HitsForRead hit_group;
4306 while (hs.next_read_hits(hit_group))
4307 {
4308 for (size_t h = 0; h < hit_group.hits.size(); ++h)
4309 {
4310 BowtieHit& bh = hit_group.hits[h];
4311 // daehwan
4312 //if (check_exons) <-- DEBUG_CHECK_EXONS
4313 #ifdef DEBUG_RANGE_ONLY
4314 if (bh.ref_id() != chr14_id)
4315 continue;
4316
4317 // if (bh.left() < 66567028 && bh.right() > 66604392)
4318 if (bh.left() < 66400000 || bh.right() > 66700000)
4319 continue;
4320
4321 ReadInfo read_info;
4322 read_info.read_id = bh.insert_id();
4323 read_info.left = bh.left();
4324 read_info.right = bh.right();
4325 hits.push_back(read_info);
4326 #endif
4327
4328 pair<map<uint32_t, vector<bool> >::iterator, bool> ret =
4329 coverage_map.insert(make_pair(bh.ref_id(), vector<bool>()));
4330 vector<bool>& ref_cov = ret.first->second;
4331
4332 size_t right_extent = bh.right();
4333
4334 if (right_extent >= ref_cov.size())
4335 {
4336 ref_cov.resize(right_extent + 1, 0);
4337 }
4338 for (uint32_t c = (uint32_t)bh.left(); c < (uint32_t)bh.right(); ++c)
4339 {
4340 ref_cov[c] = true;
4341 }
4342 }
4343 }
4344 }
4345
4346 sort(hits.begin(), hits.end());
4347 #endif
4348 // static const int min_cov_length = segment_length + 2;
4349 long covered_bases = 0;
4350 int long_enough_bases = 0;
4351 int left_looking = 0;
4352 int right_looking = 0;
4353 static const int extend = 45;
4354 static const int repeat_tol = 5;
4355
4356 int num_islands = 0;
4357
4358 for (map<uint32_t, vector<bool> >::iterator itr = coverage_map.begin();
4359 itr != coverage_map.end();
4360 ++itr)
4361 {
4362 #ifdef B_DEBUG
4363 fprintf (stderr, "Finding pairings in ref seq %s\n", rt.get_name(itr->first));
4364 #endif
4365 vector<bool>& cov = itr->second;
4366 vector<bool> long_enough(cov.size());
4367
4368 size_t last_uncovered = 0;
4369
4370 static const uint8_t min_cov = 1;
4371
4372 for (size_t c = 1; c < cov.size(); ++c)
4373 {
4374 uint8_t c_cov = cov[c]; //get_cov(cov, c);
4375 if (c_cov < min_cov || c == (cov.size()) - 1)
4376 {
4377 int putative_exon_length = (int)c - (int)last_uncovered;
4378 uint32_t last_pos_cov = cov[c - 1]; //get_cov(cov,c - 1);
4379 if (last_pos_cov >= min_cov && putative_exon_length >= min_cov_length)
4380 {
4381 #ifdef B_DEBUG
4382 fprintf(stderr, "cov. island: %d-%d\n", (int)(last_uncovered + 1), (int)c);
4383 fprintf(stderr, "\t(putative exon length = %d, min_cov_length=%d)\n",putative_exon_length, min_cov_length);
4384 #endif
4385 covered_bases += (c + 1 - last_uncovered);
4386 for (int l = (int)c; l > (int)last_uncovered; --l)
4387 {
4388 long_enough[l] = true;
4389 }
4390 }
4391 last_uncovered = c;
4392 }
4393 }
4394 vector<bool>& ref_cov = long_enough;
4395 vector<uint8_t> cov_state(ref_cov.size(), UNCOVERED);
4396
4397 // daehwan - print islands (exons)
4398 //if (check_exons)
4399 #ifdef DEBUG_CHECK_EXONS
4400 //{
4401 uint32_t left = 0, right = 0;
4402 for (size_t c = 1; c < ref_cov.size(); ++c)
4403 {
4404 if (ref_cov[c] && !ref_cov[c-1])
4405 {
4406 left = c;
4407 cout << "Exon: " << left << "-";
4408 }
4409 else if (!ref_cov[c] && ref_cov[c-1])
4410 {
4411 right = c - 1;
4412 cout << right << endl;
4413 for (size_t k = 0; k < hits.size(); ++k)
4414 {
4415 const ReadInfo& hit = hits[k];
4416 if (hit.right < left)
4417 continue;
4418
4419 if (hit.left > right)
4420 break;
4421
4422 cout << "\t" << hit.read_id << " " << hit.left << "-" << hit.right << endl;
4423 }
4424 }
4425 }
4426 //}
4427 #endif
4428 for (size_t c = 1; c < ref_cov.size(); ++c)
4429 {
4430 if (ref_cov[c])
4431 {
4432 long_enough_bases++;
4433 if (!ref_cov[c - 1])
4434 {
4435 num_islands += 1;
4436
4437 for (int r = (int)c - extend;
4438 r >= 0 && r < (int)c + repeat_tol && r < (int)cov_state.size();
4439 ++r)
4440 {
4441 cov_state[r] |= LOOK_LEFT;
4442 }
4443 }
4444 }
4445 else
4446 {
4447 if (ref_cov[c - 1])
4448 {
4449 for (int l = (int)c - repeat_tol;
4450 l >= 0 && l < (int)c + extend && l < (int)cov_state.size();
4451 ++l)
4452 {
4453 cov_state[l] |= LOOK_RIGHT;
4454 }
4455 }
4456 }
4457 }
4458
4459 RefSeg* curr_look_left = NULL;
4460 RefSeg* curr_look_right = NULL;
4461
4462 for (size_t c = 1; c < cov_state.size(); ++c)
4463 {
4464 if (cov_state[c] & LOOK_LEFT)
4465 {
4466 left_looking++;
4467 if (!(cov_state[c-1] & LOOK_LEFT))
4468 {
4469 expected_look_left_windows.push_back(RefSeg(itr->first,
4470 POINT_DIR_LEFT,
4471 false, /* not important */
4472 READ_DONTCARE,
4473 c,
4474 c + 1));
4475 curr_look_left = &(expected_look_left_windows.back());
4476 }
4477 else if (curr_look_left)
4478 {
4479 curr_look_left->right++;
4480 }
4481 }
4482 else
4483 {
4484 if ((cov_state[c-1] & LOOK_LEFT))
4485 {
4486 curr_look_left = NULL;
4487 }
4488 }
4489
4490 if (cov_state[c] & LOOK_RIGHT)
4491 {
4492 right_looking++;
4493 if (!(cov_state[c-1] & LOOK_RIGHT))
4494 {
4495 expected_look_right_windows.push_back(RefSeg(itr->first,
4496 POINT_DIR_RIGHT,
4497 false, /* not important */
4498 READ_DONTCARE,
4499 c,
4500 c + 1));
4501
4502 curr_look_right = &(expected_look_right_windows.back());
4503 }
4504 else if (curr_look_right)
4505 {
4506 curr_look_right->right++;
4507 }
4508 }
4509 else
4510 {
4511 if ((cov_state[c-1] & LOOK_RIGHT))
4512 {
4513 curr_look_right = NULL;
4514 }
4515 }
4516 }
4517 }
4518
4519 fprintf(stderr, " Map covers %ld bases\n", covered_bases);
4520 fprintf(stderr, " Map covers %d bases in sufficiently long segments\n", long_enough_bases);
4521 fprintf(stderr, " Map contains %d good islands\n", num_islands + 1);
4522 fprintf(stderr, " %d are left looking bases\n", left_looking);
4523 fprintf(stderr, " %d are right looking bases\n", right_looking);
4524
4525 vector<RefSeg> expected_don_acc_windows;
4526 expected_don_acc_windows.insert(expected_don_acc_windows.end(),
4527 expected_look_right_windows.begin(),
4528 expected_look_right_windows.end());
4529
4530 expected_don_acc_windows.insert(expected_don_acc_windows.end(),
4531 expected_look_left_windows.begin(),
4532 expected_look_left_windows.end());
4533
4534 if (!butterfly_search) coverage_map.clear(); //free some memory
4535 juncs_from_ref_segs<RecordExtendableJuncs>(rt,
4536 expected_don_acc_windows,
4537 cov_juncs,
4538 "GT",
4539 "AG",
4540 max_coverage_intron_length,
4541 min_coverage_intron_length,
4542 max_cov_juncs,
4543 true,
4544 half_splice_mer_len);
4545 //fprintf(stderr, "Found %ld potential island-end pairing junctions\n", (long int)cov_juncs.size());
4546 }
4547
4548 void print_juncs(RefSequenceTable& rt, std::set<Junction, skip_count_lt>& juncs, const char* str)
4549 {
4550 fprintf (stderr, "-- %s --\n", str);
4551 for(std::set<Junction>::iterator itr = juncs.begin();
4552 itr != juncs.end();
4553 ++itr)
4554 {
4555 const char* ref_name = rt.get_name(itr->refid);
4556
4557 fprintf(stderr, "%s\t%d\t%d\t%c\n",
4558 ref_name,
4559 itr->left,
4560 itr->right,
4561 itr->antisense ? '-' : '+');
4562 }
4563 fprintf (stderr, "-- done --\n");
4564 }
4565
4566 struct SegmentSearchWorker
4567 {
4568 void operator()()
4569 {
4570 ReadTable it;
4571
4572 ReadStream readstream(reads_fname);
4573 ReadStream readstream_for_segment_search(reads_fname);
4574 ReadStream readstream_for_indel_discovery(reads_fname);
4575 ReadStream readstream_for_fusion_discovery(reads_fname);
4576
4577 if (readstream.file() == NULL ||
4578 readstream_for_segment_search.file() == NULL ||
4579 readstream_for_indel_discovery.file() == NULL ||
4580 readstream_for_fusion_discovery.file() == NULL)
4581 {
4582 fprintf(stderr, "Error: cannot open %s for reading\n",
4583 reads_fname.c_str());
4584 exit(1);
4585 }
4586
4587 if (read_offset > 0)
4588 {
4589 readstream.seek(read_offset);
4590 readstream_for_segment_search.seek(read_offset);
4591 readstream_for_indel_discovery.seek(read_offset);
4592 readstream_for_fusion_discovery.seek(read_offset);
4593 }
4594
4595 vector<BAMHitFactory*> hit_factories;
4596 hit_factories.push_back(new BAMHitFactory(it, *rt));
4597 HitStream partner_hit_stream_for_segment_search(partner_reads_map_fname, hit_factories.back(), false, false, false);
4598 hit_factories.push_back(new BAMHitFactory(it, *rt));
4599 HitStream partner_hit_stream_for_fusion_discovery(partner_reads_map_fname, hit_factories.back(), false, false, false);
4600 if (partner_hit_offset > 0)
4601 {
4602 partner_hit_stream_for_segment_search.seek(partner_hit_offset);
4603 partner_hit_stream_for_fusion_discovery.seek(partner_hit_offset);
4604 }
4605
4606 hit_factories.push_back(new BAMHitFactory(it, *rt));
4607 HitStream seg_partner_hit_stream_for_segment_search(seg_partner_reads_map_fname, hit_factories.back(), false, false, false);
4608 hit_factories.push_back(new BAMHitFactory(it, *rt));
4609 HitStream seg_partner_hit_stream_for_fusion_discovery(seg_partner_reads_map_fname, hit_factories.back(), false, false, false);
4610 if (seg_partner_hit_offset > 0)
4611 {
4612 seg_partner_hit_stream_for_segment_search.seek(seg_partner_hit_offset);
4613 seg_partner_hit_stream_for_fusion_discovery.seek(seg_partner_hit_offset);
4614 }
4615
4616 vector<HitStream> hit_streams;
4617 for (size_t i = 0; i < segmap_fnames->size(); ++i)
4618 {
4619 hit_factories.push_back(new BAMHitFactory(it, *rt));
4620 HitStream hs((*segmap_fnames)[i], hit_factories.back(), false, false, false);
4621 if (seg_offsets[i] > 0)
4622 hs.seek(seg_offsets[i]);
4623
4624 hit_streams.push_back(hs);
4625 }
4626
4627 int num_group = 0;
4628 uint64_t read_id = 0, last_read_id = 0;
4629 while ((read_id = process_next_hit_group(*rt,
4630 readstream,
4631 readstream_for_segment_search,
4632 readstream_for_indel_discovery,
4633 readstream_for_fusion_discovery,
4634 it,
4635 hit_streams,
4636 hit_streams.size() - 1,
4637 partner_hit_stream_for_segment_search,
4638 seg_partner_hit_stream_for_segment_search,
4639 partner_hit_stream_for_fusion_discovery,
4640 seg_partner_hit_stream_for_fusion_discovery,
4641 *juncs, *deletions, *insertions, *fusions,
4642 read,
4643 begin_id, end_id)) != 0)
4644 {
4645 num_group++;
4646 //if (num_group % 500000 == 0)
4647 // fprintf(stderr, "\tProcessed %d root segment groups\n", num_group);
4648
4649 last_read_id = read_id;
4650 }
4651
4652 // "microaligned_segs" is not protected against multi-threading
4653 // fprintf(stderr, "Microaligned %d segments\n", microaligned_segs);
4654
4655 for (size_t i = 0; i < hit_factories.size(); ++i)
4656 delete hit_factories[i];
4657
4658 hit_factories.clear();
4659 }
4660
4661 RefSequenceTable* rt;
4662 string reads_fname;
4663 vector<string>* segmap_fnames;
4664 string partner_reads_map_fname;
4665 string seg_partner_reads_map_fname;
4666 std::set<Junction, skip_count_lt>* juncs;
4667 std::set<Deletion>* deletions;
4668 std::set<Insertion>* insertions;
4669 FusionSimpleSet* fusions;
4670 eREAD read;
4671
4672 uint64_t begin_id;
4673 uint64_t end_id;
4674 int64_t read_offset;
4675 vector<int64_t> seg_offsets;
4676
4677 int64_t partner_hit_offset;
4678 int64_t seg_partner_hit_offset;
4679 };
4680
4681 struct SpliceJunctionCoord
4682 {
4683 uint32_t refid;
4684 int coord;
4685
4686 SpliceJunctionCoord(uint32_t r, int c) :
4687 refid(r),
4688 coord(c)
4689 {}
4690
4691 bool operator< (const SpliceJunctionCoord& r) const
4692 {
4693 if (refid < r.refid)
4694 return true;
4695 else if (refid == r.refid && coord < r.coord)
4696 return true;
4697 else
4698 return false;
4699 }
4700 };
4701
4702 void driver(istream& ref_stream,
4703 FILE* juncs_out,
4704 FILE* insertions_out,
4705 FILE* deletions_out,
4706 FILE* fusions_out,
4707 string& left_reads_fname,
4708 string& left_reads_map_fname,
4709 vector<string>& left_segmap_fnames,
4710 string& right_reads_fname,
4711 string& right_reads_map_fname,
4712 vector<string>& right_segmap_fnames)
4713 {
4714 if (!parallel)
4715 num_threads = 1;
4716
4717 // turn off parallelization in case of the following search methods
4718 if (!no_coverage_search || !no_microexon_search || butterfly_search)
4719 num_threads = 1;
4720
4721 assert (num_threads > 0);
4722 if (left_segmap_fnames.size() == 0)
4723 {
4724 fprintf(stderr, "No hits to process, exiting\n");
4725 exit(0);
4726 }
4727
4728 std::set<Junction, skip_count_lt> vseg_juncs[num_threads];
4729 std::set<Junction, skip_count_lt> cov_juncs;
4730 std::set<Junction, skip_count_lt> butterfly_juncs;
4731
4732 std::set<Junction> juncs;
4733
4734 std::set<Deletion> vdeletions[num_threads];
4735 std::set<Insertion> vinsertions[num_threads];
4736 FusionSimpleSet vfusions[num_threads];
4737
4738 RefSequenceTable rt(sam_header, true);
4739
4740 fprintf (stderr, "Loading reference sequences...\n");
4741 get_seqs(ref_stream, rt, true);
4742
4743 string left_seg_fname_for_segment_search = left_segmap_fnames.back();
4744 string right_seg_fname_for_segment_search;
4745 if (right_segmap_fnames.size() > 0)
4746 right_seg_fname_for_segment_search = right_segmap_fnames.back();
4747
4748 fprintf(stderr, ">> Performing segment-search:\n");
4749
4750 if (left_segmap_fnames.size() > 1)
4751 {
4752 fprintf( stderr, "Loading left segment hits...\n");
4753
4754 vector<uint64_t> read_ids;
4755 vector<vector<int64_t> > offsets;
4756 vector<int64_t> partner_offsets;
4757 vector<int64_t> seg_partner_offsets;
4758 if (num_threads > 1)
4759 {
4760 vector<string> fnames;
4761 fnames.push_back(left_reads_fname);
4762 fnames.insert(fnames.end(), left_segmap_fnames.begin(), left_segmap_fnames.end());
4763 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
4764 if (!enough_data)
4765 num_threads = 1;
4766
4767 if (enough_data && right_reads_map_fname != "")
4768 calculate_offsets_from_ids(right_reads_map_fname, read_ids, partner_offsets);
4769
4770 if (enough_data && right_seg_fname_for_segment_search != "")
4771 calculate_offsets_from_ids(right_seg_fname_for_segment_search, read_ids, seg_partner_offsets);
4772 }
4773
4774 vector<boost::thread*> threads;
4775 for (int i = 0; i < num_threads; ++i)
4776 {
4777 SegmentSearchWorker worker;
4778 worker.rt = &rt;
4779 worker.reads_fname = left_reads_fname;
4780 worker.segmap_fnames = &left_segmap_fnames;
4781 worker.partner_reads_map_fname = right_reads_map_fname;
4782 worker.seg_partner_reads_map_fname = right_seg_fname_for_segment_search;
4783 worker.juncs = &vseg_juncs[i];
4784 worker.deletions = &vdeletions[i];
4785 worker.insertions = &vinsertions[i];
4786 worker.fusions = &vfusions[i];
4787 worker.read = READ_LEFT;
4788 worker.partner_hit_offset = 0;
4789 worker.seg_partner_hit_offset = 0;
4790
4791 if (i == 0)
4792 {
4793 worker.begin_id = 0;
4794 worker.seg_offsets = vector<int64_t>(left_segmap_fnames.size(), 0);
4795 worker.read_offset = 0;
4796 }
4797 else
4798 {
4799 worker.begin_id = read_ids[i-1];
4800 worker.seg_offsets.insert(worker.seg_offsets.end(), offsets[i-1].begin()+1, offsets[i-1].end());
4801 worker.read_offset = offsets[i-1][0];
4802 if (partner_offsets.size() > 0)
4803 worker.partner_hit_offset = partner_offsets[i-1];
4804 if (seg_partner_offsets.size() > 0)
4805 worker.seg_partner_hit_offset = seg_partner_offsets[i-1];
4806 }
4807
4808 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
4809
4810 if (num_threads > 1)
4811 threads.push_back(new boost::thread(worker));
4812 else
4813 worker();
4814 }
4815
4816 for (size_t i = 0; i < threads.size(); ++i)
4817 {
4818 threads[i]->join();
4819 delete threads[i];
4820 threads[i] = NULL;
4821 }
4822 threads.clear();
4823
4824 fprintf( stderr, "done.\n");
4825 }
4826
4827 if (right_segmap_fnames.size() > 1)
4828 {
4829 fprintf( stderr, "Loading right segment hits...\n");
4830
4831 vector<uint64_t> read_ids;
4832 vector<vector<int64_t> > offsets;
4833 vector<int64_t> partner_offsets;
4834 vector<int64_t> seg_partner_offsets;
4835 if (num_threads > 1)
4836 {
4837 vector<string> fnames;
4838 fnames.push_back(right_reads_fname);
4839 fnames.insert(fnames.end(), right_segmap_fnames.begin(), right_segmap_fnames.end());
4840 bool enough_data = calculate_offsets(fnames, read_ids, offsets);
4841 if (!enough_data)
4842 num_threads = 1;
4843
4844 if (enough_data)
4845 calculate_offsets_from_ids(left_reads_map_fname, read_ids, partner_offsets);
4846
4847 if (enough_data)
4848 calculate_offsets_from_ids(left_seg_fname_for_segment_search, read_ids, seg_partner_offsets);
4849 }
4850
4851 vector<boost::thread*> threads;
4852 for (int i = 0; i < num_threads; ++i)
4853 {
4854 SegmentSearchWorker worker;
4855 worker.rt = &rt;
4856 worker.reads_fname = right_reads_fname;
4857 worker.segmap_fnames = &right_segmap_fnames;
4858 worker.partner_reads_map_fname = left_reads_map_fname;
4859 worker.seg_partner_reads_map_fname = left_seg_fname_for_segment_search;
4860 worker.juncs = &vseg_juncs[i];
4861 worker.deletions = &vdeletions[i];
4862 worker.insertions = &vinsertions[i];
4863 worker.fusions = &vfusions[i];
4864 worker.read = READ_RIGHT;
4865 worker.partner_hit_offset = 0;
4866 worker.seg_partner_hit_offset = 0;
4867
4868 if (i == 0)
4869 {
4870 worker.begin_id = 0;
4871 worker.seg_offsets = vector<int64_t>(left_segmap_fnames.size(), 0);
4872 worker.read_offset = 0;
4873 }
4874 else
4875 {
4876 worker.begin_id = read_ids[i-1];
4877 worker.seg_offsets.insert(worker.seg_offsets.end(), offsets[i-1].begin() + 1, offsets[i-1].end());
4878 worker.read_offset = offsets[i-1][0];
4879 if (partner_offsets.size() > 0)
4880 worker.partner_hit_offset = partner_offsets[i-1];
4881 if (seg_partner_offsets.size() > 0)
4882 worker.seg_partner_hit_offset = seg_partner_offsets[i-1];
4883 }
4884
4885 worker.end_id = (i+1 < num_threads) ? read_ids[i] : std::numeric_limits<uint64_t>::max();
4886
4887 if (num_threads > 1)
4888 threads.push_back(new boost::thread(worker));
4889 else
4890 worker();
4891 }
4892
4893 for (size_t i = 0; i < threads.size(); ++i)
4894 {
4895 threads[i]->join();
4896 delete threads[i];
4897 threads[i] = NULL;
4898 }
4899 threads.clear();
4900 fprintf( stderr, "done.\n");
4901 }
4902
4903 std::set<Junction, skip_count_lt> seg_juncs;
4904 std::set<Deletion> deletions;
4905 std::set<Insertion> insertions;
4906
4907 for (int i = 0; i < num_threads; ++i)
4908 {
4909 seg_juncs.insert(vseg_juncs[i].begin(), vseg_juncs[i].end());
4910 deletions.insert(vdeletions[i].begin(), vdeletions[i].end());
4911 insertions.insert(vinsertions[i].begin(), vinsertions[i].end());
4912 }
4913
4914 FusionSimpleSet fusions = vfusions[0];
4915 for (int i = 1; i < num_threads; ++i)
4916 {
4917 merge_with(fusions, vfusions[i]);
4918 }
4919
4920
4921 fprintf(stderr, "\tfound %ld potential split-segment junctions\n", (long int)seg_juncs.size());
4922 fprintf(stderr, "\tfound %ld potential small deletions\n", (long int)deletions.size());
4923 fprintf(stderr, "\tfound %ld potential small insertions\n", (long int)insertions.size());
4924
4925 vector<string> all_segmap_fnames;
4926 for (vector<string>::size_type i = 0; i != left_segmap_fnames.size();i++) {
4927 all_segmap_fnames.push_back( left_segmap_fnames[i] );
4928 }
4929 for (vector<string>::size_type i = 0; i != right_segmap_fnames.size();i++) {
4930 all_segmap_fnames.push_back( right_segmap_fnames[i] );
4931 }
4932
4933 #if 0
4934 // daehwan - check this out as Cole insists on using segments gives better results.
4935 vector<FZPipe*> all_map_files;
4936 if (left_seg_files.size() > 1)
4937 {
4938 all_map_files.push_back(&left_reads_map_file);
4939 }
4940
4941 if (right_seg_files.size() > 1)
4942 {
4943 all_map_files.push_back(&right_reads_map_file);
4944 }
4945
4946 copy(all_seg_files.begin(), all_seg_files.end(), back_inserter(all_map_files));
4947 #endif
4948
4949 ReadTable it;
4950 map<uint32_t, vector<bool> > coverage_map;
4951 if (!no_coverage_search || butterfly_search)
4952 {
4953 if (ium_reads != "")
4954 {
4955 vector<string> ium_read_files;
4956 tokenize(ium_reads,",", ium_read_files);
4957 vector<FZPipe> iums;
4958 string unzcmd=getUnpackCmd(ium_read_files[0],false); //could be BAM file
4959 for (size_t ium = 0; ium < ium_read_files.size(); ++ium)
4960 {
4961 //fprintf (stderr, "Indexing extensions in %s\n", ium_read_files[ium].c_str());
4962 FZPipe ium_file(ium_read_files[ium],unzcmd);
4963 if (ium_file.file==NULL)
4964 {
4965 fprintf (stderr, "Can't open file %s for reading, skipping...\n",ium_read_files[ium].c_str());
4966 continue;
4967 }
4968 iums.push_back(ium_file);
4969 }
4970
4971 index_read_mers(iums, 5);
4972 }
4973 else
4974 { //no unmapped reads
4975 no_coverage_search = true;
4976 butterfly_search = false;
4977 }
4978
4979 if (!no_coverage_search)
4980 { //coverage search
4981 // looking for junctions by island end pairings
4982 fprintf(stderr, ">> Performing coverage-search:\n");
4983 capture_island_ends(it,
4984 rt,
4985 all_segmap_fnames,
4986 cov_juncs,
4987 coverage_map,
4988 5);
4989 fprintf(stderr, "\tfound %d potential junctions\n",(int)cov_juncs.size());
4990 }
4991 } //coverage search or butterfly search
4992
4993 if (butterfly_search)
4994 {
4995 //looking for junctions between and within islands
4996 fprintf(stderr, ">> Performing butterfly-search: \n");
4997 prune_extension_table(butterfly_overhang);
4998 compact_extension_table();
4999 pair_covered_sites(it,
5000 rt,
5001 all_segmap_fnames,
5002 butterfly_juncs,
5003 coverage_map,
5004 5);
5005
5006 fprintf(stderr, "\tfound %d potential junctions\n",(int)butterfly_juncs.size());
5007 }
5008
5009 coverage_map.clear();
5010
5011 std::set<Junction, skip_count_lt> microexon_juncs;
5012 if (!no_microexon_search)
5013 {
5014 fprintf(stderr, ">> Performing microexon-search: \n");
5015 std::set<Junction, skip_count_lt> microexon_juncs;
5016 align_microexon_segs(rt,
5017 microexon_juncs,
5018 max_cov_juncs,
5019 5);
5020 fprintf(stderr, "\tfound %d potential junctions\n",(int)microexon_juncs.size());
5021 juncs.insert(microexon_juncs.begin(), microexon_juncs.end());
5022 }
5023
5024 juncs.insert(cov_juncs.begin(), cov_juncs.end());
5025 juncs.insert(seg_juncs.begin(), seg_juncs.end());
5026 juncs.insert(butterfly_juncs.begin(), butterfly_juncs.end());
5027
5028 //fprintf(stderr, "Reporting potential splice junctions...");
5029 vector<SpliceJunctionCoord> splice_junction_coords;
5030 for(std::set<Junction>::iterator itr = juncs.begin();
5031 itr != juncs.end();
5032 ++itr)
5033 {
5034 const char* ref_name = rt.get_name(itr->refid);
5035
5036 fprintf(juncs_out,
5037 "%s\t%d\t%d\t%c\n",
5038 ref_name,
5039 itr->left,
5040 itr->right,
5041 itr->antisense ? '-' : '+');
5042
5043 if (fusion_search)
5044 {
5045 splice_junction_coords.push_back(SpliceJunctionCoord(itr->refid, itr->left));
5046 splice_junction_coords.push_back(SpliceJunctionCoord(itr->refid, itr->right));
5047 }
5048 }
5049 //close all reading pipes, just to exit cleanly
5050 /*
5051 for (vector<FZPipe*>::size_type i = 0; i != all_segmap_fnames.size();i++) {
5052 all_segmap_fnames[i]->close();
5053 }
5054 */
5055 fprintf(stderr, "Reported %d total potential splices\n", (int)juncs.size());
5056 sort(splice_junction_coords.begin(), splice_junction_coords.end());
5057
5058 fprintf(stderr, "Reporting %u potential deletions...\n", deletions.size());
5059 if(deletions_out){
5060 for(std::set<Deletion>::iterator itr = deletions.begin(); itr != deletions.end(); ++itr){
5061 const char* ref_name = rt.get_name(itr->refid);
5062 /*
5063 * We fix up the left co-ordinate to reference the first deleted base
5064 */
5065 fprintf(deletions_out,
5066 "%s\t%d\t%d\n",
5067 ref_name,
5068 itr->left + 1,
5069 itr->right);
5070 }
5071 fclose(deletions_out);
5072 }else{
5073 fprintf(stderr, "Failed to open deletions file for writing, no deletions reported\n");
5074 }
5075
5076 fprintf(stderr, "Reporting %u potential insertions...\n", insertions.size());
5077 if(insertions_out){
5078 for(std::set<Insertion>::iterator itr = insertions.begin(); itr != insertions.end(); ++itr){
5079 const char* ref_name = rt.get_name(itr->refid);
5080 fprintf(insertions_out,
5081 "%s\t%d\t%d\t%s\n",
5082 ref_name,
5083 itr->left,
5084 itr->left,
5085 itr->sequence.c_str());
5086 }
5087 fclose(insertions_out);
5088 }else{
5089 fprintf(stderr, "Failed to open insertions file for writing, no insertions reported\n");
5090 }
5091 if (fusions_out)
5092 {
5093 // check if a fusion point coincides with splice junctions.
5094 for(FusionSimpleSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr)
5095 {
5096 const Fusion& fusion = itr->first;
5097 FusionSimpleStat& fusion_stat = itr->second;
5098
5099 bool found = binary_search(splice_junction_coords.begin(),
5100 splice_junction_coords.end(),
5101 SpliceJunctionCoord(fusion.refid1, fusion.left));
5102 if (found)
5103 fusion_stat.left_coincide_with_splice_junction = true;
5104
5105 found = binary_search(splice_junction_coords.begin(),
5106 splice_junction_coords.end(),
5107 SpliceJunctionCoord(fusion.refid2, fusion.right));
5108
5109 if (found)
5110 fusion_stat.right_coincide_with_splice_junction = true;
5111 }
5112
5113 for(FusionSimpleSet::iterator itr = fusions.begin(); itr != fusions.end(); ++itr)
5114 {
5115 const Fusion& fusion = itr->first;
5116 const FusionSimpleStat& fusion_stat = itr->second;
5117
5118 // compare the current fusion with the next fusion, pick up the better one.
5119 FusionSimpleSet::iterator next_itr = itr; ++next_itr;
5120 while (next_itr != fusions.end())
5121 {
5122 const Fusion& next_fusion = next_itr->first;
5123 const FusionSimpleStat& next_fusion_stat = next_itr->second;
5124
5125 uint32_t left_diff = abs((int)fusion.left - (int)next_fusion.left);
5126 if (fusion.refid1 == next_fusion.refid1 && fusion.refid2 == next_fusion.refid2 && left_diff < 10)
5127 {
5128 if (fusion.dir == next_fusion.dir && left_diff == abs((int)fusion.right - (int)next_fusion.right))
5129 {
5130 if (next_fusion_stat.count > fusion_stat.count)
5131 itr->second.skip = true;
5132 else if (next_fusion_stat.count == fusion_stat.count)
5133 {
5134 int curr_count = (int)fusion_stat.left_coincide_with_splice_junction + (int)fusion_stat.right_coincide_with_splice_junction;
5135 int next_count = (int)next_fusion_stat.left_coincide_with_splice_junction + (int)next_fusion_stat.right_coincide_with_splice_junction;
5136
5137 if (curr_count < next_count)
5138 itr->second.skip = true;
5139 else
5140 next_itr->second.skip = true;
5141 }
5142 else
5143 next_itr->second.skip = true;
5144 }
5145
5146 ++next_itr;
5147 }
5148 else
5149 break;
5150 }
5151
5152 if (itr->second.skip)
5153 continue;
5154
5155 const char* ref_name1 = rt.get_name(fusion.refid1);
5156 const char* ref_name2 = rt.get_name(fusion.refid2);
5157
5158 const char* dir = "";
5159 if (fusion.dir == FUSION_FR)
5160 dir = "fr";
5161 else if(fusion.dir == FUSION_RF)
5162 dir = "rf";
5163 else if(fusion.dir == FUSION_RR)
5164 dir = "rr";
5165 else
5166 dir = "ff";
5167
5168 fprintf(fusions_out,
5169 "%s\t%d\t%s\t%d\t%s\n",
5170 ref_name1,
5171 fusion.left,
5172 ref_name2,
5173 fusion.right,
5174 dir);
5175 }
5176 fclose(fusions_out);
5177 }
5178 fprintf(stderr, "Reporting potential fusions...\n");
5179 }
5180
5181 int main(int argc, char** argv)
5182 {
5183 fprintf(stderr, "segment_juncs v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
5184 fprintf(stderr, "---------------------------\n");
5185
5186 int parse_ret = parse_options(argc, argv, print_usage);
5187 if (parse_ret)
5188 return parse_ret;
5189
5190 if(optind >= argc)
5191 {
5192 print_usage();
5193 return 1;
5194 }
5195
5196 string ref_file_name = argv[optind++];
5197
5198 if(optind >= argc)
5199 {
5200 print_usage();
5201 return 1;
5202 }
5203
5204 string juncs_file_name = argv[optind++];
5205
5206 if(optind >= argc)
5207 {
5208 print_usage();
5209 return 1;
5210 }
5211
5212 string insertions_file_name = argv[optind++];
5213 if(optind >= argc)
5214 {
5215 print_usage();
5216 return 1;
5217 }
5218
5219 string deletions_file_name = argv[optind++];
5220 if(optind >= argc)
5221 {
5222 print_usage();
5223 return 1;
5224 }
5225
5226 string fusions_file_name = argv[optind++];
5227 if(optind >= argc)
5228 {
5229 print_usage();
5230 return 1;
5231 }
5232
5233 string left_reads_file_name = argv[optind++];
5234
5235 if(optind >= argc)
5236 {
5237 print_usage();
5238 return 1;
5239 }
5240
5241 string left_reads_map_file_name = argv[optind++];
5242
5243 if(optind >= argc)
5244 {
5245 print_usage();
5246 return 1;
5247 }
5248
5249 string left_segment_map_file_list = argv[optind++];
5250
5251 string right_segment_map_file_list;
5252 string right_reads_file_name;
5253 string right_reads_map_file_name;
5254 if (optind < argc)
5255 {
5256 right_reads_file_name = argv[optind++];
5257
5258 if(optind >= argc)
5259 {
5260 print_usage();
5261 return 1;
5262 }
5263
5264 right_reads_map_file_name = argv[optind++];
5265
5266 if(optind >= argc)
5267 {
5268 print_usage();
5269 return 1;
5270 }
5271 right_segment_map_file_list = argv[optind++];
5272 }
5273
5274 // Open the approppriate files
5275
5276 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
5277 if (!ref_stream.good())
5278 {
5279 fprintf(stderr, "Error: cannot open %s for reading\n",
5280 ref_file_name.c_str());
5281 exit(1);
5282 }
5283
5284
5285 FILE* juncs_file = fopen(juncs_file_name.c_str(), "w");
5286 if (!juncs_file)
5287 {
5288 fprintf(stderr, "Error: cannot open %s for writing\n",
5289 juncs_file_name.c_str());
5290 exit(1);
5291 }
5292
5293 FILE* insertions_file = fopen(insertions_file_name.c_str(), "w");
5294 if (!insertions_file)
5295 {
5296 fprintf(stderr, "Error: cannot open %s for writing\n",
5297 insertions_file_name.c_str());
5298 exit(1);
5299 }
5300
5301
5302 FILE* deletions_file = fopen(deletions_file_name.c_str(), "w");
5303 if (!deletions_file)
5304 {
5305 fprintf(stderr, "Error: cannot open %s for writing\n",
5306 deletions_file_name.c_str());
5307 exit(1);
5308 }
5309
5310 vector<string> left_segment_map_fnames;
5311 string left_segment_file_for_segment_search;
5312 tokenize(left_segment_map_file_list, ",",left_segment_map_fnames);
5313
5314 FILE* fusions_file = fopen(fusions_file_name.c_str(), "w");
5315 if (!fusions_file)
5316 {
5317 fprintf(stderr, "Error: cannot open %s for writing\n",
5318 fusions_file_name.c_str());
5319 exit(1);
5320 }
5321
5322 //FILE* left_reads_file = fopen(left_reads_file_name.c_str(), "r");
5323 //FILE* left_reads_file_for_indel_discovery = fopen(left_reads_file_name.c_str(),"r");
5324 string unzcmd=getUnpackCmd(left_reads_file_name, false);
5325 FZPipe left_reads_file(left_reads_file_name, unzcmd);
5326 FZPipe left_reads_file_for_segment_search(left_reads_file_name, unzcmd);
5327 FZPipe left_reads_file_for_indel_discovery(left_reads_file_name, unzcmd);
5328 FILE* left_reads_file_for_fusion_discovery = fopen(left_reads_file_name.c_str(),"r");
5329 if (left_reads_file.file==NULL || left_reads_file_for_segment_search.file==NULL ||
5330 left_reads_file_for_indel_discovery.file==NULL || left_reads_file_for_fusion_discovery==NULL)
5331 {
5332 fprintf(stderr, "Error: cannot open %s for reading\n",
5333 left_reads_file_name.c_str());
5334 exit(1);
5335 }
5336
5337 vector<string> right_segment_map_fnames;
5338 string right_segment_file_for_segment_search;
5339 if (right_segment_map_file_list != "")
5340 {
5341 tokenize(right_segment_map_file_list, ",", right_segment_map_fnames);
5342 }
5343
5344 // min_cov_length=20;
5345 if (min_cov_length>segment_length-2) min_cov_length=segment_length-2;
5346 driver(ref_stream,
5347 juncs_file,
5348 insertions_file,
5349 deletions_file,
5350 fusions_file,
5351 left_reads_file_name,
5352 left_reads_map_file_name,
5353 left_segment_map_fnames,
5354 right_reads_file_name,
5355 right_reads_map_file_name,
5356 right_segment_map_fnames);
5357
5358 return 0;
5359 }