ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (8 years, 5 months ago) by gpertea
File size: 47266 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #endif
4
5 /*
6 * long_spanning_reads.cpp
7 * TopHat
8 *
9 * Created by Cole Trapnell on 2/5/09.
10 * Copyright 2009 Cole Trapnell. All rights reserved.
11 *
12 */
13
14 #include <cassert>
15 #include <cstdio>
16 #include <vector>
17 #include <string>
18 #include <map>
19 #include <algorithm>
20 #include <set>
21 #include <cstdlib>
22 #include <cstring>
23 #include <bitset>
24 //#include <stdexcept>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <seqan/sequence.h>
29 #include <seqan/find.h>
30 #include <seqan/file.h>
31 #include <seqan/modifier.h>
32 #include <getopt.h>
33
34 #include "common.h"
35 #include "bwt_map.h"
36 #include "tokenize.h"
37 #include "segments.h"
38 #include "reads.h"
39
40 #include "junctions.h"
41 #include "insertions.h"
42 #include "deletions.h"
43
44 using namespace seqan;
45 using namespace std;
46
47 void print_usage()
48 {
49 fprintf(stderr, "Usage: long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
50 }
51
52 bool key_lt(const pair<uint32_t, HitsForRead>& lhs,
53 const pair<uint32_t, HitsForRead>& rhs)
54 {
55 return lhs.first < rhs.first;
56 }
57
58 void get_seqs(istream& ref_stream,
59 RefSequenceTable& rt,
60 bool keep_seqs = true,
61 bool strip_slash = false)
62 {
63 while(ref_stream.good() &&
64 !ref_stream.eof())
65 {
66 RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
67 string name;
68 readMeta(ref_stream, name, Fasta());
69 string::size_type space_pos = name.find_first_of(" \t\r");
70 if (space_pos != string::npos)
71 {
72 name.resize(space_pos);
73 }
74 seqan::read(ref_stream, *ref_str, Fasta());
75
76 rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
77 }
78 }
79
80
81 void look_right_for_hit_group(ReadTable& unmapped_reads,
82 vector<HitStream>& contig_hits,
83 size_t curr_file,
84 vector<HitStream>& spliced_hits,
85 const HitsForRead& targets,
86 vector<HitsForRead>& seg_hits_for_read)
87 {
88 int right_file = curr_file + 1;
89
90 HitStream& right = contig_hits[right_file];
91 uint64_t curr_next_group_id = targets.insert_id;
92 int curr_order = unmapped_reads.observation_order(curr_next_group_id);
93
94 assert (curr_order != -1);
95 while(true)
96 {
97 HitsForRead hit_group;
98 uint64_t right_next_group_id = right.next_group_id();
99 int right_order = unmapped_reads.observation_order(right_next_group_id);
100
101 // If we would have seen the hits by now, bail out.
102 if (curr_order < right_order || right_order == -1)
103 {
104 break;
105 }
106 if (right.next_read_hits(hit_group))
107 {
108 if (hit_group.insert_id == targets.insert_id)
109 {
110 // Some of the targets may be missing, we need to
111 // process them individually
112 seg_hits_for_read[right_file] = hit_group;
113 break;
114 }
115 }
116 }
117
118 HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
119
120 if (right_file < (int)spliced_hits.size() && right_file >= 0)
121 {
122 // Scan forward in the spliced hits file for this hit group
123 HitsForRead spliced_group;
124 HitsForRead curr_spliced_group;
125 while (spliced_hits[right_file].next_group_id() > 0 &&
126 spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order)
127 {
128 spliced_hits[right_file].next_read_hits(curr_spliced_group);
129
130 if (curr_spliced_group.insert_id == (uint32_t)curr_order)
131 {
132 spliced_group = curr_spliced_group;
133 break;
134 }
135 }
136
137 if (!spliced_group.hits.empty())
138 {
139 curr_seg_hits.insert_id = spliced_group.insert_id;
140 curr_seg_hits.hits.insert(curr_seg_hits.hits.end(),
141 spliced_group.hits.begin(),
142 spliced_group.hits.end());
143 }
144 }
145
146 if (curr_seg_hits.hits.empty())
147 return;
148 else if (right_file + 1 < (int)contig_hits.size())
149 {
150 look_right_for_hit_group(unmapped_reads,
151 contig_hits,
152 curr_file + 1,
153 spliced_hits,
154 curr_seg_hits,
155 seg_hits_for_read);
156 }
157 }
158
159 BowtieHit merge_chain(RefSequenceTable& rt,
160 const string& read_seq,
161 const string& read_quals,
162 std::set<Junction>& possible_juncs,
163 std::set<Insertion>& possible_insertions,
164 list<BowtieHit>& hit_chain)
165 {
166 bool antisense = hit_chain.front().antisense_align();
167 uint32_t reference_id = hit_chain.front().ref_id();
168 uint64_t insert_id = hit_chain.front().insert_id();
169
170 int left = hit_chain.front().left();
171
172 list<BowtieHit>::iterator prev_hit = hit_chain.begin();
173 list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
174
175 string seq;
176 string qual;
177 int old_read_length = 0;
178 int first_seg_length = hit_chain.front().seq().length();
179 for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
180 {
181 seq += i->seq();
182 qual += i->qual();
183 old_read_length += i->read_len();
184 }
185
186 string rev_read_seq, rev_read_quals;
187 if (color && antisense)
188 {
189 rev_read_seq = read_seq;
190 reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
191
192 rev_read_quals = read_quals;
193 reverse(rev_read_quals.begin(), rev_read_quals.end());
194 }
195
196 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
197 {
198 /*
199 * Note that the gap size may be negative, since left() and right() return
200 * signed integers, this will be OK.
201 */
202 int gap = curr_hit->left() - prev_hit->right();
203 if (gap < -(int)max_insertion_length ||
204 (gap > (int)max_deletion_length &&
205 (gap < min_report_intron_length || gap > max_report_intron_length)))
206 {
207 return BowtieHit();
208 }
209
210 ++prev_hit;
211 ++curr_hit;
212 }
213
214 prev_hit = hit_chain.begin();
215 curr_hit = ++(hit_chain.begin());
216
217 RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id());
218 if (!ref_str)
219 return BowtieHit();
220
221 int curr_seg_index = 1;
222 while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
223 {
224 /*
225 * This code is assuming that the cigar strings end and start with a match
226 * While segment alignments can actually end with a junction, insertion or deletion, the hope
227 * is that in those cases, the right and left ends of the alignments will correctly
228 * line up, so we won't get to this bit of code
229 */
230 if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH)
231 {
232 return BowtieHit();
233 }
234
235 if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
236 {
237 /*
238 * There is no way that we can splice these together into a valid
239 * alignment
240 */
241 return BowtieHit();
242 }
243
244 bool found_closure = false;
245 /*
246 * Note that antisense_splice is the same for prev and curr
247 */
248 bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
249 vector<CigarOp> new_cigar;
250 int new_left = -1;
251 int mismatch = 0;
252
253 /*
254 * Take the length of matched bases in each segment into consideration for closures,
255 * this can be a problem for reads of variable lengths.
256 */
257 int prev_right_end_match_length = prev_hit->cigar().back().length;
258 int curr_left_end_match_length = curr_hit->cigar().front().length;
259
260 if (prev_hit->right() > curr_hit->left())
261 {
262 std::set<Insertion>::iterator lb, ub;
263 /*
264 * Note, this offset is determined by the min-anchor length supplied to
265 * juncs_db, which is currently hard-coded at 3 in tophat.py
266 * as this value determines what sort of read segments
267 * should be able to align directly to the splice sequences
268 */
269 int left_boundary = prev_hit->right() - 4;
270 int right_boundary = curr_hit->left() + 4;
271
272 /*
273 * Create a dummy sequence to represent the maximum possible insertion
274 */
275 std::string maxInsertedSequence = "";
276 maxInsertedSequence.resize(max_insertion_length,'A');
277
278 lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
279 ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
280
281 int reference_mismatch = 0;
282
283 while (lb != ub && lb != possible_insertions.end())
284 {
285 /*
286 * In the following code, we will check to make sure that the segments have the proper
287 * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
288 * In general, reads with insertions must match the inserted sequence exactly.
289 */
290 if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left()))
291 {
292 /*
293 * Check we have enough matched bases on prev or curr segment.
294 */
295 int insert_to_prev_right = prev_hit->right() - lb->left - 1;
296 int curr_left_to_insert = lb->left - curr_hit->left() + 1;
297 if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
298 {
299 ++lb;
300 continue;
301 }
302
303 /*
304 * Keep track of how many mismatches were made to the genome in the region
305 * where we should actually be matching against the insertion
306 */
307 int this_reference_mismatch = 0;
308 int insertion_mismatch = 0;
309 int insertion_len = lb->sequence.length();
310 const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
311
312 /*
313 * First check to see if we need to adjust number of observed errors for the left (prev)
314 * hit. This is only the case if this segment runs into the insertion. To be consistent
315 * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
316 */
317 string colorSegmentSequence_prev;
318 if (insert_to_prev_right > 0)
319 {
320 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
321 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
322
323 if (color)
324 {
325 string color;
326
327 if (antisense)
328 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
329 else
330 color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
331
332 color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
333 colorSegmentSequence_prev = convert_color_to_bp(color);
334 }
335
336 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
337
338 /*
339 * Scan right in the read until we run out of read
340 */
341 for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
342 {
343 /*
344 * Any mismatch to the insertion is a failure
345 */
346 if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
347 {
348 ++this_reference_mismatch;
349 }
350
351 if (read_index < insertion_len)
352 {
353 if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
354 {
355 ++insertion_mismatch;
356 break;
357 }
358 }
359 else
360 {
361 if (referenceSequence[read_index - insertion_len] == 'N' ||
362 referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
363 {
364 --this_reference_mismatch;
365 }
366 }
367 }
368 }
369
370 string colorSegmentSequence_curr;
371 if (curr_left_to_insert > 0)
372 {
373 const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
374 const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
375
376 if (color)
377 {
378 string color;
379 if (antisense)
380 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
381 else
382 color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
383
384 color.push_back(curr_hit->seq()[curr_left_to_insert]);
385 reverse(color.begin(), color.end());
386 string bp = convert_color_to_bp(color);
387 reverse(bp.begin(), bp.end());
388 colorSegmentSequence_curr = bp;
389 }
390
391 const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
392
393 /*
394 * Scan left in the read until
395 * We ran out of read sequence (insertion extends past segment)
396 */
397 for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
398 {
399 int segmentPosition = curr_left_to_insert - read_index - 1;
400 int insertionPosition = insertion_len - read_index - 1;
401
402 if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
403 {
404 ++this_reference_mismatch;
405 }
406
407 if (read_index < insertion_len)
408 {
409 if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
410 {
411 ++insertion_mismatch;
412 break;
413 }
414 }
415 else
416 {
417 if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
418 (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
419 {
420 --this_reference_mismatch;
421 }
422 }
423 }
424 }
425
426 if (found_closure)
427 {
428 fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
429 return BowtieHit();
430 }
431
432 if (insertion_mismatch == 0)
433 {
434 reference_mismatch = this_reference_mismatch;
435 mismatch = -reference_mismatch;
436 found_closure = true;
437 new_left = prev_hit->left();
438 new_cigar = prev_hit->cigar();
439
440 /*
441 * Need to make a new insert operation between the two match character that begin
442 * and end the intersection of these two junction. Note that we necessarily assume
443 * that this insertion can't span beyond the boundaries of these reads. That should
444 * probably be better enforced somewhere
445 */
446
447 new_cigar.back().length -= insert_to_prev_right;
448 if (new_cigar.back().length <= 0)
449 new_cigar.pop_back();
450
451 new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
452 vector<CigarOp> new_right_cigar = curr_hit->cigar();
453 new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
454
455 /*
456 * Finish stitching together the new cigar string
457 */
458 size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
459 for (; c < new_right_cigar.size(); ++c)
460 {
461 new_cigar.push_back(new_right_cigar[c]);
462 }
463
464 if (color)
465 {
466 if (insert_to_prev_right > 0)
467 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
468
469 if (curr_left_to_insert > 0)
470 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
471 }
472 }
473 }
474 ++lb;
475 }
476
477 if (!found_closure)
478 {
479 return BowtieHit();
480 }
481 }
482
483 /*
484 * Stitch segments together using juctions or deletions if necessary.
485 */
486 else if (prev_hit->right() < curr_hit->left())
487 {
488 std::set<Junction>::iterator lb, ub;
489
490 int left_boundary = prev_hit->right() - 4;
491 int right_boundary = curr_hit->left() + 4;
492
493 lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
494 ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
495
496 int new_diff_mismatches = 0xff;
497 while (lb != ub && lb != possible_juncs.end())
498 {
499 int dist_to_left = lb->left - prev_hit->right() + 1;
500 int dist_to_right = lb->right - curr_hit->left();
501
502 if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
503 {
504 /*
505 * Check we have enough matched bases on prev or curr segment.
506 */
507 if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length )
508 {
509 ++lb;
510 continue;
511 }
512
513 Dna5String new_cmp_str, old_cmp_str;
514 int new_mismatch = 0, old_mismatch = 0;
515 string new_patch_str; // this is for colorspace reads
516 if (dist_to_left > 0)
517 {
518 new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
519 old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
520
521 string new_seq;
522 if (color)
523 {
524 string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
525
526 string color, qual;
527 if (antisense)
528 {
529 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
530 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
531 }
532 else
533 {
534 color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
535 qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
536 }
537
538 BWA_decode(color, qual, ref, new_seq);
539 new_seq = new_seq.substr(1);
540 }
541
542 const string& curr_old_seq = curr_hit->seq();
543 const string& curr_seq = color ? new_seq : curr_hit->seq();
544 for (int i = 0; i < dist_to_left; ++i)
545 {
546 if (curr_seq[i] != new_cmp_str[i])
547 ++new_mismatch;
548
549 if (curr_old_seq[i] != old_cmp_str[i])
550 ++old_mismatch;
551 }
552
553 if (color)
554 new_patch_str = curr_seq.substr(0, dist_to_left);
555 }
556 else if (dist_to_left < 0)
557 {
558 new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
559 old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
560
561 size_t abs_dist = -dist_to_left;
562 string new_seq;
563 if (color)
564 {
565 string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
566 ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
567
568 string color, qual;
569 if (antisense)
570 {
571 color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
572 qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
573 }
574 else
575 {
576 color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
577 qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
578 }
579
580 BWA_decode(color, qual, ref, new_seq);
581 new_seq = new_seq.substr(1);
582 }
583
584 const string& prev_old_seq = prev_hit->seq();
585 size_t prev_old_seq_len = prev_old_seq.length();
586 const string& prev_seq = color ? new_seq : prev_hit->seq();
587 size_t prev_seq_len = prev_seq.length();
588 for (size_t i = 0; i < abs_dist; ++i)
589 {
590 if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
591 ++new_mismatch;
592 if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
593 ++old_mismatch;
594 }
595
596 if (color)
597 new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
598 }
599
600 int temp_diff_mismatches = new_mismatch - old_mismatch;
601 if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
602 {
603 ++lb;
604 continue;
605 }
606
607 if (color)
608 {
609 /*
610 * We need to recover the origianl sequence.
611 */
612 if (found_closure)
613 {
614 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
615 prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
616 }
617
618 if (dist_to_left > 0)
619 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
620 else if (dist_to_left < 0)
621 seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
622 }
623
624 new_diff_mismatches = temp_diff_mismatches;
625
626 new_left = prev_hit->left();
627 new_cigar = prev_hit->cigar();
628
629 int new_left_back_len = new_cigar.back().length;
630 new_left_back_len += dist_to_left;
631
632 vector<CigarOp> new_right_cig = curr_hit->cigar();
633 int new_right_front_len = new_right_cig.front().length;
634 new_right_front_len -= dist_to_right;
635 if (new_left_back_len > 0)
636 new_cigar.back().length = new_left_back_len;
637 else
638 new_cigar.pop_back();
639
640 /*
641 * FIXME, currently just differentiating between a deletion and a
642 * reference skip based on length. However, would probably be better
643 * to denote the difference explicitly, this would allow the user
644 * to supply their own (very large) deletions
645 */
646 if ((lb->right - lb->left - 1) <= max_deletion_length)
647 {
648 new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
649 antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
650 }
651 else
652 {
653 new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
654 antisense_closure = lb->antisense;
655 }
656
657 new_right_cig.front().length = new_right_front_len;
658 size_t c = new_right_front_len > 0 ? 0 : 1;
659 for (; c < new_right_cig.size(); ++c)
660 new_cigar.push_back(new_right_cig[c]);
661
662 mismatch = new_diff_mismatches;
663 found_closure = true;
664 }
665 ++lb;
666 }
667
668 if (!found_closure)
669 {
670 return BowtieHit();
671 }
672 }
673
674 if (found_closure)
675 {
676 bool end = false;
677 BowtieHit merged_hit(reference_id,
678 insert_id,
679 new_left,
680 new_cigar,
681 antisense,
682 antisense_closure,
683 prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
684 prev_hit->splice_mms() + curr_hit->splice_mms(),
685 end);
686
687 if (curr_seg_index > 1)
688 merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length));
689 else
690 merged_hit.seq(seq.substr(0, first_seg_length + segment_length));
691
692 prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
693 /*
694 * prev_hit now points PAST the last element removed
695 */
696 prev_hit = hit_chain.insert(prev_hit, merged_hit);
697 /*
698 * merged_hit has been inserted before the old position of
699 * prev_hit. New location of prev_hit is merged_hit
700 */
701 curr_hit = prev_hit;
702 ++curr_hit;
703 ++curr_seg_index;
704 continue;
705 }
706
707 ++prev_hit;
708 ++curr_hit;
709 ++curr_seg_index;
710 }
711
712 bool saw_antisense_splice = false;
713 bool saw_sense_splice = false;
714 vector<CigarOp> long_cigar;
715 int num_mismatches = 0;
716 int num_splice_mms = 0;
717 for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
718 {
719 num_mismatches += s->edit_dist();
720 num_splice_mms += s->splice_mms();
721
722 /*
723 * Check whether the sequence contains any reference skips. Previously,
724 * this was just a check to see whether the sequence was contiguous; however
725 * we don't want to count an indel event as a splice
726 */
727 bool containsSplice = s->is_spliced();
728 if (containsSplice)
729 {
730 if (s->antisense_splice())
731 {
732 if (saw_sense_splice)
733 return BowtieHit();
734 saw_antisense_splice = true;
735 }
736 else
737 {
738 if (saw_antisense_splice)
739 return BowtieHit();
740 saw_sense_splice = true;
741 }
742 }
743 const vector<CigarOp>& cigar = s->cigar();
744 if (long_cigar.empty())
745 {
746 long_cigar = cigar;
747 }
748 else
749 {
750 CigarOp& last = long_cigar.back();
751 /*
752 * If necessary, merge the back and front
753 * cigar operations
754 */
755 if(last.opcode == cigar[0].opcode){
756 last.length += cigar[0].length;
757 for (size_t b = 1; b < cigar.size(); ++b)
758 {
759 long_cigar.push_back(cigar[b]);
760 }
761 }else{
762 for(size_t b = 0; b < cigar.size(); ++b)
763 {
764 long_cigar.push_back(cigar[b]);
765 }
766 }
767 }
768 }
769
770 bool end = false;
771 BowtieHit new_hit(reference_id,
772 insert_id,
773 left,
774 long_cigar,
775 antisense,
776 saw_antisense_splice,
777 num_mismatches,
778 num_splice_mms,
779 end);
780
781 new_hit.seq(seq);
782 new_hit.qual(qual);
783
784 int new_read_len = new_hit.read_len();
785 if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
786 {
787 fprintf(stderr, "Warning: malformed closure\n");
788 return BowtieHit();
789 }
790
791 return new_hit;
792 }
793
794 int multi_closure = 0;
795 int anchor_too_short = 0;
796 int gap_too_short = 0;
797
798 bool valid_hit(const BowtieHit& bh)
799 {
800 if (bh.insert_id())
801 {
802 /*
803 * validate the cigar chain - no gaps shorter than an intron, etc.
804 * also,
805 * -Don't start or end with an indel or refskip
806 * -Only a match operation is allowed is allowed
807 * adjacent to an indel or refskip
808 * -Indels should confrom to length restrictions
809 */
810 const CigarOp* prevCig = &(bh.cigar()[0]);
811 const CigarOp* currCig = &(bh.cigar()[1]);
812 for (size_t i = 1; i < bh.cigar().size(); ++i){
813 currCig = &(bh.cigar()[i]);
814 if(currCig->opcode != MATCH && prevCig->opcode != MATCH){
815 return false;
816 }
817 if(currCig->opcode == INS){
818 if(currCig->length > max_insertion_length){
819 return false;
820 }
821 }
822 if(currCig->opcode == DEL){
823 if(currCig->length > max_deletion_length){
824 return false;
825 }
826 }
827 if(currCig->opcode == REF_SKIP){
828 if(currCig->length < (uint64_t)min_report_intron_length){
829 gap_too_short++;
830 return false;
831 }
832 }
833 prevCig = currCig;
834 }
835 if (bh.cigar().front().opcode != MATCH ||
836 bh.cigar().back().opcode != MATCH /* ||
837 (int)bh.cigar().front().length < min_anchor_len||
838 (int)bh.cigar().back().length < min_anchor_len*/ )
839 {
840 anchor_too_short++;
841 return false;
842 }
843 }
844 else
845 {
846 multi_closure++;
847 return false;
848 }
849
850 return true;
851 }
852
853 void merge_segment_chain(RefSequenceTable& rt,
854 const string& read_seq,
855 const string& read_quals,
856 std::set<Junction>& possible_juncs,
857 std::set<Insertion>& possible_insertions,
858 vector<BowtieHit>& hits,
859 vector<BowtieHit>& merged_hits)
860 {
861 if (hits.size() == 0)
862 return;
863
864 BowtieHit bh;
865 if (hits.size() > 1)
866 {
867 list<BowtieHit> hit_chain;
868
869 if (hits.front().antisense_align())
870 copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
871 else
872 copy(hits.begin(), hits.end(), back_inserter(hit_chain));
873
874 bh = merge_chain(rt, read_seq, read_quals, possible_juncs, possible_insertions, hit_chain);
875 }
876 else
877 {
878 bh = hits[0];
879 }
880
881 if (valid_hit(bh))
882 merged_hits.push_back(bh);
883 }
884
885 bool dfs_seg_hits(RefSequenceTable& rt,
886 const string& read_seq,
887 const string& read_quals,
888 std::set<Junction>& possible_juncs,
889 std::set<Insertion>& possible_insertions,
890 vector<HitsForRead>& seg_hits_for_read,
891 size_t curr,
892 vector<BowtieHit>& seg_hit_stack,
893 vector<BowtieHit>& joined_hits)
894 {
895 assert (!seg_hit_stack.empty());
896 bool join_success = false;
897
898 if (curr < seg_hits_for_read.size())
899 {
900 for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
901 {
902 BowtieHit& bh = seg_hits_for_read[curr].hits[i];
903 BowtieHit& back = seg_hit_stack.back();
904
905 bool consistent_sense = bh.antisense_align() == back.antisense_align();
906 bool same_contig = bh.ref_id() == back.ref_id();
907
908 // FIXME: when we have stranded reads, we need to fix this condition
909 //bool consistent_strand = (bh.contiguous() || back.contiguous() ||
910 // (bh.antisense_splice() == back.antisense_splice()));
911 if (consistent_sense && same_contig /*&& consistent_strand*/)
912 {
913 if (bh.antisense_align())
914 {
915 unsigned int bh_r = bh.right();
916 unsigned int back_left = seg_hit_stack.back().left();
917 if ((bh_r + max_report_intron_length >= back_left &&
918 back_left >= bh_r) || (back_left + max_insertion_length >= bh_r && back_left < bh_r))
919 {
920 // these hits are compatible, so push bh onto the
921 // stack, recurse, and pop it when done.
922 seg_hit_stack.push_back(bh);
923 bool success = dfs_seg_hits(rt,
924 read_seq,
925 read_quals,
926 possible_juncs,
927 possible_insertions,
928 seg_hits_for_read,
929 curr + 1,
930 seg_hit_stack,
931 joined_hits);
932 if (success)
933 join_success = true;
934
935 seg_hit_stack.pop_back();
936 }
937 }
938 else
939 {
940 unsigned int bh_l = bh.left();
941
942 unsigned int back_right = seg_hit_stack.back().right();
943 if ((back_right + max_report_intron_length >= bh_l &&
944 bh_l >= back_right) || (bh_l + max_insertion_length >= back_right && bh_l < back_right))
945 {
946 // these hits are compatible, so push bh onto the
947 // stack, recurse, and pop it when done.
948 seg_hit_stack.push_back(bh);
949 bool success = dfs_seg_hits(rt,
950 read_seq,
951 read_quals,
952 possible_juncs,
953 possible_insertions,
954 seg_hits_for_read,
955 curr + 1,
956 seg_hit_stack,
957 joined_hits);
958
959 if (success)
960 join_success = true;
961
962 seg_hit_stack.pop_back();
963 }
964 }
965 }
966 }
967 }
968 else
969 {
970 merge_segment_chain(rt,
971 read_seq,
972 read_quals,
973 possible_juncs,
974 possible_insertions,
975 seg_hit_stack,
976 joined_hits);
977 return join_success = true;
978 }
979 return join_success;
980 }
981
982 bool join_segments_for_read(RefSequenceTable& rt,
983 const string& read_seq,
984 const string& read_quals,
985 std::set<Junction>& possible_juncs,
986 std::set<Insertion>& possible_insertions,
987 vector<HitsForRead>& seg_hits_for_read,
988 vector<BowtieHit>& joined_hits)
989 {
990 vector<BowtieHit> seg_hit_stack;
991 bool join_success = false;
992
993 for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
994 {
995 BowtieHit& bh = seg_hits_for_read[0].hits[i];
996 seg_hit_stack.push_back(bh);
997 bool success = dfs_seg_hits(rt,
998 read_seq,
999 read_quals,
1000 possible_juncs,
1001 possible_insertions,
1002 seg_hits_for_read,
1003 1,
1004 seg_hit_stack,
1005 joined_hits);
1006 if (success)
1007 join_success = true;
1008 seg_hit_stack.pop_back();
1009 }
1010
1011 return join_success;
1012 }
1013
1014 void join_segment_hits(GBamWriter& bam_writer, std::set<Junction>& possible_juncs,
1015 std::set<Insertion>& possible_insertions,
1016 ReadTable& unmapped_reads,
1017 RefSequenceTable& rt,
1018 //FILE* reads_file,
1019 ReadStream& readstream,
1020 vector<HitStream>& contig_hits,
1021 vector<HitStream>& spliced_hits)
1022 {
1023 uint32_t curr_contig_obs_order = VMAXINT32;
1024 HitStream* first_seg_contig_stream = NULL;
1025 uint64_t next_contig_id = 0;
1026
1027 if (contig_hits.size())
1028 {
1029 first_seg_contig_stream = &(contig_hits.front());
1030 next_contig_id = first_seg_contig_stream->next_group_id();
1031 curr_contig_obs_order = unmapped_reads.observation_order(next_contig_id);
1032 }
1033
1034 HitsForRead curr_hit_group;
1035
1036 uint32_t curr_spliced_obs_order = VMAXINT32;
1037 HitStream* first_seg_spliced_stream = NULL;
1038 uint64_t next_spliced_id = 0;
1039
1040 if (spliced_hits.size())
1041 {
1042 first_seg_spliced_stream = &(spliced_hits.front());
1043 next_spliced_id = first_seg_spliced_stream->next_group_id();
1044 curr_spliced_obs_order = unmapped_reads.observation_order(next_spliced_id);
1045 }
1046
1047 while(curr_contig_obs_order != VMAXINT32 ||
1048 curr_spliced_obs_order != VMAXINT32)
1049 {
1050 uint32_t read_in_process;
1051 vector<HitsForRead> seg_hits_for_read;
1052 seg_hits_for_read.resize(contig_hits.size());
1053
1054 if (curr_contig_obs_order < curr_spliced_obs_order)
1055 {
1056 first_seg_contig_stream->next_read_hits(curr_hit_group);
1057 seg_hits_for_read.front() = curr_hit_group;
1058
1059 next_contig_id = first_seg_contig_stream->next_group_id();
1060 uint32_t next_order = unmapped_reads.observation_order(next_contig_id);
1061
1062 read_in_process = curr_contig_obs_order;
1063 curr_contig_obs_order = next_order;
1064 }
1065 else if (curr_spliced_obs_order < curr_contig_obs_order)
1066 {
1067 first_seg_spliced_stream->next_read_hits(curr_hit_group);
1068 seg_hits_for_read.front() = curr_hit_group;
1069
1070 next_spliced_id = first_seg_spliced_stream->next_group_id();
1071 uint32_t next_order = unmapped_reads.observation_order(next_spliced_id);
1072
1073 read_in_process = curr_spliced_obs_order;
1074 curr_spliced_obs_order = next_order;
1075 }
1076 else if (curr_contig_obs_order == curr_spliced_obs_order &&
1077 curr_contig_obs_order != VMAXINT32 &&
1078 curr_spliced_obs_order != VMAXINT32)
1079 {
1080 first_seg_contig_stream->next_read_hits(curr_hit_group);
1081
1082 HitsForRead curr_spliced_group;
1083 first_seg_spliced_stream->next_read_hits(curr_spliced_group);
1084
1085 curr_hit_group.hits.insert(curr_hit_group.hits.end(),
1086 curr_spliced_group.hits.begin(),
1087 curr_spliced_group.hits.end());
1088 seg_hits_for_read.front() = curr_hit_group;
1089 read_in_process = curr_spliced_obs_order;
1090
1091 next_contig_id = first_seg_contig_stream->next_group_id();
1092 uint32_t next_order = unmapped_reads.observation_order(next_contig_id);
1093
1094 next_spliced_id = first_seg_spliced_stream->next_group_id();
1095 uint32_t next_spliced_order = unmapped_reads.observation_order(next_spliced_id);
1096
1097 curr_spliced_obs_order = next_spliced_order;
1098 curr_contig_obs_order = next_order;
1099 }
1100 else
1101 {
1102 break;
1103 }
1104
1105 if (contig_hits.size() > 1)
1106 {
1107 look_right_for_hit_group(unmapped_reads,
1108 contig_hits,
1109 0,
1110 spliced_hits,
1111 curr_hit_group,
1112 seg_hits_for_read);
1113 }
1114
1115 size_t last_non_empty = seg_hits_for_read.size() - 1;
1116 while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
1117 {
1118 --last_non_empty;
1119 }
1120
1121 seg_hits_for_read.resize(last_non_empty + 1);
1122 if (!seg_hits_for_read[last_non_empty].hits[0].end())
1123 continue;
1124
1125 if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
1126 {
1127 uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
1128 Read read;
1129 /* if (get_read_from_stream(insert_id,
1130 reads_file,
1131 reads_format,
1132 false,
1133 read)) */
1134 if (readstream.getRead(insert_id, read))
1135 {
1136 vector<BowtieHit> joined_hits;
1137 join_segments_for_read(rt,
1138 read.seq.c_str(),
1139 read.qual.c_str(),
1140 possible_juncs,
1141 possible_insertions,
1142 seg_hits_for_read,
1143 joined_hits);
1144
1145 sort(joined_hits.begin(), joined_hits.end());
1146 vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
1147 joined_hits.erase(new_end, joined_hits.end());
1148
1149 for (size_t i = 0; i < joined_hits.size(); i++)
1150 {
1151 const char* ref_name = rt.get_name(joined_hits[i].ref_id());
1152 if (color && !color_out)
1153 //print_hit(stdout, read_name, joined_hits[i], ref_name, joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
1154 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name, joined_hits[i].seq().c_str(),
1155 joined_hits[i].qual().c_str(), true);
1156 else
1157 print_bamhit(bam_writer, read.name.c_str(), joined_hits[i], ref_name,
1158 read.seq.c_str(), read.qual.c_str(), false);
1159 //print_hit(stdout, read_name, joined_hits[i], ref_name, read_seq, read_quals, false);
1160 }
1161 }
1162 else
1163 {
1164 err_die("Error: could not get read # %d from stream\n",
1165 read_in_process);
1166 }
1167 }
1168 else
1169 {
1170 //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
1171 }
1172 }
1173 }
1174
1175 void driver(GBamWriter& bam_writer, istream& ref_stream,
1176 vector<FILE*>& possible_juncs_files,
1177 vector<FILE*>& possible_insertions_files,
1178 vector<FILE*>& possible_deletions_files,
1179 vector<FZPipe>& spliced_segmap_files, //.sam.z files
1180 vector<string>& segmap_fnames, //.bam files
1181 ReadStream& readstream)
1182 {
1183 if (segmap_fnames.size() == 0)
1184 {
1185 fprintf(stderr, "No hits to process, exiting\n");
1186 exit(0);
1187 }
1188
1189 RefSequenceTable rt(true, true);
1190 fprintf (stderr, "Loading reference sequences...\n");
1191 get_seqs(ref_stream, rt, true, false);
1192 fprintf (stderr, " reference sequences loaded.\n");
1193 ReadTable it;
1194
1195 bool need_seq = true;
1196 bool need_qual = color;
1197 //rewind(reads_file);
1198 readstream.rewind();
1199
1200 //vector<HitTable> seg_hits;
1201 vector<HitStream> contig_hits;
1202 vector<HitStream> spliced_hits;
1203
1204 vector<HitFactory*> factories;
1205 for (size_t i = 0; i < segmap_fnames.size(); ++i)
1206 {
1207 HitFactory* fac = new BAMHitFactory(it, rt);
1208 factories.push_back(fac);
1209 HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
1210 contig_hits.push_back(hs);
1211 }
1212
1213 fprintf(stderr, "Loading spliced hits...");
1214
1215 //vector<vector<pair<uint32_t, HitsForRead> > > spliced_hits;
1216 for (size_t i = 0; i < spliced_segmap_files.size(); ++i)
1217 {
1218 int anchor_length = 0;
1219
1220 // we allow read alignments even 1bp overlapping with juctions.
1221 // if (i == 0 || i == spliced_seg_files.size() - 1)
1222 // anchor_length = min_anchor_len;
1223
1224 HitFactory* fac = new SplicedSAMHitFactory(it,
1225 rt,
1226 anchor_length);
1227 factories.push_back(fac);
1228
1229 HitStream hs(spliced_segmap_files[i].file, fac, true, false, false, need_seq, need_qual);
1230 spliced_hits.push_back(hs);
1231
1232 }
1233 fprintf(stderr, "done\n");
1234
1235 fprintf(stderr, "Loading junctions...");
1236 std::set<Junction> possible_juncs;
1237
1238 for (size_t i = 0; i < possible_juncs_files.size(); ++i)
1239 {
1240 char buf[2048];
1241 while(!feof(possible_juncs_files[i]) &&
1242 fgets(buf, sizeof(buf), possible_juncs_files[i]))
1243 {
1244 char junc_ref_name[256];
1245 int left;
1246 int right;
1247 char orientation;
1248 int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation);
1249 if (ret != 4)
1250 continue;
1251 uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0);
1252 possible_juncs.insert(Junction(ref_id, left, right, orientation == '-'));
1253 }
1254 }
1255 fprintf(stderr, "done\n");
1256
1257
1258 fprintf(stderr, "Loading deletions...");
1259
1260 for (size_t i = 0; i < possible_deletions_files.size(); ++i)
1261 {
1262 char splice_buf[2048];
1263 FILE* deletions_file = possible_deletions_files[i];
1264 if(!deletions_file){
1265 continue;
1266 }
1267 while(fgets(splice_buf, 2048, deletions_file)){
1268 char* nl = strrchr(splice_buf, '\n');
1269 char* buf = splice_buf;
1270 if (nl) *nl = 0;
1271
1272 char* ref_name = get_token((char**)&buf, "\t");
1273 char* scan_left_coord = get_token((char**)&buf, "\t");
1274 char* scan_right_coord = get_token((char**)&buf, "\t");
1275
1276 if (!scan_left_coord || !scan_right_coord)
1277 {
1278 err_die("Error: malformed deletion coordinate record\n");
1279 }
1280
1281 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
1282 uint32_t left_coord = atoi(scan_left_coord);
1283 uint32_t right_coord = atoi(scan_right_coord);
1284 possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false));
1285 }
1286 }
1287 fprintf(stderr, "done\n");
1288
1289
1290 /*
1291 * Read the insertions from the list of insertion
1292 * files into a set
1293 */
1294 std::set<Insertion> possible_insertions;
1295 for (size_t i=0; i < possible_insertions_files.size(); ++i)
1296 {
1297 char splice_buf[2048];
1298 FILE* insertions_file = possible_insertions_files[i];
1299 if(!insertions_file){
1300 continue;
1301 }
1302 while(fgets(splice_buf, 2048, insertions_file)){
1303 char* nl = strrchr(splice_buf, '\n');
1304 char* buf = splice_buf;
1305 if (nl) *nl = 0;
1306
1307 char* ref_name = get_token((char**)&buf, "\t");
1308 char* scan_left_coord = get_token((char**)&buf, "\t");
1309 char* scan_right_coord = get_token((char**)&buf, "\t");
1310 char* scan_sequence = get_token((char**)&buf, "\t");
1311
1312 if (!scan_left_coord || !scan_sequence || !scan_right_coord)
1313 {
1314 err_die("Error: malformed insertion coordinate record\n");
1315 }
1316
1317 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
1318 uint32_t left_coord = atoi(scan_left_coord);
1319 std::string sequence(scan_sequence);
1320 possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
1321 }
1322 }
1323
1324 join_segment_hits(bam_writer, possible_juncs, possible_insertions, it, rt, readstream, contig_hits, spliced_hits);
1325 //join_segment_hits(possible_juncs, possible_insertions, it, rt, reads_file.file, contig_hits, spliced_hits);
1326
1327 /*for (size_t i = 0; i < segmap_fnames.size(); ++i)
1328 segmap_fnames[i].close();
1329 */
1330 for (size_t i = 0; i < spliced_segmap_files.size(); ++i)
1331 spliced_segmap_files[i].close();
1332 readstream.close();
1333
1334 for (size_t fac = 0; fac < factories.size(); ++fac)
1335 {
1336 delete factories[fac];
1337 }
1338 } //driver
1339
1340 int main(int argc, char** argv)
1341 {
1342 fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1343 fprintf(stderr, "--------------------------------------------\n");
1344
1345 int parse_ret = parse_options(argc, argv, print_usage);
1346 if (parse_ret)
1347 return parse_ret;
1348
1349 if(optind >= argc)
1350 {
1351 print_usage();
1352 return 1;
1353 }
1354
1355 string ref_file_name = argv[optind++];
1356
1357 if(optind >= argc)
1358 {
1359 print_usage();
1360 return 1;
1361 }
1362
1363
1364 string reads_file_name = argv[optind++];
1365
1366 if(optind >= argc)
1367 {
1368 print_usage();
1369 return 1;
1370 }
1371
1372 string juncs_file_list = argv[optind++];
1373
1374 if(optind >= argc)
1375 {
1376 print_usage();
1377 return 1;
1378 }
1379
1380 string insertions_file_list = argv[optind++];
1381 if(optind >= argc)
1382 {
1383 print_usage();
1384 return 1;
1385 }
1386
1387 string deletions_file_list = argv[optind++];
1388
1389 if(optind >= argc)
1390 {
1391 print_usage();
1392 return 1;
1393 }
1394
1395
1396 string segmap_file_list = argv[optind++];
1397 string spliced_segmap_flist;
1398 if(optind < argc)
1399 {
1400 spliced_segmap_flist = argv[optind++];
1401 }
1402
1403 ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1404 if (!ref_stream.good())
1405 err_die("Error: cannot open %s for reading\n",ref_file_name.c_str());
1406
1407 checkSamHeader();
1408
1409 //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
1410 vector<string> segmap_file_names;
1411 tokenize(segmap_file_list, ",",segmap_file_names);
1412 ReadStream readstream(reads_file_name);
1413 if (readstream.file()==NULL)
1414 err_die("Error: cannot open %s for reading\n",
1415 reads_file_name.c_str());
1416
1417 vector<string> juncs_file_names;
1418 vector<FILE*> juncs_files;
1419 tokenize(juncs_file_list, ",",juncs_file_names);
1420 for (size_t i = 0; i < juncs_file_names.size(); ++i) {
1421 //fprintf(stderr, "Opening %s for reading\n",
1422 // juncs_file_names[i].c_str());
1423 FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r");
1424 if (juncs_file == NULL) {
1425 fprintf(stderr, "Warning: cannot open %s for reading\n",
1426 juncs_file_names[i].c_str());
1427 continue;
1428 }
1429 juncs_files.push_back(juncs_file);
1430 }
1431
1432 /*
1433 * Read in the deletion file names
1434 */
1435 vector<string> deletions_file_names;
1436 vector<FILE*> deletions_files;
1437 tokenize(deletions_file_list, ",",deletions_file_names);
1438 for (size_t i = 0; i < deletions_file_names.size(); ++i)
1439 {
1440 //fprintf(stderr, "Opening %s for reading\n",
1441 // deletions_file_names[i].c_str());
1442 FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r");
1443 if (deletions_file == NULL) {
1444 fprintf(stderr, "Warning: cannot open %s for reading\n",
1445 deletions_file_names[i].c_str());
1446 continue;
1447 }
1448 deletions_files.push_back(deletions_file);
1449 }
1450
1451 /*
1452 * Read in the list of filenames that contain
1453 * insertion coordinates
1454 */
1455
1456 vector<string> insertions_file_names;
1457 vector<FILE*> insertions_files;
1458 tokenize(insertions_file_list, ",",insertions_file_names);
1459 for (size_t i = 0; i < insertions_file_names.size(); ++i)
1460 {
1461 //fprintf(stderr, "Opening %s for reading\n",
1462 // insertions_file_names[i].c_str());
1463 FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r");
1464 if (insertions_file == NULL)
1465 {
1466 fprintf(stderr, "Warning: cannot open %s for reading\n",
1467 insertions_file_names[i].c_str());
1468 continue;
1469 }
1470 insertions_files.push_back(insertions_file);
1471 }
1472
1473 vector<string> spliced_segmap_file_names;
1474 vector<FZPipe> spliced_segmap_files;
1475 //these are headerless .sam.z files
1476 string unzcmd;
1477 tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names);
1478 for (size_t i = 0; i < spliced_segmap_file_names.size(); ++i)
1479 {
1480 fprintf(stderr, "Opening %s for reading\n",
1481 spliced_segmap_file_names[i].c_str());
1482 if (unzcmd.empty())
1483 unzcmd=getUnpackCmd(spliced_segmap_file_names[i],false);
1484 FZPipe spliced_seg_file(spliced_segmap_file_names[i], unzcmd);
1485 if (spliced_seg_file.file == NULL)
1486 {
1487 err_die("Error: cannot open %s for reading\n",
1488 spliced_segmap_file_names[i].c_str());
1489 }
1490 spliced_segmap_files.push_back(spliced_seg_file);
1491 }
1492
1493 if (spliced_segmap_files.size())
1494 {
1495 if (spliced_segmap_files.size() != segmap_file_names.size())
1496 {
1497 err_die("Error: each segment file must have a corresponding spliced segment file\n");
1498 }
1499 }
1500 GBamWriter bam_writer("-", sam_header.c_str());
1501 driver(bam_writer, ref_stream, juncs_files, insertions_files, deletions_files, spliced_segmap_files, segmap_file_names, readstream);
1502 //driver(ref_stream, juncs_files, insertions_files, deletions_files, spliced_segment_files, segment_files, reads_file);
1503 return 0;
1504 }