ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/long_spanning_reads.cpp
Revision: 69
Committed: Mon Sep 19 21:00:41 2011 UTC (9 years, 1 month ago) by gpertea
File size: 47622 byte(s)
Log Message:
wip - implementing SplicedSAMHitFactory::get_hit_from_buf()

Line User Rev File contents
1 gpertea 29 #ifdef HAVE_CONFIG_H
2     #include <config.h>
3     #endif
4    
5     /*
6     * long_spanning_reads.cpp
7     * TopHat
8     *
9     * Created by Cole Trapnell on 2/5/09.
10     * Copyright 2009 Cole Trapnell. All rights reserved.
11     *
12     */
13    
14     #include <cassert>
15     #include <cstdio>
16     #include <vector>
17     #include <string>
18     #include <map>
19     #include <algorithm>
20     #include <set>
21     #include <cstdlib>
22     #include <cstring>
23     #include <bitset>
24     //#include <stdexcept>
25     #include <iostream>
26     #include <fstream>
27     #include <sstream>
28     #include <seqan/sequence.h>
29     #include <seqan/find.h>
30     #include <seqan/file.h>
31     #include <seqan/modifier.h>
32     #include <getopt.h>
33    
34     #include "common.h"
35     #include "bwt_map.h"
36     #include "tokenize.h"
37     #include "segments.h"
38     #include "reads.h"
39    
40     #include "junctions.h"
41     #include "insertions.h"
42     #include "deletions.h"
43    
44     using namespace seqan;
45     using namespace std;
46    
47     void print_usage()
48     {
49     fprintf(stderr, "Usage: long_spanning_reads <reads.fa/.fq> <possible_juncs1,...,possible_juncsN> <possible_insertions1,...,possible_insertionsN> <possible_deletions1,...,possible_deletionsN> <seg1.bwtout,...,segN.bwtout> [spliced_seg1.bwtout,...,spliced_segN.bwtout]\n");
50     }
51    
52     bool key_lt(const pair<uint32_t, HitsForRead>& lhs,
53     const pair<uint32_t, HitsForRead>& rhs)
54     {
55     return lhs.first < rhs.first;
56     }
57    
58     void get_seqs(istream& ref_stream,
59     RefSequenceTable& rt,
60     bool keep_seqs = true,
61     bool strip_slash = false)
62     {
63     while(ref_stream.good() &&
64     !ref_stream.eof())
65     {
66     RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
67     string name;
68     readMeta(ref_stream, name, Fasta());
69     string::size_type space_pos = name.find_first_of(" \t\r");
70     if (space_pos != string::npos)
71     {
72     name.resize(space_pos);
73     }
74     seqan::read(ref_stream, *ref_str, Fasta());
75    
76     rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
77     }
78     }
79    
80    
81     void look_right_for_hit_group(ReadTable& unmapped_reads,
82     vector<HitStream>& contig_hits,
83     size_t curr_file,
84     vector<HitStream>& spliced_hits,
85     const HitsForRead& targets,
86     vector<HitsForRead>& seg_hits_for_read)
87     {
88     int right_file = curr_file + 1;
89    
90     HitStream& right = contig_hits[right_file];
91     uint64_t curr_next_group_id = targets.insert_id;
92     int curr_order = unmapped_reads.observation_order(curr_next_group_id);
93    
94     assert (curr_order != -1);
95     while(true)
96     {
97     HitsForRead hit_group;
98     uint64_t right_next_group_id = right.next_group_id();
99     int right_order = unmapped_reads.observation_order(right_next_group_id);
100    
101     // If we would have seen the hits by now, bail out.
102     if (curr_order < right_order || right_order == -1)
103     {
104     break;
105     }
106     if (right.next_read_hits(hit_group))
107     {
108     if (hit_group.insert_id == targets.insert_id)
109     {
110     // Some of the targets may be missing, we need to
111     // process them individually
112     seg_hits_for_read[right_file] = hit_group;
113     break;
114     }
115     }
116     }
117    
118     HitsForRead& curr_seg_hits = seg_hits_for_read[right_file];
119    
120     if (right_file < (int)spliced_hits.size() && right_file >= 0)
121     {
122     // Scan forward in the spliced hits file for this hit group
123     HitsForRead spliced_group;
124     HitsForRead curr_spliced_group;
125     while (spliced_hits[right_file].next_group_id() > 0 &&
126     spliced_hits[right_file].next_group_id() <= (uint32_t)curr_order)
127     {
128     spliced_hits[right_file].next_read_hits(curr_spliced_group);
129    
130     if (curr_spliced_group.insert_id == (uint32_t)curr_order)
131     {
132     spliced_group = curr_spliced_group;
133     break;
134     }
135     }
136    
137     if (!spliced_group.hits.empty())
138     {
139     curr_seg_hits.insert_id = spliced_group.insert_id;
140     curr_seg_hits.hits.insert(curr_seg_hits.hits.end(),
141     spliced_group.hits.begin(),
142     spliced_group.hits.end());
143     }
144     }
145    
146     if (curr_seg_hits.hits.empty())
147     return;
148     else if (right_file + 1 < (int)contig_hits.size())
149     {
150     look_right_for_hit_group(unmapped_reads,
151     contig_hits,
152     curr_file + 1,
153     spliced_hits,
154     curr_seg_hits,
155     seg_hits_for_read);
156     }
157     }
158    
159     BowtieHit merge_chain(RefSequenceTable& rt,
160     const string& read_seq,
161     const string& read_quals,
162     std::set<Junction>& possible_juncs,
163     std::set<Insertion>& possible_insertions,
164     list<BowtieHit>& hit_chain)
165     {
166     bool antisense = hit_chain.front().antisense_align();
167     uint32_t reference_id = hit_chain.front().ref_id();
168     uint64_t insert_id = hit_chain.front().insert_id();
169    
170     int left = hit_chain.front().left();
171    
172     list<BowtieHit>::iterator prev_hit = hit_chain.begin();
173     list<BowtieHit>::iterator curr_hit = ++(hit_chain.begin());
174    
175     string seq;
176     string qual;
177     int old_read_length = 0;
178     int first_seg_length = hit_chain.front().seq().length();
179     for (list<BowtieHit>::iterator i = hit_chain.begin(); i != hit_chain.end(); ++i)
180     {
181     seq += i->seq();
182     qual += i->qual();
183     old_read_length += i->read_len();
184     }
185    
186     string rev_read_seq, rev_read_quals;
187     if (color && antisense)
188     {
189     rev_read_seq = read_seq;
190     reverse(rev_read_seq.begin() + 1, rev_read_seq.end());
191    
192     rev_read_quals = read_quals;
193     reverse(rev_read_quals.begin(), rev_read_quals.end());
194     }
195    
196     while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
197     {
198     /*
199     * Note that the gap size may be negative, since left() and right() return
200     * signed integers, this will be OK.
201     */
202     int gap = curr_hit->left() - prev_hit->right();
203     if (gap < -(int)max_insertion_length ||
204     (gap > (int)max_deletion_length &&
205     (gap < min_report_intron_length || gap > max_report_intron_length)))
206     {
207     return BowtieHit();
208     }
209    
210     ++prev_hit;
211     ++curr_hit;
212     }
213    
214     prev_hit = hit_chain.begin();
215     curr_hit = ++(hit_chain.begin());
216    
217     RefSequenceTable::Sequence* ref_str = rt.get_seq(prev_hit->ref_id());
218     if (!ref_str)
219     return BowtieHit();
220    
221     int curr_seg_index = 1;
222     while (curr_hit != hit_chain.end() && prev_hit != hit_chain.end())
223     {
224     /*
225     * This code is assuming that the cigar strings end and start with a match
226     * While segment alignments can actually end with a junction, insertion or deletion, the hope
227     * is that in those cases, the right and left ends of the alignments will correctly
228     * line up, so we won't get to this bit of code
229     */
230     if (prev_hit->cigar().back().opcode != MATCH || curr_hit->cigar().front().opcode != MATCH)
231     {
232     return BowtieHit();
233     }
234    
235     if (prev_hit->is_spliced() && curr_hit->is_spliced() && prev_hit->antisense_splice() != curr_hit->antisense_splice())
236     {
237     /*
238     * There is no way that we can splice these together into a valid
239     * alignment
240     */
241     return BowtieHit();
242     }
243    
244     bool found_closure = false;
245     /*
246     * Note that antisense_splice is the same for prev and curr
247     */
248     bool antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
249     vector<CigarOp> new_cigar;
250     int new_left = -1;
251     int mismatch = 0;
252    
253     /*
254     * Take the length of matched bases in each segment into consideration for closures,
255     * this can be a problem for reads of variable lengths.
256     */
257     int prev_right_end_match_length = prev_hit->cigar().back().length;
258     int curr_left_end_match_length = curr_hit->cigar().front().length;
259    
260     if (prev_hit->right() > curr_hit->left())
261     {
262     std::set<Insertion>::iterator lb, ub;
263     /*
264     * Note, this offset is determined by the min-anchor length supplied to
265     * juncs_db, which is currently hard-coded at 3 in tophat.py
266     * as this value determines what sort of read segments
267     * should be able to align directly to the splice sequences
268     */
269     int left_boundary = prev_hit->right() - 4;
270     int right_boundary = curr_hit->left() + 4;
271    
272     /*
273     * Create a dummy sequence to represent the maximum possible insertion
274     */
275     std::string maxInsertedSequence = "";
276     maxInsertedSequence.resize(max_insertion_length,'A');
277    
278     lb = possible_insertions.upper_bound(Insertion(reference_id, left_boundary, ""));
279     ub = possible_insertions.upper_bound(Insertion(reference_id, right_boundary, maxInsertedSequence));
280    
281     int reference_mismatch = 0;
282    
283     while (lb != ub && lb != possible_insertions.end())
284     {
285     /*
286     * In the following code, we will check to make sure that the segments have the proper
287     * separation and sequence for the insertions, and generate the appropriate merged bowtie hit
288     * In general, reads with insertions must match the inserted sequence exactly.
289     */
290     if (((int)lb->sequence.size()) == (prev_hit->right() - curr_hit->left()))
291     {
292     /*
293     * Check we have enough matched bases on prev or curr segment.
294     */
295     int insert_to_prev_right = prev_hit->right() - lb->left - 1;
296     int curr_left_to_insert = lb->left - curr_hit->left() + 1;
297     if (insert_to_prev_right > prev_right_end_match_length || curr_left_to_insert > curr_left_end_match_length)
298     {
299     ++lb;
300     continue;
301     }
302    
303     /*
304     * Keep track of how many mismatches were made to the genome in the region
305     * where we should actually be matching against the insertion
306     */
307     int this_reference_mismatch = 0;
308     int insertion_mismatch = 0;
309     int insertion_len = lb->sequence.length();
310     const seqan::Dna5String insertionSequence = seqan::Dna5String(lb->sequence);
311    
312     /*
313     * First check to see if we need to adjust number of observed errors for the left (prev)
314     * hit. This is only the case if this segment runs into the insertion. To be consistent
315     * with bwt_map.cpp, we will not allow a segment to have errors in the insertion region
316     */
317     string colorSegmentSequence_prev;
318     if (insert_to_prev_right > 0)
319     {
320     const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
321     const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(prev_hit->seq().substr(prev_hit->seq().length() - insert_to_prev_right));
322    
323     if (color)
324     {
325     string color;
326    
327     if (antisense)
328     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - insert_to_prev_right - 2, insert_to_prev_right + 1);
329     else
330     color = read_seq.substr(curr_seg_index * segment_length - insert_to_prev_right, insert_to_prev_right + 1);
331    
332     color[0] = prev_hit->seq()[segment_length - insert_to_prev_right - 1];
333     colorSegmentSequence_prev = convert_color_to_bp(color);
334     }
335    
336     const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_prev : oldSegmentSequence;
337    
338     /*
339     * Scan right in the read until we run out of read
340     */
341     for (int read_index = 0; read_index < insert_to_prev_right; ++read_index)
342     {
343     /*
344     * Any mismatch to the insertion is a failure
345     */
346     if (referenceSequence[read_index] == 'N' || referenceSequence[read_index] != oldSegmentSequence[read_index])
347     {
348     ++this_reference_mismatch;
349     }
350    
351     if (read_index < insertion_len)
352     {
353     if (insertionSequence[read_index] == 'N' || insertionSequence[read_index] != newSegmentSequence[read_index])
354     {
355     ++insertion_mismatch;
356     break;
357     }
358     }
359     else
360     {
361     if (referenceSequence[read_index - insertion_len] == 'N' ||
362     referenceSequence[read_index - insertion_len] != newSegmentSequence[read_index])
363     {
364     --this_reference_mismatch;
365     }
366     }
367     }
368     }
369    
370     string colorSegmentSequence_curr;
371     if (curr_left_to_insert > 0)
372     {
373     const seqan::Dna5String referenceSequence = seqan::infix(*ref_str, curr_hit->left(), lb->left + 1);
374     const seqan::Dna5String oldSegmentSequence = seqan::Dna5String(curr_hit->seq().substr(0, curr_left_to_insert));
375    
376     if (color)
377     {
378     string color;
379     if (antisense)
380     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length), curr_left_to_insert);
381     else
382     color = read_seq.substr(curr_seg_index * segment_length + 2, curr_left_to_insert);
383    
384     color.push_back(curr_hit->seq()[curr_left_to_insert]);
385     reverse(color.begin(), color.end());
386     string bp = convert_color_to_bp(color);
387     reverse(bp.begin(), bp.end());
388     colorSegmentSequence_curr = bp;
389     }
390    
391     const seqan::Dna5String newSegmentSequence = color ? colorSegmentSequence_curr : oldSegmentSequence;
392    
393     /*
394     * Scan left in the read until
395     * We ran out of read sequence (insertion extends past segment)
396     */
397     for (int read_index = 0; read_index < curr_left_to_insert; ++read_index)
398     {
399     int segmentPosition = curr_left_to_insert - read_index - 1;
400     int insertionPosition = insertion_len - read_index - 1;
401    
402     if (referenceSequence[segmentPosition] == 'N' || (referenceSequence[segmentPosition] != oldSegmentSequence[segmentPosition]))
403     {
404     ++this_reference_mismatch;
405     }
406    
407     if (read_index < insertion_len)
408     {
409     if (insertionSequence[insertionPosition] == 'N' || (insertionSequence[insertionPosition] != newSegmentSequence[segmentPosition]))
410     {
411     ++insertion_mismatch;
412     break;
413     }
414     }
415     else
416     {
417     if (referenceSequence[segmentPosition + insertion_len] == 'N' ||
418     (referenceSequence[segmentPosition + insertion_len] != newSegmentSequence[segmentPosition]))
419     {
420     --this_reference_mismatch;
421     }
422     }
423     }
424     }
425    
426     if (found_closure)
427     {
428     fprintf(stderr, "Warning: multiple closures found for insertion read # %d\n", (int)insert_id);
429     return BowtieHit();
430     }
431    
432     if (insertion_mismatch == 0)
433     {
434     reference_mismatch = this_reference_mismatch;
435     mismatch = -reference_mismatch;
436     found_closure = true;
437     new_left = prev_hit->left();
438     new_cigar = prev_hit->cigar();
439    
440     /*
441     * Need to make a new insert operation between the two match character that begin
442     * and end the intersection of these two junction. Note that we necessarily assume
443     * that this insertion can't span beyond the boundaries of these reads. That should
444     * probably be better enforced somewhere
445     */
446    
447     new_cigar.back().length -= insert_to_prev_right;
448     if (new_cigar.back().length <= 0)
449     new_cigar.pop_back();
450    
451     new_cigar.push_back(CigarOp(INS, lb->sequence.size()));
452     vector<CigarOp> new_right_cigar = curr_hit->cigar();
453     new_right_cigar.front().length += (insert_to_prev_right - lb->sequence.size());
454    
455     /*
456     * Finish stitching together the new cigar string
457     */
458     size_t c = new_right_cigar.front().length > 0 ? 0 : 1;
459     for (; c < new_right_cigar.size(); ++c)
460     {
461     new_cigar.push_back(new_right_cigar[c]);
462     }
463    
464     if (color)
465     {
466     if (insert_to_prev_right > 0)
467     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - insert_to_prev_right, insert_to_prev_right, colorSegmentSequence_prev);
468    
469     if (curr_left_to_insert > 0)
470     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, curr_left_to_insert, colorSegmentSequence_curr);
471     }
472     }
473     }
474     ++lb;
475     }
476    
477     if (!found_closure)
478     {
479     return BowtieHit();
480     }
481     }
482    
483     /*
484     * Stitch segments together using juctions or deletions if necessary.
485     */
486     else if (prev_hit->right() < curr_hit->left())
487     {
488     std::set<Junction>::iterator lb, ub;
489    
490     int left_boundary = prev_hit->right() - 4;
491     int right_boundary = curr_hit->left() + 4;
492    
493     lb = possible_juncs.upper_bound(Junction(reference_id, left_boundary, right_boundary - 8, true));
494     ub = possible_juncs.lower_bound(Junction(reference_id, left_boundary + 8, right_boundary, false));
495    
496     int new_diff_mismatches = 0xff;
497     while (lb != ub && lb != possible_juncs.end())
498     {
499     int dist_to_left = lb->left - prev_hit->right() + 1;
500     int dist_to_right = lb->right - curr_hit->left();
501    
502     if (abs(dist_to_left) <= 4 && abs(dist_to_right) <= 4 && dist_to_left == dist_to_right)
503     {
504     /*
505     * Check we have enough matched bases on prev or curr segment.
506     */
507     if (dist_to_left > curr_left_end_match_length || -dist_to_left > prev_right_end_match_length )
508     {
509     ++lb;
510     continue;
511     }
512    
513     Dna5String new_cmp_str, old_cmp_str;
514     int new_mismatch = 0, old_mismatch = 0;
515     string new_patch_str; // this is for colorspace reads
516     if (dist_to_left > 0)
517     {
518     new_cmp_str = seqan::infix(*ref_str, prev_hit->right(), lb->left + 1);
519     old_cmp_str = seqan::infix(*ref_str, curr_hit->left(), lb->right);
520    
521     string new_seq;
522     if (color)
523     {
524     string ref = DnaString_to_string(seqan::infix(*ref_str, prev_hit->right() - 1, lb->left + 1));
525    
526     string color, qual;
527     if (antisense)
528     {
529     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
530     qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1, dist_to_left);
531     }
532     else
533     {
534     color = read_seq.substr(1 + curr_seg_index * segment_length, dist_to_left);
535     qual = read_quals.substr(curr_seg_index * segment_length, dist_to_left);
536     }
537    
538     BWA_decode(color, qual, ref, new_seq);
539     new_seq = new_seq.substr(1);
540     }
541    
542     const string& curr_old_seq = curr_hit->seq();
543     const string& curr_seq = color ? new_seq : curr_hit->seq();
544     for (int i = 0; i < dist_to_left; ++i)
545     {
546     if (curr_seq[i] != new_cmp_str[i])
547     ++new_mismatch;
548    
549     if (curr_old_seq[i] != old_cmp_str[i])
550     ++old_mismatch;
551     }
552    
553     if (color)
554     new_patch_str = curr_seq.substr(0, dist_to_left);
555     }
556     else if (dist_to_left < 0)
557     {
558     new_cmp_str = seqan::infix(*ref_str, lb->right, curr_hit->left());
559     old_cmp_str = seqan::infix(*ref_str, lb->left + 1, prev_hit->right());
560    
561     size_t abs_dist = -dist_to_left;
562     string new_seq;
563     if (color)
564     {
565     string ref = DnaString_to_string(seqan::infix(*ref_str, lb->left, lb->left + 1));
566     ref += DnaString_to_string(seqan::infix(*ref_str, lb->right, curr_hit->left()));
567    
568     string color, qual;
569     if (antisense)
570     {
571     color = rev_read_seq.substr(rev_read_seq.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
572     qual = rev_read_quals.substr(rev_read_quals.length() - (curr_seg_index * segment_length) - 1 - abs_dist, abs_dist);
573     }
574     else
575     {
576     color = read_seq.substr(1 + curr_seg_index * segment_length - abs_dist, abs_dist);
577     qual = read_quals.substr(curr_seg_index * segment_length - abs_dist, abs_dist);
578     }
579    
580     BWA_decode(color, qual, ref, new_seq);
581     new_seq = new_seq.substr(1);
582     }
583    
584     const string& prev_old_seq = prev_hit->seq();
585     size_t prev_old_seq_len = prev_old_seq.length();
586     const string& prev_seq = color ? new_seq : prev_hit->seq();
587     size_t prev_seq_len = prev_seq.length();
588     for (size_t i = 0; i < abs_dist; ++i)
589     {
590     if (prev_seq[prev_seq_len - (abs_dist - i)] != new_cmp_str[i])
591     ++new_mismatch;
592     if (prev_old_seq[prev_old_seq_len - (abs_dist - i)] != old_cmp_str[i])
593     ++old_mismatch;
594     }
595    
596     if (color)
597     new_patch_str = prev_seq.substr(prev_seq_len - abs_dist, abs_dist);
598     }
599    
600     int temp_diff_mismatches = new_mismatch - old_mismatch;
601     if (temp_diff_mismatches >= new_diff_mismatches || new_mismatch >= 2)
602     {
603     ++lb;
604     continue;
605     }
606    
607     if (color)
608     {
609     /*
610     * We need to recover the origianl sequence.
611     */
612     if (found_closure)
613     {
614     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length - 4, 8,
615     prev_hit->seq().substr(prev_hit->seq().length() - 4) + curr_hit->seq().substr(0, 4));
616     }
617    
618     if (dist_to_left > 0)
619     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length, dist_to_left, new_patch_str);
620     else if (dist_to_left < 0)
621     seq.replace(first_seg_length + (curr_seg_index - 1) * segment_length + dist_to_left, -dist_to_left, new_patch_str);
622     }
623    
624     new_diff_mismatches = temp_diff_mismatches;
625    
626     new_left = prev_hit->left();
627     new_cigar = prev_hit->cigar();
628    
629     int new_left_back_len = new_cigar.back().length;
630     new_left_back_len += dist_to_left;
631    
632     vector<CigarOp> new_right_cig = curr_hit->cigar();
633     int new_right_front_len = new_right_cig.front().length;
634     new_right_front_len -= dist_to_right;
635     if (new_left_back_len > 0)
636     new_cigar.back().length = new_left_back_len;
637     else
638     new_cigar.pop_back();
639    
640     /*
641     * FIXME, currently just differentiating between a deletion and a
642     * reference skip based on length. However, would probably be better
643     * to denote the difference explicitly, this would allow the user
644     * to supply their own (very large) deletions
645     */
646     if ((lb->right - lb->left - 1) <= max_deletion_length)
647     {
648     new_cigar.push_back(CigarOp(DEL, lb->right - lb->left - 1));
649     antisense_closure = prev_hit->is_spliced() ? prev_hit->antisense_splice() : curr_hit->antisense_splice();
650     }
651     else
652     {
653     new_cigar.push_back(CigarOp(REF_SKIP, lb->right - lb->left - 1));
654     antisense_closure = lb->antisense;
655     }
656    
657     new_right_cig.front().length = new_right_front_len;
658     size_t c = new_right_front_len > 0 ? 0 : 1;
659     for (; c < new_right_cig.size(); ++c)
660     new_cigar.push_back(new_right_cig[c]);
661    
662     mismatch = new_diff_mismatches;
663     found_closure = true;
664     }
665     ++lb;
666     }
667    
668     if (!found_closure)
669     {
670     return BowtieHit();
671     }
672     }
673    
674     if (found_closure)
675     {
676     bool end = false;
677     BowtieHit merged_hit(reference_id,
678     insert_id,
679     new_left,
680     new_cigar,
681     antisense,
682     antisense_closure,
683     prev_hit->edit_dist() + curr_hit->edit_dist() + mismatch,
684     prev_hit->splice_mms() + curr_hit->splice_mms(),
685     end);
686    
687     if (curr_seg_index > 1)
688     merged_hit.seq(seq.substr(first_seg_length + (curr_seg_index - 1) * segment_length, 2 * segment_length));
689     else
690     merged_hit.seq(seq.substr(0, first_seg_length + segment_length));
691    
692     prev_hit = hit_chain.erase(prev_hit, ++curr_hit);
693     /*
694     * prev_hit now points PAST the last element removed
695     */
696     prev_hit = hit_chain.insert(prev_hit, merged_hit);
697     /*
698     * merged_hit has been inserted before the old position of
699     * prev_hit. New location of prev_hit is merged_hit
700     */
701     curr_hit = prev_hit;
702     ++curr_hit;
703     ++curr_seg_index;
704     continue;
705     }
706    
707     ++prev_hit;
708     ++curr_hit;
709     ++curr_seg_index;
710     }
711    
712     bool saw_antisense_splice = false;
713     bool saw_sense_splice = false;
714     vector<CigarOp> long_cigar;
715     int num_mismatches = 0;
716     int num_splice_mms = 0;
717     for (list<BowtieHit>::iterator s = hit_chain.begin(); s != hit_chain.end(); ++s)
718     {
719     num_mismatches += s->edit_dist();
720     num_splice_mms += s->splice_mms();
721    
722     /*
723     * Check whether the sequence contains any reference skips. Previously,
724     * this was just a check to see whether the sequence was contiguous; however
725     * we don't want to count an indel event as a splice
726     */
727     bool containsSplice = s->is_spliced();
728     if (containsSplice)
729     {
730     if (s->antisense_splice())
731     {
732     if (saw_sense_splice)
733     return BowtieHit();
734     saw_antisense_splice = true;
735     }
736     else
737     {
738     if (saw_antisense_splice)
739     return BowtieHit();
740     saw_sense_splice = true;
741     }
742     }
743     const vector<CigarOp>& cigar = s->cigar();
744     if (long_cigar.empty())
745     {
746     long_cigar = cigar;
747     }
748     else
749     {
750     CigarOp& last = long_cigar.back();
751     /*
752     * If necessary, merge the back and front
753     * cigar operations
754     */
755     if(last.opcode == cigar[0].opcode){
756     last.length += cigar[0].length;
757     for (size_t b = 1; b < cigar.size(); ++b)
758     {
759     long_cigar.push_back(cigar[b]);
760     }
761     }else{
762     for(size_t b = 0; b < cigar.size(); ++b)
763     {
764     long_cigar.push_back(cigar[b]);
765     }
766     }
767     }
768     }
769    
770     bool end = false;
771     BowtieHit new_hit(reference_id,
772     insert_id,
773     left,
774     long_cigar,
775     antisense,
776     saw_antisense_splice,
777     num_mismatches,
778     num_splice_mms,
779     end);
780    
781     new_hit.seq(seq);
782     new_hit.qual(qual);
783    
784     int new_read_len = new_hit.read_len();
785     if (new_read_len != old_read_length || !new_hit.check_editdist_consistency(rt))
786     {
787     fprintf(stderr, "Warning: malformed closure\n");
788     return BowtieHit();
789     }
790    
791     return new_hit;
792     }
793    
794     int multi_closure = 0;
795     int anchor_too_short = 0;
796     int gap_too_short = 0;
797    
798     bool valid_hit(const BowtieHit& bh)
799     {
800     if (bh.insert_id())
801     {
802     /*
803     * validate the cigar chain - no gaps shorter than an intron, etc.
804     * also,
805     * -Don't start or end with an indel or refskip
806     * -Only a match operation is allowed is allowed
807     * adjacent to an indel or refskip
808     * -Indels should confrom to length restrictions
809     */
810     const CigarOp* prevCig = &(bh.cigar()[0]);
811     const CigarOp* currCig = &(bh.cigar()[1]);
812     for (size_t i = 1; i < bh.cigar().size(); ++i){
813     currCig = &(bh.cigar()[i]);
814     if(currCig->opcode != MATCH && prevCig->opcode != MATCH){
815     return false;
816     }
817     if(currCig->opcode == INS){
818     if(currCig->length > max_insertion_length){
819     return false;
820     }
821     }
822     if(currCig->opcode == DEL){
823     if(currCig->length > max_deletion_length){
824     return false;
825     }
826     }
827     if(currCig->opcode == REF_SKIP){
828     if(currCig->length < (uint64_t)min_report_intron_length){
829     gap_too_short++;
830     return false;
831     }
832     }
833     prevCig = currCig;
834     }
835     if (bh.cigar().front().opcode != MATCH ||
836     bh.cigar().back().opcode != MATCH /* ||
837     (int)bh.cigar().front().length < min_anchor_len||
838     (int)bh.cigar().back().length < min_anchor_len*/ )
839     {
840     anchor_too_short++;
841     return false;
842     }
843     }
844     else
845     {
846     multi_closure++;
847     return false;
848     }
849    
850     return true;
851     }
852    
853     void merge_segment_chain(RefSequenceTable& rt,
854     const string& read_seq,
855     const string& read_quals,
856     std::set<Junction>& possible_juncs,
857     std::set<Insertion>& possible_insertions,
858     vector<BowtieHit>& hits,
859     vector<BowtieHit>& merged_hits)
860     {
861     if (hits.size() == 0)
862     return;
863    
864     BowtieHit bh;
865     if (hits.size() > 1)
866     {
867     list<BowtieHit> hit_chain;
868    
869     if (hits.front().antisense_align())
870     copy(hits.rbegin(), hits.rend(), back_inserter(hit_chain));
871     else
872     copy(hits.begin(), hits.end(), back_inserter(hit_chain));
873    
874     bh = merge_chain(rt, read_seq, read_quals, possible_juncs, possible_insertions, hit_chain);
875     }
876     else
877     {
878     bh = hits[0];
879     }
880    
881     if (valid_hit(bh))
882     merged_hits.push_back(bh);
883     }
884    
885     bool dfs_seg_hits(RefSequenceTable& rt,
886     const string& read_seq,
887     const string& read_quals,
888     std::set<Junction>& possible_juncs,
889     std::set<Insertion>& possible_insertions,
890     vector<HitsForRead>& seg_hits_for_read,
891     size_t curr,
892     vector<BowtieHit>& seg_hit_stack,
893     vector<BowtieHit>& joined_hits)
894     {
895     assert (!seg_hit_stack.empty());
896     bool join_success = false;
897    
898     if (curr < seg_hits_for_read.size())
899     {
900     for (size_t i = 0; i < seg_hits_for_read[curr].hits.size(); ++i)
901     {
902     BowtieHit& bh = seg_hits_for_read[curr].hits[i];
903     BowtieHit& back = seg_hit_stack.back();
904    
905     bool consistent_sense = bh.antisense_align() == back.antisense_align();
906     bool same_contig = bh.ref_id() == back.ref_id();
907    
908     // FIXME: when we have stranded reads, we need to fix this condition
909     //bool consistent_strand = (bh.contiguous() || back.contiguous() ||
910     // (bh.antisense_splice() == back.antisense_splice()));
911     if (consistent_sense && same_contig /*&& consistent_strand*/)
912     {
913     if (bh.antisense_align())
914     {
915     unsigned int bh_r = bh.right();
916     unsigned int back_left = seg_hit_stack.back().left();
917     if ((bh_r + max_report_intron_length >= back_left &&
918     back_left >= bh_r) || (back_left + max_insertion_length >= bh_r && back_left < bh_r))
919     {
920     // these hits are compatible, so push bh onto the
921     // stack, recurse, and pop it when done.
922     seg_hit_stack.push_back(bh);
923     bool success = dfs_seg_hits(rt,
924     read_seq,
925     read_quals,
926     possible_juncs,
927     possible_insertions,
928     seg_hits_for_read,
929     curr + 1,
930     seg_hit_stack,
931     joined_hits);
932     if (success)
933     join_success = true;
934    
935     seg_hit_stack.pop_back();
936     }
937     }
938     else
939     {
940     unsigned int bh_l = bh.left();
941    
942     unsigned int back_right = seg_hit_stack.back().right();
943     if ((back_right + max_report_intron_length >= bh_l &&
944     bh_l >= back_right) || (bh_l + max_insertion_length >= back_right && bh_l < back_right))
945     {
946     // these hits are compatible, so push bh onto the
947     // stack, recurse, and pop it when done.
948     seg_hit_stack.push_back(bh);
949     bool success = dfs_seg_hits(rt,
950     read_seq,
951     read_quals,
952     possible_juncs,
953     possible_insertions,
954     seg_hits_for_read,
955     curr + 1,
956     seg_hit_stack,
957     joined_hits);
958    
959     if (success)
960     join_success = true;
961    
962     seg_hit_stack.pop_back();
963     }
964     }
965     }
966     }
967     }
968     else
969     {
970     merge_segment_chain(rt,
971     read_seq,
972     read_quals,
973     possible_juncs,
974     possible_insertions,
975     seg_hit_stack,
976     joined_hits);
977     return join_success = true;
978     }
979     return join_success;
980     }
981    
982     bool join_segments_for_read(RefSequenceTable& rt,
983     const string& read_seq,
984     const string& read_quals,
985     std::set<Junction>& possible_juncs,
986     std::set<Insertion>& possible_insertions,
987     vector<HitsForRead>& seg_hits_for_read,
988     vector<BowtieHit>& joined_hits)
989     {
990     vector<BowtieHit> seg_hit_stack;
991     bool join_success = false;
992    
993     for (size_t i = 0; i < seg_hits_for_read[0].hits.size(); ++i)
994     {
995     BowtieHit& bh = seg_hits_for_read[0].hits[i];
996     seg_hit_stack.push_back(bh);
997     bool success = dfs_seg_hits(rt,
998     read_seq,
999     read_quals,
1000     possible_juncs,
1001     possible_insertions,
1002     seg_hits_for_read,
1003     1,
1004     seg_hit_stack,
1005     joined_hits);
1006     if (success)
1007     join_success = true;
1008     seg_hit_stack.pop_back();
1009     }
1010    
1011     return join_success;
1012     }
1013    
1014     void join_segment_hits(GBamWriter& bam_writer, std::set<Junction>& possible_juncs,
1015     std::set<Insertion>& possible_insertions,
1016     ReadTable& unmapped_reads,
1017     RefSequenceTable& rt,
1018     FILE* reads_file,
1019     vector<HitStream>& contig_hits,
1020     vector<HitStream>& spliced_hits)
1021     {
1022     uint32_t curr_contig_obs_order = 0xFFFFFFFF;
1023     HitStream* first_seg_contig_stream = NULL;
1024     uint64_t next_contig_id = 0;
1025    
1026     if (contig_hits.size())
1027     {
1028     first_seg_contig_stream = &(contig_hits.front());
1029     next_contig_id = first_seg_contig_stream->next_group_id();
1030     curr_contig_obs_order = unmapped_reads.observation_order(next_contig_id);
1031     }
1032    
1033     HitsForRead curr_hit_group;
1034    
1035     uint32_t curr_spliced_obs_order = 0xFFFFFFFF;
1036     HitStream* first_seg_spliced_stream = NULL;
1037     uint64_t next_spliced_id = 0;
1038    
1039     if (spliced_hits.size())
1040     {
1041     first_seg_spliced_stream = &(spliced_hits.front());
1042     next_spliced_id = first_seg_spliced_stream->next_group_id();
1043     curr_spliced_obs_order = unmapped_reads.observation_order(next_spliced_id);
1044     }
1045    
1046     while(curr_contig_obs_order != 0xFFFFFFFF ||
1047     curr_spliced_obs_order != 0xFFFFFFFF)
1048     {
1049     uint32_t read_in_process;
1050     vector<HitsForRead> seg_hits_for_read;
1051     seg_hits_for_read.resize(contig_hits.size());
1052    
1053     if (curr_contig_obs_order < curr_spliced_obs_order)
1054     {
1055     first_seg_contig_stream->next_read_hits(curr_hit_group);
1056     seg_hits_for_read.front() = curr_hit_group;
1057    
1058     next_contig_id = first_seg_contig_stream->next_group_id();
1059     uint32_t next_order = unmapped_reads.observation_order(next_contig_id);
1060    
1061     read_in_process = curr_contig_obs_order;
1062     curr_contig_obs_order = next_order;
1063     }
1064     else if (curr_spliced_obs_order < curr_contig_obs_order)
1065     {
1066     first_seg_spliced_stream->next_read_hits(curr_hit_group);
1067     seg_hits_for_read.front() = curr_hit_group;
1068    
1069     next_spliced_id = first_seg_spliced_stream->next_group_id();
1070     uint32_t next_order = unmapped_reads.observation_order(next_spliced_id);
1071    
1072     read_in_process = curr_spliced_obs_order;
1073     curr_spliced_obs_order = next_order;
1074     }
1075     else if (curr_contig_obs_order == curr_spliced_obs_order &&
1076     curr_contig_obs_order != 0xFFFFFFFF &&
1077     curr_spliced_obs_order != 0xFFFFFFFF)
1078     {
1079     first_seg_contig_stream->next_read_hits(curr_hit_group);
1080    
1081     HitsForRead curr_spliced_group;
1082     first_seg_spliced_stream->next_read_hits(curr_spliced_group);
1083    
1084     curr_hit_group.hits.insert(curr_hit_group.hits.end(),
1085     curr_spliced_group.hits.begin(),
1086     curr_spliced_group.hits.end());
1087     seg_hits_for_read.front() = curr_hit_group;
1088     read_in_process = curr_spliced_obs_order;
1089    
1090     next_contig_id = first_seg_contig_stream->next_group_id();
1091     uint32_t next_order = unmapped_reads.observation_order(next_contig_id);
1092    
1093     next_spliced_id = first_seg_spliced_stream->next_group_id();
1094     uint32_t next_spliced_order = unmapped_reads.observation_order(next_spliced_id);
1095    
1096     curr_spliced_obs_order = next_spliced_order;
1097     curr_contig_obs_order = next_order;
1098     }
1099     else
1100     {
1101     break;
1102     }
1103    
1104     if (contig_hits.size() > 1)
1105     {
1106     look_right_for_hit_group(unmapped_reads,
1107     contig_hits,
1108     0,
1109     spliced_hits,
1110     curr_hit_group,
1111     seg_hits_for_read);
1112     }
1113    
1114     size_t last_non_empty = seg_hits_for_read.size() - 1;
1115     while(last_non_empty >= 0 && seg_hits_for_read[last_non_empty].hits.empty())
1116     {
1117     --last_non_empty;
1118     }
1119    
1120     seg_hits_for_read.resize(last_non_empty + 1);
1121     if (!seg_hits_for_read[last_non_empty].hits[0].end())
1122     continue;
1123    
1124     if (!seg_hits_for_read.empty() && !seg_hits_for_read[0].hits.empty())
1125     {
1126     uint64_t insert_id = seg_hits_for_read[0].hits[0].insert_id();
1127     char read_name[256];
1128     char read_seq[256];
1129     char read_alt_name[256];
1130     char read_quals[256];
1131    
1132     if (get_read_from_stream(insert_id,
1133     reads_file,
1134     reads_format,
1135     false,
1136     read_name,
1137     read_seq,
1138     read_alt_name,
1139     read_quals))
1140     {
1141     vector<BowtieHit> joined_hits;
1142     join_segments_for_read(rt,
1143     read_seq,
1144     read_quals,
1145     possible_juncs,
1146     possible_insertions,
1147     seg_hits_for_read,
1148     joined_hits);
1149    
1150     sort(joined_hits.begin(), joined_hits.end());
1151     vector<BowtieHit>::iterator new_end = unique(joined_hits.begin(), joined_hits.end());
1152     joined_hits.erase(new_end, joined_hits.end());
1153    
1154     for (size_t i = 0; i < joined_hits.size(); i++)
1155     {
1156     const char* ref_name = rt.get_name(joined_hits[i].ref_id());
1157     if (color && !color_out)
1158     //print_hit(stdout, read_name, joined_hits[i], ref_name, joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
1159     print_bamhit(bam_writer, read_name, joined_hits[i], ref_name, joined_hits[i].seq().c_str(), joined_hits[i].qual().c_str(), true);
1160     else
1161     print_bamhit(bam_writer, read_name, joined_hits[i], ref_name, read_seq, read_quals, false);
1162     //print_hit(stdout, read_name, joined_hits[i], ref_name, read_seq, read_quals, false);
1163     }
1164     }
1165     else
1166     {
1167     err_die("Error: could not get read # %d from stream\n",
1168     read_in_process);
1169     }
1170     }
1171     else
1172     {
1173     //fprintf(stderr, "Warning: couldn't join segments for read # %d\n", read_in_process);
1174     }
1175     }
1176     }
1177    
1178     void driver(GBamWriter& bam_writer, istream& ref_stream,
1179     vector<FILE*>& possible_juncs_files,
1180     vector<FILE*>& possible_insertions_files,
1181     vector<FILE*>& possible_deletions_files,
1182 gpertea 69 vector<FZPipe>& spliced_segmap_files, //.sam.z files
1183     vector<string>& segmap_fnames, //.bam files
1184 gpertea 29 FZPipe& reads_file)
1185     {
1186 gpertea 69 if (segmap_fnames.size() == 0)
1187 gpertea 29 {
1188     fprintf(stderr, "No hits to process, exiting\n");
1189     exit(0);
1190     }
1191    
1192     RefSequenceTable rt(true, true);
1193     fprintf (stderr, "Loading reference sequences...\n");
1194     get_seqs(ref_stream, rt, true, false);
1195     fprintf (stderr, " reference sequences loaded.\n");
1196     ReadTable it;
1197    
1198     bool need_seq = true;
1199     bool need_qual = color;
1200     //rewind(reads_file);
1201     reads_file.rewind();
1202    
1203     //vector<HitTable> seg_hits;
1204     vector<HitStream> contig_hits;
1205     vector<HitStream> spliced_hits;
1206    
1207     vector<HitFactory*> factories;
1208 gpertea 69 for (size_t i = 0; i < segmap_fnames.size(); ++i)
1209 gpertea 29 {
1210 gpertea 69 HitFactory* fac = new BAMHitFactory(it, rt);
1211 gpertea 29 factories.push_back(fac);
1212 gpertea 69 HitStream hs(segmap_fnames[i], fac, false, false, false, need_seq, need_qual);
1213 gpertea 29 contig_hits.push_back(hs);
1214     }
1215    
1216     fprintf(stderr, "Loading spliced hits...");
1217    
1218     //vector<vector<pair<uint32_t, HitsForRead> > > spliced_hits;
1219 gpertea 69 for (size_t i = 0; i < spliced_segmap_files.size(); ++i)
1220 gpertea 29 {
1221     int anchor_length = 0;
1222    
1223     // we allow read alignments even 1bp overlapping with juctions.
1224     // if (i == 0 || i == spliced_seg_files.size() - 1)
1225     // anchor_length = min_anchor_len;
1226    
1227 gpertea 69 HitFactory* fac = new SplicedSAMHitFactory(it,
1228 gpertea 29 rt,
1229     anchor_length);
1230     factories.push_back(fac);
1231    
1232 gpertea 69 HitStream hs(spliced_segmap_files[i].file, fac, true, false, false, need_seq, need_qual);
1233 gpertea 29 spliced_hits.push_back(hs);
1234    
1235     }
1236     fprintf(stderr, "done\n");
1237    
1238     fprintf(stderr, "Loading junctions...");
1239     std::set<Junction> possible_juncs;
1240    
1241     for (size_t i = 0; i < possible_juncs_files.size(); ++i)
1242     {
1243     char buf[2048];
1244     while(!feof(possible_juncs_files[i]) &&
1245     fgets(buf, sizeof(buf), possible_juncs_files[i]))
1246     {
1247     char junc_ref_name[256];
1248     int left;
1249     int right;
1250     char orientation;
1251     int ret = sscanf(buf, "%s %d %d %c", junc_ref_name, &left, &right, &orientation);
1252     if (ret != 4)
1253     continue;
1254     uint32_t ref_id = rt.get_id(junc_ref_name, NULL, 0);
1255     possible_juncs.insert(Junction(ref_id, left, right, orientation == '-'));
1256     }
1257     }
1258     fprintf(stderr, "done\n");
1259    
1260    
1261     fprintf(stderr, "Loading deletions...");
1262    
1263     for (size_t i = 0; i < possible_deletions_files.size(); ++i)
1264     {
1265     char splice_buf[2048];
1266     FILE* deletions_file = possible_deletions_files[i];
1267     if(!deletions_file){
1268     continue;
1269     }
1270     while(fgets(splice_buf, 2048, deletions_file)){
1271     char* nl = strrchr(splice_buf, '\n');
1272     char* buf = splice_buf;
1273     if (nl) *nl = 0;
1274    
1275     char* ref_name = get_token((char**)&buf, "\t");
1276     char* scan_left_coord = get_token((char**)&buf, "\t");
1277     char* scan_right_coord = get_token((char**)&buf, "\t");
1278    
1279     if (!scan_left_coord || !scan_right_coord)
1280     {
1281     err_die("Error: malformed deletion coordinate record\n");
1282     }
1283    
1284     uint32_t ref_id = rt.get_id(ref_name,NULL,0);
1285     uint32_t left_coord = atoi(scan_left_coord);
1286     uint32_t right_coord = atoi(scan_right_coord);
1287     possible_juncs.insert((Junction)Deletion(ref_id, left_coord - 1,right_coord, false));
1288     }
1289     }
1290     fprintf(stderr, "done\n");
1291    
1292    
1293     /*
1294     * Read the insertions from the list of insertion
1295     * files into a set
1296     */
1297     std::set<Insertion> possible_insertions;
1298     for (size_t i=0; i < possible_insertions_files.size(); ++i)
1299     {
1300     char splice_buf[2048];
1301     FILE* insertions_file = possible_insertions_files[i];
1302     if(!insertions_file){
1303     continue;
1304     }
1305     while(fgets(splice_buf, 2048, insertions_file)){
1306     char* nl = strrchr(splice_buf, '\n');
1307     char* buf = splice_buf;
1308     if (nl) *nl = 0;
1309    
1310     char* ref_name = get_token((char**)&buf, "\t");
1311     char* scan_left_coord = get_token((char**)&buf, "\t");
1312     char* scan_right_coord = get_token((char**)&buf, "\t");
1313     char* scan_sequence = get_token((char**)&buf, "\t");
1314    
1315     if (!scan_left_coord || !scan_sequence || !scan_right_coord)
1316     {
1317     err_die("Error: malformed insertion coordinate record\n");
1318     }
1319    
1320     uint32_t ref_id = rt.get_id(ref_name,NULL,0);
1321     uint32_t left_coord = atoi(scan_left_coord);
1322     std::string sequence(scan_sequence);
1323     possible_insertions.insert(Insertion(ref_id, left_coord, sequence));
1324     }
1325     }
1326    
1327     join_segment_hits(bam_writer, possible_juncs, possible_insertions, it, rt, reads_file.file, contig_hits, spliced_hits);
1328     //join_segment_hits(possible_juncs, possible_insertions, it, rt, reads_file.file, contig_hits, spliced_hits);
1329    
1330 gpertea 69 /*for (size_t i = 0; i < segmap_fnames.size(); ++i)
1331     segmap_fnames[i].close();
1332     */
1333     for (size_t i = 0; i < spliced_segmap_files.size(); ++i)
1334     spliced_segmap_files[i].close();
1335 gpertea 29 reads_file.close();
1336    
1337     for (size_t fac = 0; fac < factories.size(); ++fac)
1338     {
1339     delete factories[fac];
1340     }
1341     } //driver
1342    
1343     int main(int argc, char** argv)
1344     {
1345     fprintf(stderr, "long_spanning_reads v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
1346     fprintf(stderr, "--------------------------------------------\n");
1347    
1348     int parse_ret = parse_options(argc, argv, print_usage);
1349     if (parse_ret)
1350     return parse_ret;
1351    
1352     if(optind >= argc)
1353     {
1354     print_usage();
1355     return 1;
1356     }
1357    
1358     string ref_file_name = argv[optind++];
1359    
1360     if(optind >= argc)
1361     {
1362     print_usage();
1363     return 1;
1364     }
1365    
1366    
1367     string reads_file_name = argv[optind++];
1368    
1369     if(optind >= argc)
1370     {
1371     print_usage();
1372     return 1;
1373     }
1374    
1375     string juncs_file_list = argv[optind++];
1376    
1377     if(optind >= argc)
1378     {
1379     print_usage();
1380     return 1;
1381     }
1382    
1383     string insertions_file_list = argv[optind++];
1384     if(optind >= argc)
1385     {
1386     print_usage();
1387     return 1;
1388     }
1389    
1390     string deletions_file_list = argv[optind++];
1391    
1392     if(optind >= argc)
1393     {
1394     print_usage();
1395     return 1;
1396     }
1397    
1398    
1399 gpertea 69 string segmap_file_list = argv[optind++];
1400     string spliced_segmap_flist;
1401 gpertea 29 if(optind < argc)
1402     {
1403 gpertea 69 spliced_segmap_flist = argv[optind++];
1404 gpertea 29 }
1405    
1406     ifstream ref_stream(ref_file_name.c_str(), ifstream::in);
1407     if (!ref_stream.good())
1408     err_die("Error: cannot open %s for reading\n",ref_file_name.c_str());
1409    
1410     checkSamHeader();
1411    
1412     //FILE* reads_file = fopen(reads_file_name.c_str(), "r");
1413 gpertea 69 vector<string> segmap_file_names;
1414     tokenize(segmap_file_list, ",",segmap_file_names);
1415 gpertea 29
1416 gpertea 69 string unzcmd=getUnpackCmd(reads_file_name, spliced_segmap_flist.empty() &&
1417     segmap_file_names.size()<4);
1418 gpertea 29 FZPipe reads_file(reads_file_name, unzcmd);
1419     if (reads_file.file==NULL)
1420     err_die("Error: cannot open %s for reading\n",
1421     reads_file_name.c_str());
1422    
1423     vector<string> juncs_file_names;
1424     vector<FILE*> juncs_files;
1425     tokenize(juncs_file_list, ",",juncs_file_names);
1426     for (size_t i = 0; i < juncs_file_names.size(); ++i) {
1427     //fprintf(stderr, "Opening %s for reading\n",
1428     // juncs_file_names[i].c_str());
1429     FILE* juncs_file = fopen(juncs_file_names[i].c_str(), "r");
1430     if (juncs_file == NULL) {
1431     fprintf(stderr, "Warning: cannot open %s for reading\n",
1432     juncs_file_names[i].c_str());
1433     continue;
1434     }
1435     juncs_files.push_back(juncs_file);
1436     }
1437    
1438     /*
1439     * Read in the deletion file names
1440     */
1441     vector<string> deletions_file_names;
1442     vector<FILE*> deletions_files;
1443     tokenize(deletions_file_list, ",",deletions_file_names);
1444     for (size_t i = 0; i < deletions_file_names.size(); ++i)
1445     {
1446     //fprintf(stderr, "Opening %s for reading\n",
1447     // deletions_file_names[i].c_str());
1448     FILE* deletions_file = fopen(deletions_file_names[i].c_str(), "r");
1449     if (deletions_file == NULL) {
1450     fprintf(stderr, "Warning: cannot open %s for reading\n",
1451     deletions_file_names[i].c_str());
1452     continue;
1453     }
1454     deletions_files.push_back(deletions_file);
1455     }
1456    
1457     /*
1458     * Read in the list of filenames that contain
1459     * insertion coordinates
1460     */
1461    
1462     vector<string> insertions_file_names;
1463     vector<FILE*> insertions_files;
1464     tokenize(insertions_file_list, ",",insertions_file_names);
1465     for (size_t i = 0; i < insertions_file_names.size(); ++i)
1466     {
1467     //fprintf(stderr, "Opening %s for reading\n",
1468     // insertions_file_names[i].c_str());
1469     FILE* insertions_file = fopen(insertions_file_names[i].c_str(), "r");
1470     if (insertions_file == NULL)
1471     {
1472     fprintf(stderr, "Warning: cannot open %s for reading\n",
1473     insertions_file_names[i].c_str());
1474     continue;
1475     }
1476     insertions_files.push_back(insertions_file);
1477     }
1478    
1479 gpertea 69 //vector<FZPipe> segmap_files;
1480     /*
1481     vector<string> segmap_files; //BAM files
1482     for (size_t i = 0; i < segmap_file_names.size(); ++i)
1483 gpertea 29 {
1484 gpertea 69 fprintf(stderr, "Opening %s for reading\n",
1485     segmap_file_names[i].c_str());
1486     FZPipe seg_file(segmap_file_names[i], unzcmd);
1487 gpertea 29 if (seg_file.file == NULL)
1488     {
1489     err_die("Error: cannot open %s for reading\n",
1490 gpertea 69 segmap_file_names[i].c_str());
1491 gpertea 29 }
1492 gpertea 69 segmap_files.push_back(seg_file);
1493 gpertea 29
1494 gpertea 69 }*/
1495    
1496     vector<string> spliced_segmap_file_names;
1497     vector<FZPipe> spliced_segmap_files;
1498     //these are headerless .sam.z files
1499     tokenize(spliced_segmap_flist, ",",spliced_segmap_file_names);
1500     for (size_t i = 0; i < spliced_segmap_file_names.size(); ++i)
1501 gpertea 29 {
1502 gpertea 69 if (i==0) {
1503     unzcmd=getUnpackCmd(spliced_segmap_file_names[i]);
1504     }
1505     fprintf(stderr, "Opening %s for reading\n",
1506     spliced_segmap_file_names[i].c_str());
1507     FZPipe spliced_seg_file(spliced_segmap_file_names[i], unzcmd);
1508 gpertea 29 if (spliced_seg_file.file == NULL)
1509     {
1510     err_die("Error: cannot open %s for reading\n",
1511 gpertea 69 spliced_segmap_file_names[i].c_str());
1512 gpertea 29 }
1513 gpertea 69 spliced_segmap_files.push_back(spliced_seg_file);
1514 gpertea 29 }
1515    
1516 gpertea 69 if (spliced_segmap_files.size())
1517 gpertea 29 {
1518 gpertea 69 if (spliced_segmap_files.size() != segmap_file_names.size())
1519 gpertea 29 {
1520 gpertea 69 err_die("Error: each segment mapping file must have a corresponding spliced segment mapping file\n");
1521 gpertea 29 }
1522     }
1523     GBamWriter bam_writer("-", sam_header.c_str());
1524 gpertea 69 driver(bam_writer, ref_stream, juncs_files, insertions_files, deletions_files, spliced_segmap_files, segmap_file_names, reads_file);
1525 gpertea 29 return 0;
1526     }