ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/insertions.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (8 years, 4 months ago) by gpertea
File size: 5167 byte(s)
Log Message:
massive update with Daehwan's work

Line User Rev File contents
1 gpertea 29 /*
2     * insertions.cpp
3     * TopHat
4     *
5     * Created by Ryan Kelley on 11/04/2010.
6     *
7     */
8    
9     #ifdef HAVE_CONFIG_H
10     #include <config.h>
11     #else
12     #define PACKAGE_VERSION "INTERNAL"
13     #define SVN_REVISION "XXX"
14     #endif
15    
16    
17     #include <cassert>
18     #include <cstdio>
19     #include <cstring>
20     #include <vector>
21     #include <string>
22     #include <map>
23     #include <algorithm>
24     #include <set>
25     #include <stdexcept>
26     #include <iostream>
27     #include <fstream>
28     #include <seqan/sequence.h>
29     #include <seqan/find.h>
30     #include <seqan/file.h>
31     #include <getopt.h>
32    
33     #include "common.h"
34     #include "bwt_map.h"
35     #include "junctions.h"
36     #include "insertions.h"
37     #include "fragments.h"
38     #include "wiggles.h"
39     #include "tokenize.h"
40     #include "reads.h"
41    
42     #include "inserts.h"
43    
44     /**
45     * Add insertions from an alignment to an InsertionSet.
46     * This will look for insertion in the alignment specified by bh. If the
47     * insertion is already in insertions, it will updated the count. Otherwise,
48     * it will add the insertion to the set and initialize the count to 1.
49     * @param bh The bowtie hit to be used to specify alignment infromation.
50     * @param insertions The InsertionSet that will be updated with the insertion information from teh alignment.
51     */
52     void insertions_from_alignment(const BowtieHit& bh, InsertionSet& insertions){
53    
54     vector<Insertion> new_insertions;
55     insertions_from_spliced_hit(bh, new_insertions);
56    
57     for(size_t i = 0; i < new_insertions.size(); ++i){
58     Insertion insertion = new_insertions[i];
59     InsertionSet::iterator itr = insertions.find(insertion);
60     if (itr != insertions.end()){
61     itr->second += 1;
62     }
63     else{
64 gpertea 135 assert(insertion.refid != VMAXINT32);
65 gpertea 29 insertions[insertion] = 1;
66     }
67     }
68     return;
69     }
70    
71     /**
72     * Print insertions in BED format.
73     * Note, as per the BED-standard (http://genome.ucsc.edu/FAQ/FAQformat)
74     * -The coordinates should be 0-based
75     * -The chromEnd field should not include the actual feature
76     * -The name will be the inserted sequence
77     * -The score will be the number of supporing counts, which is capped at 1,000
78     * By (my) convention, the chromStart will be the last genome postion
79     * before hte insertio.
80     *
81     * <chrom>\t<left>\t<left>\t<inserted sequence>\t<read count>\n
82     * @param insertions_out The output file
83     * @param insertions Maps from insertions to number of supporting reads
84     * @param ref_sequences The table of reference sequences
85     */
86     void print_insertions(FILE* insertions_out, const InsertionSet& insertions, RefSequenceTable& ref_sequences){
87     fprintf(insertions_out, "track name=insertions description=\"TopHat insertions\"\n");
88     for(InsertionSet::const_iterator i = insertions.begin(); i != insertions.end(); ++i){
89     int counts = i->second;
90     if(counts > 1000){
91     counts = 1000;
92     }
93     fprintf(insertions_out, "%s\t%d\t%d\t%s\t%d\n",
94     ref_sequences.get_name(i->first.refid),
95     i->first.left,
96     i->first.left,
97     (i->first.sequence).c_str(),
98     counts);
99     }
100     }
101    
102     /**
103     * Extract a list of insertions from a bowtie hit.
104     * Given a bowtie hit, extract a vector of insertions.
105     * @param bh The bowtie hit to use for alignment information.
106     * @param insertions Used to store the resultant vector of insertions.
107     */
108     void insertions_from_spliced_hit(const BowtieHit& bh, vector<Insertion>& insertions){
109     const vector<CigarOp>& cigar = bh.cigar();
110     unsigned int positionInGenome = bh.left();
111     unsigned int positionInRead = 0;
112    
113 gpertea 154 bool bSawFusion = false;
114 gpertea 29 for(size_t c = 0; c < cigar.size(); ++c){
115 gpertea 154 Insertion insertion;
116     switch(cigar[c].opcode){
117     case REF_SKIP:
118     positionInGenome += cigar[c].length;
119     break;
120     case rEF_SKIP:
121     positionInGenome -= cigar[c].length;
122     break;
123     case MATCH:
124     case mATCH:
125     if (cigar[c].opcode == MATCH)
126     positionInGenome += cigar[c].length;
127     else
128     positionInGenome -= cigar[c].length;
129     positionInRead += cigar[c].length;
130     break;
131     case DEL:
132     positionInGenome += cigar[c].length;
133     break;
134     case dEL:
135     positionInGenome -= cigar[c].length;
136     break;
137     case INS:
138     case iNS:
139     /*
140     * Note that the reported position in the genome from the SAM
141     * alignment is 1-based, since the insertion object is expecting
142     * a 0-based co-ordinate, we need to subtract 1
143     */
144     if (bSawFusion)
145     insertion.refid = bh.ref_id2();
146     else
147     insertion.refid = bh.ref_id();
148 gpertea 29
149 gpertea 154 if (cigar[c].opcode == INS)
150     insertion.left = positionInGenome - 1;
151     else
152     insertion.left = positionInGenome + 1;
153    
154     insertion.sequence = bh.seq().substr(positionInRead, cigar[c].length);
155    
156     insertions.push_back(insertion);
157     positionInRead += cigar[c].length;
158     break;
159     case FUSION_FF:
160     case FUSION_FR:
161     case FUSION_RF:
162     bSawFusion = true;
163     positionInGenome = cigar[c].length;
164     break;
165     default:
166     break;
167 gpertea 29 }
168     }
169     return;
170     }
171    
172 gpertea 154 void merge_with(InsertionSet& insertions, const InsertionSet& other)
173     {
174     for (InsertionSet::const_iterator insertion = other.begin(); insertion != other.end(); ++insertion)
175     {
176     InsertionSet::iterator itr = insertions.find(insertion->first);
177     if (itr != insertions.end())
178     {
179     itr->second += insertion->second;
180     }
181     else
182     {
183     insertions[insertion->first] = insertion->second;
184     }
185     }
186     }