ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/juncs_db.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (9 years, 3 months ago) by gpertea
File size: 14704 byte(s)
Log Message:
adding tophat source work

Line User Rev File contents
1 gpertea 29 /*
2     * juncs_db.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 12/12/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14    
15     #include <cassert>
16     #include <cstdio>
17     #include <cstring>
18     #include <vector>
19     #include <string>
20     #include <map>
21     #include <algorithm>
22     #include <set>
23     #include <stdexcept>
24     #include <iostream>
25     #include <fstream>
26     #include <seqan/sequence.h>
27     #include <seqan/find.h>
28     #include <seqan/file.h>
29     #include <getopt.h>
30    
31     #include "common.h"
32     #include "bwt_map.h"
33     #include "tokenize.h"
34     #include "junctions.h"
35     #include "insertions.h"
36     #include "deletions.h"
37    
38     using namespace std;
39     using namespace seqan;
40     using std::set;
41    
42    
43     // Length of the outer dimension of a single insert from the paired-end library
44    
45     static int read_length = -1;
46     void print_usage()
47     {
48     fprintf(stderr, "Usage: juncs_db <min_anchor> <read_length> <splice_coords1,...,splice_coordsN> <insertion_coords1,...,insertion_coordsN> <deletion_coords1,...,deletion_coordsN> <ref.fa>\n");
49     }
50    
51     typedef vector<string> Mapped;
52    
53     bool splice_junc_lt(const pair<size_t, size_t>& lhs,
54     const pair<size_t, size_t>& rhs)
55     {
56     if (lhs.first < rhs.first)
57     return true;
58     else
59     return lhs.second < rhs.second;
60     }
61    
62     /**
63     * Given an insertion set, this code will print FASTA entries
64     * for the surrounding sequence. The names of the FASTA entries
65     * contain information about the exact location and nature of the
66     * insertion. The entry is generally of the form
67     * <contig>|<left end of fasta sequence>|<position of insertion>-<sequence of insertion>|<right end of fasta sequence>|ins|<[fwd|rev]>
68     */
69     template<typename TStr>
70     void print_insertion(const Insertion& insertion,
71     int read_len,
72     TStr& ref_str,
73     const string& ref_name,
74     ostream& splice_db)
75     {
76     int half_splice_len = read_len - min_anchor_len;
77    
78     size_t left_start, right_start;
79     size_t left_end, right_end;
80     if (insertion.left >= 0 && insertion.left <= length(ref_str))
81     {
82     left_start = (int)insertion.left - half_splice_len + 1 >= 0 ? (int)insertion.left - half_splice_len + 1 : 0;
83     left_end = left_start + half_splice_len;
84    
85     right_start = (int)left_end;
86     right_end = right_start + half_splice_len < length(ref_str) ? right_start + half_splice_len : length(ref_str);
87    
88     Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str, left_start, left_end);
89     Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str, right_start, right_end);
90    
91     splice_db << ">" << ref_name << "|" << left_start << "|" << insertion.left << "-" << insertion.sequence
92     << "|" << right_end << "|ins|" << ("fwd") << endl;
93    
94     splice_db << left_splice << insertion.sequence << right_splice << endl;
95     }
96     }
97    
98    
99     template<typename TStr>
100     void print_splice(const Junction& junction,
101     int read_len,
102     const string& tag,
103     TStr& ref_str,
104     const string& ref_name,
105     ostream& splice_db)
106     {
107     // daehwan - this is tentative, let's think about this more :)
108     // int half_splice_len = read_len - min_anchor_len;
109     int half_splice_len = read_len;
110    
111     size_t left_start, right_start;
112     size_t left_end, right_end;
113    
114     if (junction.left >= 0 && junction.left <= length(ref_str) &&
115     junction.right >= 0 && junction.right <= length(ref_str))
116     {
117     left_start = (int)junction.left - half_splice_len + 1 >= 0 ? (int)junction.left - half_splice_len + 1 : 0;
118     left_end = left_start + half_splice_len;
119    
120     right_start = junction.right;
121     right_end = right_start + half_splice_len < length(ref_str) ? right_start + half_splice_len : length(ref_str) - right_start;
122    
123     Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str,
124     left_start,
125     left_end);
126     Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str,
127     right_start,
128     right_end);
129    
130     splice_db << ">" << ref_name << "|" << left_start << "|" << junction.left <<
131     "-" << junction.right << "|" << right_end << "|" << tag << endl;
132    
133     splice_db << left_splice << right_splice << endl;
134     }
135     }
136    
137    
138     /**
139     * Parse an int out of optarg and enforce that it be at least 'lower';
140     * if it is less than 'lower', than output the given error message and
141     * exit with an error and a usage message.
142     */
143     static int parse_oInt(int lower, char* arg, const char *errmsg) {
144     long l;
145     char *endPtr= NULL;
146     l = strtol(arg, &endPtr, 10);
147     if (endPtr != NULL) {
148     if (l < lower) {
149     cerr << errmsg << endl;
150     print_usage();
151     exit(1);
152     }
153     return (int32_t)l;
154     }
155     cerr << errmsg << endl;
156     print_usage();
157     exit(1);
158     return -1;
159     }
160     //
161     //int parse_options(int argc, char** argv)
162     //{
163     // int option_index = 0;
164     // int next_option;
165     // do {
166     // next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
167     // switch (next_option) {
168     // case 'v':
169     // verbose = true;
170     // break;
171     // case -1: /* Done with options. */
172     // break;
173     // default:
174     // print_usage();
175     // return 1;
176     // }
177     // } while(next_option != -1);
178     //
179     // return 0;
180     //}
181    
182     void driver(const vector<FILE*>& splice_coords_files,
183     const vector<FILE*>& insertion_coords_files,
184     const vector<FILE*>& deletion_coords_files,
185     ifstream& ref_stream)
186     {
187     char splice_buf[2048];
188     RefSequenceTable rt(true);
189     JunctionSet junctions;
190     for (size_t i = 0; i < splice_coords_files.size(); ++i)
191     {
192     FILE* splice_coords = splice_coords_files[i];
193     if (!splice_coords)
194     continue;
195     while (fgets(splice_buf, 2048, splice_coords))
196     {
197     char* nl = strrchr(splice_buf, '\n');
198     char* buf = splice_buf;
199     if (nl) *nl = 0;
200    
201     /**
202     Fields are:
203     1) reference name
204     2) left coord of splice (last char of the left exon)
205     3) right coord of splice (first char of the right exon)
206     */
207    
208     char* ref_name = get_token((char**)&buf, "\t");
209     char* scan_left_coord = get_token((char**)&buf, "\t");
210     char* scan_right_coord = get_token((char**)&buf, "\t");
211     char* orientation = get_token((char**)&buf, "\t");
212    
213     if (!scan_left_coord || !scan_right_coord || !orientation)
214     {
215     fprintf(stderr,"Error: malformed splice coordinate record\n");
216     exit(1);
217     }
218     uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
219     uint32_t left_coord = atoi(scan_left_coord);
220     uint32_t right_coord = atoi(scan_right_coord);
221     bool antisense = *orientation == '-';
222     junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
223     }
224     }
225    
226    
227     /*
228     * Read in the deletion coordinates
229     * and store in a set
230     */
231     std::set<Deletion> deletions;
232     for(size_t i=0; i < deletion_coords_files.size(); ++i){
233     FILE* deletion_coords = deletion_coords_files[i];
234     if(!deletion_coords){
235     continue;
236     }
237     while (fgets(splice_buf, 2048, deletion_coords))
238     {
239     char* nl = strrchr(splice_buf, '\n');
240     char* buf = splice_buf;
241     if (nl) *nl = 0;
242    
243     /**
244     Fields are:
245     1) reference name
246     2) left coord of splice (last char of the left exon)
247     3) right coord of splice (first char of the right exon)
248     */
249    
250     char* ref_name = get_token((char**)&buf, "\t");
251     char* scan_left_coord = get_token((char**)&buf, "\t");
252     char* scan_right_coord = get_token((char**)&buf, "\t");
253    
254     if (!scan_left_coord || !scan_right_coord)
255     {
256     fprintf(stderr,"Error: malformed deletion coordinate record\n");
257     exit(1);
258     }
259    
260     /*
261     * Note that when reading in a deletion, the left co-ord is the position of the
262     * first deleted based. Since we are co-opting the junction data structure, need
263     * to fix up this location
264     */
265     uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
266     uint32_t left_coord = atoi(scan_left_coord);
267     uint32_t right_coord = atoi(scan_right_coord);
268     deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false));
269     }
270     }
271    
272     /*
273     * Read in the insertion coordinates
274     * and store in a set
275     */
276     std::set<Insertion> insertions;
277     for(size_t i=0; i < insertion_coords_files.size(); ++i){
278     FILE* insertion_coords = insertion_coords_files[i];
279     if(!insertion_coords){
280     continue;
281     }
282     while(fgets(splice_buf, 2048, insertion_coords)){
283     char* nl = strrchr(splice_buf, '\n');
284     char* buf = splice_buf;
285     if (nl) *nl = 0;
286    
287     char* ref_name = get_token((char**)&buf, "\t");
288     char* scan_left_coord = get_token((char**)&buf, "\t");
289     char* scan_right_coord = get_token((char**)&buf, "\t");
290     char* scan_sequence = get_token((char**)&buf, "\t");
291    
292     if (!scan_left_coord || !scan_sequence || !scan_right_coord)
293     {
294     fprintf(stderr,"Error: malformed insertion coordinate record\n");
295     exit(1);
296     }
297    
298     seqan::Dna5String sequence = seqan::Dna5String(scan_sequence);
299     bool containsN = false;
300     for(size_t index = 0; index < seqan::length(sequence); index += 1){
301     /*
302     * Don't allow any ambiguities in the insertion
303     */
304     if(sequence[index] == 'N'){
305     containsN = true;
306     break;
307     }
308     }
309     if(containsN){
310     continue;
311     }
312     seqan::CharString charSequence = sequence;
313     uint32_t ref_id = rt.get_id(ref_name,NULL,0);
314     uint32_t left_coord = atoi(scan_left_coord);
315     insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence)));
316     }
317     }
318    
319    
320     typedef RefSequenceTable::Sequence Reference;
321    
322     while(ref_stream.good() &&
323     !ref_stream.eof())
324     {
325     Reference ref_str;
326     string name;
327    
328     readMeta(ref_stream, name, Fasta());
329     string::size_type space_pos = name.find_first_of(" \t\r");
330     if (space_pos != string::npos)
331     {
332     name.resize(space_pos);
333     }
334    
335     read(ref_stream, ref_str, Fasta());
336    
337     uint32_t refid = rt.get_id(name, NULL, 0);
338     Junction dummy_left(refid, 0, 0, true);
339     Junction dummy_right(refid, 0xFFFFFFFF, 0xFFFFFFFF, true);
340    
341     pair<JunctionSet::iterator, JunctionSet::iterator> r;
342     r.first = junctions.lower_bound(dummy_left);
343     r.second = junctions.upper_bound(dummy_right);
344    
345     JunctionSet::iterator itr = r.first;
346    
347     while(itr != r.second && itr != junctions.end())
348     {
349     print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", ref_str, name, cout);
350     ++itr;
351     }
352     }
353    
354    
355     ref_stream.clear();
356     ref_stream.seekg(0, ios::beg);
357    
358    
359     while(ref_stream.good() &&
360     !ref_stream.eof())
361     {
362     Reference ref_str;
363     string name;
364    
365     readMeta(ref_stream, name, Fasta());
366     string::size_type space_pos = name.find_first_of(" \t\r");
367     if (space_pos != string::npos)
368     {
369     name.resize(space_pos);
370     }
371    
372     read(ref_stream, ref_str, Fasta());
373    
374     uint32_t refid = rt.get_id(name, NULL,0);
375     Deletion dummy_left(refid, 0, 0, true);
376     Deletion dummy_right(refid, 0xFFFFFFFF, 0xFFFFFFFF, true);
377    
378     pair<std::set<Deletion>::iterator, std::set<Deletion>::iterator> r;
379     r.first = deletions.lower_bound(dummy_left);
380     r.second = deletions.upper_bound(dummy_right);
381    
382     std::set<Deletion>::iterator itr = r.first;
383    
384     while(itr != r.second && itr != deletions.end())
385     {
386     print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", ref_str, name, cout);
387     ++itr;
388     }
389     }
390    
391     ref_stream.clear();
392     ref_stream.seekg(0, ios::beg);
393    
394    
395    
396     while(ref_stream.good() &&
397     !ref_stream.eof())
398     {
399     Reference ref_str;
400     string name;
401    
402     readMeta(ref_stream, name, Fasta());
403     string::size_type space_pos = name.find_first_of(" \t\r");
404     if (space_pos != string::npos)
405     {
406     name.resize(space_pos);
407     }
408    
409     read(ref_stream, ref_str, Fasta());
410    
411     uint32_t refid = rt.get_id(name, NULL,0);
412     Insertion dummy_left(refid, 0, "");
413     Insertion dummy_right(refid, 0xFFFFFFFF, "");
414    
415     std::set<Insertion>::iterator itr = insertions.lower_bound(dummy_left);
416     std::set<Insertion>::iterator upper = insertions.upper_bound(dummy_right);
417    
418     while(itr != upper && itr != insertions.end()){
419     print_insertion(*itr, read_length, ref_str, name, cout);
420     ++itr;
421     }
422     }
423    
424     }
425    
426     int main(int argc, char** argv)
427     {
428     fprintf(stderr, "juncs_db v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
429     fprintf(stderr, "---------------------------\n");
430    
431     int parse_ret = parse_options(argc, argv, print_usage);
432     if (parse_ret)
433     return parse_ret;
434    
435     if(optind >= argc)
436     {
437     print_usage();
438     return 1;
439     }
440    
441     min_anchor_len = parse_oInt(3, argv[optind++], "anchor length must be at least 3");
442    
443     if(optind >= argc)
444     {
445     print_usage();
446     return 1;
447     }
448    
449     read_length = parse_oInt(4, argv[optind++], "read length must be at least 4");
450    
451     if(optind >= argc)
452     {
453     print_usage();
454     return 1;
455     }
456    
457     string splice_coords_file_list = argv[optind++];
458     vector<string> splice_coords_file_names;
459     vector<FILE*> coords_files;
460     tokenize(splice_coords_file_list, ",", splice_coords_file_names);
461     for (size_t s = 0; s < splice_coords_file_names.size(); ++s)
462     {
463    
464     FILE* coords_file = fopen(splice_coords_file_names[s].c_str(), "r");
465    
466     if (!coords_file)
467     {
468     fprintf(stderr, "Warning: cannot open %s for reading\n",
469     splice_coords_file_names[s].c_str());
470     continue;
471     }
472     coords_files.push_back(coords_file);
473     }
474     if(optind >= argc)
475     {
476     print_usage();
477     return 1;
478     }
479    
480     /*
481     * Read in the insertion co-ordinates
482     */
483     string insertion_coords_file_list = argv[optind++];
484     vector<string> insertion_coords_file_names;
485     vector<FILE*> insertion_coords_files;
486     tokenize(insertion_coords_file_list, ",", insertion_coords_file_names);
487     for(size_t s = 0; s < insertion_coords_file_names.size(); ++s)
488     {
489     FILE* insertion_coords_file = fopen(insertion_coords_file_names[s].c_str(),"r");
490     if(!insertion_coords_file)
491     {
492     fprintf(stderr, "Warning: cannot open %s for reading\n",
493     insertion_coords_file_names[s].c_str());
494     continue;
495     }
496     insertion_coords_files.push_back(insertion_coords_file);
497     }
498     if(optind >= argc)
499     {
500     print_usage();
501     return 1;
502     }
503    
504    
505     /*
506     * Read in the deletion co-ordinates
507     */
508     string deletion_coords_file_list = argv[optind++];
509     vector<string> deletion_coords_file_names;
510     vector<FILE*> deletion_coords_files;
511     tokenize(deletion_coords_file_list, ",", deletion_coords_file_names);
512     for(size_t s = 0; s < deletion_coords_file_names.size(); ++s)
513     {
514     FILE* deletion_coords_file = fopen(deletion_coords_file_names[s].c_str(),"r");
515     if(!deletion_coords_file)
516     {
517     fprintf(stderr, "Warning: cannot open %s for reading\n",
518     deletion_coords_file_names[s].c_str());
519     continue;
520     }
521     deletion_coords_files.push_back(deletion_coords_file);
522     }
523     if(optind >= argc)
524     {
525     print_usage();
526     return 1;
527     }
528    
529    
530    
531     string ref_file_name = argv[optind++];
532     ifstream ref_stream(ref_file_name.c_str());
533    
534     if (!ref_stream.good())
535     {
536     fprintf(stderr, "Error: cannot open %s for reading\n",
537     ref_file_name.c_str());
538     exit(1);
539     }
540    
541     driver(coords_files, insertion_coords_files, deletion_coords_files, ref_stream);
542     return 0;
543     }