ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/juncs_db.cpp
Revision: 154
Committed: Tue Jan 24 02:29:21 2012 UTC (8 years, 9 months ago) by gpertea
File size: 18572 byte(s)
Log Message:
massive update with Daehwan's work

Line User Rev File contents
1 gpertea 29 /*
2     * juncs_db.cpp
3     * TopHat
4     *
5     * Created by Cole Trapnell on 12/12/08.
6     * Copyright 2008 Cole Trapnell. All rights reserved.
7     *
8     */
9    
10     #ifdef HAVE_CONFIG_H
11     #include <config.h>
12     #endif
13    
14    
15     #include <cassert>
16     #include <cstdio>
17     #include <cstring>
18     #include <vector>
19     #include <string>
20     #include <map>
21     #include <algorithm>
22     #include <set>
23     #include <stdexcept>
24     #include <iostream>
25     #include <fstream>
26     #include <seqan/sequence.h>
27     #include <seqan/find.h>
28     #include <seqan/file.h>
29 gpertea 154 #include <seqan/modifier.h>
30 gpertea 29 #include <getopt.h>
31    
32     #include "common.h"
33     #include "bwt_map.h"
34     #include "tokenize.h"
35     #include "junctions.h"
36     #include "insertions.h"
37     #include "deletions.h"
38 gpertea 154 #include "fusions.h"
39 gpertea 29
40 gpertea 154
41 gpertea 29 using namespace std;
42     using namespace seqan;
43     using std::set;
44    
45    
46     // Length of the outer dimension of a single insert from the paired-end library
47    
48     static int read_length = -1;
49     void print_usage()
50     {
51     fprintf(stderr, "Usage: juncs_db <min_anchor> <read_length> <splice_coords1,...,splice_coordsN> <insertion_coords1,...,insertion_coordsN> <deletion_coords1,...,deletion_coordsN> <ref.fa>\n");
52     }
53    
54     typedef vector<string> Mapped;
55    
56     bool splice_junc_lt(const pair<size_t, size_t>& lhs,
57     const pair<size_t, size_t>& rhs)
58     {
59     if (lhs.first < rhs.first)
60     return true;
61     else
62     return lhs.second < rhs.second;
63     }
64    
65     /**
66     * Given an insertion set, this code will print FASTA entries
67     * for the surrounding sequence. The names of the FASTA entries
68     * contain information about the exact location and nature of the
69     * insertion. The entry is generally of the form
70     * <contig>|<left end of fasta sequence>|<position of insertion>-<sequence of insertion>|<right end of fasta sequence>|ins|<[fwd|rev]>
71     */
72     template<typename TStr>
73     void print_insertion(const Insertion& insertion,
74     int read_len,
75     TStr& ref_str,
76     const string& ref_name,
77     ostream& splice_db)
78     {
79     int half_splice_len = read_len - min_anchor_len;
80    
81     size_t left_start, right_start;
82     size_t left_end, right_end;
83     if (insertion.left >= 0 && insertion.left <= length(ref_str))
84     {
85     left_start = (int)insertion.left - half_splice_len + 1 >= 0 ? (int)insertion.left - half_splice_len + 1 : 0;
86     left_end = left_start + half_splice_len;
87    
88     right_start = (int)left_end;
89     right_end = right_start + half_splice_len < length(ref_str) ? right_start + half_splice_len : length(ref_str);
90    
91     Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str, left_start, left_end);
92     Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str, right_start, right_end);
93    
94     splice_db << ">" << ref_name << "|" << left_start << "|" << insertion.left << "-" << insertion.sequence
95     << "|" << right_end << "|ins|" << ("fwd") << endl;
96    
97     splice_db << left_splice << insertion.sequence << right_splice << endl;
98     }
99     }
100    
101    
102     template<typename TStr>
103     void print_splice(const Junction& junction,
104 gpertea 154 int read_len,
105     const string& tag,
106     TStr& ref_str,
107     const string& ref_name,
108     ostream& splice_db)
109 gpertea 29 {
110     // daehwan - this is tentative, let's think about this more :)
111     // int half_splice_len = read_len - min_anchor_len;
112     int half_splice_len = read_len;
113     size_t left_start, right_start;
114     size_t left_end, right_end;
115    
116     if (junction.left >= 0 && junction.left <= length(ref_str) &&
117     junction.right >= 0 && junction.right <= length(ref_str))
118     {
119     left_start = (int)junction.left - half_splice_len + 1 >= 0 ? (int)junction.left - half_splice_len + 1 : 0;
120     left_end = left_start + half_splice_len;
121    
122     right_start = junction.right;
123     right_end = right_start + half_splice_len < length(ref_str) ? right_start + half_splice_len : length(ref_str) - right_start;
124    
125     Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str,
126     left_start,
127     left_end);
128     Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str,
129     right_start,
130     right_end);
131    
132     splice_db << ">" << ref_name << "|" << left_start << "|" << junction.left <<
133     "-" << junction.right << "|" << right_end << "|" << tag << endl;
134    
135     splice_db << left_splice << right_splice << endl;
136     }
137     }
138    
139 gpertea 154 template<typename TStr>
140     void print_fusion(const Fusion& fusion,
141     int read_len,
142     TStr& left_ref_str,
143     TStr& right_ref_str,
144     const char* left_ref_name,
145     const char* right_ref_name,
146     ostream& fusion_db)
147     {
148     int half_splice_len = read_len - min_anchor_len;
149    
150     size_t left_start, right_start;
151     size_t left_end, right_end;
152    
153     if (fusion.left >= 0 && fusion.left < length(left_ref_str) &&
154     fusion.right >= 0 && fusion.right < length(right_ref_str))
155     {
156     if (fusion.dir == FUSION_FF || fusion.dir == FUSION_FR)
157     {
158     left_start = fusion.left + 1 >= half_splice_len ? fusion.left - half_splice_len + 1 : 0;
159     left_end = left_start + half_splice_len;
160     }
161     else
162     {
163     left_start = fusion.left;
164     left_end = left_start + half_splice_len < length(left_ref_str) ? left_start + half_splice_len : length(left_ref_str);
165     }
166 gpertea 29
167 gpertea 154 if (fusion.dir == FUSION_FF || fusion.dir == FUSION_RF)
168     {
169     right_start = fusion.right;
170     right_end = right_start + half_splice_len < length(right_ref_str) ? right_start + half_splice_len : length(right_ref_str);
171     }
172     else
173     {
174     right_end = fusion.right + 1;
175     right_start = right_end >= half_splice_len ? right_end - half_splice_len : 0;
176     }
177    
178     seqan::Dna5String left_splice = infix(left_ref_str, left_start, left_end);
179     seqan::Dna5String right_splice = infix(right_ref_str, right_start, right_end);
180    
181     if (fusion.dir == FUSION_RF || fusion.dir == FUSION_RR)
182     {
183     seqan::convertInPlace(left_splice, seqan::FunctorComplement<Dna>());
184     seqan::reverseInPlace(left_splice);
185    
186     left_start = left_end - 1;
187     }
188    
189     if (fusion.dir == FUSION_FR || fusion.dir == FUSION_RR)
190     {
191     seqan::convertInPlace(right_splice, seqan::FunctorComplement<Dna>());
192     seqan::reverseInPlace(right_splice);
193    
194     right_end = right_start - 1;
195     }
196    
197     const char* dir = "ff";
198     if (fusion.dir == FUSION_FR)
199     dir = "fr";
200     else if (fusion.dir == FUSION_RF)
201     dir = "rf";
202     else if (fusion.dir == FUSION_RR)
203     dir = "rr";
204    
205     fusion_db << ">" << left_ref_name << "-" << right_ref_name << "|"
206     << left_start << "|"
207     << fusion.left << "-" << fusion.right << "|"
208     << right_end << "|fus|" << dir << endl;
209    
210     fusion_db << left_splice << right_splice << endl;
211     }
212     }
213    
214    
215 gpertea 29 /**
216     * Parse an int out of optarg and enforce that it be at least 'lower';
217     * if it is less than 'lower', than output the given error message and
218     * exit with an error and a usage message.
219     */
220     static int parse_oInt(int lower, char* arg, const char *errmsg) {
221     long l;
222     char *endPtr= NULL;
223     l = strtol(arg, &endPtr, 10);
224     if (endPtr != NULL) {
225     if (l < lower) {
226     cerr << errmsg << endl;
227     print_usage();
228     exit(1);
229     }
230     return (int32_t)l;
231     }
232     cerr << errmsg << endl;
233     print_usage();
234     exit(1);
235     return -1;
236     }
237     //
238     //int parse_options(int argc, char** argv)
239     //{
240     // int option_index = 0;
241     // int next_option;
242     // do {
243     // next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
244     // switch (next_option) {
245     // case 'v':
246     // verbose = true;
247     // break;
248     // case -1: /* Done with options. */
249     // break;
250     // default:
251     // print_usage();
252     // return 1;
253     // }
254     // } while(next_option != -1);
255     //
256     // return 0;
257     //}
258    
259 gpertea 154 void get_seqs(istream& ref_stream,
260     RefSequenceTable& rt,
261     bool keep_seqs = true)
262     {
263     while(ref_stream.good() &&
264     !ref_stream.eof())
265     {
266     RefSequenceTable::Sequence* ref_str = new RefSequenceTable::Sequence();
267     string name;
268     readMeta(ref_stream, name, Fasta());
269     string::size_type space_pos = name.find_first_of(" \t\r");
270     if (space_pos != string::npos)
271     {
272     name.resize(space_pos);
273     }
274     fprintf(stderr, "\tLoading %s...", name.c_str());
275     seqan::read(ref_stream, *ref_str, Fasta());
276     fprintf(stderr, "done\n");
277     rt.get_id(name, keep_seqs ? ref_str : NULL, 0);
278     if (!keep_seqs)
279     delete ref_str;
280     }
281     }
282    
283 gpertea 29 void driver(const vector<FILE*>& splice_coords_files,
284 gpertea 154 const vector<FILE*>& insertion_coords_files,
285     const vector<FILE*>& deletion_coords_files,
286     const vector<FILE*>& fusion_coords_files,
287     ifstream& ref_stream)
288 gpertea 29 {
289     char splice_buf[2048];
290 gpertea 154 RefSequenceTable rt(sam_header, true);
291     get_seqs(ref_stream, rt, true);
292    
293 gpertea 29 JunctionSet junctions;
294     for (size_t i = 0; i < splice_coords_files.size(); ++i)
295     {
296     FILE* splice_coords = splice_coords_files[i];
297     if (!splice_coords)
298     continue;
299     while (fgets(splice_buf, 2048, splice_coords))
300     {
301     char* nl = strrchr(splice_buf, '\n');
302     char* buf = splice_buf;
303     if (nl) *nl = 0;
304    
305     /**
306     Fields are:
307     1) reference name
308     2) left coord of splice (last char of the left exon)
309     3) right coord of splice (first char of the right exon)
310     */
311    
312     char* ref_name = get_token((char**)&buf, "\t");
313     char* scan_left_coord = get_token((char**)&buf, "\t");
314     char* scan_right_coord = get_token((char**)&buf, "\t");
315     char* orientation = get_token((char**)&buf, "\t");
316    
317     if (!scan_left_coord || !scan_right_coord || !orientation)
318     {
319     fprintf(stderr,"Error: malformed splice coordinate record\n");
320     exit(1);
321     }
322     uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
323     uint32_t left_coord = atoi(scan_left_coord);
324     uint32_t right_coord = atoi(scan_right_coord);
325     bool antisense = *orientation == '-';
326     junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
327     }
328     }
329    
330    
331     /*
332     * Read in the deletion coordinates
333     * and store in a set
334     */
335     std::set<Deletion> deletions;
336     for(size_t i=0; i < deletion_coords_files.size(); ++i){
337     FILE* deletion_coords = deletion_coords_files[i];
338     if(!deletion_coords){
339     continue;
340     }
341     while (fgets(splice_buf, 2048, deletion_coords))
342     {
343     char* nl = strrchr(splice_buf, '\n');
344     char* buf = splice_buf;
345     if (nl) *nl = 0;
346    
347     /**
348     Fields are:
349     1) reference name
350     2) left coord of splice (last char of the left exon)
351     3) right coord of splice (first char of the right exon)
352     */
353    
354     char* ref_name = get_token((char**)&buf, "\t");
355     char* scan_left_coord = get_token((char**)&buf, "\t");
356     char* scan_right_coord = get_token((char**)&buf, "\t");
357    
358     if (!scan_left_coord || !scan_right_coord)
359     {
360     fprintf(stderr,"Error: malformed deletion coordinate record\n");
361     exit(1);
362     }
363    
364     /*
365     * Note that when reading in a deletion, the left co-ord is the position of the
366     * first deleted based. Since we are co-opting the junction data structure, need
367     * to fix up this location
368     */
369     uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
370     uint32_t left_coord = atoi(scan_left_coord);
371     uint32_t right_coord = atoi(scan_right_coord);
372     deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false));
373     }
374     }
375    
376     /*
377     * Read in the insertion coordinates
378     * and store in a set
379     */
380     std::set<Insertion> insertions;
381     for(size_t i=0; i < insertion_coords_files.size(); ++i){
382     FILE* insertion_coords = insertion_coords_files[i];
383     if(!insertion_coords){
384     continue;
385     }
386     while(fgets(splice_buf, 2048, insertion_coords)){
387     char* nl = strrchr(splice_buf, '\n');
388     char* buf = splice_buf;
389     if (nl) *nl = 0;
390    
391     char* ref_name = get_token((char**)&buf, "\t");
392     char* scan_left_coord = get_token((char**)&buf, "\t");
393     char* scan_right_coord = get_token((char**)&buf, "\t");
394     char* scan_sequence = get_token((char**)&buf, "\t");
395    
396     if (!scan_left_coord || !scan_sequence || !scan_right_coord)
397     {
398     fprintf(stderr,"Error: malformed insertion coordinate record\n");
399     exit(1);
400     }
401    
402     seqan::Dna5String sequence = seqan::Dna5String(scan_sequence);
403     bool containsN = false;
404     for(size_t index = 0; index < seqan::length(sequence); index += 1){
405     /*
406     * Don't allow any ambiguities in the insertion
407     */
408     if(sequence[index] == 'N'){
409     containsN = true;
410     break;
411     }
412     }
413     if(containsN){
414     continue;
415     }
416     seqan::CharString charSequence = sequence;
417     uint32_t ref_id = rt.get_id(ref_name,NULL,0);
418     uint32_t left_coord = atoi(scan_left_coord);
419     insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence)));
420     }
421     }
422    
423 gpertea 154 std::set<Fusion> fusions;
424     for(size_t i=0; i < fusion_coords_files.size(); ++i){
425     FILE* fusion_coords = fusion_coords_files[i];
426     if(!fusion_coords){
427     continue;
428     }
429     while(fgets(splice_buf, 2048, fusion_coords)){
430     char* nl = strrchr(splice_buf, '\n');
431     char* buf = splice_buf;
432     if (nl) *nl = 0;
433    
434     char* ref_name1 = strsep((char**)&buf, "\t");
435     char* scan_left_coord = strsep((char**)&buf, "\t");
436     char* ref_name2 = strsep((char**)&buf, "\t");
437     char* scan_right_coord = strsep((char**)&buf, "\t");
438     char* scan_dir = strsep((char**)&buf, "\t");
439 gpertea 29
440 gpertea 154 if (!ref_name1 || !scan_left_coord || !ref_name2 || !scan_right_coord || !scan_dir)
441     {
442     fprintf(stderr,"Error: malformed insertion coordinate record\n");
443     exit(1);
444     }
445    
446     uint32_t ref_id1 = rt.get_id(ref_name1, NULL, 0);
447     uint32_t ref_id2 = rt.get_id(ref_name2, NULL, 0);
448     uint32_t left_coord = atoi(scan_left_coord);
449     uint32_t right_coord = atoi(scan_right_coord);
450     uint32_t dir = FUSION_FF;
451     if (strcmp(scan_dir, "fr") == 0)
452     dir = FUSION_FR;
453     else if(strcmp(scan_dir, "rf") == 0)
454     dir = FUSION_RF;
455     else if(strcmp(scan_dir, "rr") == 0)
456     dir = FUSION_RR;
457    
458     fusions.insert(Fusion(ref_id1, ref_id2, left_coord, right_coord, dir));
459 gpertea 29 }
460     }
461    
462 gpertea 154 {
463     JunctionSet::iterator itr = junctions.begin();
464     while(itr != junctions.end())
465     {
466     RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->first.refid);
467     const char* name = rt.get_name(itr->first.refid);
468     print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", *ref_str, name, cout);
469     ++itr;
470     }
471     }
472 gpertea 29
473 gpertea 154 {
474     std::set<Deletion>::iterator itr = deletions.begin();
475     while(itr != deletions.end())
476     {
477     RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
478     const char* name = rt.get_name(itr->refid);
479     print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", *ref_str, name, cout);
480     ++itr;
481     }
482     }
483 gpertea 29
484     {
485 gpertea 154 std::set<Insertion>::iterator itr = insertions.begin();
486     while(itr != insertions.end()){
487     RefSequenceTable::Sequence* ref_str = rt.get_seq(itr->refid);
488     const char* name = rt.get_name(itr->refid);
489     print_insertion(*itr, read_length, *ref_str, name, cout);
490     ++itr;
491     }
492 gpertea 29 }
493    
494     {
495 gpertea 154 std::set<Fusion>::iterator itr = fusions.begin();
496     while(itr != fusions.end()){
497     RefSequenceTable::Sequence* left_ref_str = rt.get_seq(itr->refid1);
498     RefSequenceTable::Sequence* right_ref_str = rt.get_seq(itr->refid2);
499     const char* left_ref_name = rt.get_name(itr->refid1);
500     const char* right_ref_name = rt.get_name(itr->refid2);
501     print_fusion(*itr, read_length, *left_ref_str, *right_ref_str, left_ref_name, right_ref_name, cout);
502     ++itr;
503     }
504 gpertea 29 }
505     }
506    
507     int main(int argc, char** argv)
508     {
509     fprintf(stderr, "juncs_db v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
510     fprintf(stderr, "---------------------------\n");
511    
512     int parse_ret = parse_options(argc, argv, print_usage);
513     if (parse_ret)
514     return parse_ret;
515    
516     if(optind >= argc)
517     {
518     print_usage();
519     return 1;
520     }
521    
522     min_anchor_len = parse_oInt(3, argv[optind++], "anchor length must be at least 3");
523    
524     if(optind >= argc)
525     {
526     print_usage();
527     return 1;
528     }
529    
530     read_length = parse_oInt(4, argv[optind++], "read length must be at least 4");
531    
532     if(optind >= argc)
533     {
534     print_usage();
535     return 1;
536     }
537    
538     string splice_coords_file_list = argv[optind++];
539     vector<string> splice_coords_file_names;
540     vector<FILE*> coords_files;
541     tokenize(splice_coords_file_list, ",", splice_coords_file_names);
542     for (size_t s = 0; s < splice_coords_file_names.size(); ++s)
543     {
544    
545     FILE* coords_file = fopen(splice_coords_file_names[s].c_str(), "r");
546    
547     if (!coords_file)
548     {
549     fprintf(stderr, "Warning: cannot open %s for reading\n",
550     splice_coords_file_names[s].c_str());
551     continue;
552     }
553     coords_files.push_back(coords_file);
554     }
555     if(optind >= argc)
556     {
557     print_usage();
558     return 1;
559     }
560    
561     /*
562     * Read in the insertion co-ordinates
563     */
564     string insertion_coords_file_list = argv[optind++];
565     vector<string> insertion_coords_file_names;
566     vector<FILE*> insertion_coords_files;
567     tokenize(insertion_coords_file_list, ",", insertion_coords_file_names);
568     for(size_t s = 0; s < insertion_coords_file_names.size(); ++s)
569     {
570     FILE* insertion_coords_file = fopen(insertion_coords_file_names[s].c_str(),"r");
571     if(!insertion_coords_file)
572     {
573     fprintf(stderr, "Warning: cannot open %s for reading\n",
574     insertion_coords_file_names[s].c_str());
575     continue;
576     }
577     insertion_coords_files.push_back(insertion_coords_file);
578     }
579     if(optind >= argc)
580     {
581     print_usage();
582     return 1;
583     }
584    
585    
586     /*
587     * Read in the deletion co-ordinates
588     */
589     string deletion_coords_file_list = argv[optind++];
590     vector<string> deletion_coords_file_names;
591     vector<FILE*> deletion_coords_files;
592     tokenize(deletion_coords_file_list, ",", deletion_coords_file_names);
593     for(size_t s = 0; s < deletion_coords_file_names.size(); ++s)
594     {
595     FILE* deletion_coords_file = fopen(deletion_coords_file_names[s].c_str(),"r");
596     if(!deletion_coords_file)
597     {
598     fprintf(stderr, "Warning: cannot open %s for reading\n",
599     deletion_coords_file_names[s].c_str());
600     continue;
601     }
602     deletion_coords_files.push_back(deletion_coords_file);
603     }
604     if(optind >= argc)
605     {
606     print_usage();
607     return 1;
608     }
609    
610    
611 gpertea 154 /*
612     */
613     string fusion_coords_file_list = argv[optind++];
614     vector<string> fusion_coords_file_names;
615     vector<FILE*> fusion_coords_files;
616     tokenize(fusion_coords_file_list, ",", fusion_coords_file_names);
617     for(size_t s = 0; s < fusion_coords_file_names.size(); ++s)
618     {
619     FILE* fusion_coords_file = fopen(fusion_coords_file_names[s].c_str(),"r");
620     if(!fusion_coords_file)
621     {
622     fprintf(stderr, "Warning: cannot open %s for reading\n",
623     fusion_coords_file_names[s].c_str());
624     continue;
625     }
626     fusion_coords_files.push_back(fusion_coords_file);
627     }
628     if(optind >= argc)
629     {
630     print_usage();
631     return 1;
632     }
633    
634 gpertea 29
635     string ref_file_name = argv[optind++];
636     ifstream ref_stream(ref_file_name.c_str());
637    
638     if (!ref_stream.good())
639     {
640     fprintf(stderr, "Error: cannot open %s for reading\n",
641     ref_file_name.c_str());
642     exit(1);
643     }
644    
645 gpertea 154 driver(coords_files, insertion_coords_files, deletion_coords_files, fusion_coords_files, ref_stream);
646 gpertea 29 return 0;
647     }