ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/juncs_db.cpp
Revision: 29
Committed: Tue Aug 2 21:24:54 2011 UTC (9 years, 3 months ago) by gpertea
File size: 14704 byte(s)
Log Message:
adding tophat source work

Line File contents
1 /*
2 * juncs_db.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 12/12/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13
14
15 #include <cassert>
16 #include <cstdio>
17 #include <cstring>
18 #include <vector>
19 #include <string>
20 #include <map>
21 #include <algorithm>
22 #include <set>
23 #include <stdexcept>
24 #include <iostream>
25 #include <fstream>
26 #include <seqan/sequence.h>
27 #include <seqan/find.h>
28 #include <seqan/file.h>
29 #include <getopt.h>
30
31 #include "common.h"
32 #include "bwt_map.h"
33 #include "tokenize.h"
34 #include "junctions.h"
35 #include "insertions.h"
36 #include "deletions.h"
37
38 using namespace std;
39 using namespace seqan;
40 using std::set;
41
42
43 // Length of the outer dimension of a single insert from the paired-end library
44
45 static int read_length = -1;
46 void print_usage()
47 {
48 fprintf(stderr, "Usage: juncs_db <min_anchor> <read_length> <splice_coords1,...,splice_coordsN> <insertion_coords1,...,insertion_coordsN> <deletion_coords1,...,deletion_coordsN> <ref.fa>\n");
49 }
50
51 typedef vector<string> Mapped;
52
53 bool splice_junc_lt(const pair<size_t, size_t>& lhs,
54 const pair<size_t, size_t>& rhs)
55 {
56 if (lhs.first < rhs.first)
57 return true;
58 else
59 return lhs.second < rhs.second;
60 }
61
62 /**
63 * Given an insertion set, this code will print FASTA entries
64 * for the surrounding sequence. The names of the FASTA entries
65 * contain information about the exact location and nature of the
66 * insertion. The entry is generally of the form
67 * <contig>|<left end of fasta sequence>|<position of insertion>-<sequence of insertion>|<right end of fasta sequence>|ins|<[fwd|rev]>
68 */
69 template<typename TStr>
70 void print_insertion(const Insertion& insertion,
71 int read_len,
72 TStr& ref_str,
73 const string& ref_name,
74 ostream& splice_db)
75 {
76 int half_splice_len = read_len - min_anchor_len;
77
78 size_t left_start, right_start;
79 size_t left_end, right_end;
80 if (insertion.left >= 0 && insertion.left <= length(ref_str))
81 {
82 left_start = (int)insertion.left - half_splice_len + 1 >= 0 ? (int)insertion.left - half_splice_len + 1 : 0;
83 left_end = left_start + half_splice_len;
84
85 right_start = (int)left_end;
86 right_end = right_start + half_splice_len < length(ref_str) ? right_start + half_splice_len : length(ref_str);
87
88 Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str, left_start, left_end);
89 Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str, right_start, right_end);
90
91 splice_db << ">" << ref_name << "|" << left_start << "|" << insertion.left << "-" << insertion.sequence
92 << "|" << right_end << "|ins|" << ("fwd") << endl;
93
94 splice_db << left_splice << insertion.sequence << right_splice << endl;
95 }
96 }
97
98
99 template<typename TStr>
100 void print_splice(const Junction& junction,
101 int read_len,
102 const string& tag,
103 TStr& ref_str,
104 const string& ref_name,
105 ostream& splice_db)
106 {
107 // daehwan - this is tentative, let's think about this more :)
108 // int half_splice_len = read_len - min_anchor_len;
109 int half_splice_len = read_len;
110
111 size_t left_start, right_start;
112 size_t left_end, right_end;
113
114 if (junction.left >= 0 && junction.left <= length(ref_str) &&
115 junction.right >= 0 && junction.right <= length(ref_str))
116 {
117 left_start = (int)junction.left - half_splice_len + 1 >= 0 ? (int)junction.left - half_splice_len + 1 : 0;
118 left_end = left_start + half_splice_len;
119
120 right_start = junction.right;
121 right_end = right_start + half_splice_len < length(ref_str) ? right_start + half_splice_len : length(ref_str) - right_start;
122
123 Infix<RefSequenceTable::Sequence>::Type left_splice = infix(ref_str,
124 left_start,
125 left_end);
126 Infix<RefSequenceTable::Sequence>::Type right_splice = infix(ref_str,
127 right_start,
128 right_end);
129
130 splice_db << ">" << ref_name << "|" << left_start << "|" << junction.left <<
131 "-" << junction.right << "|" << right_end << "|" << tag << endl;
132
133 splice_db << left_splice << right_splice << endl;
134 }
135 }
136
137
138 /**
139 * Parse an int out of optarg and enforce that it be at least 'lower';
140 * if it is less than 'lower', than output the given error message and
141 * exit with an error and a usage message.
142 */
143 static int parse_oInt(int lower, char* arg, const char *errmsg) {
144 long l;
145 char *endPtr= NULL;
146 l = strtol(arg, &endPtr, 10);
147 if (endPtr != NULL) {
148 if (l < lower) {
149 cerr << errmsg << endl;
150 print_usage();
151 exit(1);
152 }
153 return (int32_t)l;
154 }
155 cerr << errmsg << endl;
156 print_usage();
157 exit(1);
158 return -1;
159 }
160 //
161 //int parse_options(int argc, char** argv)
162 //{
163 // int option_index = 0;
164 // int next_option;
165 // do {
166 // next_option = getopt_long(argc, argv, short_options, long_options, &option_index);
167 // switch (next_option) {
168 // case 'v':
169 // verbose = true;
170 // break;
171 // case -1: /* Done with options. */
172 // break;
173 // default:
174 // print_usage();
175 // return 1;
176 // }
177 // } while(next_option != -1);
178 //
179 // return 0;
180 //}
181
182 void driver(const vector<FILE*>& splice_coords_files,
183 const vector<FILE*>& insertion_coords_files,
184 const vector<FILE*>& deletion_coords_files,
185 ifstream& ref_stream)
186 {
187 char splice_buf[2048];
188 RefSequenceTable rt(true);
189 JunctionSet junctions;
190 for (size_t i = 0; i < splice_coords_files.size(); ++i)
191 {
192 FILE* splice_coords = splice_coords_files[i];
193 if (!splice_coords)
194 continue;
195 while (fgets(splice_buf, 2048, splice_coords))
196 {
197 char* nl = strrchr(splice_buf, '\n');
198 char* buf = splice_buf;
199 if (nl) *nl = 0;
200
201 /**
202 Fields are:
203 1) reference name
204 2) left coord of splice (last char of the left exon)
205 3) right coord of splice (first char of the right exon)
206 */
207
208 char* ref_name = get_token((char**)&buf, "\t");
209 char* scan_left_coord = get_token((char**)&buf, "\t");
210 char* scan_right_coord = get_token((char**)&buf, "\t");
211 char* orientation = get_token((char**)&buf, "\t");
212
213 if (!scan_left_coord || !scan_right_coord || !orientation)
214 {
215 fprintf(stderr,"Error: malformed splice coordinate record\n");
216 exit(1);
217 }
218 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
219 uint32_t left_coord = atoi(scan_left_coord);
220 uint32_t right_coord = atoi(scan_right_coord);
221 bool antisense = *orientation == '-';
222 junctions.insert(make_pair<Junction, JunctionStats>(Junction(ref_id, left_coord, right_coord, antisense), JunctionStats()));
223 }
224 }
225
226
227 /*
228 * Read in the deletion coordinates
229 * and store in a set
230 */
231 std::set<Deletion> deletions;
232 for(size_t i=0; i < deletion_coords_files.size(); ++i){
233 FILE* deletion_coords = deletion_coords_files[i];
234 if(!deletion_coords){
235 continue;
236 }
237 while (fgets(splice_buf, 2048, deletion_coords))
238 {
239 char* nl = strrchr(splice_buf, '\n');
240 char* buf = splice_buf;
241 if (nl) *nl = 0;
242
243 /**
244 Fields are:
245 1) reference name
246 2) left coord of splice (last char of the left exon)
247 3) right coord of splice (first char of the right exon)
248 */
249
250 char* ref_name = get_token((char**)&buf, "\t");
251 char* scan_left_coord = get_token((char**)&buf, "\t");
252 char* scan_right_coord = get_token((char**)&buf, "\t");
253
254 if (!scan_left_coord || !scan_right_coord)
255 {
256 fprintf(stderr,"Error: malformed deletion coordinate record\n");
257 exit(1);
258 }
259
260 /*
261 * Note that when reading in a deletion, the left co-ord is the position of the
262 * first deleted based. Since we are co-opting the junction data structure, need
263 * to fix up this location
264 */
265 uint32_t ref_id = rt.get_id(ref_name, NULL, 0);
266 uint32_t left_coord = atoi(scan_left_coord);
267 uint32_t right_coord = atoi(scan_right_coord);
268 deletions.insert(Deletion(ref_id, left_coord - 1, right_coord, false));
269 }
270 }
271
272 /*
273 * Read in the insertion coordinates
274 * and store in a set
275 */
276 std::set<Insertion> insertions;
277 for(size_t i=0; i < insertion_coords_files.size(); ++i){
278 FILE* insertion_coords = insertion_coords_files[i];
279 if(!insertion_coords){
280 continue;
281 }
282 while(fgets(splice_buf, 2048, insertion_coords)){
283 char* nl = strrchr(splice_buf, '\n');
284 char* buf = splice_buf;
285 if (nl) *nl = 0;
286
287 char* ref_name = get_token((char**)&buf, "\t");
288 char* scan_left_coord = get_token((char**)&buf, "\t");
289 char* scan_right_coord = get_token((char**)&buf, "\t");
290 char* scan_sequence = get_token((char**)&buf, "\t");
291
292 if (!scan_left_coord || !scan_sequence || !scan_right_coord)
293 {
294 fprintf(stderr,"Error: malformed insertion coordinate record\n");
295 exit(1);
296 }
297
298 seqan::Dna5String sequence = seqan::Dna5String(scan_sequence);
299 bool containsN = false;
300 for(size_t index = 0; index < seqan::length(sequence); index += 1){
301 /*
302 * Don't allow any ambiguities in the insertion
303 */
304 if(sequence[index] == 'N'){
305 containsN = true;
306 break;
307 }
308 }
309 if(containsN){
310 continue;
311 }
312 seqan::CharString charSequence = sequence;
313 uint32_t ref_id = rt.get_id(ref_name,NULL,0);
314 uint32_t left_coord = atoi(scan_left_coord);
315 insertions.insert(Insertion(ref_id, left_coord, seqan::toCString(charSequence)));
316 }
317 }
318
319
320 typedef RefSequenceTable::Sequence Reference;
321
322 while(ref_stream.good() &&
323 !ref_stream.eof())
324 {
325 Reference ref_str;
326 string name;
327
328 readMeta(ref_stream, name, Fasta());
329 string::size_type space_pos = name.find_first_of(" \t\r");
330 if (space_pos != string::npos)
331 {
332 name.resize(space_pos);
333 }
334
335 read(ref_stream, ref_str, Fasta());
336
337 uint32_t refid = rt.get_id(name, NULL, 0);
338 Junction dummy_left(refid, 0, 0, true);
339 Junction dummy_right(refid, 0xFFFFFFFF, 0xFFFFFFFF, true);
340
341 pair<JunctionSet::iterator, JunctionSet::iterator> r;
342 r.first = junctions.lower_bound(dummy_left);
343 r.second = junctions.upper_bound(dummy_right);
344
345 JunctionSet::iterator itr = r.first;
346
347 while(itr != r.second && itr != junctions.end())
348 {
349 print_splice(itr->first, read_length, itr->first.antisense ? "GTAG|rev" : "GTAG|fwd", ref_str, name, cout);
350 ++itr;
351 }
352 }
353
354
355 ref_stream.clear();
356 ref_stream.seekg(0, ios::beg);
357
358
359 while(ref_stream.good() &&
360 !ref_stream.eof())
361 {
362 Reference ref_str;
363 string name;
364
365 readMeta(ref_stream, name, Fasta());
366 string::size_type space_pos = name.find_first_of(" \t\r");
367 if (space_pos != string::npos)
368 {
369 name.resize(space_pos);
370 }
371
372 read(ref_stream, ref_str, Fasta());
373
374 uint32_t refid = rt.get_id(name, NULL,0);
375 Deletion dummy_left(refid, 0, 0, true);
376 Deletion dummy_right(refid, 0xFFFFFFFF, 0xFFFFFFFF, true);
377
378 pair<std::set<Deletion>::iterator, std::set<Deletion>::iterator> r;
379 r.first = deletions.lower_bound(dummy_left);
380 r.second = deletions.upper_bound(dummy_right);
381
382 std::set<Deletion>::iterator itr = r.first;
383
384 while(itr != r.second && itr != deletions.end())
385 {
386 print_splice((Junction)*itr, read_length, itr->antisense ? "del|rev" : "del|fwd", ref_str, name, cout);
387 ++itr;
388 }
389 }
390
391 ref_stream.clear();
392 ref_stream.seekg(0, ios::beg);
393
394
395
396 while(ref_stream.good() &&
397 !ref_stream.eof())
398 {
399 Reference ref_str;
400 string name;
401
402 readMeta(ref_stream, name, Fasta());
403 string::size_type space_pos = name.find_first_of(" \t\r");
404 if (space_pos != string::npos)
405 {
406 name.resize(space_pos);
407 }
408
409 read(ref_stream, ref_str, Fasta());
410
411 uint32_t refid = rt.get_id(name, NULL,0);
412 Insertion dummy_left(refid, 0, "");
413 Insertion dummy_right(refid, 0xFFFFFFFF, "");
414
415 std::set<Insertion>::iterator itr = insertions.lower_bound(dummy_left);
416 std::set<Insertion>::iterator upper = insertions.upper_bound(dummy_right);
417
418 while(itr != upper && itr != insertions.end()){
419 print_insertion(*itr, read_length, ref_str, name, cout);
420 ++itr;
421 }
422 }
423
424 }
425
426 int main(int argc, char** argv)
427 {
428 fprintf(stderr, "juncs_db v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
429 fprintf(stderr, "---------------------------\n");
430
431 int parse_ret = parse_options(argc, argv, print_usage);
432 if (parse_ret)
433 return parse_ret;
434
435 if(optind >= argc)
436 {
437 print_usage();
438 return 1;
439 }
440
441 min_anchor_len = parse_oInt(3, argv[optind++], "anchor length must be at least 3");
442
443 if(optind >= argc)
444 {
445 print_usage();
446 return 1;
447 }
448
449 read_length = parse_oInt(4, argv[optind++], "read length must be at least 4");
450
451 if(optind >= argc)
452 {
453 print_usage();
454 return 1;
455 }
456
457 string splice_coords_file_list = argv[optind++];
458 vector<string> splice_coords_file_names;
459 vector<FILE*> coords_files;
460 tokenize(splice_coords_file_list, ",", splice_coords_file_names);
461 for (size_t s = 0; s < splice_coords_file_names.size(); ++s)
462 {
463
464 FILE* coords_file = fopen(splice_coords_file_names[s].c_str(), "r");
465
466 if (!coords_file)
467 {
468 fprintf(stderr, "Warning: cannot open %s for reading\n",
469 splice_coords_file_names[s].c_str());
470 continue;
471 }
472 coords_files.push_back(coords_file);
473 }
474 if(optind >= argc)
475 {
476 print_usage();
477 return 1;
478 }
479
480 /*
481 * Read in the insertion co-ordinates
482 */
483 string insertion_coords_file_list = argv[optind++];
484 vector<string> insertion_coords_file_names;
485 vector<FILE*> insertion_coords_files;
486 tokenize(insertion_coords_file_list, ",", insertion_coords_file_names);
487 for(size_t s = 0; s < insertion_coords_file_names.size(); ++s)
488 {
489 FILE* insertion_coords_file = fopen(insertion_coords_file_names[s].c_str(),"r");
490 if(!insertion_coords_file)
491 {
492 fprintf(stderr, "Warning: cannot open %s for reading\n",
493 insertion_coords_file_names[s].c_str());
494 continue;
495 }
496 insertion_coords_files.push_back(insertion_coords_file);
497 }
498 if(optind >= argc)
499 {
500 print_usage();
501 return 1;
502 }
503
504
505 /*
506 * Read in the deletion co-ordinates
507 */
508 string deletion_coords_file_list = argv[optind++];
509 vector<string> deletion_coords_file_names;
510 vector<FILE*> deletion_coords_files;
511 tokenize(deletion_coords_file_list, ",", deletion_coords_file_names);
512 for(size_t s = 0; s < deletion_coords_file_names.size(); ++s)
513 {
514 FILE* deletion_coords_file = fopen(deletion_coords_file_names[s].c_str(),"r");
515 if(!deletion_coords_file)
516 {
517 fprintf(stderr, "Warning: cannot open %s for reading\n",
518 deletion_coords_file_names[s].c_str());
519 continue;
520 }
521 deletion_coords_files.push_back(deletion_coords_file);
522 }
523 if(optind >= argc)
524 {
525 print_usage();
526 return 1;
527 }
528
529
530
531 string ref_file_name = argv[optind++];
532 ifstream ref_stream(ref_file_name.c_str());
533
534 if (!ref_stream.good())
535 {
536 fprintf(stderr, "Error: cannot open %s for reading\n",
537 ref_file_name.c_str());
538 exit(1);
539 }
540
541 driver(coords_files, insertion_coords_files, deletion_coords_files, ref_stream);
542 return 0;
543 }