ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/tophat_cpp/extract_reads.cpp
Revision: 135
Committed: Mon Dec 12 22:28:38 2011 UTC (7 years, 10 months ago) by gpertea
File size: 4125 byte(s)
Log Message:
wip - SplicedSAMHitFactory() still not implemented

Line File contents
1 /*
2 * extract_reads.cpp
3 * TopHat
4 *
5 * Created by Cole Trapnell on 9/25/08.
6 * Copyright 2008 Cole Trapnell. All rights reserved.
7 *
8 */
9
10 #include <stdio.h>
11 #include <iostream>
12 #include <set>
13 #include <algorithm>
14 #include <string>
15 #include <cstring>
16 #include <cassert>
17 #include "reads.h"
18 #include "common.h"
19
20 using namespace std;
21
22 ReadFormat format = FASTA;
23
24 bool invert_selection = false;
25 bool use_alt_name = false;
26
27 struct r_to_l_lexcompare
28 {
29 bool operator()(const string& s1, const string& s2) const
30 {
31 return lexicographical_compare(s1.rbegin(), s1.rend(),
32 s2.rbegin(), s2.rend());
33 }
34 };
35
36
37 typedef set<string, r_to_l_lexcompare> READSET;
38
39 void extract_reads(FILE *fa, const READSET& selected_reads)
40 {
41 int num_reads = 0;
42 int reads_examined = 0;
43 Read read;
44 FLineReader fr(fa);
45 //while(!feof(fa))
46 while(!fr.isEof())
47 {
48 read.clear();
49
50 // Get the next read from the file
51 if (!next_fasta_record(fr, read.name, read.seq, format))
52 break;
53 if (format == FASTQ)
54 {
55 if (!next_fastq_record(fr, read.seq, read.alt_name, read.qual, format))
56 break;
57 }
58 reads_examined++;
59 const string& name_selected = use_alt_name ? read.alt_name : read.name;
60 bool read_in_set = selected_reads.find(name_selected) != selected_reads.end();
61 if ((read_in_set && !invert_selection) || (!read_in_set && invert_selection))
62 {
63 ++num_reads;
64 if (format == FASTA)
65 printf(">%s\n%s\n", read.name.c_str(), read.seq.c_str());
66 else if (format == FASTQ)
67 printf("@%s\n%s\n+%s\n%s\n",
68 read.name.c_str(), read.seq.c_str(),read.alt_name.c_str(),read.qual.c_str());
69
70 }
71 }
72 }
73
74
75 void get_next_chunk(FILE* fp,
76 READSET& selected_reads,
77 unsigned int chunk_size)
78 {
79 static int buf_size = 2048;
80 char buf[buf_size];
81
82 unsigned int id_count = 0;
83 while (!feof(fp) && fgets(buf, buf_size, fp))
84 {
85 // Chomp trailing newline
86 char* nl = strrchr(buf, '\n');
87 if (nl) *nl = 0;
88
89 // Chomp leading whitespace
90 char* p = buf;
91 while(*p && isspace(*p)) ++p;
92 if (!*p)
93 continue;
94
95 selected_reads.insert(p);
96 if (++id_count >= chunk_size)
97 break;
98 }
99 fprintf(stderr, "chunk contains %d read ids\n", id_count);
100 }
101
102
103 static void print_usage()
104 {
105 cerr << "Usage: extract_reads [options] read_ids < reads.f*" << endl;
106 cerr << " -f reads are in FASTA format" << endl;
107 cerr << " -q reads are in FASTQ format" << endl;
108 cerr << " -a Select reads based on alternate name field (FASTQ only)" << endl;
109 cerr << " -r <int> select reads in ARG sized chunks to reduce peak memory." << endl;
110 cerr << " -v invert the selection (i.e. select reads NOT in <read_ids>." << endl;
111 cerr << "\nSelects reads listed in read_ids from the standard input" << endl;
112 }
113
114 int main(int argc, char *argv[])
115 {
116 FILE *fp;
117 int c;
118 unsigned int chunk_size = VMAXINT32;
119 // Parse command line options
120 while ((c = getopt(argc, argv, "hqfr:va")) >= 0) {
121 switch (c) {
122 case 'h':
123 {
124 print_usage();
125 return 1;
126 }
127 case 'q':
128 {
129 format = FASTQ;
130 break;
131 }
132 case 'a':
133 {
134 use_alt_name = true;
135 break;
136 }
137 case 'f':
138 {
139 format = FASTA;
140 break;
141 }
142 case 'r':
143 {
144 chunk_size = parseIntOpt(1,"-r arg must be at least 1", print_usage);
145 break;
146 }
147 case 'v':
148 {
149 invert_selection = true;
150 break;
151 }
152 default: break;
153 }
154 }
155
156 if(optind >= argc) {
157 print_usage();
158 return 1;
159 }
160
161 string selected_reads_name = argv[optind];
162 FILE* f_selected_reads = fopen(selected_reads_name.c_str(),"r");
163 if (!f_selected_reads)
164 {
165 fprintf(stderr, "Error: could not open %s\n", selected_reads_name.c_str());
166 exit(1);
167 }
168
169 fp = stdin;
170 //fp = (strcmp(argv[optind], "-") == 0)? stdin : fopen(argv[optind], "r");
171 assert(fp);
172
173 READSET selected_reads;
174 //unsigned int chunk_number = 1;
175 do {
176 selected_reads.clear();
177 rewind(fp);
178
179 //fprintf(stderr,"Starting chunk # %d\n", chunk_number++);
180 get_next_chunk(f_selected_reads, selected_reads, chunk_size);
181 // Only print to standard out the good reads
182 extract_reads(fp, selected_reads);
183
184 }while(!feof(f_selected_reads));
185
186 fclose(fp);
187 return 0;
188 }