ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/wigcov.cpp
Revision: 146
Committed: Thu Dec 22 20:41:43 2011 UTC (8 years, 3 months ago) by gpertea
File size: 16488 byte(s)
Log Message:
removed useless operator>

Line File contents
1 #include "GBase.h"
2 #include "GArgs.h"
3 #include "GStr.h"
4 #include "GList.hh"
5 #include "GHash.hh"
6 //#include "gffmrna.h"
7 #include "gff.h"
8 #include <ctype.h>
9
10
11 #define USAGE "Usage:\n\
12 wigcov [-f <input.gff>] [-A] <input.wig>\n\
13 \n\
14 Shows the coverage islands found in <input.wig>, or,\n\
15 if the -f option is given, acts as a filter for the given transcripts,\n\
16 only showing those transcripts overlapping the coverage islands. \n\
17 \n\
18 -A option shows such a transcript even if a single exon overlaps the\n\
19 coverage islands (default: all exons should overlap)\n\
20 "
21 class CovIsland {
22 uint nextp;
23 public:
24 uint start;
25 uint end;
26 CovIsland(uint gstart, int initcov, int cspan=1) {
27 init(gstart, initcov, cspan);
28 nextp=end+1;
29 }
30 CovIsland(GLineReader& covreader, GStr& seqname, uint maxgap=10);
31 //construct a whole island by reading next lines from file
32 //and stopping if a 0-coverage gap of at least maxgap is encountered
33 void init(uint gstart, int initcov, int cspan=1) {
34 start=gstart;
35 end=gstart+cspan-1;
36 nextp=end+1;
37 }
38 void extend(int scov, int cspan=1) {
39 if (scov>255) scov=255;
40 end+=cspan;
41 nextp=end+1;
42 }
43 bool operator==(CovIsland& c){
44 return (start==c.start && this->end==c.end);
45 }
46 bool operator<(CovIsland& c){
47 return (start==c.start) ? (end<c.end) : (start<c.start);
48 }
49 };
50
51 CovIsland::CovIsland(GLineReader& covreader, GStr& gseqname, uint maxgap) {
52 //build a coverage island by parsing the next .wig lines
53 static const char msg_ERR_COVPARSE[] = " Error at parsing coverage file, line: %s\n";
54 char *line;
55 char* fields[5];
56 start=0;
57 end=0;
58 nextp=0;
59 gseqname.clear(); //it will update it
60 while ((line=covreader.nextLine())!=NULL) {
61 GStr l=line; //keep a copy for error messages
62 if (l.length()<3) continue;
63 int tidx=strsplit(line,fields, 4, '\t');
64 if (tidx<4) continue;
65 uint cstart=0, cend=0;
66 int lcov=0;
67 if (!gseqname.is_empty() && gseqname!=fields[0]) {
68 //different genomic seq, return here
69 nextp=MAXUINT;
70 covreader.pushBack(); //leave it for next CovIsland
71 break;
72 }
73
74 if (!(parseUInt(fields[1],cstart) && parseUInt(fields[2],cend) && parseInt(fields[3],lcov)))
75 GError(msg_ERR_COVPARSE, l.chars());
76 cend--;
77 if (lcov==0) { // no coverage segment
78 if (cend-cstart>maxgap) { //gap larger than minimum intron
79 if (end) { nextp=cend+1; break; } //close this CovIsland
80 else continue; //skip initially uncovered region
81 }
82 }// cov gap
83 if (end) {
84 if (cstart>end+1) { //implied gap
85 if (cstart-end-1>maxgap) {
86 nextp=cstart;
87 covreader.pushBack(); //read it next time
88 break; //island end
89 }
90 extend(0, cstart-end+1); //extend through small gap
91 } //implied gap
92 extend(lcov, cend-cstart+1);
93 }
94 else { //new start
95 init(cstart, lcov, cend-cstart+1);
96 if (gseqname.is_empty()) gseqname=fields[0];
97 }
98 }//while coverage lines
99 // -- we just finished putting together a coverage island
100
101 }
102
103 class GLocus:public GSeg {
104 public:
105 int gseq_id; //id of underlying genomic sequence
106 GList<GffObj> mrnas; //list of transcripts for this locus
107 GArray<GSeg> mexons; //list of merged exons in this region
108 GLocus(GffObj* mrna=NULL):mrnas(true,false,false), mexons(true,true) {
109 //this will NOT free mrnas!
110 gseq_id=-1;
111 if (mrna!=NULL) {
112 start=mrna->exons.First()->start;
113 end=mrna->exons.Last()->end;;
114 gseq_id=mrna->gseq_id;
115 GSeg seg;
116 for (int i=0;i<mrna->exons.Count();i++) {
117 seg.start=mrna->exons[i]->start;
118 seg.end=mrna->exons[i]->end;
119 mexons.Add(seg);
120 }
121 mrnas.Add(mrna);
122 mrna->uptr=this;
123 }
124 }
125 void addMerge(GLocus& locus, GffObj* lnkmrna) {
126 //add all the elements of the other locus (merging)
127 //-- merge mexons
128 GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
129 int i=0; //index of first mexons with a merge
130 int j=0; //index current mrna exon
131 while (i<mexons.Count() && j<locus.mexons.Count()) {
132 uint istart=mexons[i].start;
133 uint iend=mexons[i].end;
134 uint jstart=locus.mexons[j].start;
135 uint jend=locus.mexons[j].end;
136 if (iend<jstart) { i++; continue; }
137 if (jend<istart) { j++; continue; }
138 //if (mexons[i].overlap(jstart, jend)) {
139 //exon overlap was found :
140 ovlexons.Add(j);
141 //extend mexons[i] as needed
142 if (jstart<istart) mexons[i].start=jstart;
143 if (jend>iend) { //mexons[i] end extend
144 mexons[i].end=jend;
145 //now this could overlap the next mexon(s), so we have to merge them all
146 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
147 uint nextend=mexons[i+1].end;
148 mexons.Delete(i+1);
149 if (nextend>mexons[i].end) {
150 mexons[i].end=nextend;
151 break; //no need to check next mexons
152 }
153 } //while next mexons merge
154 } // mexons[i] end extend
155 // } //exon overlap
156 j++; //check the next locus.mexon
157 }
158 // -- merge uexons
159 //add to exons:
160 GSeg seg;
161 for (int i=0;i<locus.mexons.Count();i++) {
162 seg.start=locus.mexons[i].start;
163 seg.end=locus.mexons[i].end;
164 if (!ovlexons.Exists(i)) mexons.Add(seg);
165 }
166 // -- add locus.mrnas
167 for (int i=0;i<locus.mrnas.Count();i++) {
168 locus.mrnas[i]->uptr=this;
169 if (locus.mrnas[i]!=lnkmrna)
170 mrnas.Add(locus.mrnas[i]);
171 }
172 // -- adjust start/end as needed
173 if (start>locus.start) start=locus.start;
174 if (end<locus.end) end=locus.end;
175 }
176
177 bool add_mRNA(GffObj* mrna) {
178 if (mrnas.Count()>0 && mrna->gseq_id!=gseq_id) return false; //mrna must be on the same genomic seq
179 //check for exon overlap with existing mexons
180 //also update uexons and mexons accordingly, if mrna is added
181 uint mrna_start=mrna->exons.First()->start;
182 uint mrna_end=mrna->exons.Last()->end;
183 if (mrna_start>end || start>mrna_end) return false;
184 bool hasovl=false;
185 int i=0; //index of first mexons with a merge
186 int j=0; //index current mrna exon
187 GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
188 while (i<mexons.Count() && j<mrna->exons.Count()) {
189 uint istart=mexons[i].start;
190 uint iend=mexons[i].end;
191 uint jstart=mrna->exons[j]->start;
192 uint jend=mrna->exons[j]->end;
193 if (iend<jstart) { i++; continue; }
194 if (jend<istart) { j++; continue; }
195 //exon overlap found if we're here:
196 ovlexons.Add(j);
197 hasovl=true;
198 //extend mexons[i] as needed
199 if (jstart<istart) mexons[i].start=jstart;
200 if (jend>iend) { //mexon stretch up
201 mexons[i].end=jend;
202 //now this could overlap the next mexon(s), so we have to merge them all
203 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
204 uint nextend=mexons[i+1].end;
205 mexons.Delete(i+1);
206 if (nextend>mexons[i].end) {
207 mexons[i].end=nextend;
208 break; //no need to check next mexons
209 }
210 } //while next mexons merge
211 } //possible mexons merge
212
213 j++; //check the next mrna exon
214 }//all vs all exon check loop
215 if (hasovl) {
216 GSeg seg;
217 for (int i=0;i<mrna->exons.Count();i++) {
218 seg.start=mrna->exons[i]->start;
219 seg.end=mrna->exons[i]->end;
220 if (!ovlexons.Exists(i)) mexons.Add(seg);
221 }
222 // add to mrnas
223 mrnas.Add(mrna);
224 // adjust start/end
225 if (start>mrna_start) start=mrna_start;
226 if (end<mrna_end) end=mrna_end;
227 mrna->uptr=this;
228 gseq_id=mrna->gseq_id;
229 }
230 return hasovl;
231 }
232 };
233
234
235 class GSeqData {
236 public:
237 GList<GffObj> mrnas;
238 GList<GLocus> loci;
239 GSeqData():mrnas(true,true,false), loci(true,true,true) {
240 }
241 };
242
243 void openfw(FILE* &f, GArgs& args, char opt) {
244 GStr s=args.getOpt(opt);
245 if (!s.is_empty()) {
246 if (s=='-')
247 f=stdout;
248 else {
249 f=fopen(s,"w");
250 if (f==NULL) GError("Error creating file: %s\n", s.chars());
251 }
252 }
253 }
254
255 // --- globals:
256 GHash<GSeqData> ctghash(true);
257
258 void cluster_mRNAs(GList<GffObj>& mrnas, GList<GLocus>& loci) {
259 //mrnas sorted by start coordinate
260 //and so are the loci
261 for (int t=0;t<mrnas.Count();t++) {
262 GArray<int> mrgloci(true);
263 GffObj* mrna=mrnas[t];
264 int lfound=0; //count of parent loci
265 for (int l=0;l<loci.Count();l++) {
266 if (loci[l]->end<mrna->exons.First()->start) continue;
267 if (loci[l]->add_mRNA(mrna)) {
268 //a parent locus was found
269 lfound++;
270 mrgloci.Add(l);
271 }
272 }//loci loop
273 if (lfound==0) {
274 //create a locus with only this mRNA
275 loci.Add(new GLocus(mrna));
276 }
277 else if (lfound>1) {
278 //more than one loci found parenting this mRNA, merge loci
279 //if (lfound>2) GMessage(" merging %d loci \n",lfound);
280 for (int l=1;l<lfound;l++) {
281 int mlidx=mrgloci[l]-l+1;
282 loci[mrgloci[0]]->addMerge(*loci[mlidx], mrna);
283 loci.Delete(mlidx);
284 }
285 }
286 }//mrnas loop
287 }
288
289 class COvlCovList: public GList<CovIsland> {
290 public:
291 GLocus* ovloc;
292 COvlCovList():GList<CovIsland>(true,true,false) {
293 ovloc=NULL;
294 }
295 void addCov(CovIsland& cov, GLocus* loc=NULL) {
296 //add CovIsland overlapping locus
297 if (this->Count()==0 && loc==NULL)
298 GError("Error: cannot add CovIsland to COvlCovList without a locus");
299 if (loc!=NULL) {
300 if (!loc->overlap(cov.start, cov.end))
301 GError("Error: adding non-overlapping CovIsland %d-%d to COvlCovList for locus %d-%d!\n",
302 cov.start, cov.end, loc->start, loc->end);
303 if (ovloc!=NULL && ovloc!=loc)
304 GError("Error: changing locus when adding new overlapping CovIsland\n");
305 ovloc=loc;
306 }
307 this->Add(new CovIsland(cov));
308 }
309 bool mrnaOvl(GffObj& mrna, bool fullcov) {
310 //TODO: check single or all-exon match
311 int i=0;//cov index
312 int j=0;//exon index
313 int* exonovl=NULL; //exonovl[j]=1 if exon j has overlap
314 GCALLOC(exonovl, mrna.exons.Count()*sizeof(int));
315 bool aovl=false; //at least an ovl
316 while (j<mrna.exons.Count() && i<Count()) {
317 uint jstart=mrna.exons[j]->start;
318 uint jend=mrna.exons[j]->end;
319 uint istart=Get(i)->start;
320 uint iend=Get(i)->end;
321 if (iend<jstart) { i++; continue; } //next covisland
322 if (jend<istart) { j++; continue; } //next exon
323 //we have overlap!
324 exonovl[j]=1;aovl=true;
325 if (!fullcov) { GFREE(exonovl); return true; }
326 // if (iend<jend) i++;
327 // else j++;
328 j++; //check next exon
329 }
330 if (!aovl) { GFREE(exonovl); return false; }
331 if (fullcov)
332 for (j=0;j<mrna.exons.Count();j++) {
333 if (exonovl[j]!=1) { GFREE(exonovl); return false; }
334 }
335 GFREE(exonovl);
336 return true;
337 }
338
339 void ovlCheck(bool fullcov, FILE* fout) {
340 if (Count()==0) return;
341 for (int m=0;m<ovloc->mrnas.Count();m++) {
342 if (ovloc->mrnas[m]->udata==0 && mrnaOvl(*(ovloc->mrnas[m]), fullcov)) {
343 ovloc->mrnas[m]->udata=1;
344 ovloc->mrnas[m]->printGtf(fout);
345 }
346 }//for each mrna in this locus
347 // now clear it, we're done with this
348 ovloc=NULL;
349 Clear();
350 }
351
352 };
353
354
355 void load_mRNAs(GStr& fname) {
356 // reads mRNAs into ctghash and clusters them into loci
357 GMessage("Reading mRNAs..");
358 FILE* f=fopen(fname.chars(),"r");
359 if (f==NULL) GError("Error opening gff file %s\n",fname.chars());
360 GffReader* gffr=new GffReader(f,true);
361 gffr->readAll();
362 for (int k=0;k<gffr->gflst.Count();k++) {
363 GffObj* m=gffr->gflst[k];
364 const char* gseqname=m->getGSeqName();
365 GSeqData* gdata=ctghash.Find(gseqname);
366 if (gdata==NULL) {
367 gdata=new GSeqData();
368 ctghash.shkAdd(gseqname,gdata);
369 }
370 gdata->mrnas.Add(m);
371 }//for each mRNA loaded
372 GMessage(" .. %d mRNAs loaded.\n",gffr->gflst.Count());
373 delete gffr;
374 GMessage("Clustering mRNAs into loci..\n");
375 // -- now for each contig, cluster mRNAs into loci
376 ctghash.startIterate();
377 GSeqData* gdata=NULL;
378 int numloci=0;
379 while ((gdata=ctghash.NextData())!=NULL) {
380 cluster_mRNAs(gdata->mrnas, gdata->loci);
381 numloci+=gdata->loci.Count();
382 }
383 GMessage(" .. %d loci formed.\n",numloci);
384 fclose(f);
385 }
386
387 GList<GLocus>* getCtgLoci(GStr& ctgname) {
388 GSeqData* r=ctghash.Find(ctgname.chars());
389 return (r==NULL ? NULL : &(r->loci));
390 }
391
392 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
393 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
394
395 int minintron = 12; //used for crossing small "exon" gaps
396
397 int main(int argc, char * const argv[]) {
398 GArgs args(argc, argv, "Ahf:o:");
399 int e;
400 if ((e=args.isError())>0)
401 GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
402 if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
403 bool mrna_filter=false;
404 GStr s=args.getOpt('f');
405 if (!s.is_empty()) {
406 load_mRNAs(s); //also clusters them into loci
407 mrna_filter=true;
408 }
409 FILE* f_in=NULL;
410 FILE* f_out=NULL;
411 openfw(f_out, args, 'o');
412 if (f_out==NULL) f_out=stdout;
413 if (args.startNonOpt()==0) GError("%s\n", USAGE);
414 char* infile=args.nextNonOpt();
415 f_in=fopen(infile,"r");
416 if (f_in==NULL) GError("Cannot open input file %s!\n",infile);
417 bool fullOvl=(mrna_filter && args.getOpt('A')==NULL);
418 GLineReader covreader(f_in);
419 GStr gseqname;
420
421 if (!mrna_filter) { //simply write the coverage islands and quit
422 while (!covreader.eof()) {
423 CovIsland cov(covreader, gseqname, minintron);
424 if (cov.start==0) continue; //no valid coverage parsed, try next line
425 fprintf(f_out, "%s\t%d\t%d\n", gseqname.chars(),cov.start, cov.end);
426 }//while reading cov islands
427 FWCLOSE(f_out);
428 FRCLOSE(f_in);
429 return 0;
430 }
431
432 GStr prevgseq;
433 /* mrna_filter method:
434 * when switching to a new contig
435 * *skip CovIslands until we find one overlapping the next locus on that contig
436 * *create a new GList<CovIsland> and add CovIslands while they are in range of current locus
437 * * when the next CovIsland falls outside the current locus, do the overlapCheck
438 * between the GList<CovIsland> and every mRNA in the locus
439 * * only output those mRNAs having overlaps for EACH exon with any of GList<CovIsland>
440 */
441 COvlCovList covlst; //CovIslands overlapping current locus
442 GList<GLocus>* ctgloci=NULL;
443 int locidx=0; //current locus being checked
444 while (!covreader.eof()) {
445 CovIsland cov(covreader, gseqname, minintron);
446 if (cov.start==0) continue; //no valid coverage parsed, try next line
447 if (prevgseq!=gseqname) { //contig change?
448 prevgseq=gseqname;
449 covlst.ovlCheck(fullOvl, f_out);
450 ctgloci=getCtgLoci(gseqname);
451 locidx=0;
452 }// contig change
453
454 //entry point when advancing locidx, to recheck the current cov
455 checkNextLocus:
456 if (ctgloci==NULL) {
457 continue; //no loci on this contig, try next
458 }
459 if (locidx>=ctgloci->Count()) {
460 //check out what we got so far
461 //covlst.ovlCheck(fullOvl, f_out);
462 ctgloci=NULL;
463 continue;
464 }
465 //ctgloci !=NULL, there are mRNAs for this contig
466 //if (covlst.Count()==0) { //no overlap found yet
467 GLocus* curloc=ctgloci->Get(locidx);
468 if (curloc->end<cov.start) {
469 //locus is behind current cov, has to catch up
470 //check what we have so far and move on
471 covlst.ovlCheck(fullOvl, f_out);
472 locidx++;
473 goto checkNextLocus; //same CovIsland
474 } //let
475 // -- here we have current locus->end >= cov.start
476 if (cov.end<curloc->start) {
477 //cov behind locus
478 //check what we have so far and move on
479 covlst.ovlCheck(fullOvl, f_out);
480 //try next covisland against the same locus
481 continue; //same locus, next covisland if any
482 }
483 //--here we have overlap of cov with current locus
484 covlst.addCov(cov,curloc);
485 covlst.ovlCheck(fullOvl, f_out);
486 locidx++;
487 goto checkNextLocus; //see if there are more loci overlapping this covisland
488 }//while reading wig lines
489 covlst.ovlCheck(fullOvl,f_out);
490 FWCLOSE(f_out);
491 FRCLOSE(f_in);
492 //GMessage(".. press Enter to exit ..\n");
493 //getchar();
494 }