ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/wigcov.cpp
Revision: 20
Committed: Mon Jul 18 21:08:36 2011 UTC (8 years, 3 months ago) by gpertea
File size: 16587 byte(s)
Log Message:
added cuffcompare sources

Line File contents
1 #include "GBase.h"
2 #include "GArgs.h"
3 #include "GStr.h"
4 #include "GList.hh"
5 #include "GHash.hh"
6 //#include "gffmrna.h"
7 #include "gff.h"
8 #include <ctype.h>
9
10
11 #define USAGE "Usage:\n\
12 wigcov [-f <input.gff>] [-A] <input.wig>\n\
13 \n\
14 Shows the coverage islands found in <input.wig>, or,\n\
15 if the -f option is given, acts as a filter for the given transcripts,\n\
16 only showing those transcripts overlapping the coverage islands. \n\
17 \n\
18 -A option shows such a transcript even if a single exon overlaps the\n\
19 coverage islands (default: all exons should overlap)\n\
20 "
21 class CovIsland {
22 uint nextp;
23 public:
24 uint start;
25 uint end;
26 CovIsland(uint gstart, int initcov, int cspan=1) {
27 init(gstart, initcov, cspan);
28 nextp=end+1;
29 }
30 CovIsland(GLineReader& covreader, GStr& seqname, uint maxgap=10);
31 //construct a whole island by reading next lines from file
32 //and stopping if a 0-coverage gap of at least maxgap is encountered
33 void init(uint gstart, int initcov, int cspan=1) {
34 start=gstart;
35 end=gstart+cspan-1;
36 nextp=end+1;
37 }
38 void extend(int scov, int cspan=1) {
39 if (scov>255) scov=255;
40 end+=cspan;
41 nextp=end+1;
42 }
43 bool operator==(CovIsland& c){
44 return (start==c.start && this->end==c.end);
45 }
46 bool operator>(CovIsland& c){
47 return (start==c.start) ? (end>c.end) : (start>c.start);
48 }
49 bool operator<(CovIsland& c){
50 return (start==c.start) ? (end<c.end) : (start<c.start);
51 }
52 };
53
54 CovIsland::CovIsland(GLineReader& covreader, GStr& gseqname, uint maxgap) {
55 //build a coverage island by parsing the next .wig lines
56 static const char msg_ERR_COVPARSE[] = " Error at parsing coverage file, line: %s\n";
57 char *line;
58 char* fields[5];
59 start=0;
60 end=0;
61 nextp=0;
62 gseqname.clear(); //it will update it
63 while ((line=covreader.nextLine())!=NULL) {
64 GStr l=line; //keep a copy for error messages
65 if (l.length()<3) continue;
66 int tidx=strsplit(line,fields, 4, '\t');
67 if (tidx<4) continue;
68 uint cstart=0, cend=0;
69 int lcov=0;
70 if (!gseqname.is_empty() && gseqname!=fields[0]) {
71 //different genomic seq, return here
72 nextp=MAXUINT;
73 covreader.pushBack(); //leave it for next CovIsland
74 break;
75 }
76
77 if (!(parseUInt(fields[1],cstart) && parseUInt(fields[2],cend) && parseInt(fields[3],lcov)))
78 GError(msg_ERR_COVPARSE, l.chars());
79 cend--;
80 if (lcov==0) { // no coverage segment
81 if (cend-cstart>maxgap) { //gap larger than minimum intron
82 if (end) { nextp=cend+1; break; } //close this CovIsland
83 else continue; //skip initially uncovered region
84 }
85 }// cov gap
86 if (end) {
87 if (cstart>end+1) { //implied gap
88 if (cstart-end-1>maxgap) {
89 nextp=cstart;
90 covreader.pushBack(); //read it next time
91 break; //island end
92 }
93 extend(0, cstart-end+1); //extend through small gap
94 } //implied gap
95 extend(lcov, cend-cstart+1);
96 }
97 else { //new start
98 init(cstart, lcov, cend-cstart+1);
99 if (gseqname.is_empty()) gseqname=fields[0];
100 }
101 }//while coverage lines
102 // -- we just finished putting together a coverage island
103
104 }
105
106 class GLocus:public GSeg {
107 public:
108 int gseq_id; //id of underlying genomic sequence
109 GList<GffObj> mrnas; //list of transcripts for this locus
110 GArray<GSeg> mexons; //list of merged exons in this region
111 GLocus(GffObj* mrna=NULL):mrnas(true,false,false), mexons(true,true) {
112 //this will NOT free mrnas!
113 gseq_id=-1;
114 if (mrna!=NULL) {
115 start=mrna->exons.First()->start;
116 end=mrna->exons.Last()->end;;
117 gseq_id=mrna->gseq_id;
118 GSeg seg;
119 for (int i=0;i<mrna->exons.Count();i++) {
120 seg.start=mrna->exons[i]->start;
121 seg.end=mrna->exons[i]->end;
122 mexons.Add(seg);
123 }
124 mrnas.Add(mrna);
125 mrna->uptr=this;
126 }
127 }
128 void addMerge(GLocus& locus, GffObj* lnkmrna) {
129 //add all the elements of the other locus (merging)
130 //-- merge mexons
131 GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
132 int i=0; //index of first mexons with a merge
133 int j=0; //index current mrna exon
134 while (i<mexons.Count() && j<locus.mexons.Count()) {
135 uint istart=mexons[i].start;
136 uint iend=mexons[i].end;
137 uint jstart=locus.mexons[j].start;
138 uint jend=locus.mexons[j].end;
139 if (iend<jstart) { i++; continue; }
140 if (jend<istart) { j++; continue; }
141 //if (mexons[i].overlap(jstart, jend)) {
142 //exon overlap was found :
143 ovlexons.Add(j);
144 //extend mexons[i] as needed
145 if (jstart<istart) mexons[i].start=jstart;
146 if (jend>iend) { //mexons[i] end extend
147 mexons[i].end=jend;
148 //now this could overlap the next mexon(s), so we have to merge them all
149 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
150 uint nextend=mexons[i+1].end;
151 mexons.Delete(i+1);
152 if (nextend>mexons[i].end) {
153 mexons[i].end=nextend;
154 break; //no need to check next mexons
155 }
156 } //while next mexons merge
157 } // mexons[i] end extend
158 // } //exon overlap
159 j++; //check the next locus.mexon
160 }
161 // -- merge uexons
162 //add to exons:
163 GSeg seg;
164 for (int i=0;i<locus.mexons.Count();i++) {
165 seg.start=locus.mexons[i].start;
166 seg.end=locus.mexons[i].end;
167 if (!ovlexons.Exists(i)) mexons.Add(seg);
168 }
169 // -- add locus.mrnas
170 for (int i=0;i<locus.mrnas.Count();i++) {
171 locus.mrnas[i]->uptr=this;
172 if (locus.mrnas[i]!=lnkmrna)
173 mrnas.Add(locus.mrnas[i]);
174 }
175 // -- adjust start/end as needed
176 if (start>locus.start) start=locus.start;
177 if (end<locus.end) end=locus.end;
178 }
179
180 bool add_mRNA(GffObj* mrna) {
181 if (mrnas.Count()>0 && mrna->gseq_id!=gseq_id) return false; //mrna must be on the same genomic seq
182 //check for exon overlap with existing mexons
183 //also update uexons and mexons accordingly, if mrna is added
184 uint mrna_start=mrna->exons.First()->start;
185 uint mrna_end=mrna->exons.Last()->end;
186 if (mrna_start>end || start>mrna_end) return false;
187 bool hasovl=false;
188 int i=0; //index of first mexons with a merge
189 int j=0; //index current mrna exon
190 GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
191 while (i<mexons.Count() && j<mrna->exons.Count()) {
192 uint istart=mexons[i].start;
193 uint iend=mexons[i].end;
194 uint jstart=mrna->exons[j]->start;
195 uint jend=mrna->exons[j]->end;
196 if (iend<jstart) { i++; continue; }
197 if (jend<istart) { j++; continue; }
198 //exon overlap found if we're here:
199 ovlexons.Add(j);
200 hasovl=true;
201 //extend mexons[i] as needed
202 if (jstart<istart) mexons[i].start=jstart;
203 if (jend>iend) { //mexon stretch up
204 mexons[i].end=jend;
205 //now this could overlap the next mexon(s), so we have to merge them all
206 while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
207 uint nextend=mexons[i+1].end;
208 mexons.Delete(i+1);
209 if (nextend>mexons[i].end) {
210 mexons[i].end=nextend;
211 break; //no need to check next mexons
212 }
213 } //while next mexons merge
214 } //possible mexons merge
215
216 j++; //check the next mrna exon
217 }//all vs all exon check loop
218 if (hasovl) {
219 GSeg seg;
220 for (int i=0;i<mrna->exons.Count();i++) {
221 seg.start=mrna->exons[i]->start;
222 seg.end=mrna->exons[i]->end;
223 if (!ovlexons.Exists(i)) mexons.Add(seg);
224 }
225 // add to mrnas
226 mrnas.Add(mrna);
227 // adjust start/end
228 if (start>mrna_start) start=mrna_start;
229 if (end<mrna_end) end=mrna_end;
230 mrna->uptr=this;
231 gseq_id=mrna->gseq_id;
232 }
233 return hasovl;
234 }
235 };
236
237
238 class GSeqData {
239 public:
240 GList<GffObj> mrnas;
241 GList<GLocus> loci;
242 GSeqData():mrnas(true,true,false), loci(true,true,true) {
243 }
244 };
245
246 void openfw(FILE* &f, GArgs& args, char opt) {
247 GStr s=args.getOpt(opt);
248 if (!s.is_empty()) {
249 if (s=='-')
250 f=stdout;
251 else {
252 f=fopen(s,"w");
253 if (f==NULL) GError("Error creating file: %s\n", s.chars());
254 }
255 }
256 }
257
258 // --- globals:
259 GHash<GSeqData> ctghash(true);
260
261 void cluster_mRNAs(GList<GffObj>& mrnas, GList<GLocus>& loci) {
262 //mrnas sorted by start coordinate
263 //and so are the loci
264 for (int t=0;t<mrnas.Count();t++) {
265 GArray<int> mrgloci(true);
266 GffObj* mrna=mrnas[t];
267 int lfound=0; //count of parent loci
268 for (int l=0;l<loci.Count();l++) {
269 if (loci[l]->end<mrna->exons.First()->start) continue;
270 if (loci[l]->add_mRNA(mrna)) {
271 //a parent locus was found
272 lfound++;
273 mrgloci.Add(l);
274 }
275 }//loci loop
276 if (lfound==0) {
277 //create a locus with only this mRNA
278 loci.Add(new GLocus(mrna));
279 }
280 else if (lfound>1) {
281 //more than one loci found parenting this mRNA, merge loci
282 //if (lfound>2) GMessage(" merging %d loci \n",lfound);
283 for (int l=1;l<lfound;l++) {
284 int mlidx=mrgloci[l]-l+1;
285 loci[mrgloci[0]]->addMerge(*loci[mlidx], mrna);
286 loci.Delete(mlidx);
287 }
288 }
289 }//mrnas loop
290 }
291
292 class COvlCovList: public GList<CovIsland> {
293 public:
294 GLocus* ovloc;
295 COvlCovList():GList<CovIsland>(true,true,false) {
296 ovloc=NULL;
297 }
298 void addCov(CovIsland& cov, GLocus* loc=NULL) {
299 //add CovIsland overlapping locus
300 if (this->Count()==0 && loc==NULL)
301 GError("Error: cannot add CovIsland to COvlCovList without a locus");
302 if (loc!=NULL) {
303 if (!loc->overlap(cov.start, cov.end))
304 GError("Error: adding non-overlapping CovIsland %d-%d to COvlCovList for locus %d-%d!\n",
305 cov.start, cov.end, loc->start, loc->end);
306 if (ovloc!=NULL && ovloc!=loc)
307 GError("Error: changing locus when adding new overlapping CovIsland\n");
308 ovloc=loc;
309 }
310 this->Add(new CovIsland(cov));
311 }
312 bool mrnaOvl(GffObj& mrna, bool fullcov) {
313 //TODO: check single or all-exon match
314 int i=0;//cov index
315 int j=0;//exon index
316 int* exonovl=NULL; //exonovl[j]=1 if exon j has overlap
317 GCALLOC(exonovl, mrna.exons.Count()*sizeof(int));
318 bool aovl=false; //at least an ovl
319 while (j<mrna.exons.Count() && i<Count()) {
320 uint jstart=mrna.exons[j]->start;
321 uint jend=mrna.exons[j]->end;
322 uint istart=Get(i)->start;
323 uint iend=Get(i)->end;
324 if (iend<jstart) { i++; continue; } //next covisland
325 if (jend<istart) { j++; continue; } //next exon
326 //we have overlap!
327 exonovl[j]=1;aovl=true;
328 if (!fullcov) { GFREE(exonovl); return true; }
329 // if (iend<jend) i++;
330 // else j++;
331 j++; //check next exon
332 }
333 if (!aovl) { GFREE(exonovl); return false; }
334 if (fullcov)
335 for (j=0;j<mrna.exons.Count();j++) {
336 if (exonovl[j]!=1) { GFREE(exonovl); return false; }
337 }
338 GFREE(exonovl);
339 return true;
340 }
341
342 void ovlCheck(bool fullcov, FILE* fout) {
343 if (Count()==0) return;
344 for (int m=0;m<ovloc->mrnas.Count();m++) {
345 if (ovloc->mrnas[m]->udata==0 && mrnaOvl(*(ovloc->mrnas[m]), fullcov)) {
346 ovloc->mrnas[m]->udata=1;
347 ovloc->mrnas[m]->printGtf(fout);
348 }
349 }//for each mrna in this locus
350 // now clear it, we're done with this
351 ovloc=NULL;
352 Clear();
353 }
354
355 };
356
357
358 void load_mRNAs(GStr& fname) {
359 // reads mRNAs into ctghash and clusters them into loci
360 GMessage("Reading mRNAs..");
361 FILE* f=fopen(fname.chars(),"r");
362 if (f==NULL) GError("Error opening gff file %s\n",fname.chars());
363 GffReader* gffr=new GffReader(f,true);
364 gffr->readAll();
365 for (int k=0;k<gffr->gflst.Count();k++) {
366 GffObj* m=gffr->gflst[k];
367 const char* gseqname=m->getGSeqName();
368 GSeqData* gdata=ctghash.Find(gseqname);
369 if (gdata==NULL) {
370 gdata=new GSeqData();
371 ctghash.shkAdd(gseqname,gdata);
372 }
373 gdata->mrnas.Add(m);
374 }//for each mRNA loaded
375 GMessage(" .. %d mRNAs loaded.\n",gffr->gflst.Count());
376 delete gffr;
377 GMessage("Clustering mRNAs into loci..\n");
378 // -- now for each contig, cluster mRNAs into loci
379 ctghash.startIterate();
380 GSeqData* gdata=NULL;
381 int numloci=0;
382 while ((gdata=ctghash.NextData())!=NULL) {
383 cluster_mRNAs(gdata->mrnas, gdata->loci);
384 numloci+=gdata->loci.Count();
385 }
386 GMessage(" .. %d loci formed.\n",numloci);
387 fclose(f);
388 }
389
390 GList<GLocus>* getCtgLoci(GStr& ctgname) {
391 GSeqData* r=ctghash.Find(ctgname.chars());
392 return (r==NULL ? NULL : &(r->loci));
393 }
394
395 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
396 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
397
398 int minintron = 12; //used for crossing small "exon" gaps
399
400 int main(int argc, char * const argv[]) {
401 GArgs args(argc, argv, "Ahf:o:");
402 int e;
403 if ((e=args.isError())>0)
404 GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
405 if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
406 bool mrna_filter=false;
407 GStr s=args.getOpt('f');
408 if (!s.is_empty()) {
409 load_mRNAs(s); //also clusters them into loci
410 mrna_filter=true;
411 }
412 FILE* f_in=NULL;
413 FILE* f_out=NULL;
414 openfw(f_out, args, 'o');
415 if (f_out==NULL) f_out=stdout;
416 if (args.startNonOpt()==0) GError("%s\n", USAGE);
417 char* infile=args.nextNonOpt();
418 f_in=fopen(infile,"r");
419 if (f_in==NULL) GError("Cannot open input file %s!\n",infile);
420 bool fullOvl=(mrna_filter && args.getOpt('A')==NULL);
421 GLineReader covreader(f_in);
422 GStr gseqname;
423
424 if (!mrna_filter) { //simply write the coverage islands and quit
425 while (!covreader.eof()) {
426 CovIsland cov(covreader, gseqname, minintron);
427 if (cov.start==0) continue; //no valid coverage parsed, try next line
428 fprintf(f_out, "%s\t%d\t%d\n", gseqname.chars(),cov.start, cov.end);
429 }//while reading cov islands
430 FWCLOSE(f_out);
431 FRCLOSE(f_in);
432 return 0;
433 }
434
435 GStr prevgseq;
436 /* mrna_filter method:
437 * when switching to a new contig
438 * *skip CovIslands until we find one overlapping the next locus on that contig
439 * *create a new GList<CovIsland> and add CovIslands while they are in range of current locus
440 * * when the next CovIsland falls outside the current locus, do the overlapCheck
441 * between the GList<CovIsland> and every mRNA in the locus
442 * * only output those mRNAs having overlaps for EACH exon with any of GList<CovIsland>
443 */
444 COvlCovList covlst; //CovIslands overlapping current locus
445 GList<GLocus>* ctgloci=NULL;
446 int locidx=0; //current locus being checked
447 while (!covreader.eof()) {
448 CovIsland cov(covreader, gseqname, minintron);
449 if (cov.start==0) continue; //no valid coverage parsed, try next line
450 if (prevgseq!=gseqname) { //contig change?
451 prevgseq=gseqname;
452 covlst.ovlCheck(fullOvl, f_out);
453 ctgloci=getCtgLoci(gseqname);
454 locidx=0;
455 }// contig change
456
457 //entry point when advancing locidx, to recheck the current cov
458 checkNextLocus:
459 if (ctgloci==NULL) {
460 continue; //no loci on this contig, try next
461 }
462 if (locidx>=ctgloci->Count()) {
463 //check out what we got so far
464 //covlst.ovlCheck(fullOvl, f_out);
465 ctgloci=NULL;
466 continue;
467 }
468 //ctgloci !=NULL, there are mRNAs for this contig
469 //if (covlst.Count()==0) { //no overlap found yet
470 GLocus* curloc=ctgloci->Get(locidx);
471 if (curloc->end<cov.start) {
472 //locus is behind current cov, has to catch up
473 //check what we have so far and move on
474 covlst.ovlCheck(fullOvl, f_out);
475 locidx++;
476 goto checkNextLocus; //same CovIsland
477 } //let
478 // -- here we have current locus->end >= cov.start
479 if (cov.end<curloc->start) {
480 //cov behind locus
481 //check what we have so far and move on
482 covlst.ovlCheck(fullOvl, f_out);
483 //try next covisland against the same locus
484 continue; //same locus, next covisland if any
485 }
486 //--here we have overlap of cov with current locus
487 covlst.addCov(cov,curloc);
488 covlst.ovlCheck(fullOvl, f_out);
489 locidx++;
490 goto checkNextLocus; //see if there are more loci overlapping this covisland
491 }//while reading wig lines
492 covlst.ovlCheck(fullOvl,f_out);
493 FWCLOSE(f_out);
494 FRCLOSE(f_in);
495 //GMessage(".. press Enter to exit ..\n");
496 //getchar();
497 }