ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/cuffcompare.cpp
Revision: 125
Committed: Tue Dec 6 19:22:50 2011 UTC (7 years, 10 months ago) by gpertea
File size: 99749 byte(s)
Log Message:
minor stats formatting

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #else
4 #define PACKAGE_VERSION "INTERNAL"
5 #define SVN_REVISION "SVN"
6 #endif
7
8 #include "GArgs.h"
9 #include <ctype.h>
10 #include <errno.h>
11 #include "gtf_tracking.h"
12
13 #ifdef HAVE_CONFIG_H
14 #include "update_check.h"
15 #endif
16
17 #define USAGE "Usage:\n\
18 cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>] \n\
19 [-o <outprefix>] [-p <cprefix>] \n\
20 {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}\n\
21 \n\
22 Cuffcompare provides classification, reference annotation mapping and various\n\
23 statistics for Cufflinks transfrags.\n\
24 Cuffcompare clusters and tracks transfrags across multiple samples, writing\n\
25 matching transcripts (intron chains) into <outprefix>.tracking, and a GTF\n\
26 file <outprefix>.combined.gtf containing a nonredundant set of transcripts \n\
27 across all input files (with a single representative transfrag chosen\n\
28 for each clique of matching transfrags across samples).\n\
29 \n\
30 Options:\n\
31 -i provide a text file with a list of Cufflinks GTF files to process instead\n\
32 of expecting them as command line arguments (useful when a large number\n\
33 of GTF files should be processed)\n\
34 \n\
35 -r a set of known mRNAs to use as a reference for assessing \n\
36 the accuracy of mRNAs or gene models given in <input.gtf>\n\
37 \n\
38 -R for -r option, reduce the set of reference transcripts to \n\
39 only those found to overlap any of the input loci\n\
40 -M discard (ignore) single-exon transfrags and reference transcripts\n\
41 -N discard (ignore) single-exon reference transcripts\n\
42 \n\
43 -s <seq_path> can be a multi-fasta file with all the genomic sequences or \n\
44 a directory containing multiple single-fasta files (one file per contig);\n\
45 lower case bases will be used to classify input transcripts as repeats\n\
46 \n\
47 -d max distance (range) for grouping transcript start sites (100)\n\
48 -p the name prefix to use for consensus transcripts in the \n\
49 <outprefix>.combined.gtf file (default: 'TCONS')\n\
50 -C include the \"contained\" transcripts in the .combined.gtf file\n\
51 -G generic GFF input file(s) (do not assume Cufflinks GTF)\n\
52 -T do not generate .tmap and .refmap files for each input file\n\
53 -V verbose processing mode (showing all GFF parsing warnings)\n\
54 "
55 bool debug=false;
56 bool perContigStats=false; // -S to enable stats for every single contig
57 bool generic_GFF=false; //-G, don't assume Cufflinks GTF as input
58 bool showContained=false; // -C
59 bool reduceRefs=false;
60 bool checkFasta=false;
61 bool tmapFiles=true;
62 bool only_spliced_refs=false;
63 int debugCounter=0;
64
65 int polyrun_range=2000; //polymerase run range 2KB
66 double scoreThreshold=0;
67 double exprThreshold=0;
68 char* cprefix=NULL;
69 FILE* ffasta=NULL; //genomic seq file
70 FILE *f_ref=NULL; //reference mRNA GFF, if provided
71 FILE* f_in=NULL; //sequentially, each input GFF file
72 FILE* f_out=NULL; //stdout if not provided
73 GFastaHandler gfasta;
74 int xlocnum=0;
75 int tsscl_num=0; //for tss cluster IDs
76 int protcl_num=0; //for "unique" protein IDs within TSS clusters
77 int tssDist=100;
78 //int total_tcons=0;
79 int total_xloci_alt=0;
80
81 void openfwrite(FILE* &f, GArgs& args, char opt) {
82 GStr s=args.getOpt(opt);
83 if (!s.is_empty()) {
84 if (s=='-')
85 f=stdout;
86 else {
87 f=fopen(s,"w");
88 if (f==NULL) GError("Error creating file: %s\n", s.chars());
89 }
90 }
91 }
92
93 //-- structure to keep track of data from multiple qry input files for a single genomic seq
94 class GSeqTrack {
95 int gseq_id;
96 public:
97 const char* gseq_name;
98 GList<GLocus>* rloci_f; //reference loci for this genomic sequence
99 GList<GLocus>* rloci_r;
100 GList<GXLocus> xloci_f; // extended super-loci across all qry datasets
101 GList<GXLocus> xloci_r; // extended super-loci across all qry datasets
102 GList<GXLocus> xloci_u; // extended super-loci across all qry datasets
103 GSeqData* qdata[MAX_QFILES]; //fixed order array with GSeqData for each qry input
104 //element in array is NULL if a qry file has no transcripts on this genomic sequence
105 int get_gseqid() { return gseq_id; }
106 GSeqTrack(int gid=-1):xloci_f(true,true,false),
107 xloci_r(true,true,false), xloci_u(true,true,false) {
108 gseq_id=gid;
109 if (gseq_id>=0) {
110 gseq_name=GffObj::names->gseqs.getName(gseq_id);
111 }
112 rloci_f=NULL;
113 rloci_r=NULL;
114 for (int i=0;i<MAX_QFILES;i++) qdata[i]=NULL;
115 }
116 bool operator==(GSeqTrack& d){
117 return (gseq_id==d.gseq_id);
118 }
119 bool operator>(GSeqTrack& d){
120 return (gseq_id>d.gseq_id);
121 }
122 bool operator<(GSeqTrack& d){
123 return (gseq_id<d.gseq_id);
124 }
125 };
126
127 char* getFastaFile(int gseq_id);
128
129 // ref globals
130 bool haveRefs=false; //true if a reference annotation (-r) is provide
131
132 GList<GSeqData> ref_data(true,true,true); //list of reference mRNAs and loci data for each genomic seq
133 //each locus will keep track of any superloci which includes it, formed during the analysis
134
135 void processLoci(GSeqData& seqdata, GSeqData* refdata=NULL, int qfidx=0);
136
137 void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
138 GSeqData* seqdata=NULL, GSeqData* refdata=NULL);
139
140 GSeqData* getQryData(int gid, GList<GSeqData>& qdata);
141 void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs);
142
143 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
144 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
145
146 FILE* f_mintr=NULL; //missed ref introns
147
148 bool multiexon_only=false;
149 bool multiexonrefs_only=false;
150
151 GHash<GStr> refdescr;
152 void loadRefDescr(const char* fname);
153
154 GList<GStr> qryfiles(false,true,false);
155
156 //list of GSeqTrack data, sorted by gseq_id
157 GList<GSeqTrack> gseqtracks(true,true,true);
158 GSeqTrack* findGSeqTrack(int gsid);
159
160
161 int cmpGTrackByName(const pointer p1, const pointer p2) {
162 return strcmp(((GSeqTrack*)p1)->gseq_name, ((GSeqTrack*)p2)->gseq_name);
163 }
164
165
166 void show_usage() {
167 GMessage("cuffcompare v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
168 GMessage( "-----------------------------\n");
169 GMessage("%s\n", USAGE);
170 }
171
172 int main(int argc, char * const argv[]) {
173 GArgs args(argc, argv, "XDTMNVGSCKRLhp:c:d:s:i:n:r:o:");
174 int e;
175 if ((e=args.isError())>0) {
176 show_usage();
177 GMessage("Invalid argument: %s\n", argv[e]);
178 exit(1);
179 }
180 if (args.getOpt('h')!=NULL){
181 show_usage();
182 exit(1);
183 }
184 showContained=(args.getOpt('C')!=NULL);
185 debug=(args.getOpt('D')!=NULL);
186 tmapFiles=(args.getOpt('T')==NULL);
187 multiexon_only=(args.getOpt('M')!=NULL);
188 multiexonrefs_only=(args.getOpt('N')!=NULL);
189 perContigStats=(args.getOpt('S')!=NULL);
190 checkFasta=(args.getOpt('K')!=NULL);
191 gtf_tracking_verbose=((args.getOpt('V')!=NULL) || debug);
192 FILE* finlst=NULL;
193 GStr s=args.getOpt('i');
194 if (!s.is_empty()) {
195 if (s=='-')
196 finlst=stdin;
197 else {
198 finlst=fopen(s,"r");
199 if (finlst==NULL) GError("Error opening file: %s\n", s.chars());
200 }
201 }
202 int numqryfiles=0;
203 if (finlst) {
204 GLineReader* lr=new GLineReader(finlst);
205 char* l=NULL;
206 while ((l=lr->getLine())!=NULL) {
207 if (strlen(l)<2 || startsWith(l,"# ") || isspace(*l)) continue;
208 if (!fileExists(l)) GError("Error: cannot locate input file: %s\n", l);
209 qryfiles.Add(new GStr(l));
210 }
211 delete lr;
212 //if (qryfiles.Count()>10)
213 gtf_tracking_largeScale=true;
214 }
215 else {
216 numqryfiles=args.startNonOpt();
217 char *infile=NULL;
218 if (numqryfiles>0) {
219 while ((infile=args.nextNonOpt())!=NULL) {
220 if (!fileExists(infile)) GError("Error: cannot locate input file: %s\n", infile);
221 qryfiles.Add(new GStr(infile));
222 } //for each argument
223 }
224 }
225 numqryfiles=qryfiles.Count();
226 if (numqryfiles==0) {
227 show_usage();
228 exit(1);
229 }
230 if (numqryfiles>MAX_QFILES) {
231 GMessage("Error: too many input files (limit set to %d at compile time)\n",MAX_QFILES);
232 GMessage("(if you need to raise this limit set a new value for\nMAX_QFILES in gtf_tracking.h and recompile)\n");
233 exit(0x5000);
234 }
235 #ifdef HAVE_CONFIG_H
236 check_version(PACKAGE_VERSION);
237 #endif
238 gfasta.init(args.getOpt('s'));
239 // determine if -s points to a multi-fasta file or a directory
240 s=args.getOpt('c');
241 if (!s.is_empty()) scoreThreshold=s.asReal();
242 s=args.getOpt('p');
243 if (!s.is_empty()) cprefix=Gstrdup(s.chars());
244 else cprefix=Gstrdup("TCONS");
245 s=args.getOpt('e');
246 if (!s.is_empty()) exprThreshold=s.asReal();
247 s=args.getOpt('d');
248 if (!s.is_empty()) {
249 tssDist=s.asInt();
250 }
251
252 s=args.getOpt('n');
253 if (!s.is_empty()) loadRefDescr(s.chars());
254 s=args.getOpt('r');
255 if (!s.is_empty()) {
256 f_ref=fopen(s,"r");
257 if (f_ref==NULL) GError("Error opening reference gff: %s\n",s.chars());
258 haveRefs=true;
259 if (gtf_tracking_verbose) GMessage("Loading reference transcripts..\n");
260 read_mRNAs(f_ref, ref_data, &ref_data, true, -1, s.chars(), (multiexonrefs_only || multiexon_only));
261 haveRefs=(ref_data.Count()>0);
262 reduceRefs=(args.getOpt('R')!=NULL);
263 if (gtf_tracking_verbose) GMessage("..reference annotation loaded\n");
264 }
265 bool discard_redundant=true; //discard redundant input transfrags
266 generic_GFF=args.getOpt('G');
267 if (generic_GFF) discard_redundant=false; //generic GTF, don't try to discard "redundant" transcripts
268 //if a full pathname is given
269 //the other common output files will still be created in the current directory:
270 // .loci, .tracking, .stats
271 GStr outbasename; //include path, if provided
272 GStr outprefix; //without path and/or extension
273 GStr outstats=args.getOpt('o');
274 if (outstats.is_empty() || outstats=="-") {
275 outstats="cuffcmp";
276 }
277 outbasename=outstats;
278 GStr outext(getFileExt(outstats.chars()));
279 if (outext.is_empty()) {
280 outext="stats";
281 outstats.append(".stats");
282 outbasename=outstats;
283 }
284 else outext.lower();
285 if (outext=="txt" || outext=="out" || outext=="stats" || outext=="summary") {
286 outbasename.cut(outbasename.length()-outext.length()-1);
287 }
288
289 outprefix=outbasename;
290 int di=outprefix.rindex(CHPATHSEP);
291 if (di>=0) outprefix.cut(0,di+1);
292
293 if (debug) { //create a few more files potentially useful for debugging
294 s=outbasename;
295 s.append(".missed_introns.gtf");
296 f_mintr=fopen(s.chars(),"w");
297 if (f_mintr==NULL) GError("Error creating file %s!\n",s.chars());
298 /*
299 s=outbasename;
300 s.append(".noTP_introns.gtf");
301 f_nintr=fopen(s.chars(),"w");
302 s=outbasename;
303 s.append(".wrong_Qintrons.gtf");
304 f_qintr=fopen(s.chars(),"w");
305 */
306 }
307
308 f_out=fopen(outstats, "w");
309 if (f_out==NULL) GError("Error creating output file %s!\n", outstats.chars());
310 if (gtf_tracking_verbose) GMessage("Prefix for output files: %s\n", outprefix.chars());
311 fprintf(f_out, "# Cuffcompare v%s | Command line was:\n#", PACKAGE_VERSION);
312 for (int i=0;i<argc-1;i++)
313 fprintf(f_out, "%s ", argv[i]);
314 fprintf(f_out, "%s\n#\n", argv[argc-1]);
315 //int qfileno=0;
316 GList<GSeqData>** qrysdata=NULL;
317 FILE** tfiles=NULL;
318 FILE** rtfiles=NULL;
319 GMALLOC(qrysdata, numqryfiles*sizeof(GList<GSeqData>*));
320 if (tmapFiles) {
321 GMALLOC(tfiles, numqryfiles*sizeof(FILE*));
322 if (haveRefs) {
323 GMALLOC(rtfiles, numqryfiles*sizeof(FILE*));
324 }
325 }
326 for (int fi=0;fi<qryfiles.Count();fi++) {
327 GStr in_file(qryfiles[fi]->chars());
328 GStr infname(getFileName(qryfiles[fi]->chars())); //file name only
329 GStr indir(qryfiles[fi]->chars());
330 di=indir.rindex(CHPATHSEP);
331 if (di>=0) indir.cut(di+1); //directory path for this input file
332 else indir=""; //current directory
333
334 if (debug || (gtf_tracking_verbose && !gtf_tracking_largeScale))
335 GMessage("Processing qfile #%d: %s\n",fi+1, in_file.chars());
336 if (in_file=="-") { f_in=stdin; in_file="stdin"; }
337 else {
338 f_in=fopen(in_file.chars(),"r");
339 if (f_in==NULL)
340 GError("Cannot open input file %s!\n",in_file.chars());
341 }
342 //f_in is the query gff file to process
343
344 GStr sbase(indir);
345 sbase.append(outprefix);
346 sbase.append(".");
347 sbase.append(infname);
348 if (tmapFiles) {
349 //-- we should keep the infname path, otherwise the remaining file names
350 // may be the same and clobber each other
351 s=sbase;
352 s.append(".tmap");
353 tfiles[fi]=fopen(s.chars(),"w");
354 if (tfiles[fi]==NULL)
355 GError("Error creating file '%s'!\n",s.chars());
356 fprintf(tfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_gene_id\tcuff_id\tFMI\tFPKM\tFPKM_conf_lo\tFPKM_conf_hi\tcov\tlen\tmajor_iso_id\tref_match_len\n");
357 if (haveRefs) {
358 s=sbase;
359 s.append(".refmap");
360 rtfiles[fi]=fopen(s.chars(),"w");
361 if (rtfiles[fi]==NULL)
362 GError("Error creating file '%s'!\n",s.chars());
363 fprintf(rtfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_id_list\n");
364 }
365 }
366
367 GList<GSeqData>* pdata=new GList<GSeqData>(true,true,true);
368 qrysdata[fi]=pdata;
369 if (gtf_tracking_verbose) GMessage("Loading transcripts from %s..\n",in_file.chars());
370 read_mRNAs(f_in, *pdata, &ref_data, discard_redundant, fi, in_file.chars(), multiexon_only);
371 GSuperLocus gstats;
372 GFaSeqGet *faseq=NULL;
373 for (int g=0;g<pdata->Count();g++) { //for each seqdata related to a genomic sequence
374 int gsid=pdata->Get(g)->get_gseqid();
375 GSeqData* refdata=getRefData(gsid, ref_data);//ref data for this contig
376 if (!gtf_tracking_largeScale)
377 processLoci(*(pdata->Get(g)), refdata, fi);
378 GSeqTrack* seqtrack=findGSeqTrack(gsid); //this will add a gseqtrack if it doesn't exist
379 // for gsid
380 if (refdata!=NULL) {
381 seqtrack->rloci_f=&(refdata->loci_f);
382 seqtrack->rloci_r=&(refdata->loci_r);
383 }
384 seqtrack->qdata[fi]=pdata->Get(g);
385 //will only gather data into stats if perContig==false
386 if (!gtf_tracking_largeScale) reportStats(f_out, getGSeqName(gsid), gstats,
387 pdata->Get(g), refdata);
388 if (faseq!=NULL) delete faseq;
389 } //for each genomic sequence data
390 //there could be genomic sequences with no qry transcripts
391 //but with reference transcripts
392 if (haveRefs && !reduceRefs && !gtf_tracking_largeScale) {
393 for (int r=0;r<ref_data.Count();r++) {
394 GSeqData* refdata=ref_data[r];
395 int gsid=refdata->get_gseqid();
396 if (getQryData(gsid, *pdata)==NULL) {
397 reportStats(f_out, getGSeqName(gsid), gstats, NULL, refdata);
398 }//completely missed all refdata on this contig
399 }
400 }
401 //now report the summary:
402 if (!gtf_tracking_largeScale) reportStats(f_out, in_file.chars(), gstats);
403 if (f_in!=stdin) fclose(f_in);
404 //qfileno++;
405 }//for each input file
406 if (f_mintr!=NULL) fclose(f_mintr);
407 gseqtracks.setSorted(&cmpGTrackByName);
408 if (gtf_tracking_verbose) GMessage("Tracking transcripts across %d query files..\n", numqryfiles);
409 trackGData(numqryfiles, gseqtracks, outbasename, tfiles, rtfiles);
410 fprintf(f_out, "\n Total union super-loci across all input datasets: %d \n", xlocnum);
411 if (numqryfiles>1) {
412 fprintf(f_out, " (%d multi-transcript, ~%.1f transcripts per locus)\n",
413 total_xloci_alt, ((double)(GXConsensus::count))/xlocnum);
414 }
415 if (gtf_tracking_verbose) GMessage("Cleaning up..\n");
416 GFREE(cprefix);
417 // clean up
418 for (int i=0;i<numqryfiles;i++) {
419 delete qrysdata[i];
420 }
421 GFREE(qrysdata);
422 GFREE(tfiles);
423 GFREE(rtfiles);
424 gseqtracks.Clear();
425 FRCLOSE(f_ref);
426 FWCLOSE(f_out);
427 if (gtf_tracking_verbose) GMessage("Done.\n");
428 ref_data.Clear();
429 //getchar();
430 } //main ends here
431
432 void show_exons(FILE* f, GffObj& m) {
433 fprintf(f,"(");
434 int imax=m.exons.Count()-1;
435 for (int i=0;i<=imax;i++) {
436 if (i==imax) fprintf(f,"%d-%d)",m.exons[i]->start, m.exons[i]->end);
437 else fprintf(f,"%d-%d,",m.exons[i]->start, m.exons[i]->end);
438 }
439 }
440
441 bool ichainMatch(GffObj* t, GffObj* r, bool& exonMatch, int fuzz=0) {
442 //t's intron chain is considered matching to reference r
443 //if r chain is the same or a subset of t's chain
444 exonMatch=false;
445 int imax=r->exons.Count()-1;
446 int jmax=t->exons.Count()-1;
447 if (imax==0 || jmax==0) { //single-exon mRNAs
448 if (imax!=jmax) return false;
449 exonMatch=r->exons[0]->coordMatch(t->exons[0],fuzz);
450 /*if (exonMatch) return true;
451 else return (r->exons[0]->start>=t->exons[0]->start &&
452 r->exons[0]->end<=t->exons[0]->end);*/
453 return exonMatch;
454 }
455
456 if (r->exons[imax]->start<t->exons[0]->end ||
457 t->exons[jmax]->start<r->exons[0]->end ) //intron chains do not overlap at all
458 {
459 return false;
460 }
461 //check intron overlaps
462 int i=1;
463 int j=1;
464 bool exmism=false; //any mismatch
465 while (i<=imax && j<=jmax) {
466 uint rstart=r->exons[i-1]->end;
467 uint rend=r->exons[i]->start;
468 uint tstart=t->exons[j-1]->end;
469 uint tend=t->exons[j]->start;
470 if (tend<rstart) { j++; continue; }
471 if (rend<tstart) { i++; continue; }
472 break; //here we have an intron overlap
473 }
474 if (i>1 || i>imax || j>jmax) {
475 return false; //no intron overlaps found at all
476 //or first intron of ref not overlapping
477 }
478 //from now on we expect intron matches up to imax
479 if (i!=j || imax!=jmax) { exmism=true; if (fuzz==0) return false; }
480 for (;i<=imax && j<=jmax;i++,j++) {
481 if (abs((int)(r->exons[i-1]->end-t->exons[j-1]->end))>fuzz ||
482 abs((int)(r->exons[i]->start-t->exons[j]->start))>fuzz) {
483 return false; //just run away
484 }
485 }
486 //if we made it here, we have matching intron chains up to MIN(imax,jmax)
487 if (imax!=jmax) {
488 exmism=true;
489 if (jmax<imax) return false; // qry ichain included in ref ichain
490 else //ref ichain included in qry ichain
491 if (fuzz==0) return false;
492 }
493 if (exmism) {
494 //exonMatch=false; -- it's default
495 return true;
496 }
497 exonMatch = ( abs((int)(r->exons[0]->start-t->exons[0]->start))<=fuzz &&
498 abs((int)(r->exons[imax]->end-t->exons[jmax]->end))<=fuzz );
499 return true;
500 }
501
502
503 void compareLoci2R(GList<GLocus>& loci, GList<GSuperLocus>& cmpdata,
504 GList<GLocus>& refloci, int qfidx) {
505 cmpdata.Clear();//a new list of superloci will be built
506 if (refloci.Count()==0 || loci.Count()==0) return;
507 //reset cmpovl and stats
508 for (int i=0;i<refloci.Count();i++) refloci[i]->creset();
509 //find loci with overlapping refloci
510 //and store cmpovl links both ways for ALL loci and refloci on this strand
511 for (int l=0;l<loci.Count();l++) {
512 GLocus* locus=loci[l];
513 locus->creset();
514 for (int j=0;j<refloci.Count();j++) {
515 //if (refloci[j]->start>locus->end) break;
516 if (refloci[j]->start>locus->end) {
517 if (refloci[j]->start-locus->end > GFF_MAX_LOCUS) break;
518 continue;
519 }
520 if (locus->start>refloci[j]->end) continue;
521 // then we must have overlap here:
522 //if (locus->overlap(refloci[j]->start, refloci[j]->end)) {
523 locus->cmpovl.Add(refloci[j]);
524 refloci[j]->cmpovl.Add(locus);
525 //}
526 }//for each reflocus
527 } //for each locus
528
529 //create corresponding "superloci" from transitive overlapping between loci and ref
530 for (int l=0;l<loci.Count();l++) {
531 if (loci[l]->v!=0) continue; //skip, already processed
532 GSuperLocus* super=new GSuperLocus();
533 super->qfidx=qfidx;
534 //try to find all other loci connected to this locus loci[l]
535 GList<GLocus> lstack(false,false,false); //traversal stack
536 lstack.Push(loci[l]);
537 while (lstack.Count()>0) {
538 GLocus* locus=lstack.Pop();
539 if (locus->v!=0 || locus->cmpovl.Count()==0) continue;
540 super->addQlocus(*locus);
541 locus->v=1;
542 for (int r=0;r<locus->cmpovl.Count();r++) {
543 GLocus* rloc=locus->cmpovl[r];
544 if (rloc->v==0) {
545 super->addRlocus(*rloc);
546 rloc->v=1;
547 for (int ll=0;ll<rloc->cmpovl.Count();ll++) {
548 if (rloc->cmpovl[ll]->v==0) lstack.Push(rloc->cmpovl[ll]);
549 }
550 }
551 } //for each overlapping reflocus
552 } //while linking
553
554 if (super->qloci.Count()==0) {
555 delete super;
556 continue; //try next query loci
557 }
558 //--here we have a "superlocus" region data on both qry and ref
559 // -- analyze mexons matching (base level metrics)
560 cmpdata.Add(super);
561 //make each ref locus keep track of all superloci containing it
562 for (int rl=0;rl<super->rloci.Count();rl++) {
563 super->rloci[rl]->superlst->Add(super);
564 }
565 for (int x=0;x<super->rmexons.Count();x++) {
566 super->rbases_all += super->rmexons[x].end-super->rmexons[x].start+1;
567 }
568 for (int x=0;x<super->qmexons.Count();x++) {
569 super->qbases_all += super->qmexons[x].end-super->qmexons[x].start+1;
570 }
571 int i=0; //locus mexons
572 int j=0; //refmexons
573 while (i<super->qmexons.Count() && j<super->rmexons.Count()) {
574 uint istart=super->qmexons[i].start;
575 uint iend=super->qmexons[i].end;
576 uint jstart=super->rmexons[j].start;
577 uint jend=super->rmexons[j].end;
578 if (iend<jstart) { i++; continue; }
579 if (jend<istart) { j++; continue; }
580 //v--overlap here:
581 uint ovlstart = jstart>istart? jstart : istart;
582 uint ovlend = iend<jend ? iend : jend;
583 uint ovlen=ovlend-ovlstart+1;
584 super->baseTP+=ovlen; //qbases_cov
585 if (iend<jend) i++;
586 else j++;
587 } //while mexons ovl search
588 /* if (reduceRefs) {
589 super->baseFP=super->qbases_all-super->baseTP;
590 super->baseFN=super->rbases_all-super->baseTP;
591 }
592 */
593 // -- exon level comparison:
594 int* qexovl; //flags for qry exons with ref overlap
595 GCALLOC(qexovl,super->quexons.Count()*sizeof(int));
596 int* rexovl; //flags for ref exons with qry overlap
597 GCALLOC(rexovl,super->ruexons.Count()*sizeof(int));
598 for (int i=0;i<super->quexons.Count();i++) {
599 uint istart=super->quexons[i].start;
600 uint iend=super->quexons[i].end;
601 for (int j=0;j<super->ruexons.Count();j++) {
602 uint jstart=super->ruexons[j].start;
603 uint jend=super->ruexons[j].end;
604 if (iend<jstart) break;
605 if (jend<istart) continue;
606 //--- overlap here between quexons[i] and ruexons[j]
607 qexovl[i]++;
608 rexovl[j]++;
609 if (super->quexons[i].coordMatch(&super->ruexons[j],5)) {
610 super->exonATP++;
611 if (super->quexons[i].coordMatch(&super->ruexons[j])) {
612 super->exonTP++;
613 } //exact match
614 } //fuzzy match
615 } //ref uexon loop
616 } //qry uexon loop
617 super->m_exons=0; //ref exons with no query overlap
618 super->w_exons=0; //qry exons with no ref overlap
619 for (int x=0;x<super->quexons.Count();x++)
620 if (qexovl[x]==0) super->w_exons++;
621 for (int x=0;x<super->ruexons.Count();x++)
622 if (rexovl[x]==0) super->m_exons++;
623 GFREE(rexovl);
624 GFREE(qexovl);
625
626 //-- intron level stats:
627 //query:
628 int* qinovl=NULL; //flags for qry introns with at least some ref overlap
629 int* qtpinovl=NULL; //flags for qry introns with perfect ref overlap
630 if (super->qintrons.Count()>0) {
631 GCALLOC(qinovl,super->qintrons.Count()*sizeof(int));
632 GCALLOC(qtpinovl,super->qintrons.Count()*sizeof(int));
633 }
634 //-- reference:
635 int* rinovl=NULL; //flags for ref introns with qry overlap
636 int* rtpinovl=NULL; //ref introns with perfect qry intron overlap
637 if (super->rintrons.Count()>0) {
638 GCALLOC(rinovl,super->rintrons.Count()*sizeof(int));
639 GCALLOC(rtpinovl,super->rintrons.Count()*sizeof(int));
640 }
641 for (int i=0;i<super->qintrons.Count();i++) {
642 uint istart=super->qintrons[i].start;
643 uint iend=super->qintrons[i].end;
644 for (int j=0;j<super->rintrons.Count();j++) {
645 uint jstart=super->rintrons[j].start;
646 uint jend=super->rintrons[j].end;
647 if (iend<jstart) break;
648 if (jend<istart) continue;
649 //--- overlap here between qintrons[i] and rintrons[j]
650 qinovl[i]++;
651 rinovl[j]++;
652 if (super->qintrons[i].coordMatch(&super->rintrons[j],5)) {
653 super->intronATP++;
654 if (super->qintrons[i].coordMatch(&super->rintrons[j])) {
655 super->intronTP++;
656 qtpinovl[i]++;
657 rtpinovl[j]++;
658 } //exact match
659 } //fuzzy match
660 } //ref intron loop
661 } //qry intron loop
662 super->m_introns=0; //ref introns with no query overlap
663 super->w_introns=0; //qry introns with no ref overlap
664 for (int x=0;x<super->qintrons.Count();x++) {
665 if (qinovl[x]==0) { super->w_introns++;
666 //qry introns with no ref intron overlap AT ALL
667 super->i_qwrong.Add(super->qintrons[x]);
668 }
669 else
670 if (qtpinovl[x]==0) {
671 super->i_qnotp.Add(super->qintrons[x]);
672 }
673 }
674 for (int x=0;x<super->rintrons.Count();x++) {
675 if (rinovl[x]==0) { //no intron overlap at all
676 super->m_introns++;
677 super->i_missed.Add(super->rintrons[x]);
678 }
679 else if (rtpinovl[x]==0) { //no perfect intron match
680 super->i_notp.Add(super->rintrons[x]);
681 }
682 }
683 GFREE(rinovl);
684 GFREE(rtpinovl);
685 GFREE(qinovl);
686 GFREE(qtpinovl);
687
688 // ---- now intron-chain and transcript comparison
689 for (int i=0;i<super->qmrnas.Count();i++) {
690 uint istart=super->qmrnas[i]->exons.First()->start;
691 uint iend=super->qmrnas[i]->exons.Last()->end;
692 for (int j=0;j<super->rmrnas.Count();j++) {
693 uint jstart=super->rmrnas[j]->exons.First()->start;
694 uint jend=super->rmrnas[j]->exons.Last()->end;
695 if (iend<jstart) break;
696 if (jend<istart) continue;
697 //--- overlap here --
698 bool exonMatch=false;
699 if ((super->qmrnas[i]->udata & 3) > 1) continue; //already counted a ichainTP for this qry
700 if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch, 5)) { //fuzzy match
701 GLocus* qlocus=((CTData*)super->qmrnas[i]->uptr)->locus;
702 GLocus* rlocus=((CTData*)super->rmrnas[j]->uptr)->locus;
703 if (super->qmrnas[i]->exons.Count()>1) {
704 super->ichainATP++;
705 qlocus->ichainATP++;
706 rlocus->ichainATP++;
707 }
708 if (exonMatch) {
709 super->mrnaATP++;
710 qlocus->mrnaATP++;
711 rlocus->mrnaATP++;
712 }
713 if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch)) { //exact match
714 if (super->qmrnas[i]->exons.Count()>1) {
715 super->qmrnas[i]->udata|=1;
716 super->ichainTP++;
717 qlocus->ichainTP++;
718 rlocus->ichainTP++;
719 }
720 if (exonMatch) {
721 super->qmrnas[i]->udata|=2;
722 super->mrnaTP++;
723 qlocus->mrnaTP++;
724 rlocus->mrnaTP++;
725 }
726 } //exact match
727 } //fuzzy match
728 } //ref mrna loop
729 } //qry mrna loop
730 for (int ql=0;ql<super->qloci.Count();ql++) {
731 if (super->qloci[ql]->ichainTP+super->qloci[ql]->mrnaTP >0 )
732 super->locusQTP++;
733 if (super->qloci[ql]->ichainATP+super->qloci[ql]->mrnaATP>0)
734 super->locusAQTP++;
735 }
736 for (int rl=0;rl<super->rloci.Count();rl++) {
737 if (super->rloci[rl]->ichainTP+super->rloci[rl]->mrnaTP >0 )
738 super->locusTP++;
739 if (super->rloci[rl]->ichainATP+super->rloci[rl]->mrnaATP>0)
740 super->locusATP++;
741 }
742
743 }//for each unlinked locus
744
745 }
746
747 //look for qry data for a specific genomic sequence
748 GSeqData* getQryData(int gid, GList<GSeqData>& qdata) {
749 int qi=-1;
750 GSeqData f(gid);
751 GSeqData* q=NULL;
752 if (qdata.Found(&f,qi))
753 q=qdata[qi];
754 return q;
755 }
756
757 const char* findDescr(GffObj* gfobj) {
758 if (refdescr.Count()==0) return NULL;
759 GStr* s=refdescr.Find(gfobj->getID());
760 if (s==NULL) {
761 s=refdescr.Find(gfobj->getGeneName());
762 if (s==NULL) s=refdescr.Find(gfobj->getGeneID());
763 }
764 if (s!=NULL)
765 return s->chars();
766 return NULL;
767 }
768
769 const char* getGeneID(GffObj* gfobj) {
770 //returns anything that might resemble a gene identifier for this transcript
771 //or, if everything fails, returns the transcript ID
772 const char* s=gfobj->getGeneName();
773 if (s) return s;
774 if ((s=gfobj->getGeneID())!=NULL) return s;
775 if ((s=gfobj->getAttr("Name"))!=NULL) return s;
776 return gfobj->getID();
777 }
778
779 const char* getGeneID(GffObj& gfobj) {
780 return getGeneID(&gfobj);
781 }
782
783 void writeLoci(FILE* f, GList<GLocus> & loci) {
784 for (int l=0;l<loci.Count();l++) {
785 GLocus& loc=*(loci[l]);
786 fprintf(f,"%s\t%s[%c]%d-%d\t", loc.mrna_maxcov->getID(),
787 loc.mrna_maxcov->getGSeqName(),
788 loc.mrna_maxcov->strand, loc.start,loc.end);
789 //now print all transcripts in this locus, comma delimited
790 int printfd=0;
791 for (int i=0;i<loc.mrnas.Count();i++) {
792 if (loc.mrnas[i]==loc.mrna_maxcov) continue;
793 if (printfd==0) fprintf(f,"%s",loc.mrnas[i]->getID());
794 else fprintf(f,",%s",loc.mrnas[i]->getID());
795 printfd++;
796 }
797 const char* rdescr=findDescr(loc.mrna_maxcov);
798 if (rdescr==NULL) fprintf(f,"\t\n");
799 else fprintf(f,"\t%s\n",rdescr);
800 }
801 }
802
803 void printXQ1(FILE* f, int qidx, GList<GLocus>& qloci) {
804 int printfd=0;
805 //print
806 for (int i=0;i<qloci.Count();i++) {
807 if (qloci[i]->qfidx!=qidx) continue;
808 for (int j=0;j<qloci[i]->mrnas.Count();j++) {
809 if (printfd==0) fprintf(f,"%s",qloci[i]->mrnas[j]->getID());
810 else fprintf(f,",%s",qloci[i]->mrnas[j]->getID());
811 printfd++;
812 }
813 }
814 if (printfd==0) fprintf(f,"-");
815 }
816
817 void numXLoci(GList<GXLocus>& xloci, int& last_id) {
818 for (int l=0;l<xloci.Count();l++) {
819 if (xloci[l]->qloci.Count()==0) continue; //we never print ref-only xloci
820 last_id++;
821 xloci[l]->id=last_id;
822 }
823 }
824
825
826 class GProtCl {
827 public:
828 GList<GXConsensus> protcl;
829 GProtCl(GXConsensus* c=NULL):protcl(true,false,false) {
830 if (c!=NULL)
831 protcl.Add(c);
832 }
833 bool add_Pcons(GXConsensus* c) {
834 if (c==NULL || c->aalen==0) return false;
835 if (protcl.Count()==0) {
836 protcl.Add(c);
837 return true;
838 }
839 if (protcl[0]->aalen!=c->aalen) return false;
840 if (strcmp(protcl[0]->aa,c->aa)!=0) return false;
841 protcl.Add(c);
842 return true;
843 }
844
845 void addMerge(GProtCl& pcl, GXConsensus* pclnk) {
846 for (int i=0;i<pcl.protcl.Count();i++) {
847 if (pcl.protcl[i]!=pclnk) {
848 protcl.Add(pcl.protcl[i]);
849 }
850 }
851 }
852
853 int aalen() {
854 if (protcl.Count()==0) return 0;
855 return protcl[0]->aalen;
856 }
857 bool operator==(GProtCl& cl) {
858 return this==&cl;
859 }
860 bool operator>(GProtCl& cl) {
861 return (this>&cl);
862 }
863 bool operator<(GProtCl& cl) {
864 return (this<&cl);
865 }
866 };
867
868 class GTssCl:public GSeg { //experiment cluster of ref loci (isoforms)
869 public:
870 uint fstart; //lowest coordinate of the first exon
871 uint fend; //highest coordinate of the first exon
872 GList<GXConsensus> tsscl;
873 GTssCl(GXConsensus* c=NULL):tsscl(true,false,false) {
874 start=0;
875 end=0;
876 fstart=0;
877 fend=0;
878 if (c!=NULL) addFirst(c);
879 }
880
881 void addFirst(GXConsensus* c) {
882 tsscl.Add(c);
883 start=c->start;
884 end=c->end;
885 GffExon* fexon=(c->tcons->strand=='-') ? c->tcons->exons.Last() :
886 c->tcons->exons.First();
887 fstart=fexon->start;
888 fend=fexon->end;
889 }
890 bool add_Xcons(GXConsensus* c) {
891 if (tsscl.Count()==0) {
892 addFirst(c);
893 return true;
894 }
895 //check if it can be added to existing xconsensi
896 uint nfend=0;
897 uint nfstart=0;
898 /*
899 if (tsscl.Get(0)->tcons->getGeneID()!=NULL &&
900 c->tcons->getGeneID()!=NULL &&
901 strcmp(tsscl.Get(0)->tcons->getGeneID(), c->tcons->getGeneID()))
902 //don't tss cluster if they don't have the same GeneID (?)
903 //FIXME: we might not want this if input files are not from Cufflinks
904 // and they could simply lack proper GeneID
905 return false;
906 */
907 if (c->tcons->strand=='-') {
908 //no, the first exons don't have to overlap
909 //if (!c->tcons->exons.Last()->overlap(fstart,fend)) return false;
910 nfstart=c->tcons->exons.Last()->start;
911 nfend=c->tcons->exons.Last()->end;
912 //proximity check for the transcript start:
913 if (nfend>fend+tssDist || fend>nfend+tssDist)
914 return false;
915 }
916 else {
917 //if (!c->tcons->exons.First()->overlap(fstart,fend)) return false;
918 nfstart=c->tcons->exons.First()->start;
919 nfend=c->tcons->exons.First()->end;
920 if (nfstart>fstart+tssDist || fstart>nfstart+tssDist)
921 return false;
922 }
923 // -- if we are here, we can add to tss cluster
924
925 tsscl.Add(c);
926 if (fstart>nfstart) fstart=nfstart;
927 if (fend<nfend) fend=nfend;
928 if (start>c->start) start=c->start;
929 if (end<c->end) end=c->end;
930 return true;
931 }
932
933 void addMerge(GTssCl& cl, GXConsensus* clnk) {
934 for (int i=0;i<cl.tsscl.Count();i++) {
935 if (cl.tsscl[i]==clnk) continue;
936 tsscl.Add(cl.tsscl[i]);
937 }
938 if (fstart>cl.fstart) fstart=cl.fstart;
939 if (fend<cl.fend) fend=cl.fend;
940 if (start>cl.start) start=cl.start;
941 if (end<cl.end) end=cl.end;
942 }
943 };
944 /*
945 class IntArray { //two dimensional int array
946 int* mem;
947 int xsize;
948 int ysize;
949 public:
950 IntArray(int xlen, int ylen) {
951 xsize=xlen;
952 ysize=ylen;
953 GMALLOC(mem, xsize*ysize*sizeof(int));
954 }
955 ~IntArray() {
956 GFREE(mem);
957 }
958 int& data(int x, int y) {
959 return mem[y*xsize+x];
960 }
961 };
962
963 int aa_diff(GXConsensus* c1, GXConsensus* c2) {
964 int diflen=abs(c1->aalen-c2->aalen);
965 if (diflen>=6) return diflen;
966 //obvious case: same CDS
967 if (diflen==0 && strcmp(c1->aa, c2->aa)==0) return 0;
968 //simple edit distance calculation
969 IntArray dist(c1->aalen+1, c2->aalen+1);
970 for (int i=0;i<=c1->aalen;i++) {
971 dist.data(i,0) = i;
972 }
973 for (int j = 0; j <= c2->aalen; j++) {
974 dist.data(0,j) = j;
975 }
976 for (int i = 1; i <= c1->aalen; i++)
977 for (int j = 1; j <= c2->aalen; j++) {
978 dist.data(i,j) = GMIN3( dist.data(i-1,j)+1,
979 dist.data(i,j-1)+1,
980 dist.data(i-1,j-1)+((c1->aa[i-1] == c2->aa[j-1]) ? 0 : 1) );
981 }
982 int r=dist.data(c1->aalen,c2->aalen);
983 return r;
984 }
985 */
986 void printConsGTF(FILE* fc, GXConsensus* xc, int xlocnum) {
987 for (int i=0;i<xc->tcons->exons.Count();i++) {
988 fprintf(fc,
989 "%s\t%s\texon\t%d\t%d\t.\t%c\t.\tgene_id \"XLOC_%06d\"; transcript_id \"%s_%08d\"; exon_number \"%d\";",
990 xc->tcons->getGSeqName(),xc->tcons->getTrackName(),xc->tcons->exons[i]->start, xc->tcons->exons[i]->end, xc->tcons->strand,
991 xlocnum, cprefix, xc->id, i+1);
992 //if (i==0) {
993 const char* gene_name=NULL;
994 if (xc->ref) {
995 gene_name=xc->ref->getGeneName();
996 if (gene_name==NULL) gene_name=xc->ref->getGeneID();
997 if (gene_name) {
998 fprintf (fc, " gene_name \"%s\";", gene_name);
999 }
1000 }
1001 if (!haveRefs) {
1002 if (gene_name==NULL && xc->tcons->getGeneName())
1003 fprintf (fc, " gene_name \"%s\";", xc->tcons->getGeneName());
1004 char* s=xc->tcons->getAttr("nearest_ref", true);
1005 if (s) fprintf(fc, " nearest_ref \"%s\";",s);
1006 s=xc->tcons->getAttr("class_code", true);
1007 if (s) fprintf(fc, " class_code \"%s\";", s);
1008 }
1009 fprintf(fc, " oId \"%s\";",xc->tcons->getID());
1010 if (xc->contained) {
1011 fprintf(fc, " contained_in \"%s_%08d\";", cprefix, xc->contained->id);
1012 }
1013 if (haveRefs) {
1014 if (xc->ref!=NULL)
1015 fprintf(fc, " nearest_ref \"%s\";",xc->ref->getID());
1016 fprintf(fc, " class_code \"%c\";",xc->refcode ? xc->refcode : '.');
1017 }
1018 if (xc->tss_id>0) fprintf(fc, " tss_id \"TSS%d\";",xc->tss_id);
1019 if (xc->p_id>0) fprintf(fc, " p_id \"P%d\";",xc->p_id);
1020 // }
1021 fprintf(fc,"\n");
1022 }
1023 }
1024
1025 void tssCluster(GXLocus& xloc)
1026 {
1027 GList<GTssCl> xpcls(true,true,false);
1028 for (int i=0;i<xloc.tcons.Count();i++)
1029 {
1030 GXConsensus* c=xloc.tcons[i];
1031 //if (c->tcons->exons.Count()<2) continue; //should we skip single-exon transcripts ??
1032 GArray<int> mrgloci(true);
1033 int lfound=0;
1034 for (int l=0;l<xpcls.Count();l++)
1035 {
1036 if (xpcls[l]->end<c->tcons->exons.First()->start) continue;
1037 if (xpcls[l]->start>c->tcons->exons.Last()->end) break;
1038 if (xpcls[l]->add_Xcons(c))
1039 {
1040 lfound++;
1041 mrgloci.Add(l);
1042
1043 }
1044
1045 } // for each xpcluster
1046 if (lfound==0)
1047 {
1048 //create a xpcl with only this xconsensus
1049 xpcls.Add(new GTssCl(c));
1050
1051 }
1052 else if (lfound>1)
1053 {
1054 for (int l=1;l<lfound;l++)
1055 {
1056 int mlidx=mrgloci[l]-l+1;
1057 xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1058 xpcls.Delete(mlidx);
1059 }
1060 }
1061
1062 }//for each xconsensus in this xlocus
1063 for (int l=0;l<xpcls.Count();l++)
1064 {
1065 //if (xpcls[l]->tsscl.Count()<2) continue;
1066 tsscl_num++;
1067 for (int i=0;i<xpcls[l]->tsscl.Count();i++)
1068 xpcls[l]->tsscl[i]->tss_id=tsscl_num;
1069 //processTssCl(xcds_num, xpcls[l], faseq);
1070 }
1071 }
1072
1073 void protCluster(GXLocus& xloc, GFaSeqGet *faseq) {
1074 if (!faseq)
1075 return;
1076 GList<GProtCl> xpcls(true,true,false);
1077 for (int i=0;i<xloc.tcons.Count();i++) {
1078 GXConsensus* c=xloc.tcons[i];
1079 if (c->ref==NULL || c->ref->CDstart==0) continue; //no ref or CDS available
1080 if (c->refcode!='=') continue;
1081 //get the CDS translation here
1082 if (c->aa==NULL) {
1083 c->aa=c->ref->getSplicedTr(faseq, true, &c->aalen);
1084 if (c->aalen>0 && c->aa[c->aalen-1]=='.') {
1085 //discard the final stop codon
1086 c->aalen--;
1087 c->aa[c->aalen]=0;
1088 }
1089 }
1090 GArray<int> mrgloci(true);
1091 int lfound=0;
1092 for (int l=0;l<xpcls.Count();l++) {
1093 if (xpcls[l]->aalen()!=c->aalen) continue;
1094 if (xpcls[l]->add_Pcons(c)) {
1095 lfound++;
1096 mrgloci.Add(l);
1097 }
1098 } // for each xpcluster
1099 if (lfound==0) {
1100 //create a xpcl with only this xconsensus
1101 xpcls.Add(new GProtCl(c));
1102 }
1103 else if (lfound>1) {
1104 for (int l=1;l<lfound;l++) {
1105 int mlidx=mrgloci[l]-l+1;
1106 xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1107 xpcls.Delete(mlidx);
1108 }
1109 }
1110 }//for each xconsensus in this xlocus
1111 for (int l=0;l<xpcls.Count();l++) {
1112 protcl_num++;
1113 for (int i=0;i<xpcls[l]->protcl.Count();i++)
1114 xpcls[l]->protcl[i]->p_id=protcl_num;
1115 }
1116 for (int i=0;i<xloc.tcons.Count();i++) {
1117 GXConsensus* c=xloc.tcons[i];
1118 if (c->aa!=NULL) { GFREE(c->aa); }
1119 }
1120 }
1121
1122 void printXLoci(FILE* f, FILE* fc, int qcount, GList<GXLocus>& xloci, GFaSeqGet *faseq) {
1123 for (int l=0;l<xloci.Count();l++) {
1124 if (xloci[l]->qloci.Count()==0) continue;
1125 GXLocus& xloc=*(xloci[l]);
1126 xloc.checkContainment();
1127 tssCluster(xloc);//cluster and assign tss_id and cds_id to each xconsensus in xloc
1128 protCluster(xloc,faseq);
1129 for (int c=0;c<xloc.tcons.Count();c++) {
1130 if (showContained || xloc.tcons[c]->contained==NULL)
1131 printConsGTF(fc,xloc.tcons[c],xloc.id);
1132 }
1133 fprintf(f,"XLOC_%06d\t%s[%c]%d-%d\t", xloc.id,
1134 xloc.qloci[0]->mrna_maxcov->getGSeqName(),
1135 xloc.strand, xloc.start,xloc.end);
1136 //now print all transcripts in this locus, comma delimited
1137 //first, ref loci, if any
1138 int printfd=0;
1139 if (xloc.rloci.Count()>0) {
1140 for (int i=0;i<xloc.rloci.Count();i++) {
1141 for (int j=0;j<xloc.rloci[i]->mrnas.Count();j++) {
1142 if (printfd==0) fprintf(f,"%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1143 xloc.rloci[i]->mrnas[j]->getID());
1144 else fprintf(f,",%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1145 xloc.rloci[i]->mrnas[j]->getID());
1146 printfd++;
1147 }
1148 }
1149 }
1150 else {
1151 fprintf(f,"-");
1152 }
1153 //second, all the cufflinks transcripts
1154 for (int qi=0;qi<qcount;qi++) {
1155 fprintf(f,"\t");
1156 printXQ1(f,qi,xloc.qloci);
1157 }
1158 fprintf(f,"\n");
1159 }
1160 }
1161
1162 void writeIntron(FILE* f, char strand, GFaSeqGet* faseq, GSeg& iseg,
1163 GList<GffObj>& mrnas, bool wrong=false) {
1164 //find a ref mrna having this intron
1165 GffObj* rm=NULL;
1166 for (int i=0;i<mrnas.Count();i++) {
1167 GffObj* m=mrnas[i];
1168 if (m->start>iseg.end) break;
1169 if (m->end<iseg.start) continue;
1170 //intron coords overlaps mrna region
1171 for (int j=1;j<m->exons.Count();j++) {
1172 if (iseg.start==m->exons[j-1]->end+1 &&
1173 iseg.end==m->exons[j]->start-1) { rm=m; break; } //match found
1174 }//for each intron
1175 if (rm!=NULL) break;
1176 } //for each ref mrna in this locus
1177 if (rm==NULL) GError("Error: couldn't find ref mrna for intron %d-%d! (BUG)\n",
1178 iseg.start,iseg.end);
1179 int ilen=iseg.end-iseg.start+1;
1180 fprintf(f,"%s\t%s\tintron\t%d\t%d\t.\t%c\t.\t",
1181 rm->getGSeqName(),rm->getTrackName(),iseg.start,iseg.end,strand);
1182 if (faseq!=NULL) {
1183 const char* gseq=faseq->subseq(iseg.start, ilen);
1184 char* cseq=Gstrdup(gseq, gseq+ilen-1);
1185 if (strand=='-') reverseComplement(cseq, ilen);
1186 fprintf(f,"spl=\"%c%c..%c%c\"; ", toupper(cseq[0]),toupper(cseq[1]),
1187 toupper(cseq[ilen-2]),toupper(cseq[ilen-1]));
1188 GFREE(cseq);
1189 }
1190 fprintf(f,"transcript_id \"%s\";", rm->getID());
1191 if (wrong) fprintf(f," noOvl=1;");
1192 fprintf(f,"\n");
1193
1194 }
1195
1196 void reportMIntrons(FILE* fm, FILE* fn, FILE* fq, char strand,
1197 GList<GSuperLocus>& cmpdata) {
1198 if (fm==NULL) return;
1199 for (int l=0;l<cmpdata.Count();l++) {
1200 GSuperLocus *sl=cmpdata[l];
1201 //cache the whole locus sequence if possible
1202 //write these introns and their splice sites into the file
1203 for (int i=0;i<sl->i_missed.Count();i++)
1204 writeIntron(fm, strand, NULL, sl->i_missed[i], sl->rmrnas);
1205 if (fn!=NULL) {
1206 for (int i=0;i<sl->i_notp.Count();i++)
1207 writeIntron(fn, strand, NULL, sl->i_notp[i], sl->rmrnas);
1208 }
1209 if (fq!=NULL) {
1210 for (int i=0;i<sl->i_qwrong.Count();i++) {
1211 writeIntron(fq, strand, NULL, sl->i_qwrong[i], sl->qmrnas, true);
1212 }
1213 for (int i=0;i<sl->i_qnotp.Count();i++) {
1214 writeIntron(fq, strand, NULL, sl->i_qnotp[i], sl->qmrnas);
1215 }
1216 }
1217 }
1218 }
1219
1220
1221 void processLoci(GSeqData& seqdata, GSeqData* refdata, int qfidx) {
1222 //GList<GSeqLoci>& glstloci, GList<GSeqCmpRegs>& cmpdata)
1223
1224 if (refdata!=NULL) {
1225 //if (gtf_tracking_verbose) GMessage(" ..comparing to reference loci..\n") ;
1226 compareLoci2R(seqdata.loci_f, seqdata.gstats_f, refdata->loci_f, qfidx);
1227 compareLoci2R(seqdata.loci_r, seqdata.gstats_r, refdata->loci_r, qfidx);
1228 // -- report
1229
1230 if (f_mintr!=NULL) {
1231 GMessage(" ..reporting missed ref introns..\n");
1232 //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '+', seqdata.gstats_f);
1233 //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '-', seqdata.gstats_r);
1234 reportMIntrons(f_mintr, NULL, NULL, '+', seqdata.gstats_f);
1235 reportMIntrons(f_mintr, NULL, NULL, '-', seqdata.gstats_r);
1236 }
1237 }
1238 }
1239
1240 //adjust stats for a list of unoverlapped (completely missed) ref loci
1241 void collectRLocData(GSuperLocus& stats, GLocus& loc) {
1242 stats.total_rmrnas+=loc.mrnas.Count();
1243 stats.total_rexons+=loc.uexons.Count();
1244 stats.total_rintrons+=loc.introns.Count();
1245 stats.total_rmexons+=loc.mexons.Count();
1246 stats.total_richains+=loc.ichains;
1247 stats.m_exons+=loc.uexons.Count();
1248 stats.m_introns+=loc.introns.Count();
1249 stats.total_rloci++;
1250 for (int e=0;e<loc.mexons.Count();e++) {
1251 stats.rbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1252 }
1253 }
1254
1255 void collectRData(GSuperLocus& stats, GList<GLocus>& loci) {
1256 for (int l=0;l<loci.Count();l++)
1257 collectRLocData(stats,*loci[l]);
1258 }
1259
1260 //adjust stats for a list of unoverlapped (completely "wrong" or novel) qry loci
1261 void collectQLocData(GSuperLocus& stats, GLocus& loc) {
1262 stats.total_qmrnas+=loc.mrnas.Count();
1263 stats.total_qexons+=loc.uexons.Count();
1264 stats.total_qmexons+=loc.mexons.Count();
1265 stats.total_qintrons+=loc.introns.Count();
1266 stats.total_qichains+=loc.ichains;
1267 stats.total_qloci++;
1268 if (loc.ichains>0 && loc.mrnas.Count()>1)
1269 stats.total_qloci_alt++;
1270 stats.w_exons+=loc.uexons.Count();
1271 stats.w_introns+=loc.introns.Count();
1272 for (int e=0;e<loc.mexons.Count();e++) {
1273 stats.qbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1274 }
1275 }
1276
1277 void collectQData(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1278 for (int l=0;l<loci.Count();l++) {
1279 //this is called when no refdata is given, so all these loci are nloci
1280 nloci.Add(loci[l]);
1281 collectQLocData(stats,*loci[l]);
1282 }
1283 }
1284
1285 void collectQNOvl(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1286 for (int l=0;l<loci.Count();l++) {
1287 if (loci[l]->cmpovl.Count()==0) {//locus with no ref loci overlaps
1288 stats.w_loci++; //novel/wrong loci
1289 nloci.Add(loci[l]);
1290 collectQLocData(stats,*loci[l]);
1291 }
1292 }
1293 }
1294
1295 void collectQU(GSuperLocus& stats, GList<GLocus>& nloci) {
1296 for (int l=0;l<nloci.Count();l++) {
1297 stats.w_loci++; //novel/wrong loci
1298 collectQLocData(stats, *nloci[l]);
1299 }
1300 }
1301
1302 void printLocus(FILE* f, GLocus& loc, const char* gseqname) {
1303 fprintf(f, "## Locus %s:%d-%d\n",gseqname, loc.start, loc.end);
1304 for (int m=0;m<loc.mrnas.Count();m++) {
1305 loc.mrnas[m]->printGtf(f);
1306 }
1307 }
1308
1309 void collectRNOvl(GSuperLocus& stats, GList<GLocus>& loci) { //, const char* gseqname) {
1310 for (int l=0;l<loci.Count();l++) {
1311 if (loci[l]->cmpovl.Count()==0) {
1312 stats.m_loci++; //missed ref loci
1313 //if (f_mloci!=NULL)
1314 // printLocus(f_mloci,*loci[l], gseqname);
1315 collectRLocData(stats,*loci[l]);
1316 }
1317 }
1318 }
1319
1320
1321 void collectCmpData(GSuperLocus& stats, GList<GSuperLocus>& cmpdata) { //, const char* gseqname) {
1322 for (int c=0;c<cmpdata.Count();c++) {
1323 stats.addStats(*cmpdata[c]);
1324 /*
1325 if (f_nloci!=NULL && cmpdata[c]->locusTP==0 && cmpdata[c]->rloci.Count()>0) {
1326 fprintf(f_nloci, "# Superlocus %s:%d-%d\n",gseqname, cmpdata[c]->start, cmpdata[c]->end);
1327 for (int l=0;l<cmpdata[c]->rloci.Count();l++) {
1328 printLocus(f_nloci,*cmpdata[c]->rloci[l], gseqname);
1329 }
1330 }
1331 */
1332 }
1333 }
1334
1335 void collectStats(GSuperLocus& stats, GSeqData* seqdata, GSeqData* refdata) {
1336 //collect all stats for a single genomic sequence into stats
1337 if (seqdata==NULL) {
1338 if (reduceRefs || refdata==NULL) return;
1339 //special case with completely missed all refs on a contig/chromosome
1340 collectRData(stats, refdata->loci_f);
1341 collectRData(stats, refdata->loci_r);
1342 return;
1343 }
1344 if (refdata==NULL) {//reference data missing on this contig
1345 collectQData(stats, seqdata->loci_f, seqdata->nloci_f);
1346 collectQData(stats, seqdata->loci_r, seqdata->nloci_r);
1347 collectQU(stats, seqdata->nloci_u);
1348 return;
1349 }
1350
1351 /*stats.total_qloci+=seqdata->loci_f.Count();
1352 stats.total_qloci+=seqdata->loci_r.Count();
1353 if (reduceRefs) { //only collect ref loci from superloci
1354
1355 }
1356 else {
1357 stats.total_rloci+=refdata->loci_f.Count();
1358 stats.total_rloci+=refdata->loci_r.Count();
1359 }
1360 */
1361 //collect data for overlapping superloci (already in seqdata->gstats_f/_r)
1362 //char* gseqname=getGSeqName(seqdata->gseq_id);
1363 collectCmpData(stats, seqdata->gstats_f);
1364 collectCmpData(stats, seqdata->gstats_r);
1365 //for non-overlapping qry loci, always add them as false positives FP
1366 collectQNOvl(stats, seqdata->loci_f, seqdata->nloci_f);
1367 collectQNOvl(stats, seqdata->loci_r, seqdata->nloci_r);
1368 collectQU(stats, seqdata->nloci_u);
1369 if (!reduceRefs) { //find ref loci with empty cmpovl and add them
1370 collectRNOvl(stats, refdata->loci_f);
1371 collectRNOvl(stats, refdata->loci_r);
1372 }
1373 }
1374
1375 void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
1376 GSeqData* seqdata, GSeqData* refdata) {
1377 GSuperLocus stats;
1378 bool finalSummary=(seqdata==NULL && refdata==NULL);
1379 GSuperLocus *ps=(finalSummary ? &stotal : &stats );
1380 if (!finalSummary) { //collecting contig stats
1381 //gather statistics for all loci/superloci here
1382 collectStats(stats, seqdata, refdata);
1383 stotal.addStats(stats);
1384 if (!perContigStats) return;
1385 }
1386 ps->calcF();
1387 if (seqdata!=NULL) fprintf(fout, "#> Genomic sequence: %s \n", setname);
1388 else fprintf(fout, "\n#= Summary for dataset: %s :\n", setname);
1389
1390 fprintf(fout, "# Query mRNAs : %7d in %7d loci (%d multi-exon transcripts)\n",
1391 ps->total_qmrnas, ps->total_qloci, ps->total_qichains);
1392 fprintf(fout, "# (%d multi-transcript loci, ~%.1f transcripts per locus)\n",
1393 ps->total_qloci_alt, ((double)ps->total_qmrnas/ps->total_qloci));
1394
1395 if (haveRefs) {
1396 fprintf(fout, "# Reference mRNAs : %7d in %7d loci (%d multi-exon)\n",
1397 ps->total_rmrnas, ps->total_rloci, ps->total_richains);
1398 if (ps->baseTP+ps->baseFP==0 || ps->baseTP+ps->baseFN==0) return;
1399 fprintf(fout, "# Corresponding super-loci: %7d\n",ps->total_superloci);
1400
1401 /*if (seqdata!=NULL) {
1402 fprintf(fout, " ( %d/%d on forward/reverse strand)\n",
1403 seqdata->gstats_f.Count(),seqdata->gstats_r.Count());
1404 }*/
1405 fprintf(fout, "#--------------------| Sn | Sp | fSn | fSp \n");
1406 double sp=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFP);
1407 double sn=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFN);
1408 fprintf(fout, " Base level: \t%5.1f\t%5.1f\t - \t - \n",sn, sp);
1409 sp=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFP);
1410 sn=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFN);
1411 double fsp=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFP);
1412 double fsn=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFN);
1413 if (fsp>100.0) fsp=100.0;
1414 if (fsn>100.0) fsn=100.0;
1415 fprintf(fout, " Exon level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1416 if (ps->total_rintrons>0) {
1417 //intron level
1418 sp=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFP);
1419 sn=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFN);
1420 fsp=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFP);
1421 fsn=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFN);
1422 if (fsp>100.0) fsp=100.0;
1423 if (fsn>100.0) fsn=100.0;
1424 fprintf(fout, " Intron level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1425 //intron chains:
1426 sp=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFP);
1427 sn=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFN);
1428 if (sp>100.0) sp=100.0;
1429 if (sn>100.0) sn=100.0;
1430 fsp=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFP);
1431 fsn=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFN);
1432 if (fsp>100.0) fsp=100.0;
1433 if (fsn>100.0) fsn=100.0;
1434 fprintf(fout, "Intron chain level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1435 }
1436 else {
1437 fprintf(fout, " Intron level: \t - \t - \t - \t - \n");
1438 fprintf(fout, "Intron chain level: \t - \t - \t - \t - \n");
1439 }
1440 sp=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFP);
1441 sn=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFN);
1442 fsp=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFP);
1443 fsn=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFN);
1444 if (fsp>100.0) fsp=100.0;
1445 if (fsn>100.0) fsn=100.0;
1446 fprintf(fout, " Transcript level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1447 //sp=(100.0*(double)ps->locusTP)/(ps->locusTP+ps->locusFP);
1448 sp=(100.0*(double)ps->locusQTP)/ps->total_qloci;
1449 sn=(100.0*(double)ps->locusTP)/ps->total_rloci; //(ps->locusTP+ps->locusFN);
1450 fsp=(100.0*(double)ps->locusAQTP)/ps->total_qloci; //(ps->locusATP+ps->locusAFP);
1451 fsn=(100.0*(double)ps->locusATP)/ps->total_rloci; //(ps->locusATP+ps->locusAFN);
1452 fprintf(fout, " Locus level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1453 //fprintf(fout, " (locus TP=%d, total ref loci=%d)\n",ps->locusTP, ps->total_rloci);
1454 fprintf(fout, "\nMatching intron chains: %7d\n",ps->ichainTP);
1455 fprintf(fout, " Matching loci: %7d\n",ps->locusTP);
1456 fprintf(fout, "\n");
1457 sn=(100.0*(double)ps->m_exons)/(ps->total_rexons);
1458 fprintf(fout, " Missed exons: %7d/%d\t(%5.1f%%)\n",ps->m_exons, ps->total_rexons, sn);
1459 sn=(100.0*(double)ps->w_exons)/(ps->total_qexons);
1460 fprintf(fout, " Novel exons: %7d/%d\t(%5.1f%%)\n",ps->w_exons, ps->total_qexons,sn);
1461 if (ps->total_rintrons>0) {
1462 sn=(100.0*(double)ps->m_introns)/(ps->total_rintrons);
1463 fprintf(fout, " Missed introns: %7d/%d\t(%5.1f%%)\n",ps->m_introns, ps->total_rintrons, sn);
1464 }
1465 if (ps->total_qintrons>0) {
1466 sn=(100.0*(double)ps->w_introns)/(ps->total_qintrons);
1467 fprintf(fout, " Novel introns: %7d/%d\t(%5.1f%%)\n",ps->w_introns, ps->total_qintrons,sn);
1468 }
1469 if (ps->total_rloci>0) {
1470 sn=(100.0*(double)ps->m_loci)/(ps->total_rloci);
1471 fprintf(fout, " Missed loci: %7d/%d\t(%5.1f%%)\n",ps->m_loci, ps->total_rloci, sn);
1472 }
1473 if (ps->total_qloci>0) {
1474 sn=(100.0*(double)ps->w_loci)/(ps->total_qloci);
1475 fprintf(fout, " Novel loci: %7d/%d\t(%5.1f%%)\n",ps->w_loci, ps->total_qloci,sn);
1476 }
1477
1478 }
1479 }
1480
1481 int inbuf_len=1024; //starting inbuf capacity
1482 char* inbuf=NULL; // incoming buffer for sequence lines.
1483
1484 void loadRefDescr(const char* fname) {
1485 if (inbuf==NULL) { GMALLOC(inbuf, inbuf_len); }
1486 FILE *f=fopen(fname, "rb");
1487 if (f==NULL) GError("Error opening exon file: %s\n",fname);
1488 char* line;
1489 int llen=0;
1490 off_t fpos;
1491 while ((line=fgetline(inbuf, inbuf_len, f, &fpos, &llen))!=NULL) {
1492 if (strlen(line)<=2) continue;
1493 int idlen=strcspn(line,"\t ");
1494 char* p=line+idlen;
1495 if (idlen<llen && idlen>0) {
1496 *p=0;
1497 p++;
1498 refdescr.Add(line, new GStr(p));
1499 }
1500 }
1501 }
1502
1503 GSeqTrack* findGSeqTrack(int gsid) {
1504 GSeqTrack f(gsid);
1505 int fidx=-1;
1506 if (gseqtracks.Found(&f,fidx))
1507 return gseqtracks[fidx];
1508 fidx=gseqtracks.Add(new GSeqTrack(gsid));
1509 return gseqtracks[fidx];
1510 }
1511
1512
1513
1514 GffObj* findRefMatch(GffObj& m, GLocus& rloc, int& ovlen) {
1515 ovlen=0;
1516 CTData* mdata=((CTData*)m.uptr);
1517 if (mdata->eqref!=NULL && ((CTData*)(mdata->eqref->uptr))->locus==&rloc) {
1518 mdata->eqref=mdata->ovls.First()->mrna; //this should be unnecessary
1519 //check it?
1520 return mdata->ovls.First()->mrna;
1521 }
1522 //if (rloc==NULL|| m==NULL) return NULL;
1523 GffObj* ret=NULL;
1524 for (int r=0;r<rloc.mrnas.Count();r++) {
1525 int olen=0;
1526 if (tMatch(m, *(rloc.mrnas[r]),olen, true)) { //return rloc->mrnas[r];
1527 /*
1528 if (ovlen<olen) {
1529 ovlen=olen;
1530 ret=rloc.mrnas[r]; //keep the longest matching ref
1531 //but this is unnecessary, there can be only one matching ref
1532 // because duplicate refs were discarded (unless -G was used)
1533 }
1534 */
1535 mdata->addOvl('=',rloc.mrnas[r], olen);
1536 ret=mdata->ovls.First()->mrna;
1537 //this must be called only for the head of an equivalency chain
1538 CTData* rdata=(CTData*)rloc.mrnas[r]->uptr;
1539 rdata->addOvl('=',&m,olen);
1540 //if (rdata->eqnext==NULL) rdata->eqnext=&m;
1541 }
1542 }
1543 if (ret!=NULL)
1544 mdata->eqref=ret;
1545 return ret;
1546 }
1547
1548
1549 void addXCons(GXLocus* xloc, GffObj* ref, char ovlcode, GffObj* tcons, CEqList* ts) {
1550 GXConsensus* c=new GXConsensus(tcons, ts, ref, ovlcode);
1551 //xloc->tcons.Add(c);
1552 //this will also check c against the other tcons for containment:
1553 xloc->addXCons(c);
1554 }
1555
1556
1557 const uint pre_mrna_threshold = 100;
1558
1559 char getOvlCode(GffObj& m, GffObj& r, int& ovlen) {
1560 ovlen=0;
1561 if (!m.overlap(r.start,r.end)) return 'u';
1562 int jmax=r.exons.Count()-1;
1563
1564 if (m.exons.Count()==1) { //single-exon transfrag
1565 GSeg mseg(m.start, m.end);
1566 if (jmax==0) { //also single-exon ref
1567 ovlen=mseg.overlapLen(r.start,r.end);
1568 int lmax=GMAX(r.covlen, m.covlen);
1569 if (ovlen >= lmax*0.8) return '='; //fuzz matching for single-exon transcripts: 80% of the longer one
1570 //if (m.covlen<=ovlen+12 && m.covlen<r.covlen) return 'c'; //contained
1571 if (m.covlen<r.covlen && ovlen >= m.covlen*0.8) return 'c';
1572 return 'o'; //just plain overlapping
1573 }
1574 //single-exon qry overlaping multi-exon ref
1575 for (int j=0;j<=jmax;j++) {
1576 //check if it's contained by an exon
1577 if (m.start>r.exons[j]->start-8 && m.end<r.exons[j]->end+8)
1578 return 'c';
1579 if (j==jmax) break;
1580 //check if it's contained by an intron
1581 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1582 return 'i';
1583 // check if it's a potential pre-mRNA transcript
1584 // (if overlaps an intron at least 10 bases)
1585 uint iovlen=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1);
1586 if (iovlen>=10 && mseg.len()>iovlen+10) return 'e';
1587 }
1588 return 'o'; //plain overlap, uncategorized
1589 } //single-exon transfrag
1590 //-- from here on we have a multi-exon transfrag --
1591 // * check if contained by a ref intron
1592 for (int j=0;j<jmax;j++) {
1593 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1594 return 'i';
1595 }
1596 //> check if m's intron chain is a subset of r's intron chain
1597 int imax=m.exons.Count()-1;// imax>0 here
1598 if (m.exons[imax]->start<r.exons[0]->end ||
1599 r.exons[jmax]->start<m.exons[0]->end ) //intron chains do not overlap at all
1600 return 'o'; //but terminal exons do, otherwise we wouldn't be here
1601 int i=1; //index of exon to the right of current qry intron
1602 int j=1; //index of exon to the right of current ref intron
1603 //find first intron overlap
1604 while (i<=imax && j<=jmax) {
1605 if (r.exons[j]->start<m.exons[i-1]->end) { j++; continue; }
1606 if (m.exons[i]->start<r.exons[j-1]->end) { i++; continue; }
1607 break; //here we have an intron overlap
1608 }
1609 if (i>imax || j>jmax)
1610 return 'o'; //no initial intron overlap found
1611 //from here on we check all qry introns against ref introns
1612 bool jmatch=false; //true if at least a junction match is found
1613 bool icmatch=(i==1); //intron chain match - it will be updated as introns are checked
1614 //bool exovli=false; // if any terminal exon of qry extends into a ref intron
1615 int jmstart=j; //index of first intron overlap of reference
1616 int jmend=0; //index of last intron overlap of reference
1617 int imend=0; //index of last intron overlap of query
1618 //check for intron matches
1619 while (i<=imax && j<=jmax) {
1620 uint mstart=m.exons[i-1]->end;
1621 uint mend=m.exons[i]->start;
1622 uint rstart=r.exons[j-1]->end;
1623 uint rend=r.exons[j]->start;
1624 if (rend<mstart) { j++; icmatch=false; continue; } //skipping ref intron, no ichain match
1625 if (mend<rstart) { i++; icmatch=false; continue; } //skipping qry intron, no ichain match
1626 //overlapping introns here, test junction matching
1627 jmend=j; //keep track of last overlapping intron
1628 imend=i;
1629 bool smatch=(mstart==rstart);
1630 bool ematch=(mend==rend);
1631 if (smatch || ematch) jmatch=true;
1632 if (smatch && ematch) { i++; j++; } //perfect match for this intron
1633 else { //at least one junction doesn't match
1634 icmatch=false;
1635 if (mend>rend) j++; else i++;
1636 }
1637 } //while checking intron overlaps
1638
1639 if (icmatch && imend==imax) { // qry intron chain match
1640 if (jmstart==1 && jmend==jmax) return '='; //identical intron chains
1641 // -- qry intron chain is shorter than ref intron chain --
1642 int l_iovh=0; // overhang of leftmost q exon left boundary beyond the end of ref intron to the left
1643 int r_iovh=0; // same type of overhang through the ref intron on the right
1644 if (jmstart>1 && r.exons[jmstart-1]->start>m.start)
1645 l_iovh = r.exons[jmstart-1]->start - m.start;
1646 if (jmend<jmax && m.end > r.exons[jmend]->end)
1647 r_iovh = m.end - r.exons[jmend]->end;
1648 if (l_iovh<4 && r_iovh<4) return 'c';
1649 //TODO? check if any x_iovl>10 and return 'e' to signal an "unspliced intron" ?
1650 // or we can check if any of them are >= the length of the corresponding ref intron on that side
1651 return 'j';
1652 }
1653 /*
1654 if (icmatch && (jmax>=imax)) { //all qry introns match
1655 //but they may overlap
1656 // if ((lbound && lbound > m.exons[0]->start+10) ||
1657 // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1658 // return 'c';
1659 // }
1660 int code = 'c';
1661 if (lbound)
1662 {
1663 uint ref_boundary = lbound;
1664 uint cuff_boundary = m.exons[0]->start;
1665 if (ref_boundary > (cuff_boundary + pre_mrna_threshold)) // cuff extends a lot
1666 {
1667 code = 'j';
1668 }
1669 if (ref_boundary > cuff_boundary) // cuff extends just a bit into a ref intron
1670 {
1671 code = 'e';
1672 }
1673 }
1674 if (j <= jmax)
1675 {
1676 uint ref_boundary = r.exons[j-1]->end;
1677 uint cuff_boundary = m.exons[i-1]->end;
1678 if (cuff_boundary > (ref_boundary + pre_mrna_threshold)) // cuff extends a lot
1679 {
1680 code = 'j';
1681 }
1682 if (cuff_boundary > ref_boundary) // cuff extends just a bit into a ref intron
1683 {
1684 code = 'e';
1685 }
1686 }
1687 //if ((lbound && lbound > m.exons[0]->start+10) ||
1688 // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1689 return code;
1690 }
1691
1692 // if (!ichain) // first and last exons more or less match, but there's a different intron somewhere
1693 // {
1694 //
1695 // }
1696
1697 */
1698 return jmatch ? 'j':'o';
1699 }
1700
1701 char getRefOvl(GffObj& m, GLocus& rloc, GffObj*& rovl, int& ovlen) {
1702 rovl=NULL;
1703 ovlen=0;
1704 if (m.start>rloc.end || m.end<rloc.start) {
1705 //see if it's a polymerase run ?
1706 /*
1707 if ((m.strand=='+' && m.end<=rloc.end+polyrun_range) ||
1708 (m.strand=='-' && m.start>=rloc.start-polyrun_range)) {
1709 rovl=rloc.mrna_maxcov;
1710 ((CTData*)m.uptr)->addOvl('p',rloc.mrna_maxcov);
1711 return 'p';
1712 }
1713 */
1714 return 0; //unknown -> intergenic space
1715 }
1716 for (int i=0;i<rloc.mrnas.Count();i++) {
1717 GffObj* r=rloc.mrnas[i];
1718 int olen=0;
1719 char ovlcode=getOvlCode(m,*r,olen);
1720 if (ovlcode!=0) { //has some sort of "overlap" with r
1721 ((CTData*)m.uptr)->addOvl(ovlcode,r,olen);
1722 if (olen>ovlen) ovlen=olen;
1723 if (ovlcode=='c' || ovlcode=='=') //keep match/containment for each reference transcript
1724 ((CTData*)r->uptr)->addOvl(ovlcode,&m,olen);
1725 }
1726 }//for each ref in rloc
1727 // i,j,o
1728 return ((CTData*)m.uptr)->getBestCode();
1729 }
1730
1731 /*
1732 void findTMatches(GTrackLocus& loctrack, int qcount) {
1733 //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1734 for (int q=0;q<qcount-1;q++) { //for each qry dataset
1735 if (loctrack[q]==NULL) continue;
1736 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1737 GffObj* qi_t=loctrack[q]->Get(qi);
1738 CTData* qi_d=(CTData*)qi_t->uptr;
1739 if ((qi_d->eqdata & EQHEAD_TAG) !=0) { //this is set as an EQ chain head already
1740 //if (qi_t->exons.Count()>1)
1741 continue;
1742 }
1743 for (int n=q+1;n<qcount;n++) { // for every successor dataset
1744 if (loctrack[n]==NULL) continue;
1745 for (int ni=0;ni<loctrack[n]->Count();ni++) {
1746 GffObj* ni_t=loctrack[n]->Get(ni);
1747 CTData* ni_d=(CTData*)ni_t->uptr;
1748 //if (ni_d->eqdata!=0) continue; //already part of an EQ chain
1749 //single exon transfrags have special treatment:
1750 bool s_match=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1751 if (ni_d->eqdata!=0 && !s_match) continue; //already part of an EQ chain
1752 if (ni_d->eqnext!=NULL) continue;
1753 int ovlen=0;
1754
1755 if (qi_d->eqnext!=NULL) {
1756 if (!s_match) continue;
1757 //test all in the EQ list for a match
1758 bool matchFound=false;
1759 CTData* next_eq_d=qi_d;
1760 if (tMatch(*qi_t, *ni_t, ovlen, true)) {
1761 matchFound=true;
1762 }
1763 else {
1764 while (next_eq_d->eqnext!=NULL) {
1765 if (tMatch(*(next_eq_d->eqnext), *ni_t, ovlen, true)) {
1766 matchFound=true;
1767 break;
1768 }
1769 next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1770 } //last in the chain
1771 }
1772 if (matchFound) {
1773 //add this to the end of the EQ chain instead
1774 next_eq_d=(CTData*)(qi_d->eqnext->uptr);
1775 while (next_eq_d->eqnext!=NULL) {
1776 next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1777 } //last in the chain
1778 next_eq_d->eqnext=ni_t;
1779 ni_d->eqdata=n+1;
1780 ni_d->eqnext=NULL; ///TEST
1781 }
1782 }
1783 else {
1784 if (tMatch(*qi_t,*ni_t, ovlen, true)) {
1785 qi_d->eqnext=ni_t;
1786 ni_d->eqnext=NULL; ///TEST
1787 if (qi_d->eqdata == 0) {//only start of chain is tagged
1788 qi_d->eqdata = ((q+1) | EQHEAD_TAG);
1789 }
1790 ni_d->eqdata=n+1; //EQ chain member only marked with qry# (1-based)
1791 }
1792 } //multi-exon case
1793 } //for each transfrag in the next qry dataset
1794
1795 if (qi_d->eqnext!=NULL && qi_t->exons.Count()>1) break;
1796 //part of a chain already, skip other datasets
1797 } // for each successor dataset
1798 } //for each transcript in qry dataset
1799 } //for each qry dataset
1800 }
1801 */
1802
1803 void findTMatches(GTrackLocus& loctrack, int qcount) {
1804 //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1805 for (int q=0;q<qcount-1;q++) { //for each qry dataset
1806 if (loctrack[q]==NULL) continue;
1807 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1808 GffObj* qi_t=loctrack[q]->Get(qi);
1809 CTData* qi_d=(CTData*)qi_t->uptr;
1810 if (qi_d->eqlist!=NULL && qi_t->exons.Count()>1) {
1811 continue; //this is part of an EQ chain already
1812 }
1813 for (int n=q+1;n<qcount;n++) { // for every successor dataset
1814 if (loctrack[n]==NULL) continue;
1815 for (int ni=0;ni<loctrack[n]->Count();ni++) {
1816 GffObj* ni_t=loctrack[n]->Get(ni);
1817 CTData* ni_d=(CTData*)ni_t->uptr;
1818 bool singleExon=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1819 if (ni_d->eqlist!=NULL &&
1820 (ni_d->eqlist==qi_d->eqlist || !singleExon)) continue;
1821 int ovlen=0;
1822 if ((ni_d->eqlist==qi_d->eqlist && qi_d->eqlist!=NULL) ||
1823 tMatch(*qi_t,*ni_t, ovlen, singleExon)) {
1824 qi_d->joinEqList(ni_t);
1825 }
1826 }
1827 } // for each successor dataset
1828 } //for each transcript in qry dataset
1829 } //for each qry dataset
1830 }
1831
1832
1833 int cmpTData_qset(const pointer* p1, const pointer* p2) {
1834 CTData* d1=(CTData*)(((GffObj*)p1)->uptr);
1835 CTData* d2=(CTData*)(((GffObj*)p2)->uptr);
1836 return (d1->qset - d2->qset);
1837 }
1838
1839 void printITrack(FILE* ft, GList<GffObj>& mrnas, int qcount, int& cnum) {
1840 for (int i=0;i<mrnas.Count();i++) {
1841 GffObj& qt=*(mrnas[i]);
1842 CTData* qtdata=(CTData*)qt.uptr;
1843 int qfidx=qtdata->qset;
1844 char ovlcode=qtdata->classcode;
1845 //GList<GffObj> eqchain(false,false,false);
1846 CEqList* eqchain=qtdata->eqlist;
1847 GffObj* ref=NULL; //related ref -- it doesn't have to be fully matching
1848 GffObj* eqref=NULL; //fully ichain-matching ref
1849 GffObj* tcons=NULL; //"consensus" (largest) transcript for a clique
1850 int tmaxcov=0;
1851 //eqchain.Add(&qt);
1852 eqref=qtdata->eqref;
1853 if (qtdata->ovls.Count()>0 && qtdata->ovls[0]->mrna!=NULL) {
1854 //if it has ovlcode with a ref
1855 ref=qtdata->ovls[0]->mrna;
1856 //consistency check: qtdata->ovls[0]->code==ovlcode
1857 // -- let tcons be a transfrag, not a ref transcript
1858 //tcons=eqref;
1859 //if (tcons!=NULL) tmaxcov=tcons->covlen;
1860 }
1861 //chain pre-check
1862 if (tcons==NULL || mrnas[i]->covlen>tmaxcov) {
1863 tcons=mrnas[i];
1864 tmaxcov=tcons->covlen;
1865 }
1866 if (qtdata->isEqHead()) {//head of a equivalency chain
1867 //check if all transcripts in this chain have the same ovlcode
1868 for (int k=0;k<qtdata->eqlist->Count();k++) {
1869 GffObj* m=qtdata->eqlist->Get(k);
1870 if (m->covlen>tmaxcov) {
1871 tmaxcov=m->covlen;
1872 tcons=m;
1873 }
1874 if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1875 ovlcode='.'; //non-uniform ovlcode
1876 }
1877 }
1878 /*
1879 GffObj* m=mrnas[i];
1880 while (((CTData*)m->uptr)->eqnext!=NULL) {
1881 m=((CTData*)m->uptr)->eqnext;
1882 eqchain.Add(m);
1883 if (m->covlen>tmaxcov) {
1884 tmaxcov=m->covlen;
1885 tcons=m;
1886 }
1887 if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1888 ovlcode='.'; //non-uniform ovlcode
1889 //break;
1890 }
1891 } //while elements in chain
1892 */
1893
1894 }//chain check
1895 //if (ovlcode=='p') ref=NULL; //ignore polymerase runs?
1896 if (ovlcode==0 || ovlcode=='-') ovlcode = (ref==NULL) ? 'u' : '.';
1897 //-- print columns 1 and 2 as LOCUS_ID and TCONS_ID
1898 //bool chainHead=(qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0));
1899 bool chainHead=qtdata->isEqHead();
1900 //bool noChain=((qtdata->eqdata & EQCHAIN_TAGMASK)==0);
1901 bool noChain=(eqchain==NULL);
1902 if (chainHead || noChain) {
1903 cnum++;
1904 if (ft!=NULL) fprintf(ft,"%s_%08d\t",cprefix,cnum);
1905 GXLocus* xloc=qtdata->locus->xlocus;
1906 if (xloc!=NULL) {
1907 if (ft!=NULL) fprintf(ft, "XLOC_%06d\t",xloc->id);
1908 if (tcons->exons.Count()>1) {
1909 //! only multi-exon mRNAs are counted for multi-transcript xloci !
1910 xloc->num_mtcons++;
1911 if (xloc->num_mtcons==2)
1912 total_xloci_alt++;
1913 }
1914 }
1915 else {
1916 //should NEVER happen!
1917 int fidx=qtdata->qset;
1918 GError("Error: no XLocus created for transcript %s (file %s) [%d, %d], on %s%c:%d-%d\n", qt.getID(),
1919 qryfiles[qtdata->locus->qfidx]->chars(), qtdata->locus->qfidx, fidx, qt.getGSeqName(), qt.strand, qt.start, qt.end);
1920 }
1921 addXCons(xloc, ref, ovlcode, tcons, eqchain);
1922 } // if chain head or uniq entry (not part of a chain)
1923 if (ft==NULL) continue;
1924 if (chainHead) {
1925 //this is the start of a equivalence class as a printing chain
1926 if (ref!=NULL) fprintf(ft,"%s|%s\t%c", getGeneID(ref),ref->getID(), ovlcode);
1927 else fprintf(ft,"-\t%c", ovlcode);
1928 GffObj* m=mrnas[i];
1929 CTData* mdata=(CTData*)m->uptr;
1930
1931 int lastpq=-1;
1932 /*
1933 for (int ptab=mdata->qset-lastpq; ptab>0;ptab--)
1934 if (ptab>1) fprintf(ft,"\t-");
1935 else fprintf(ft,"\t");
1936 lastpq=mdata->qset;
1937 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1938 iround(m->gscore/10), mdata->FPKM, mdata->conf_lo, mdata->conf_hi, mdata->cov, m->covlen);
1939 //traverse linked list of matching transcripts
1940 while (mdata->eqnext!=NULL) {
1941 m=mdata->eqnext;
1942 mdata=(CTData*)m->uptr;
1943 for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1944 if (ptab>1) fprintf(ft,"\t-");
1945 else fprintf(ft,"\t");
1946 lastpq = mdata->qset;
1947 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1948 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1949 } //traverse and print row
1950 */
1951 eqchain->setUnique(false);
1952 eqchain->setSorted((GCompareProc*) cmpTData_qset);
1953
1954 for (int k=0;k<eqchain->Count();k++) {
1955 m=eqchain->Get(k);
1956 mdata=(CTData*)m->uptr;
1957 if (mdata->qset==lastpq) {
1958 //shouldn't happen, unless this input set is messed up (has duplicates/redundant transfrags)
1959 fprintf(ft,",%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", getGeneID(m), m->getID(),
1960 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1961 continue;
1962 }
1963 for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1964 if (ptab>1) fprintf(ft,"\t-");
1965 else fprintf(ft,"\t");
1966 lastpq = mdata->qset;
1967 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1968 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1969 }
1970 for (int ptab=qcount-lastpq-1;ptab>0;ptab--)
1971 fprintf(ft,"\t-");
1972 fprintf(ft,"\n");
1973 continue;
1974 } //start of eq class (printing chain)
1975
1976 if (eqchain!=NULL) continue; //part of a matching chain, dealt with previously
1977
1978 //--------- not in an ichain-matching class, print as singleton
1979
1980 if (ref!=NULL) fprintf(ft,"%s|%s\t%c",getGeneID(ref), ref->getID(), ovlcode);
1981 else fprintf(ft,"-\t%c",ovlcode);
1982 for (int ptab=qfidx;ptab>=0;ptab--)
1983 if (ptab>0) fprintf(ft,"\t-");
1984 else fprintf(ft,"\t");
1985 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|-",qfidx+1, getGeneID(qt), qt.getID(),iround(qt.gscore/10),
1986 qtdata->FPKM, qtdata->conf_lo,qtdata->conf_hi,qtdata->cov);
1987 for (int ptab=qcount-qfidx-1;ptab>0;ptab--)
1988 fprintf(ft,"\t-");
1989 fprintf(ft,"\n");
1990 } //for each transcript
1991 }
1992
1993
1994 void findTRMatch(GTrackLocus& loctrack, int qcount, GLocus& rloc) {
1995 //requires loctrack to be already populated with overlapping qloci by findTMatches()
1996 // which also found (and tagged) all matching qry transcripts
1997 for (int q=0;q<qcount;q++) { //for each qry dataset
1998 if (loctrack[q]==NULL) continue;
1999 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
2000 //if (loctrack[q]->cl[qi]->exons.Count()<2) continue; //skip single-exon transcripts
2001 GffObj& qt=*(loctrack[q]->Get(qi));
2002 CTData* qtdata=(CTData*)qt.uptr;
2003 GffObj* rmatch=NULL; //== ref match for this row
2004 int rovlen=0;
2005 //if (qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0)) {
2006 if (qtdata->isEqHead()) {
2007 //EQ chain head -- transfrag equivalency list starts here
2008 if (qtdata->eqref==NULL) { //find rloc overlap
2009 if (qt.overlap(rloc.start, rloc.end)) {
2010 rmatch=findRefMatch(qt, rloc, rovlen);
2011 }
2012 } else rmatch=qtdata->eqref;
2013 if (rmatch!=NULL) {
2014 /*
2015 GffObj* m=loctrack[q]->Get(qi);
2016 //traverse linked list of matching transcripts
2017 while (((CTData*)m->uptr)->eqnext!=NULL) {
2018 m=((CTData*)m->uptr)->eqnext;
2019 if (rmatch!=NULL) {
2020 ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2021 }
2022 } //traverse qry data sets
2023 continue;
2024 }
2025 */
2026 for (int k=0;k<qtdata->eqlist->Count();k++) {
2027 GffObj* m=qtdata->eqlist->Get(k);
2028 ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2029 continue;
2030 }
2031 }
2032 //if (rmatch!=NULL) continue;
2033 } //equivalence class (chain of intron-matching)
2034 //if ((qtdata->eqdata & EQCHAIN_TAGMASK)!=0) continue; //part of a matching chain, dealt with previously
2035
2036 //--------- qry mrna not in a '=' matching clique
2037 if (qtdata->eqref==NULL) { //find any rloc overlap
2038 if (qt.overlap(rloc.start, rloc.end)) {
2039 rmatch=findRefMatch(qt, rloc, rovlen);
2040 if (rmatch==NULL) {
2041 //not an ichain match, look for other codes
2042 GffObj* rovl=NULL;
2043 int rovlen=0;
2044 //char ovlcode=
2045 getRefOvl(qt, rloc,rovl,rovlen);
2046 }
2047 }
2048 }
2049 else rmatch=qtdata->eqref;
2050 } //for each qry transcript
2051 }//for each qry dataset
2052 }
2053
2054
2055 bool inPolyRun(char strand, GffObj& m, GList<GLocus>* rloci, int& rlocidx) {
2056 //we are only here if there is no actual overlap between m and any locus in rloci
2057 if (rloci==NULL || rloci->Count()==0) return false; // || m.exons.Count()>1
2058 if (strand=='-') {
2059 rlocidx=qsearch_loci(m.end, *rloci);
2060 //returns index of locus starting just ABOVE m.end
2061 // or -1 if last locus start <= m.end
2062 GLocus* rloc=NULL;
2063 if (rlocidx<0) return false;
2064 while (rlocidx<rloci->Count()) {
2065 rloc=rloci->Get(rlocidx);
2066 if (rloc->start>m.end+polyrun_range) break;
2067 if (rloc->start+6>m.end) return true;
2068 rlocidx++;
2069 }
2070 }
2071 else { // strand == '+' (or '.' ?)
2072 rlocidx=qsearch_loci(m.end, *rloci);
2073 GLocus* rloc=NULL;
2074 //returns index of closest locus starting ABOVE m.end
2075 // or -1 if last locus start <= m.end
2076 if (rlocidx<0) rlocidx=rloci->Count(); //this may actually start below m.end
2077 while ((--rlocidx)>=0) {
2078 rloc=rloci->Get(rlocidx);
2079 if (m.start>rloc->start+GFF_MAX_LOCUS) break;
2080 if (m.start+6>rloc->end && m.start<rloc->end+polyrun_range) return true;
2081 }
2082 }
2083 return false;
2084 }
2085
2086 CTData* getBestOvl(GffObj& m) {
2087 //CTData* mdata=(CTData*)m.uptr;
2088 //return mdata->getBestCode();
2089 if ( ((CTData*)m.uptr)->ovls.Count()>0)
2090 return (CTData*)m.uptr;
2091 return NULL;
2092 }
2093
2094 void reclass_XStrand(GList<GffObj>& mrnas, GList<GLocus>* rloci) {
2095 //checking for relationship with ref transcripts on opposite strand
2096 if (rloci==NULL || rloci->Count()<1) return;
2097 int j=0;//current rloci index
2098 for (int i=0;i<mrnas.Count();i++) {
2099 GffObj& m=*mrnas[i];
2100 char ovlcode=((CTData*)m.uptr)->getBestCode();
2101 if (ovlcode>47 && strchr("=cjeo",ovlcode)!=NULL) continue;
2102 GLocus* rloc=rloci->Get(j);
2103 if (rloc->start>m.end) continue; //check next transfrag
2104 while (m.start>rloc->end && j+1<rloci->Count()) {
2105 j++;
2106 rloc=rloci->Get(j);
2107 }
2108 if (rloc->start>m.end) continue; //check next transfrag
2109 //m overlaps rloc:
2110 //check if m has a fuzzy intron overlap -> 's' (shadow, mapped on the wrong strand)
2111 // then if m is contained within an intron -> 'i'
2112 // otherwise it's just a plain cross-strand overlap: 'x'
2113 int jm=0;
2114 do { //while rloci overlap this transfrag (m)
2115 rloc=rloci->Get(j+jm);
2116 bool is_shadow=false;
2117 GffObj* sovl=NULL;
2118 bool is_intraintron=false;
2119 GffObj* iovl=NULL;
2120 if (rloc->introns.Count()>0) {
2121 for (int n=0;n<rloc->introns.Count();n++) {
2122 GISeg& rintron=rloc->introns[n];
2123 if (rintron.start>m.end) break;
2124 if (m.start>rintron.end) continue;
2125 //overlap between m and intron
2126 if (m.end<=rintron.end && m.start>=rintron.start) {
2127 is_intraintron=true;
2128 if (iovl==NULL || iovl->covlen<rintron.t->covlen) iovl=rintron.t;
2129 continue;
2130 }
2131 //check if any intron of m has a fuzz-match with rintron
2132 for (int e=1;e<m.exons.Count();e++) {
2133 GSeg mintron(m.exons[e-1]->end+1,m.exons[e]->start-1);
2134 if (rintron.coordMatch(&mintron,10)) {
2135 is_shadow=true;
2136 if (sovl==NULL || sovl->covlen<rintron.t->covlen) sovl=rintron.t;
2137 break;
2138 }
2139 } //for each m intron
2140 } //for each intron of rloc
2141 }//rloc has introns
2142 bool xcode=true;
2143 if (is_shadow) { ((CTData*)m.uptr)->addOvl('s', sovl); xcode=false; }
2144 // else
2145 if (ovlcode!='i' && is_intraintron) { ((CTData*)m.uptr)->addOvl('i', iovl); xcode=false; }
2146 if (xcode) {
2147 // just plain overlap, find the overlapping mrna in rloc
2148 GffObj* maxovl=NULL;
2149 int ovlen=0;
2150 GffObj* max_lovl=NULL; //max len ref transcript
2151 // having no exon overlap but simply range overlap (interleaved exons)
2152 for (int ri=0;ri<rloc->mrnas.Count();ri++) {
2153 if (!m.overlap(*(rloc->mrnas[ri]))) continue;
2154 int o=m.exonOverlapLen(*(rloc->mrnas[ri]));
2155 if (o>0) {
2156 if (o>ovlen) {
2157 ovlen=o;
2158 maxovl=rloc->mrnas[ri];
2159 }
2160 }
2161 else { //no exon overlap, but still overlapping (interleaved exons)
2162 if (max_lovl==NULL || max_lovl->covlen<rloc->mrnas[ri]->covlen)
2163 max_lovl=rloc->mrnas[ri];
2164 }
2165 }
2166 if (maxovl) ((CTData*)m.uptr)->addOvl('x',maxovl);
2167 else if (max_lovl) ((CTData*)m.uptr)->addOvl('x',max_lovl);
2168 } //'x'
2169 jm++;
2170 } while (j+jm<rloci->Count() && rloci->Get(j+jm)->overlap(m));
2171 } //for each transfrag
2172 }
2173
2174 void reclass_mRNAs(char strand, GList<GffObj>& mrnas, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2175 int rlocidx=-1;
2176 for (int i=0;i<mrnas.Count();i++) {
2177 GffObj& m=*mrnas[i];
2178 char ovlcode=((CTData*)m.uptr)->getBestCode();
2179 //if (ovlcode=='u' || ovlcode=='i' || ovlcode==0) {
2180 if (ovlcode=='u' || ovlcode<47) {
2181 //check for overlaps with ref transcripts on the other strand
2182 if (m.exons.Count()==1 && inPolyRun(strand, m, rloci, rlocidx)) {
2183 ((CTData*)m.uptr)->addOvl('p',rloci->Get(rlocidx)->mrna_maxcov);
2184 }
2185 else { //check for repeat content
2186 if (faseq!=NULL) {
2187 int seqlen;
2188 char* seq=m.getSpliced(faseq, false, &seqlen);
2189 //get percentage of lowercase
2190 int numlc=0;
2191 for (int c=0;c<seqlen;c++) if (seq[c]>='a') numlc++;
2192 if (numlc > seqlen/2)
2193 ((CTData*)m.uptr)->addOvl('r');
2194 GFREE(seq);
2195 }
2196 }
2197 } //for unassigned class
2198 }//for each mrna
2199
2200 }
2201
2202 void reclassLoci(char strand, GList<GLocus>& qloci, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2203 for (int ql=0;ql<qloci.Count();ql++) {
2204 reclass_mRNAs(strand, qloci[ql]->mrnas, rloci, faseq);
2205 //find closest upstream ref locus for this q locus
2206 } //for each locus
2207 }
2208
2209 //for a single genomic sequence, all qry data and ref data is stored in gtrack
2210 //check for all 'u' transfrags if they are repeat ('r') or polymerase run 'p' or anything else
2211 void umrnaReclass(int qcount, GSeqTrack& gtrack, FILE** ftr, GFaSeqGet* faseq=NULL) {
2212 for (int q=0;q<qcount;q++) {
2213 if (gtrack.qdata[q]==NULL) continue; //no transcripts in this q dataset for this genomic seq
2214 reclassLoci('+', gtrack.qdata[q]->loci_f, gtrack.rloci_f, faseq);
2215 reclassLoci('-', gtrack.qdata[q]->loci_r, gtrack.rloci_r, faseq);
2216 reclass_mRNAs('+', gtrack.qdata[q]->umrnas, gtrack.rloci_f, faseq);
2217 reclass_mRNAs('-', gtrack.qdata[q]->umrnas, gtrack.rloci_r, faseq);
2218 //and also check for special cases with cross-strand overlaps:
2219 reclass_XStrand(gtrack.qdata[q]->mrnas_f, gtrack.rloci_r);
2220 reclass_XStrand(gtrack.qdata[q]->mrnas_r, gtrack.rloci_f);
2221 // print all tmap data here here:
2222 for (int i=0;i<gtrack.qdata[q]->tdata.Count();i++) {
2223 CTData* mdata=gtrack.qdata[q]->tdata[i];
2224 if (mdata->mrna==NULL) continue; //invalidated -- removed earlier
2225 //GLocus* rlocus=NULL;
2226 mdata->classcode='u';
2227 GffObj* ref=NULL;
2228 if (mdata->ovls.Count()>0) {
2229 mdata->classcode=mdata->ovls[0]->code;
2230 ref=mdata->ovls[0]->mrna;
2231 }
2232 //if (mdata->classcode<33) mdata->classcode='u';
2233 if (mdata->classcode<47) mdata->classcode='u'; // if 0, '-' or '.'
2234 if (tmapFiles) {
2235 char ref_match_len[2048];
2236 if (ref!=NULL) {
2237 sprintf(ref_match_len, "%d",ref->covlen);
2238 fprintf(ftr[q],"%s\t%s\t",getGeneID(ref),ref->getID());
2239 //rlocus=((CTData*)(ref->uptr))->locus;
2240 }
2241 else {
2242 fprintf(ftr[q],"-\t-\t");
2243 strcpy(ref_match_len, "-");
2244 }
2245 //fprintf(ftr[q],"%c\t%s\t%d\t%8.6f\t%8.6f\t%d\n", ovlcode, mdata->mrna->getID(),
2246 // iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->cov, mdata->mrna->covlen);
2247 const char* mlocname = (mdata->locus!=NULL) ? mdata->locus->mrna_maxcov->getID() : mdata->mrna->getID();
2248 fprintf(ftr[q],"%c\t%s\t%s\t%d\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%d\t%s\t%s\n", mdata->classcode, getGeneID(mdata->mrna), mdata->mrna->getID(),
2249 iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->conf_lo,mdata->conf_hi, mdata->cov, mdata->mrna->covlen, mlocname, ref_match_len);
2250 }
2251 } //for each tdata
2252 } //for each qdata
2253 }
2254
2255 void buildXLoci(GTrackLocus& loctrack, int qcount, GSeqTrack& gtrack, char strand,
2256 GList<GXLocus>* retxloci=NULL) {
2257 GList<GXLocus>* dest_xloci=NULL;
2258 GList<GXLocus> tmpxloci(true,false,true); //local set of newly created xloci
2259 GList<GXLocus>* xloci=&tmpxloci;
2260 if (strand=='+') {
2261 dest_xloci=& gtrack.xloci_f;
2262 }
2263 else if (strand=='-') {
2264 dest_xloci = & gtrack.xloci_r;
2265 }
2266 else dest_xloci= & gtrack.xloci_u;
2267
2268 if (retxloci==NULL) {
2269 //if no return set of build xloci was given
2270 //take it as a directive to work directly on the global xloci
2271 xloci=dest_xloci;
2272 dest_xloci=NULL;
2273 }
2274 for (int q=-1;q<qcount;q++) {
2275 GList<GLocus>* wrkloci=NULL;
2276 if (q<0) {
2277 if (loctrack.rloci.Count()==0) continue;
2278 //loci=new GList<GLocus>(true,false,false);
2279 //loci->Add(loctrack.rloc);
2280 wrkloci = &(loctrack.rloci);
2281 }
2282 else {
2283 if (loctrack[q]==NULL) continue;
2284 wrkloci = &(loctrack[q]->qloci);
2285 }
2286
2287 for (int t=0;t<wrkloci->Count();t++) {
2288 GLocus* loc=wrkloci->Get(t);
2289 int xfound=0; //count of parent xloci
2290 if (loc->xlocus!=NULL) continue; //already assigned a superlocus
2291 GArray<int> mrgxloci(true);
2292 for (int xl=0;xl<xloci->Count();xl++) {
2293 GXLocus& xloc=*(xloci->Get(xl));
2294 if (xloc.start>loc->end) {
2295 if (xloc.start-loc->end > GFF_MAX_LOCUS) break;
2296 continue;
2297 }
2298 if (loc->start>xloc.end) continue;
2299 if (xloc.add_Locus(loc)) {
2300 xfound++;
2301 mrgxloci.Add(xl);
2302 }
2303 } //for each existing Xlocus
2304 if (xfound==0) {
2305 xloci->Add(new GXLocus(loc));
2306 }
2307 else {
2308 int il=mrgxloci[0];
2309 GXLocus& xloc=*(xloci->Get(il));
2310 if (xfound>1) {
2311 for (int l=1;l<xfound;l++) {
2312 int mlidx=mrgxloci[l]-l+1;
2313 xloc.addMerge(*(xloci->Get(mlidx)));
2314 GXLocus* ldel=xloci->Get(mlidx);
2315 xloci->Delete(mlidx);
2316 if (retxloci!=NULL)
2317 delete ldel;
2318 }
2319 }
2320 //in case xloc.start was decreased, bubble-down until it's in the proper order
2321 while (il>0 && xloc<*(xloci->Get(il-1))) {
2322 il--;
2323 xloci->Swap(il,il+1);
2324 }
2325 } //at least one locus is being merged
2326 }//for each locus
2327 }//for each set of loci in the region (refs and each qry set)
2328 //-- add xloci to the global set of xloci unless retxloci was given,
2329 if (retxloci!=NULL) retxloci->Add(*xloci);
2330 else dest_xloci->Add(*xloci);
2331 }
2332
2333 void singleQData(GList<GLocus>& qloci, GList<GTrackLocus>& loctracks) {
2334 for (int i=0;i<qloci.Count();i++) {
2335 if (qloci[i]->t_ptr==NULL) {
2336 GTrackLocus* tloc=new GTrackLocus();
2337 tloc->addQLocus(qloci[i],0);
2338 loctracks.Add(tloc);
2339 }
2340 }
2341 }
2342
2343 void recheckUmrnas(GSeqData* gseqdata, GList<GffObj>& mrnas,
2344 GList<GLocus>& loci, GList<GLocus>& nloci, GList<GLocus>& oloci) {
2345 GList<GLocus> reassignedLocs(false,false);
2346 for (int u=0;u<gseqdata->umrnas.Count();u++) {
2347 for (int l=0;l<oloci.Count();l++) {
2348 if (gseqdata->umrnas[u]==NULL) break;
2349 if (gseqdata->umrnas[u]->end<oloci[l]->start) break; //try next umrna
2350 if (oloci[l]->end<gseqdata->umrnas[u]->start) continue; //try next locus
2351 if (gseqdata->umrnas[u]->strand=='+' || gseqdata->umrnas[u]->strand=='-') {
2352 gseqdata->umrnas.Forget(u);
2353 continue; //already reassigned earlier
2354 }
2355 //umrna overlaps locus region
2356 GffObj* umrna=gseqdata->umrnas[u];
2357 for (int m=0;m<oloci[l]->mrnas.Count();m++) {
2358 if (oloci[l]->mrnas[m]->exonOverlap(umrna)) {
2359 gseqdata->umrnas.Forget(u);
2360 CTData* umdata=((CTData*)umrna->uptr);
2361 //must be in a Loci anyway
2362 if (umdata==NULL || umdata->locus==NULL)
2363 GError("Error: no locus pointer for umrna %s!\n",umrna->getID());
2364 for (int i=0;i<umdata->locus->mrnas.Count();i++) {
2365 GffObj* um=umdata->locus->mrnas[i];
2366 um->strand=oloci[l]->mrnas[m]->strand;
2367 }
2368 reassignedLocs.Add(umdata->locus);
2369 break;
2370 }
2371 } //for each mrna in locus
2372 } //for each locus
2373 } //for each umrna
2374 if (reassignedLocs.Count()>0) {
2375 gseqdata->umrnas.Pack();
2376 gseqdata->nloci_u.setFreeItem(false);
2377 for (int i=0;i<reassignedLocs.Count();i++) {
2378 GLocus* loc=reassignedLocs[i];
2379 for (int m=0;m<loc->mrnas.Count();m++) {
2380 mrnas.Add(loc->mrnas[m]);
2381 }
2382 loci.Add(loc);
2383 nloci.Add(loc);
2384 gseqdata->nloci_u.Remove(loc);
2385 }
2386 gseqdata->nloci_u.setFreeItem(true);
2387 }
2388 }
2389
2390 void umrnasXStrand(GList<GXLocus>& xloci, GSeqTrack& gtrack) {
2391 //try to determine the strand of unoriented transfrags based on possible overlaps
2392 //with other, oriented transfrags
2393 for (int x=0;x<xloci.Count();x++) {
2394 if (xloci[x]->strand=='.') continue;
2395 if (xloci[x]->qloci.Count()==0) continue;
2396 //go through all qloci in this xlocus
2397 for (int l = 0; l < xloci[x]->qloci.Count(); l++) {
2398 char locstrand=xloci[x]->qloci[l]->mrna_maxcov->strand;
2399 if (locstrand=='.') {
2400 //this is a umrna cluster
2401 GLocus* qloc=xloci[x]->qloci[l];
2402 //we don't really need to update loci lists (loci_f, nloci_f etc.)
2403 /*
2404 if (xloci[x]->strand=='+') {
2405 }
2406 else { // - strand
2407 }
2408 */
2409 for (int i=0;i<qloc->mrnas.Count();i++) {
2410 qloc->mrnas[i]->strand=xloci[x]->strand;
2411 int uidx=gtrack.qdata[qloc->qfidx]->umrnas.IndexOf(qloc->mrnas[i]);
2412 if (uidx>=0) {
2413 gtrack.qdata[qloc->qfidx]->umrnas.Forget(uidx);
2414 gtrack.qdata[qloc->qfidx]->umrnas.Delete(uidx);
2415 if (xloci[x]->strand=='+')
2416 gtrack.qdata[qloc->qfidx]->mrnas_f.Add(qloc->mrnas[i]);
2417 else
2418 gtrack.qdata[qloc->qfidx]->mrnas_r.Add(qloc->mrnas[i]);
2419 }
2420 }
2421 } //unknown strand
2422 } //for each xloci[x].qloci (l)
2423
2424 } //for each xloci (x)
2425 }
2426
2427 //cluster loci across all datasets
2428 void xclusterLoci(int qcount, char strand, GSeqTrack& gtrack) {
2429 //gtrack holds data for all input qry datasets for a chromosome/contig
2430 //cluster QLoci
2431 GList<GTrackLocus> loctracks(true,true,false);
2432 //all vs all clustering across all qry data sets + ref
2433 //one-strand set of loci from all datasets + ref loci
2434 GList<GLocus>* wrkloci=NULL;
2435 //build xloci without references first
2436 //then add references only if they overlap an existing xloci
2437
2438 int nq=0;
2439 for (int q=0;q<=qcount+1;q++) {
2440 bool refcheck=false;
2441 if (q==qcount) { // check the unoriented loci for each query file
2442 while (nq<qcount &&
2443 (gtrack.qdata[nq]==NULL || gtrack.qdata[nq]->nloci_u.Count()==0))
2444 nq++; //skip query files with no unoriented loci
2445 if (nq<qcount) {
2446 wrkloci=&(gtrack.qdata[nq]->nloci_u);
2447 nq++;
2448 if (nq<qcount) q--; //so we can fetch the next nq in the next q cycle
2449 }
2450 else continue; //no more q files with unoriented loci
2451 }
2452 else if (q==qcount+1) { // check the reference loci
2453 if (strand=='+') wrkloci=gtrack.rloci_f;
2454 else wrkloci=gtrack.rloci_r;
2455
2456 if (wrkloci==NULL) break; //no ref loci here
2457 refcheck=true;
2458 }
2459 else {
2460 if (gtrack.qdata[q]==NULL) continue;
2461 if (strand=='+') wrkloci=&(gtrack.qdata[q]->loci_f);
2462 else wrkloci=&(gtrack.qdata[q]->loci_r);
2463 }
2464 // now do the all-vs-all clustering thing:
2465 for (int t=0;t<wrkloci->Count();t++) {
2466 GLocus* loc=wrkloci->Get(t);
2467 int xfound=0; //count of parent loctracks
2468 if (loc->t_ptr!=NULL) continue; //already assigned a loctrack
2469 GArray<int> mrgloctracks(true);
2470 for (int xl=0;xl<loctracks.Count();xl++) {
2471 GTrackLocus& trackloc=*loctracks[xl];
2472 if (trackloc.start>loc->end) break;
2473 if (loc->start>trackloc.end) continue;
2474 if (trackloc.add_Locus(loc)) {
2475 xfound++;
2476 mrgloctracks.Add(xl);
2477 }
2478 } //for each existing Xlocus
2479 if (xfound==0) {
2480 if (!refcheck) //we really don't care about ref-only clusters
2481 loctracks.Add(new GTrackLocus(loc));
2482 }
2483 else {
2484 int il=mrgloctracks[0];
2485 GTrackLocus& tloc=*(loctracks.Get(il));
2486 if (xfound>1) {
2487 for (int l=1;l<xfound;l++) {
2488 int mlidx=mrgloctracks[l]-l+1;
2489 tloc.addMerge(loctracks[mlidx], qcount, loc);
2490 loctracks.Delete(mlidx);
2491 }
2492 }
2493 //in case tloc.start was decreased, bubble-down 'til it's in the proper place
2494 while (il>0 && tloc<*(loctracks[il-1])) {
2495 il--;
2496 loctracks.Swap(il,il+1);
2497 }
2498 } //at least one locus found
2499 }//for each wrklocus
2500 } //for each set of loci (q)
2501 //loctracks is now set with all x-clusters on this strand
2502 for (int i=0;i<loctracks.Count();i++) {
2503 if (!loctracks[i]->hasQloci) continue; //we really don't care here about reference-only clusters
2504 GTrackLocus& loctrack=*loctracks[i];
2505 findTMatches(loctrack, qcount); //find matching transfrags in this xcluster
2506 for (int rl=0; rl < loctrack.rloci.Count(); rl++) {
2507 findTRMatch(loctrack, qcount, *(loctrack.rloci[rl]));
2508 //find matching reference annotation for this xcluster and assign class codes to transfrags
2509 }
2510 GList<GXLocus> xloci(false,false,false);
2511 buildXLoci(loctrack, qcount, gtrack, strand, &xloci);
2512 //the newly created xloci are in xloci
2513 umrnasXStrand(xloci, gtrack);
2514 //also merge these xloci into the global list of xloci
2515 for (int l=0; l < xloci.Count(); l++) {
2516 if (xloci[l]->strand=='+') {
2517 gtrack.xloci_f.Add(xloci[l]);
2518 }
2519 else if (xloci[l]->strand=='-') {
2520 gtrack.xloci_r.Add(xloci[l]);
2521 }
2522 else gtrack.xloci_u.Add(xloci[l]);
2523 }
2524 }//for each xcluster
2525 }
2526
2527
2528 void printRefMap(FILE** frs, int qcount, GList<GLocus>* rloci) {
2529 if (rloci==NULL) return;
2530
2531 for (int l=0;l<rloci->Count(); l++) {
2532 for (int r=0;r<rloci->Get(l)->mrnas.Count(); r++) {
2533 GffObj& ref = *(rloci->Get(l)->mrnas[r]);
2534 CTData* refdata = ((CTData*)ref.uptr);
2535 GStr* clist = new GStr[qcount];
2536 GStr* eqlist = new GStr[qcount];
2537 for (int i = 0; i<refdata->ovls.Count(); i++) {
2538 GffObj* m=refdata->ovls[i]->mrna;
2539 char ovlcode=refdata->ovls[i]->code;
2540 if (m==NULL) {
2541 GMessage("Warning: NULL mRNA found for ref %s with ovlcode '%c'\n",
2542 ref.getID(), refdata->ovls[i]->code);
2543 continue;
2544 }
2545 int qfidx = ((CTData*)m->uptr)->qset;
2546 if (ovlcode == '=') {
2547 eqlist[qfidx].append(getGeneID(m));
2548 eqlist[qfidx].append('|');
2549 eqlist[qfidx].append(m->getID());
2550 eqlist[qfidx].append(',');
2551 }
2552 else if (ovlcode == 'c') {
2553 clist[qfidx].append(getGeneID(m));
2554 clist[qfidx].append('|');
2555 clist[qfidx].append(m->getID());
2556 clist[qfidx].append(',');
2557 }
2558 }//for each reference overlap
2559 for (int q=0;q<qcount;q++) {
2560 if (!eqlist[q].is_empty()) {
2561 eqlist[q].trimR(',');
2562 fprintf(frs[q],"%s\t%s\t=\t%s\n", getGeneID(ref), ref.getID(),eqlist[q].chars());
2563 }
2564 if (!clist[q].is_empty()) {
2565 clist[q].trimR(',');
2566 fprintf(frs[q],"%s\t%s\tc\t%s\n",getGeneID(ref), ref.getID(),clist[q].chars());
2567 }
2568 }
2569 delete[] clist;
2570 delete[] eqlist;
2571 }// ref loop
2572 }//ref locus loop
2573 }
2574
2575
2576
2577 void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs) {
2578 FILE* f_ltrack=NULL;
2579 FILE* f_itrack=NULL;
2580 FILE* f_ctrack=NULL;
2581 FILE* f_xloci=NULL;
2582 int cnum=0; //consensus numbering for printITrack()
2583 GStr s=fbasename;
2584 //if (qcount>1 || generic_GFF) { //doesn't make much sense for only 1 query file
2585 s.append(".tracking");
2586 f_itrack=fopen(s.chars(),"w");
2587 if (f_itrack==NULL) GError("Error creating file %s !\n",s.chars());
2588 // }
2589 s=fbasename;
2590 s.append(".combined.gtf");
2591 f_ctrack=fopen(s.chars(),"w");
2592 if (f_ctrack==NULL) GError("Error creating file %s !\n",s.chars());
2593
2594 s=fbasename;
2595 s.append(".loci");
2596 f_xloci=fopen(s.chars(),"w");
2597 if (f_xloci==NULL) GError("Error creating file %s !\n",s.chars());
2598 for (int g=0;g<gtracks.Count();g++) { //for each genomic sequence
2599 GSeqTrack& gseqtrack=*gtracks[g];
2600
2601 xclusterLoci(qcount, '+', gseqtrack);
2602 xclusterLoci(qcount, '-', gseqtrack);
2603
2604 //count XLoci, setting their id
2605 numXLoci(gseqtrack.xloci_f, xlocnum);
2606 numXLoci(gseqtrack.xloci_r, xlocnum);
2607 numXLoci(gseqtrack.xloci_u, xlocnum);
2608 //transcript accounting: for all those transcripts with 'u' or 0 class code
2609 // we have to check for polymerase runs 'p' or repeats 'r'
2610
2611 GFaSeqGet *faseq=gfasta.fetch(gseqtrack.get_gseqid(), checkFasta);
2612
2613 umrnaReclass(qcount, gseqtrack, ftr, faseq);
2614
2615 // print transcript tracking (ichain_tracking)
2616 //if (qcount>1)
2617 for (int q=0;q<qcount;q++) {
2618 if (gseqtrack.qdata[q]==NULL) continue;
2619 printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_f, qcount, cnum);
2620 printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_r, qcount, cnum);
2621 //just for the sake of completion:
2622 printITrack(f_itrack, gseqtrack.qdata[q]->umrnas, qcount, cnum);
2623 }
2624 //print XLoci and XConsensi within each xlocus
2625 //also TSS clustering and protein ID assignment for XConsensi
2626 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_f, faseq);
2627 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_r, faseq);
2628 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_u, faseq);
2629 if (tmapFiles && haveRefs) {
2630 printRefMap(frs, qcount, gseqtrack.rloci_f);
2631 printRefMap(frs, qcount, gseqtrack.rloci_r);
2632 }
2633 delete faseq;
2634 }
2635 if (tmapFiles) {
2636 for (int q=0;q<qcount;q++) {
2637 fclose(ftr[q]);
2638 if (haveRefs) fclose(frs[q]);
2639 }
2640 }
2641 if (f_ltrack!=NULL) fclose(f_ltrack);
2642 if (f_itrack!=NULL) fclose(f_itrack);
2643 if (f_ctrack!=NULL) fclose(f_ctrack);
2644 if (f_xloci!=NULL) fclose(f_xloci);
2645
2646 }

Properties

Name Value
svn:executable *