ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/cuffcompare.cpp
Revision: 53
Committed: Thu Sep 8 02:43:56 2011 UTC (8 years, 1 month ago) by gpertea
File size: 97679 byte(s)
Log Message:
Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #else
4 #define PACKAGE_VERSION "INTERNAL"
5 #define SVN_REVISION "SVN"
6 #endif
7
8 #include "GArgs.h"
9 #include <ctype.h>
10 #include <errno.h>
11 #include "gtf_tracking.h"
12
13 #ifdef HAVE_CONFIG_H
14 #include "update_check.h"
15 #endif
16
17 #define USAGE "Usage:\n\
18 cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>] \n\
19 [-o <outprefix>] [-p <cprefix>] \n\
20 {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}\n\
21 \n\
22 Cuffcompare provides classification, reference annotation mapping and various\n\
23 statistics for Cufflinks transfrags.\n\
24 Cuffcompare clusters and tracks transfrags across multiple samples, writing\n\
25 matching transcripts (intron chains) into <outprefix>.tracking, and a GTF\n\
26 file <outprefix>.combined.gtf containing a nonredundant set of transcripts \n\
27 across all input files (with a single representative transfrag chosen\n\
28 for each clique of matching transfrags across samples).\n\
29 \n\
30 Options:\n\
31 -i provide a text file with a list of Cufflinks GTF files to process instead\n\
32 of expecting them as command line arguments (useful when a large number\n\
33 of GTF files should be processed)\n\
34 \n\
35 -r a set of known mRNAs to use as a reference for assessing \n\
36 the accuracy of mRNAs or gene models given in <input.gtf>\n\
37 \n\
38 -R for -r option, reduce the set of reference transcripts to \n\
39 only those found to overlap any of the input loci\n\
40 -M discard (ignore) single-exon transfrags and reference transcripts\n\
41 -N discard (ignore) single-exon reference transcripts\n\
42 \n\
43 -s <seq_path> can be a multi-fasta file with all the genomic sequences or \n\
44 a directory containing multiple single-fasta files (one file per contig);\n\
45 lower case bases will be used to classify input transcripts as repeats\n\
46 \n\
47 -d max distance (range) for grouping transcript start sites (100)\n\
48 -p the name prefix to use for consensus transcripts in the \n\
49 <outprefix>.combined.gtf file (default: 'TCONS')\n\
50 -C include the \"contained\" transcripts in the .combined.gtf file\n\
51 -G generic GFF input file(s) (do not assume Cufflinks GTF)\n\
52 -T do not generate .tmap and .refmap files for each input file\n\
53 -V verbose processing mode (showing all GFF parsing warnings)\n\
54 "
55 bool debug=false;
56 bool perContigStats=false; // -S to enable stats for every single contig
57 bool generic_GFF=false; //-G, don't assume Cufflinks GTF as input
58 bool showContained=false; // -C
59 bool reduceRefs=false;
60 bool checkFasta=false;
61 bool tmapFiles=true;
62 bool only_spliced_refs=false;
63 int debugCounter=0;
64
65 int polyrun_range=2000; //polymerase run range 2KB
66 double scoreThreshold=0;
67 double exprThreshold=0;
68 char* cprefix=NULL;
69 FILE* ffasta=NULL; //genomic seq file
70 FILE *f_ref=NULL; //reference mRNA GFF, if provided
71 FILE* f_in=NULL; //sequentially, each input GFF file
72 FILE* f_out=NULL; //stdout if not provided
73 GFastaHandler gfasta;
74 int xlocnum=0;
75 int tsscl_num=0; //for tss cluster IDs
76 int protcl_num=0; //for "unique" protein IDs within TSS clusters
77 int tssDist=100;
78 //int total_tcons=0;
79 int total_xloci_alt=0;
80
81 void openfwrite(FILE* &f, GArgs& args, char opt) {
82 GStr s=args.getOpt(opt);
83 if (!s.is_empty()) {
84 if (s=='-')
85 f=stdout;
86 else {
87 f=fopen(s,"w");
88 if (f==NULL) GError("Error creating file: %s\n", s.chars());
89 }
90 }
91 }
92
93 //-- structure to keep track of data from multiple qry input files for a single genomic seq
94 class GSeqTrack {
95 int gseq_id;
96 public:
97 const char* gseq_name;
98 GList<GLocus>* rloci_f; //reference loci for this genomic sequence
99 GList<GLocus>* rloci_r;
100 GList<GXLocus> xloci_f; // extended super-loci across all qry datasets
101 GList<GXLocus> xloci_r; // extended super-loci across all qry datasets
102 GList<GXLocus> xloci_u; // extended super-loci across all qry datasets
103 GSeqData* qdata[MAX_QFILES]; //fixed order array with GSeqData for each qry input
104 //element in array is NULL if a qry file has no transcripts on this genomic sequence
105 int get_gseqid() { return gseq_id; }
106 GSeqTrack(int gid=-1):xloci_f(true,true,false),
107 xloci_r(true,true,false), xloci_u(true,true,false) {
108 gseq_id=gid;
109 if (gseq_id>=0) {
110 gseq_name=GffObj::names->gseqs.getName(gseq_id);
111 }
112 rloci_f=NULL;
113 rloci_r=NULL;
114 for (int i=0;i<MAX_QFILES;i++) qdata[i]=NULL;
115 }
116 bool operator==(GSeqTrack& d){
117 return (gseq_id==d.gseq_id);
118 }
119 bool operator>(GSeqTrack& d){
120 return (gseq_id>d.gseq_id);
121 }
122 bool operator<(GSeqTrack& d){
123 return (gseq_id<d.gseq_id);
124 }
125 };
126
127 char* getFastaFile(int gseq_id);
128
129 // ref globals
130 bool haveRefs=false; //true if a reference annotation (-r) is provide
131
132 GList<GSeqData> ref_data(true,true,true); //list of reference mRNAs and loci data for each genomic seq
133 //each locus will keep track of any superloci which includes it, formed during the analysis
134
135 void processLoci(GSeqData& seqdata, GSeqData* refdata=NULL, int qfidx=0);
136
137 void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
138 GSeqData* seqdata=NULL, GSeqData* refdata=NULL);
139
140 GSeqData* getQryData(int gid, GList<GSeqData>& qdata);
141 void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs);
142
143 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
144 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
145
146 FILE* f_mintr=NULL; //missed ref introns
147
148 bool multiexon_only=false;
149 bool multiexonrefs_only=false;
150
151 GHash<GStr> refdescr;
152 void loadRefDescr(const char* fname);
153
154 GList<GStr> qryfiles(false,true,false);
155
156 //list of GSeqTrack data, sorted by gseq_id
157 GList<GSeqTrack> gseqtracks(true,true,true);
158 GSeqTrack* findGSeqTrack(int gsid);
159
160
161 int cmpGTrackByName(const pointer p1, const pointer p2) {
162 return strcmp(((GSeqTrack*)p1)->gseq_name, ((GSeqTrack*)p2)->gseq_name);
163 }
164
165
166 void show_usage() {
167 GMessage("cuffcompare v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
168 GMessage( "-----------------------------\n");
169 GMessage("%s\n", USAGE);
170 }
171
172 int main(int argc, char * const argv[]) {
173 GArgs args(argc, argv, "XDTMNVGSCKRLhp:c:d:s:i:n:r:o:");
174 int e;
175 if ((e=args.isError())>0) {
176 show_usage();
177 GMessage("Invalid argument: %s\n", argv[e]);
178 exit(1);
179 }
180 if (args.getOpt('h')!=NULL){
181 show_usage();
182 exit(1);
183 }
184 showContained=(args.getOpt('C')!=NULL);
185 debug=(args.getOpt('D')!=NULL);
186 tmapFiles=(args.getOpt('T')==NULL);
187 multiexon_only=(args.getOpt('M')!=NULL);
188 multiexonrefs_only=(args.getOpt('N')!=NULL);
189 perContigStats=(args.getOpt('S')!=NULL);
190 checkFasta=(args.getOpt('K')!=NULL);
191 gtf_tracking_verbose=((args.getOpt('V')!=NULL) || debug);
192 FILE* finlst=NULL;
193 GStr s=args.getOpt('i');
194 if (!s.is_empty()) {
195 if (s=='-')
196 finlst=stdin;
197 else {
198 finlst=fopen(s,"r");
199 if (finlst==NULL) GError("Error opening file: %s\n", s.chars());
200 }
201 }
202 int numqryfiles=0;
203 if (finlst) {
204 GLineReader* lr=new GLineReader(finlst);
205 char* l=NULL;
206 while ((l=lr->getLine())!=NULL) {
207 if (strlen(l)<2 || startsWith(l,"# ") || isspace(*l)) continue;
208 if (!fileExists(l)) GError("Error: cannot locate input file: %s\n", l);
209 qryfiles.Add(new GStr(l));
210 }
211 delete lr;
212 //if (qryfiles.Count()>10)
213 gtf_tracking_largeScale=true;
214 }
215 else {
216 numqryfiles=args.startNonOpt();
217 char *infile=NULL;
218 if (numqryfiles>0) {
219 while ((infile=args.nextNonOpt())!=NULL) {
220 if (!fileExists(infile)) GError("Error: cannot locate input file: %s\n", infile);
221 qryfiles.Add(new GStr(infile));
222 } //for each argument
223 }
224 }
225 numqryfiles=qryfiles.Count();
226 if (numqryfiles==0) {
227 show_usage();
228 exit(1);
229 }
230 if (numqryfiles>MAX_QFILES) {
231 GMessage("Error: too many input files (limit set to %d at compile time)\n",MAX_QFILES);
232 GMessage("(if you need to raise this limit set a new value for\nMAX_QFILES in gtf_tracking.h and recompile)\n");
233 exit(0x5000);
234 }
235 #ifdef HAVE_CONFIG_H
236 check_version(PACKAGE_VERSION);
237 #endif
238 gfasta.init(args.getOpt('s'));
239 // determine if -s points to a multi-fasta file or a directory
240 s=args.getOpt('c');
241 if (!s.is_empty()) scoreThreshold=s.asReal();
242 s=args.getOpt('p');
243 if (!s.is_empty()) cprefix=Gstrdup(s.chars());
244 else cprefix=Gstrdup("TCONS");
245 s=args.getOpt('e');
246 if (!s.is_empty()) exprThreshold=s.asReal();
247 s=args.getOpt('d');
248 if (!s.is_empty()) {
249 tssDist=s.asInt();
250 }
251
252 s=args.getOpt('n');
253 if (!s.is_empty()) loadRefDescr(s.chars());
254 s=args.getOpt('r');
255 if (!s.is_empty()) {
256 f_ref=fopen(s,"r");
257 if (f_ref==NULL) GError("Error opening reference gff: %s\n",s.chars());
258 haveRefs=true;
259 if (gtf_tracking_verbose) GMessage("Loading reference transcripts..\n");
260 read_mRNAs(f_ref, ref_data, &ref_data, true, -1, s.chars(), (multiexonrefs_only || multiexon_only));
261 haveRefs=(ref_data.Count()>0);
262 reduceRefs=(args.getOpt('R')!=NULL);
263 if (gtf_tracking_verbose) GMessage("..reference annotation loaded\n");
264 }
265 bool discard_redundant=true; //discard redundant input transfrags
266 generic_GFF=args.getOpt('G');
267 if (generic_GFF) discard_redundant=false; //generic GTF, don't try to discard "redundant" transcripts
268 //if a full pathname is given
269 //the other common output files will still be created in the current directory:
270 // .loci, .tracking, .stats
271 GStr outbasename; //include path, if provided
272 GStr outprefix; //without path and/or extension
273 GStr outstats=args.getOpt('o');
274 if (outstats.is_empty() || outstats=="-") {
275 outstats="cuffcmp";
276 }
277 outbasename=outstats;
278 GStr outext(getFileExt(outstats.chars()));
279 if (outext.is_empty()) {
280 outext="stats";
281 outstats.append(".stats");
282 outbasename=outstats;
283 }
284 else outext.lower();
285 if (outext=="txt" || outext=="out" || outext=="stats" || outext=="summary") {
286 outbasename.cut(outbasename.length()-outext.length()-1);
287 }
288
289 outprefix=outbasename;
290 int di=outprefix.rindex(CHPATHSEP);
291 if (di>=0) outprefix.cut(0,di+1);
292
293 if (debug) { //create a few more files potentially useful for debugging
294 s=outbasename;
295 s.append(".missed_introns.gtf");
296 f_mintr=fopen(s.chars(),"w");
297 if (f_mintr==NULL) GError("Error creating file %s!\n",s.chars());
298 /*
299 s=outbasename;
300 s.append(".noTP_introns.gtf");
301 f_nintr=fopen(s.chars(),"w");
302 s=outbasename;
303 s.append(".wrong_Qintrons.gtf");
304 f_qintr=fopen(s.chars(),"w");
305 */
306 }
307
308 f_out=fopen(outstats, "w");
309 if (f_out==NULL) GError("Error creating output file %s!\n", outstats.chars());
310 if (gtf_tracking_verbose) GMessage("Prefix for output files: %s\n", outprefix.chars());
311 fprintf(f_out, "# Cuffcompare v%s | Command line was:\n#", PACKAGE_VERSION);
312 for (int i=0;i<argc-1;i++)
313 fprintf(f_out, "%s ", argv[i]);
314 fprintf(f_out, "%s\n#\n", argv[argc-1]);
315 //int qfileno=0;
316 GList<GSeqData>** qrysdata=NULL;
317 FILE** tfiles=NULL;
318 FILE** rtfiles=NULL;
319 GMALLOC(qrysdata, numqryfiles*sizeof(GList<GSeqData>*));
320 if (tmapFiles) {
321 GMALLOC(tfiles, numqryfiles*sizeof(FILE*));
322 if (haveRefs) {
323 GMALLOC(rtfiles, numqryfiles*sizeof(FILE*));
324 }
325 }
326 for (int fi=0;fi<qryfiles.Count();fi++) {
327 GStr in_file(qryfiles[fi]->chars());
328 GStr infname(getFileName(qryfiles[fi]->chars())); //file name only
329 GStr indir(qryfiles[fi]->chars());
330 di=indir.rindex(CHPATHSEP);
331 if (di>=0) indir.cut(di+1); //directory path for this input file
332 else indir=""; //current directory
333
334 if (debug || (gtf_tracking_verbose && !gtf_tracking_largeScale))
335 GMessage("Processing qfile #%d: %s\n",fi+1, in_file.chars());
336 if (in_file=="-") { f_in=stdin; in_file="stdin"; }
337 else {
338 f_in=fopen(in_file.chars(),"r");
339 if (f_in==NULL)
340 GError("Cannot open input file %s!\n",in_file.chars());
341 }
342 //f_in is the query gff file to process
343
344 GStr sbase(indir);
345 sbase.append(outprefix);
346 sbase.append(".");
347 sbase.append(infname);
348 if (tmapFiles) {
349 //-- we should keep the infname path, otherwise the remaining file names
350 // may be the same and clobber each other
351 s=sbase;
352 s.append(".tmap");
353 tfiles[fi]=fopen(s.chars(),"w");
354 if (tfiles[fi]==NULL)
355 GError("Error creating file '%s'!\n",s.chars());
356 fprintf(tfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_gene_id\tcuff_id\tFMI\tFPKM\tFPKM_conf_lo\tFPKM_conf_hi\tcov\tlen\tmajor_iso_id\tref_match_len\n");
357 if (haveRefs) {
358 s=sbase;
359 s.append(".refmap");
360 rtfiles[fi]=fopen(s.chars(),"w");
361 if (rtfiles[fi]==NULL)
362 GError("Error creating file '%s'!\n",s.chars());
363 fprintf(rtfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_id_list\n");
364 }
365 }
366
367 GList<GSeqData>* pdata=new GList<GSeqData>(true,true,true);
368 qrysdata[fi]=pdata;
369 if (gtf_tracking_verbose) GMessage("Loading transcripts from %s..\n",in_file.chars());
370 read_mRNAs(f_in, *pdata, &ref_data, discard_redundant, fi, in_file.chars(), multiexon_only);
371 GSuperLocus gstats;
372 GFaSeqGet *faseq=NULL;
373 for (int g=0;g<pdata->Count();g++) { //for each seqdata related to a genomic sequence
374 int gsid=pdata->Get(g)->get_gseqid();
375 GSeqData* refdata=getRefData(gsid, ref_data);//ref data for this contig
376 if (!gtf_tracking_largeScale)
377 processLoci(*(pdata->Get(g)), refdata, fi);
378 GSeqTrack* seqtrack=findGSeqTrack(gsid); //this will add a gseqtrack if it doesn't exist
379 // for gsid
380 if (refdata!=NULL) {
381 seqtrack->rloci_f=&(refdata->loci_f);
382 seqtrack->rloci_r=&(refdata->loci_r);
383 }
384 seqtrack->qdata[fi]=pdata->Get(g);
385 //will only gather data into stats if perContig==false
386 if (!gtf_tracking_largeScale) reportStats(f_out, getGSeqName(gsid), gstats,
387 pdata->Get(g), refdata);
388 if (faseq!=NULL) delete faseq;
389 } //for each genomic sequence data
390 //there could be genomic sequences with no qry transcripts
391 //but with reference transcripts
392 if (haveRefs && !reduceRefs && !gtf_tracking_largeScale) {
393 for (int r=0;r<ref_data.Count();r++) {
394 GSeqData* refdata=ref_data[r];
395 int gsid=refdata->get_gseqid();
396 if (getQryData(gsid, *pdata)==NULL) {
397 reportStats(f_out, getGSeqName(gsid), gstats, NULL, refdata);
398 }//completely missed all refdata on this contig
399 }
400 }
401 //now report the summary:
402 if (!gtf_tracking_largeScale) reportStats(f_out, in_file.chars(), gstats);
403 if (f_in!=stdin) fclose(f_in);
404 //qfileno++;
405 }//for each input file
406 if (f_mintr!=NULL) fclose(f_mintr);
407 gseqtracks.setSorted(&cmpGTrackByName);
408 if (gtf_tracking_verbose) GMessage("Tracking transcripts across %d query files..\n", numqryfiles);
409 trackGData(numqryfiles, gseqtracks, outbasename, tfiles, rtfiles);
410 fprintf(f_out, "\n Total union super-loci across all input datasets: %d \n", xlocnum);
411 if (numqryfiles>1) {
412 fprintf(f_out, " (%d multi-transcript, ~%.1f transcripts per locus)\n",
413 total_xloci_alt, ((double)(GXConsensus::count))/xlocnum);
414 }
415 if (gtf_tracking_verbose) GMessage("Cleaning up..\n");
416 GFREE(cprefix);
417 // clean up
418 for (int i=0;i<numqryfiles;i++) {
419 delete qrysdata[i];
420 }
421 GFREE(qrysdata);
422 GFREE(tfiles);
423 GFREE(rtfiles);
424 gseqtracks.Clear();
425 FRCLOSE(f_ref);
426 FWCLOSE(f_out);
427 if (gtf_tracking_verbose) GMessage("Done.\n");
428 ref_data.Clear();
429 //getchar();
430 } //main ends here
431
432 void show_exons(FILE* f, GffObj& m) {
433 fprintf(f,"(");
434 int imax=m.exons.Count()-1;
435 for (int i=0;i<=imax;i++) {
436 if (i==imax) fprintf(f,"%d-%d)",m.exons[i]->start, m.exons[i]->end);
437 else fprintf(f,"%d-%d,",m.exons[i]->start, m.exons[i]->end);
438 }
439 }
440
441 bool ichainMatch(GffObj* t, GffObj* r, bool& exonMatch, int fuzz=0) {
442 //t's intron chain is considered matching to reference r
443 //if r chain is the same or a subset of t's chain
444 exonMatch=false;
445 int imax=r->exons.Count()-1;
446 int jmax=t->exons.Count()-1;
447 if (imax==0 || jmax==0) { //single-exon mRNAs
448 if (imax!=jmax) return false;
449 exonMatch=r->exons[0]->coordMatch(t->exons[0],fuzz);
450 /*if (exonMatch) return true;
451 else return (r->exons[0]->start>=t->exons[0]->start &&
452 r->exons[0]->end<=t->exons[0]->end);*/
453 return exonMatch;
454 }
455
456 if (r->exons[imax]->start<t->exons[0]->end ||
457 t->exons[jmax]->start<r->exons[0]->end ) //intron chains do not overlap at all
458 {
459 return false;
460 }
461 //check intron overlaps
462 int i=1;
463 int j=1;
464 bool exmism=false; //any mismatch
465 while (i<=imax && j<=jmax) {
466 uint rstart=r->exons[i-1]->end;
467 uint rend=r->exons[i]->start;
468 uint tstart=t->exons[j-1]->end;
469 uint tend=t->exons[j]->start;
470 if (tend<rstart) { j++; continue; }
471 if (rend<tstart) { i++; continue; }
472 break; //here we have an intron overlap
473 }
474 if (i>1 || i>imax || j>jmax) {
475 return false; //no intron overlaps found at all
476 //or first intron of ref not overlapping
477 }
478 //from now on we expect intron matches up to imax
479 if (i!=j || imax!=jmax) { exmism=true; if (fuzz==0) return false; }
480 for (;i<=imax && j<=jmax;i++,j++) {
481 if (abs((int)(r->exons[i-1]->end-t->exons[j-1]->end))>fuzz ||
482 abs((int)(r->exons[i]->start-t->exons[j]->start))>fuzz) {
483 return false; //just run away
484 }
485 }
486 //if we made it here, we have matching intron chains up to MIN(imax,jmax)
487 if (imax!=jmax) {
488 exmism=true;
489 if (jmax<imax) return false; // qry ichain included in ref ichain
490 else //ref ichain included in qry ichain
491 if (fuzz==0) return false;
492 }
493 if (exmism) {
494 //exonMatch=false; -- it's default
495 return true;
496 }
497 exonMatch = ( abs((int)(r->exons[0]->start-t->exons[0]->start))<=fuzz &&
498 abs((int)(r->exons[imax]->end-t->exons[jmax]->end))<=fuzz );
499 return true;
500 }
501
502
503 void compareLoci2R(GList<GLocus>& loci, GList<GSuperLocus>& cmpdata,
504 GList<GLocus>& refloci, int qfidx) {
505 cmpdata.Clear();//a new list of superloci will be built
506 if (refloci.Count()==0 || loci.Count()==0) return;
507 //reset cmpovl and stats
508 for (int i=0;i<refloci.Count();i++) refloci[i]->creset();
509 //find loci with overlapping refloci
510 //and store cmpovl links both ways for ALL loci and refloci on this strand
511 for (int l=0;l<loci.Count();l++) {
512 GLocus* locus=loci[l];
513 locus->creset();
514 for (int j=0;j<refloci.Count();j++) {
515 //if (refloci[j]->start>locus->end) break;
516 if (refloci[j]->start>locus->end) {
517 if (refloci[j]->start-locus->end > GFF_MAX_LOCUS) break;
518 continue;
519 }
520 if (locus->start>refloci[j]->end) continue;
521 // then we must have overlap here:
522 //if (locus->overlap(refloci[j]->start, refloci[j]->end)) {
523 locus->cmpovl.Add(refloci[j]);
524 refloci[j]->cmpovl.Add(locus);
525 //}
526 }//for each reflocus
527 } //for each locus
528
529 //create corresponding "superloci" from transitive overlapping between loci and ref
530 for (int l=0;l<loci.Count();l++) {
531 if (loci[l]->v!=0) continue; //skip, already processed
532 GSuperLocus* super=new GSuperLocus();
533 super->qfidx=qfidx;
534 //try to find all other loci connected to this locus loci[l]
535 GList<GLocus> lstack(false,false,false); //traversal stack
536 lstack.Push(loci[l]);
537 while (lstack.Count()>0) {
538 GLocus* locus=lstack.Pop();
539 if (locus->v!=0 || locus->cmpovl.Count()==0) continue;
540 super->addQlocus(*locus);
541 locus->v=1;
542 for (int r=0;r<locus->cmpovl.Count();r++) {
543 GLocus* rloc=locus->cmpovl[r];
544 if (rloc->v==0) {
545 super->addRlocus(*rloc);
546 rloc->v=1;
547 for (int ll=0;ll<rloc->cmpovl.Count();ll++) {
548 if (rloc->cmpovl[ll]->v==0) lstack.Push(rloc->cmpovl[ll]);
549 }
550 }
551 } //for each overlapping reflocus
552 } //while linking
553
554 if (super->qloci.Count()==0) {
555 delete super;
556 continue; //try next query loci
557 }
558 //--here we have a "superlocus" region data on both qry and ref
559 // -- analyze mexons matching (base level metrics)
560 cmpdata.Add(super);
561 //make each ref locus keep track of all superloci containing it
562 for (int rl=0;rl<super->rloci.Count();rl++) {
563 super->rloci[rl]->superlst->Add(super);
564 }
565 for (int x=0;x<super->rmexons.Count();x++) {
566 super->rbases_all += super->rmexons[x].end-super->rmexons[x].start+1;
567 }
568 for (int x=0;x<super->qmexons.Count();x++) {
569 super->qbases_all += super->qmexons[x].end-super->qmexons[x].start+1;
570 }
571 int i=0; //locus mexons
572 int j=0; //refmexons
573 while (i<super->qmexons.Count() && j<super->rmexons.Count()) {
574 uint istart=super->qmexons[i].start;
575 uint iend=super->qmexons[i].end;
576 uint jstart=super->rmexons[j].start;
577 uint jend=super->rmexons[j].end;
578 if (iend<jstart) { i++; continue; }
579 if (jend<istart) { j++; continue; }
580 //v--overlap here:
581 uint ovlstart = jstart>istart? jstart : istart;
582 uint ovlend = iend<jend ? iend : jend;
583 uint ovlen=ovlend-ovlstart+1;
584 super->baseTP+=ovlen; //qbases_cov
585 if (iend<jend) i++;
586 else j++;
587 } //while mexons ovl search
588 /* if (reduceRefs) {
589 super->baseFP=super->qbases_all-super->baseTP;
590 super->baseFN=super->rbases_all-super->baseTP;
591 }
592 */
593 // -- exon level comparison:
594 int* qexovl; //flags for qry exons with ref overlap
595 GCALLOC(qexovl,super->quexons.Count()*sizeof(int));
596 int* rexovl; //flags for ref exons with qry overlap
597 GCALLOC(rexovl,super->ruexons.Count()*sizeof(int));
598 for (int i=0;i<super->quexons.Count();i++) {
599 uint istart=super->quexons[i].start;
600 uint iend=super->quexons[i].end;
601 for (int j=0;j<super->ruexons.Count();j++) {
602 uint jstart=super->ruexons[j].start;
603 uint jend=super->ruexons[j].end;
604 if (iend<jstart) break;
605 if (jend<istart) continue;
606 //--- overlap here between quexons[i] and ruexons[j]
607 qexovl[i]++;
608 rexovl[j]++;
609 if (super->quexons[i].coordMatch(&super->ruexons[j],5)) {
610 super->exonATP++;
611 if (super->quexons[i].coordMatch(&super->ruexons[j])) {
612 super->exonTP++;
613 } //exact match
614 } //fuzzy match
615 } //ref uexon loop
616 } //qry uexon loop
617 super->m_exons=0; //ref exons with no query overlap
618 super->w_exons=0; //qry exons with no ref overlap
619 for (int x=0;x<super->quexons.Count();x++)
620 if (qexovl[x]==0) super->w_exons++;
621 for (int x=0;x<super->ruexons.Count();x++)
622 if (rexovl[x]==0) super->m_exons++;
623 GFREE(rexovl);
624 GFREE(qexovl);
625
626 //-- intron level stats:
627 //query:
628 int* qinovl=NULL; //flags for qry introns with at least some ref overlap
629 int* qtpinovl=NULL; //flags for qry introns with perfect ref overlap
630 if (super->qintrons.Count()>0) {
631 GCALLOC(qinovl,super->qintrons.Count()*sizeof(int));
632 GCALLOC(qtpinovl,super->qintrons.Count()*sizeof(int));
633 }
634 //-- reference:
635 int* rinovl=NULL; //flags for ref introns with qry overlap
636 int* rtpinovl=NULL; //ref introns with perfect qry intron overlap
637 if (super->rintrons.Count()>0) {
638 GCALLOC(rinovl,super->rintrons.Count()*sizeof(int));
639 GCALLOC(rtpinovl,super->rintrons.Count()*sizeof(int));
640 }
641 for (int i=0;i<super->qintrons.Count();i++) {
642 uint istart=super->qintrons[i].start;
643 uint iend=super->qintrons[i].end;
644 for (int j=0;j<super->rintrons.Count();j++) {
645 uint jstart=super->rintrons[j].start;
646 uint jend=super->rintrons[j].end;
647 if (iend<jstart) break;
648 if (jend<istart) continue;
649 //--- overlap here between qintrons[i] and rintrons[j]
650 qinovl[i]++;
651 rinovl[j]++;
652 if (super->qintrons[i].coordMatch(&super->rintrons[j],5)) {
653 super->intronATP++;
654 if (super->qintrons[i].coordMatch(&super->rintrons[j])) {
655 super->intronTP++;
656 qtpinovl[i]++;
657 rtpinovl[j]++;
658 } //exact match
659 } //fuzzy match
660 } //ref intron loop
661 } //qry intron loop
662 super->m_introns=0; //ref introns with no query overlap
663 super->w_introns=0; //qry introns with no ref overlap
664 for (int x=0;x<super->qintrons.Count();x++) {
665 if (qinovl[x]==0) { super->w_introns++;
666 //qry introns with no ref intron overlap AT ALL
667 super->i_qwrong.Add(super->qintrons[x]);
668 }
669 else
670 if (qtpinovl[x]==0) {
671 super->i_qnotp.Add(super->qintrons[x]);
672 }
673 }
674 for (int x=0;x<super->rintrons.Count();x++) {
675 if (rinovl[x]==0) { //no intron overlap at all
676 super->m_introns++;
677 super->i_missed.Add(super->rintrons[x]);
678 }
679 else if (rtpinovl[x]==0) { //no perfect intron match
680 super->i_notp.Add(super->rintrons[x]);
681 }
682 }
683 GFREE(rinovl);
684 GFREE(rtpinovl);
685 GFREE(qinovl);
686 GFREE(qtpinovl);
687
688 // ---- now intron-chain and transcript comparison
689 for (int i=0;i<super->qmrnas.Count();i++) {
690 uint istart=super->qmrnas[i]->exons.First()->start;
691 uint iend=super->qmrnas[i]->exons.Last()->end;
692 for (int j=0;j<super->rmrnas.Count();j++) {
693 uint jstart=super->rmrnas[j]->exons.First()->start;
694 uint jend=super->rmrnas[j]->exons.Last()->end;
695 if (iend<jstart) break;
696 if (jend<istart) continue;
697 //--- overlap here --
698 bool exonMatch=false;
699 if ((super->qmrnas[i]->udata & 3) > 1) continue; //already counted a ichainTP for this qry
700 if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch, 5)) { //fuzzy match
701 GLocus* qlocus=((CTData*)super->qmrnas[i]->uptr)->locus;
702 if (super->qmrnas[i]->exons.Count()>1) {
703 super->ichainATP++;
704 qlocus->ichainATP++;
705 }
706 if (exonMatch) {
707 super->mrnaATP++;
708 qlocus->mrnaATP++;
709 }
710 if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch)) { //exact match
711 if (super->qmrnas[i]->exons.Count()>1) {
712 super->qmrnas[i]->udata|=1;
713 super->ichainTP++;
714 qlocus->ichainTP++;
715 }
716 if (exonMatch) {
717 super->qmrnas[i]->udata|=2;
718 super->mrnaTP++;
719 qlocus->mrnaTP++;
720 }
721 } //exact match
722 } //fuzzy match
723 } //ref mrna loop
724 } //qry mrna loop
725 for (int ql=0;ql<super->qloci.Count();ql++) {
726 if (super->qloci[ql]->ichainTP+super->qloci[ql]->mrnaTP >0 )
727 super->locusTP++;
728 if (super->qloci[ql]->ichainATP+super->qloci[ql]->mrnaATP>0)
729 super->locusATP++;
730 }
731
732 }//for each unlinked locus
733
734 }
735
736 //look for qry data for a specific genomic sequence
737 GSeqData* getQryData(int gid, GList<GSeqData>& qdata) {
738 int qi=-1;
739 GSeqData f(gid);
740 GSeqData* q=NULL;
741 if (qdata.Found(&f,qi))
742 q=qdata[qi];
743 return q;
744 }
745
746 const char* findDescr(GffObj* gfobj) {
747 if (refdescr.Count()==0) return NULL;
748 GStr* s=refdescr.Find(gfobj->getID());
749 if (s==NULL) {
750 s=refdescr.Find(gfobj->getGeneName());
751 if (s==NULL) s=refdescr.Find(gfobj->getGeneID());
752 }
753 if (s!=NULL)
754 return s->chars();
755 return NULL;
756 }
757
758 const char* getGeneID(GffObj* gfobj) {
759 //returns anything that might resemble a gene identifier for this transcript
760 //or, if everything fails, returns the transcript ID
761 const char* s=gfobj->getGeneName();
762 if (s) return s;
763 if ((s=gfobj->getGeneID())!=NULL) return s;
764 if ((s=gfobj->getAttr("Name"))!=NULL) return s;
765 return gfobj->getID();
766 }
767
768 const char* getGeneID(GffObj& gfobj) {
769 return getGeneID(&gfobj);
770 }
771
772 void writeLoci(FILE* f, GList<GLocus> & loci) {
773 for (int l=0;l<loci.Count();l++) {
774 GLocus& loc=*(loci[l]);
775 fprintf(f,"%s\t%s[%c]%d-%d\t", loc.mrna_maxcov->getID(),
776 loc.mrna_maxcov->getGSeqName(),
777 loc.mrna_maxcov->strand, loc.start,loc.end);
778 //now print all transcripts in this locus, comma delimited
779 int printfd=0;
780 for (int i=0;i<loc.mrnas.Count();i++) {
781 if (loc.mrnas[i]==loc.mrna_maxcov) continue;
782 if (printfd==0) fprintf(f,"%s",loc.mrnas[i]->getID());
783 else fprintf(f,",%s",loc.mrnas[i]->getID());
784 printfd++;
785 }
786 const char* rdescr=findDescr(loc.mrna_maxcov);
787 if (rdescr==NULL) fprintf(f,"\t\n");
788 else fprintf(f,"\t%s\n",rdescr);
789 }
790 }
791
792 void printXQ1(FILE* f, int qidx, GList<GLocus>& qloci) {
793 int printfd=0;
794 //print
795 for (int i=0;i<qloci.Count();i++) {
796 if (qloci[i]->qfidx!=qidx) continue;
797 for (int j=0;j<qloci[i]->mrnas.Count();j++) {
798 if (printfd==0) fprintf(f,"%s",qloci[i]->mrnas[j]->getID());
799 else fprintf(f,",%s",qloci[i]->mrnas[j]->getID());
800 printfd++;
801 }
802 }
803 if (printfd==0) fprintf(f,"-");
804 }
805
806 void numXLoci(GList<GXLocus>& xloci, int& last_id) {
807 for (int l=0;l<xloci.Count();l++) {
808 if (xloci[l]->qloci.Count()==0) continue; //we never print ref-only xloci
809 last_id++;
810 xloci[l]->id=last_id;
811 }
812 }
813
814
815 class GProtCl {
816 public:
817 GList<GXConsensus> protcl;
818 GProtCl(GXConsensus* c=NULL):protcl(true,false,false) {
819 if (c!=NULL)
820 protcl.Add(c);
821 }
822 bool add_Pcons(GXConsensus* c) {
823 if (c==NULL || c->aalen==0) return false;
824 if (protcl.Count()==0) {
825 protcl.Add(c);
826 return true;
827 }
828 if (protcl[0]->aalen!=c->aalen) return false;
829 if (strcmp(protcl[0]->aa,c->aa)!=0) return false;
830 protcl.Add(c);
831 return true;
832 }
833
834 void addMerge(GProtCl& pcl, GXConsensus* pclnk) {
835 for (int i=0;i<pcl.protcl.Count();i++) {
836 if (pcl.protcl[i]!=pclnk) {
837 protcl.Add(pcl.protcl[i]);
838 }
839 }
840 }
841
842 int aalen() {
843 if (protcl.Count()==0) return 0;
844 return protcl[0]->aalen;
845 }
846 bool operator==(GProtCl& cl) {
847 return this==&cl;
848 }
849 bool operator>(GProtCl& cl) {
850 return (this>&cl);
851 }
852 bool operator<(GProtCl& cl) {
853 return (this<&cl);
854 }
855 };
856
857 class GTssCl:public GSeg { //experiment cluster of ref loci (isoforms)
858 public:
859 uint fstart; //lowest coordinate of the first exon
860 uint fend; //highest coordinate of the first exon
861 GList<GXConsensus> tsscl;
862 GTssCl(GXConsensus* c=NULL):tsscl(true,false,false) {
863 start=0;
864 end=0;
865 fstart=0;
866 fend=0;
867 if (c!=NULL) addFirst(c);
868 }
869
870 void addFirst(GXConsensus* c) {
871 tsscl.Add(c);
872 start=c->start;
873 end=c->end;
874 GffExon* fexon=(c->tcons->strand=='-') ? c->tcons->exons.Last() :
875 c->tcons->exons.First();
876 fstart=fexon->start;
877 fend=fexon->end;
878 }
879 bool add_Xcons(GXConsensus* c) {
880 if (tsscl.Count()==0) {
881 addFirst(c);
882 return true;
883 }
884 //check if it can be added to existing xconsensi
885 uint nfend=0;
886 uint nfstart=0;
887 if (tsscl.Get(0)->tcons->getGeneID()!=NULL &&
888 c->tcons->getGeneID()!=NULL &&
889 strcmp(tsscl.Get(0)->tcons->getGeneID(), c->tcons->getGeneID()))
890 //don't tss cluster if they don't have the same GeneID (?)
891 //FIXME: we might not want this if input files are not from Cufflinks
892 // and they could simply lack proper GeneID
893 return false;
894
895 if (c->tcons->strand=='-') {
896 //no, the first exons don't have to overlap
897 //if (!c->tcons->exons.Last()->overlap(fstart,fend)) return false;
898 nfstart=c->tcons->exons.Last()->start;
899 nfend=c->tcons->exons.Last()->end;
900 //proximity check for the transcript start:
901 if (nfend>fend+tssDist || fend>nfend+tssDist)
902 return false;
903 }
904 else {
905 //if (!c->tcons->exons.First()->overlap(fstart,fend)) return false;
906 nfstart=c->tcons->exons.First()->start;
907 nfend=c->tcons->exons.First()->end;
908 if (nfstart>fstart+tssDist || fstart>nfstart+tssDist)
909 return false;
910 }
911 // -- if we are here, we can add to tss cluster
912
913 tsscl.Add(c);
914 if (fstart>nfstart) fstart=nfstart;
915 if (fend<nfend) fend=nfend;
916 if (start>c->start) start=c->start;
917 if (end<c->end) end=c->end;
918 return true;
919 }
920
921 void addMerge(GTssCl& cl, GXConsensus* clnk) {
922 for (int i=0;i<cl.tsscl.Count();i++) {
923 if (cl.tsscl[i]==clnk) continue;
924 tsscl.Add(cl.tsscl[i]);
925 }
926 if (fstart>cl.fstart) fstart=cl.fstart;
927 if (fend<cl.fend) fend=cl.fend;
928 if (start>cl.start) start=cl.start;
929 if (end<cl.end) end=cl.end;
930 }
931 };
932 /*
933 class IntArray { //two dimensional int array
934 int* mem;
935 int xsize;
936 int ysize;
937 public:
938 IntArray(int xlen, int ylen) {
939 xsize=xlen;
940 ysize=ylen;
941 GMALLOC(mem, xsize*ysize*sizeof(int));
942 }
943 ~IntArray() {
944 GFREE(mem);
945 }
946 int& data(int x, int y) {
947 return mem[y*xsize+x];
948 }
949 };
950
951 int aa_diff(GXConsensus* c1, GXConsensus* c2) {
952 int diflen=abs(c1->aalen-c2->aalen);
953 if (diflen>=6) return diflen;
954 //obvious case: same CDS
955 if (diflen==0 && strcmp(c1->aa, c2->aa)==0) return 0;
956 //simple edit distance calculation
957 IntArray dist(c1->aalen+1, c2->aalen+1);
958 for (int i=0;i<=c1->aalen;i++) {
959 dist.data(i,0) = i;
960 }
961 for (int j = 0; j <= c2->aalen; j++) {
962 dist.data(0,j) = j;
963 }
964 for (int i = 1; i <= c1->aalen; i++)
965 for (int j = 1; j <= c2->aalen; j++) {
966 dist.data(i,j) = GMIN3( dist.data(i-1,j)+1,
967 dist.data(i,j-1)+1,
968 dist.data(i-1,j-1)+((c1->aa[i-1] == c2->aa[j-1]) ? 0 : 1) );
969 }
970 int r=dist.data(c1->aalen,c2->aalen);
971 return r;
972 }
973 */
974 void printConsGTF(FILE* fc, GXConsensus* xc, int xlocnum) {
975 for (int i=0;i<xc->tcons->exons.Count();i++) {
976 fprintf(fc,
977 "%s\t%s\texon\t%d\t%d\t.\t%c\t.\tgene_id \"XLOC_%06d\"; transcript_id \"%s_%08d\"; exon_number \"%d\";",
978 xc->tcons->getGSeqName(),xc->tcons->getTrackName(),xc->tcons->exons[i]->start, xc->tcons->exons[i]->end, xc->tcons->strand,
979 xlocnum, cprefix, xc->id, i+1);
980 //if (i==0) {
981 const char* gene_name=NULL;
982 if (xc->ref) {
983 gene_name=xc->ref->getGeneName();
984 if (gene_name==NULL) gene_name=xc->ref->getGeneID();
985 if (gene_name) {
986 fprintf (fc, " gene_name \"%s\";", gene_name);
987 }
988 }
989 if (!haveRefs) {
990 if (gene_name==NULL && xc->tcons->getGeneName())
991 fprintf (fc, " gene_name \"%s\";", xc->tcons->getGeneName());
992 char* s=xc->tcons->getAttr("nearest_ref", true);
993 if (s) fprintf(fc, " nearest_ref \"%s\";",s);
994 s=xc->tcons->getAttr("class_code", true);
995 if (s) fprintf(fc, " class_code \"%s\";", s);
996 }
997 fprintf(fc, " oId \"%s\";",xc->tcons->getID());
998 if (xc->contained) {
999 fprintf(fc, " contained_in \"%s_%08d\";", cprefix, xc->contained->id);
1000 }
1001 if (haveRefs) {
1002 if (xc->ref!=NULL)
1003 fprintf(fc, " nearest_ref \"%s\";",xc->ref->getID());
1004 fprintf(fc, " class_code \"%c\";",xc->refcode ? xc->refcode : '.');
1005 }
1006 if (xc->tss_id>0) fprintf(fc, " tss_id \"TSS%d\";",xc->tss_id);
1007 if (xc->p_id>0) fprintf(fc, " p_id \"P%d\";",xc->p_id);
1008 // }
1009 fprintf(fc,"\n");
1010 }
1011 }
1012
1013 void tssCluster(GXLocus& xloc)
1014 {
1015 GList<GTssCl> xpcls(true,true,false);
1016 for (int i=0;i<xloc.tcons.Count();i++)
1017 {
1018 GXConsensus* c=xloc.tcons[i];
1019 //if (c->tcons->exons.Count()<2) continue; //should we skip single-exon transcripts ??
1020 GArray<int> mrgloci(true);
1021 int lfound=0;
1022 for (int l=0;l<xpcls.Count();l++)
1023 {
1024 if (xpcls[l]->end<c->tcons->exons.First()->start) continue;
1025 if (xpcls[l]->start>c->tcons->exons.Last()->end) break;
1026 if (xpcls[l]->add_Xcons(c))
1027 {
1028 lfound++;
1029 mrgloci.Add(l);
1030
1031 }
1032
1033 } // for each xpcluster
1034 if (lfound==0)
1035 {
1036 //create a xpcl with only this xconsensus
1037 xpcls.Add(new GTssCl(c));
1038
1039 }
1040 else if (lfound>1)
1041 {
1042 for (int l=1;l<lfound;l++)
1043 {
1044 int mlidx=mrgloci[l]-l+1;
1045 xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1046 xpcls.Delete(mlidx);
1047 }
1048 }
1049
1050 }//for each xconsensus in this xlocus
1051 for (int l=0;l<xpcls.Count();l++)
1052 {
1053 //if (xpcls[l]->tsscl.Count()<2) continue;
1054 tsscl_num++;
1055 for (int i=0;i<xpcls[l]->tsscl.Count();i++)
1056 xpcls[l]->tsscl[i]->tss_id=tsscl_num;
1057 //processTssCl(xcds_num, xpcls[l], faseq);
1058 }
1059 }
1060
1061 void protCluster(GXLocus& xloc, GFaSeqGet *faseq) {
1062 if (!faseq)
1063 return;
1064 GList<GProtCl> xpcls(true,true,false);
1065 for (int i=0;i<xloc.tcons.Count();i++) {
1066 GXConsensus* c=xloc.tcons[i];
1067 if (c->ref==NULL || c->ref->CDstart==0) continue; //no ref or CDS available
1068 if (c->refcode!='=') continue;
1069 //get the CDS translation here
1070 if (c->aa==NULL) {
1071 c->aa=c->ref->getSplicedTr(faseq, true, &c->aalen);
1072 if (c->aalen>0 && c->aa[c->aalen-1]=='.') {
1073 //discard the final stop codon
1074 c->aalen--;
1075 c->aa[c->aalen]=0;
1076 }
1077 }
1078 GArray<int> mrgloci(true);
1079 int lfound=0;
1080 for (int l=0;l<xpcls.Count();l++) {
1081 if (xpcls[l]->aalen()!=c->aalen) continue;
1082 if (xpcls[l]->add_Pcons(c)) {
1083 lfound++;
1084 mrgloci.Add(l);
1085 }
1086 } // for each xpcluster
1087 if (lfound==0) {
1088 //create a xpcl with only this xconsensus
1089 xpcls.Add(new GProtCl(c));
1090 }
1091 else if (lfound>1) {
1092 for (int l=1;l<lfound;l++) {
1093 int mlidx=mrgloci[l]-l+1;
1094 xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1095 xpcls.Delete(mlidx);
1096 }
1097 }
1098 }//for each xconsensus in this xlocus
1099 for (int l=0;l<xpcls.Count();l++) {
1100 protcl_num++;
1101 for (int i=0;i<xpcls[l]->protcl.Count();i++)
1102 xpcls[l]->protcl[i]->p_id=protcl_num;
1103 }
1104 for (int i=0;i<xloc.tcons.Count();i++) {
1105 GXConsensus* c=xloc.tcons[i];
1106 if (c->aa!=NULL) { GFREE(c->aa); }
1107 }
1108 }
1109
1110 void printXLoci(FILE* f, FILE* fc, int qcount, GList<GXLocus>& xloci, GFaSeqGet *faseq) {
1111 for (int l=0;l<xloci.Count();l++) {
1112 if (xloci[l]->qloci.Count()==0) continue;
1113 GXLocus& xloc=*(xloci[l]);
1114 xloc.checkContainment();
1115 tssCluster(xloc);//cluster and assign tss_id and cds_id to each xconsensus in xloc
1116 protCluster(xloc,faseq);
1117 for (int c=0;c<xloc.tcons.Count();c++) {
1118 if (showContained || xloc.tcons[c]->contained==NULL)
1119 printConsGTF(fc,xloc.tcons[c],xloc.id);
1120 }
1121 fprintf(f,"XLOC_%06d\t%s[%c]%d-%d\t", xloc.id,
1122 xloc.qloci[0]->mrna_maxcov->getGSeqName(),
1123 xloc.strand, xloc.start,xloc.end);
1124 //now print all transcripts in this locus, comma delimited
1125 //first, ref loci, if any
1126 int printfd=0;
1127 if (xloc.rloci.Count()>0) {
1128 for (int i=0;i<xloc.rloci.Count();i++) {
1129 for (int j=0;j<xloc.rloci[i]->mrnas.Count();j++) {
1130 if (printfd==0) fprintf(f,"%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1131 xloc.rloci[i]->mrnas[j]->getID());
1132 else fprintf(f,",%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1133 xloc.rloci[i]->mrnas[j]->getID());
1134 printfd++;
1135 }
1136 }
1137 }
1138 else {
1139 fprintf(f,"-");
1140 }
1141 //second, all the cufflinks transcripts
1142 for (int qi=0;qi<qcount;qi++) {
1143 fprintf(f,"\t");
1144 printXQ1(f,qi,xloc.qloci);
1145 }
1146 fprintf(f,"\n");
1147 }
1148 }
1149
1150 void writeIntron(FILE* f, char strand, GFaSeqGet* faseq, GSeg& iseg,
1151 GList<GffObj>& mrnas, bool wrong=false) {
1152 //find a ref mrna having this intron
1153 GffObj* rm=NULL;
1154 for (int i=0;i<mrnas.Count();i++) {
1155 GffObj* m=mrnas[i];
1156 if (m->start>iseg.end) break;
1157 if (m->end<iseg.start) continue;
1158 //intron coords overlaps mrna region
1159 for (int j=1;j<m->exons.Count();j++) {
1160 if (iseg.start==m->exons[j-1]->end+1 &&
1161 iseg.end==m->exons[j]->start-1) { rm=m; break; } //match found
1162 }//for each intron
1163 if (rm!=NULL) break;
1164 } //for each ref mrna in this locus
1165 if (rm==NULL) GError("Error: couldn't find ref mrna for intron %d-%d! (BUG)\n",
1166 iseg.start,iseg.end);
1167 int ilen=iseg.end-iseg.start+1;
1168 fprintf(f,"%s\t%s\tintron\t%d\t%d\t.\t%c\t.\t",
1169 rm->getGSeqName(),rm->getTrackName(),iseg.start,iseg.end,strand);
1170 if (faseq!=NULL) {
1171 const char* gseq=faseq->subseq(iseg.start, ilen);
1172 char* cseq=Gstrdup(gseq, gseq+ilen-1);
1173 if (strand=='-') reverseComplement(cseq, ilen);
1174 fprintf(f,"spl=\"%c%c..%c%c\"; ", toupper(cseq[0]),toupper(cseq[1]),
1175 toupper(cseq[ilen-2]),toupper(cseq[ilen-1]));
1176 GFREE(cseq);
1177 }
1178 fprintf(f,"transcript_id \"%s\";", rm->getID());
1179 if (wrong) fprintf(f," noOvl=1;");
1180 fprintf(f,"\n");
1181
1182 }
1183
1184 void reportMIntrons(FILE* fm, FILE* fn, FILE* fq, char strand,
1185 GList<GSuperLocus>& cmpdata) {
1186 if (fm==NULL) return;
1187 for (int l=0;l<cmpdata.Count();l++) {
1188 GSuperLocus *sl=cmpdata[l];
1189 //cache the whole locus sequence if possible
1190 //write these introns and their splice sites into the file
1191 for (int i=0;i<sl->i_missed.Count();i++)
1192 writeIntron(fm, strand, NULL, sl->i_missed[i], sl->rmrnas);
1193 if (fn!=NULL) {
1194 for (int i=0;i<sl->i_notp.Count();i++)
1195 writeIntron(fn, strand, NULL, sl->i_notp[i], sl->rmrnas);
1196 }
1197 if (fq!=NULL) {
1198 for (int i=0;i<sl->i_qwrong.Count();i++) {
1199 writeIntron(fq, strand, NULL, sl->i_qwrong[i], sl->qmrnas, true);
1200 }
1201 for (int i=0;i<sl->i_qnotp.Count();i++) {
1202 writeIntron(fq, strand, NULL, sl->i_qnotp[i], sl->qmrnas);
1203 }
1204 }
1205 }
1206 }
1207
1208
1209 void processLoci(GSeqData& seqdata, GSeqData* refdata, int qfidx) {
1210 //GList<GSeqLoci>& glstloci, GList<GSeqCmpRegs>& cmpdata)
1211
1212 if (refdata!=NULL) {
1213 //if (gtf_tracking_verbose) GMessage(" ..comparing to reference loci..\n") ;
1214 compareLoci2R(seqdata.loci_f, seqdata.gstats_f, refdata->loci_f, qfidx);
1215 compareLoci2R(seqdata.loci_r, seqdata.gstats_r, refdata->loci_r, qfidx);
1216 // -- report
1217
1218 if (f_mintr!=NULL) {
1219 GMessage(" ..reporting missed ref introns..\n");
1220 //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '+', seqdata.gstats_f);
1221 //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '-', seqdata.gstats_r);
1222 reportMIntrons(f_mintr, NULL, NULL, '+', seqdata.gstats_f);
1223 reportMIntrons(f_mintr, NULL, NULL, '-', seqdata.gstats_r);
1224 }
1225 }
1226 }
1227
1228 //adjust stats for a list of unoverlapped (completely missed) ref loci
1229 void collectRLocData(GSuperLocus& stats, GLocus& loc) {
1230 stats.total_rmrnas+=loc.mrnas.Count();
1231 stats.total_rexons+=loc.uexons.Count();
1232 stats.total_rintrons+=loc.introns.Count();
1233 stats.total_rmexons+=loc.mexons.Count();
1234 stats.total_richains+=loc.ichains;
1235 stats.m_exons+=loc.uexons.Count();
1236 stats.m_introns+=loc.introns.Count();
1237 stats.total_rloci++;
1238 for (int e=0;e<loc.mexons.Count();e++) {
1239 stats.rbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1240 }
1241 }
1242
1243 void collectRData(GSuperLocus& stats, GList<GLocus>& loci) {
1244 for (int l=0;l<loci.Count();l++)
1245 collectRLocData(stats,*loci[l]);
1246 }
1247
1248 //adjust stats for a list of unoverlapped (completely "wrong" or novel) qry loci
1249 void collectQLocData(GSuperLocus& stats, GLocus& loc) {
1250 stats.total_qmrnas+=loc.mrnas.Count();
1251 stats.total_qexons+=loc.uexons.Count();
1252 stats.total_qmexons+=loc.mexons.Count();
1253 stats.total_qintrons+=loc.introns.Count();
1254 stats.total_qichains+=loc.ichains;
1255 stats.total_qloci++;
1256 if (loc.ichains>0 && loc.mrnas.Count()>1)
1257 stats.total_qloci_alt++;
1258 stats.w_exons+=loc.uexons.Count();
1259 stats.w_introns+=loc.introns.Count();
1260 for (int e=0;e<loc.mexons.Count();e++) {
1261 stats.qbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1262 }
1263 }
1264
1265 void collectQData(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1266 for (int l=0;l<loci.Count();l++) {
1267 //this is called when no refdata is given, so all these loci are nloci
1268 nloci.Add(loci[l]);
1269 collectQLocData(stats,*loci[l]);
1270 }
1271 }
1272
1273 void collectQNOvl(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1274 for (int l=0;l<loci.Count();l++) {
1275 if (loci[l]->cmpovl.Count()==0) {//locus with no ref loci overlaps
1276 stats.w_loci++; //novel/wrong loci
1277 nloci.Add(loci[l]);
1278 collectQLocData(stats,*loci[l]);
1279 }
1280 }
1281 }
1282
1283 void collectQU(GSuperLocus& stats, GList<GLocus>& nloci) {
1284 for (int l=0;l<nloci.Count();l++) {
1285 stats.w_loci++; //novel/wrong loci
1286 collectQLocData(stats, *nloci[l]);
1287 }
1288 }
1289
1290 void printLocus(FILE* f, GLocus& loc, const char* gseqname) {
1291 fprintf(f, "## Locus %s:%d-%d\n",gseqname, loc.start, loc.end);
1292 for (int m=0;m<loc.mrnas.Count();m++) {
1293 loc.mrnas[m]->printGtf(f);
1294 }
1295 }
1296
1297 void collectRNOvl(GSuperLocus& stats, GList<GLocus>& loci) { //, const char* gseqname) {
1298 for (int l=0;l<loci.Count();l++) {
1299 if (loci[l]->cmpovl.Count()==0) {
1300 stats.m_loci++; //missed ref loci
1301 //if (f_mloci!=NULL)
1302 // printLocus(f_mloci,*loci[l], gseqname);
1303 collectRLocData(stats,*loci[l]);
1304 }
1305 }
1306 }
1307
1308
1309 void collectCmpData(GSuperLocus& stats, GList<GSuperLocus>& cmpdata) { //, const char* gseqname) {
1310 for (int c=0;c<cmpdata.Count();c++) {
1311 stats.addStats(*cmpdata[c]);
1312 /*
1313 if (f_nloci!=NULL && cmpdata[c]->locusTP==0 && cmpdata[c]->rloci.Count()>0) {
1314 fprintf(f_nloci, "# Superlocus %s:%d-%d\n",gseqname, cmpdata[c]->start, cmpdata[c]->end);
1315 for (int l=0;l<cmpdata[c]->rloci.Count();l++) {
1316 printLocus(f_nloci,*cmpdata[c]->rloci[l], gseqname);
1317 }
1318 }
1319 */
1320 }
1321 }
1322
1323 void collectStats(GSuperLocus& stats, GSeqData* seqdata, GSeqData* refdata) {
1324 //collect all stats for a single genomic sequence into stats
1325 if (seqdata==NULL) {
1326 if (reduceRefs || refdata==NULL) return;
1327 //special case with completely missed all refs on a contig/chromosome
1328 collectRData(stats, refdata->loci_f);
1329 collectRData(stats, refdata->loci_r);
1330 return;
1331 }
1332 if (refdata==NULL) {//reference data missing on this contig
1333 collectQData(stats, seqdata->loci_f, seqdata->nloci_f);
1334 collectQData(stats, seqdata->loci_r, seqdata->nloci_r);
1335 collectQU(stats, seqdata->nloci_u);
1336 return;
1337 }
1338
1339 /*stats.total_qloci+=seqdata->loci_f.Count();
1340 stats.total_qloci+=seqdata->loci_r.Count();
1341 if (reduceRefs) { //only collect ref loci from superloci
1342
1343 }
1344 else {
1345 stats.total_rloci+=refdata->loci_f.Count();
1346 stats.total_rloci+=refdata->loci_r.Count();
1347 }
1348 */
1349 //collect data for overlapping superloci (already in seqdata->gstats_f/_r)
1350 //char* gseqname=getGSeqName(seqdata->gseq_id);
1351 collectCmpData(stats, seqdata->gstats_f);
1352 collectCmpData(stats, seqdata->gstats_r);
1353 //for non-overlapping qry loci, always add them as false positives FP
1354 collectQNOvl(stats, seqdata->loci_f, seqdata->nloci_f);
1355 collectQNOvl(stats, seqdata->loci_r, seqdata->nloci_r);
1356 collectQU(stats, seqdata->nloci_u);
1357 if (!reduceRefs) { //find ref loci with empty cmpovl and add them
1358 collectRNOvl(stats, refdata->loci_f);
1359 collectRNOvl(stats, refdata->loci_r);
1360 }
1361 }
1362
1363 void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
1364 GSeqData* seqdata, GSeqData* refdata) {
1365 GSuperLocus stats;
1366 bool finalSummary=(seqdata==NULL && refdata==NULL);
1367 GSuperLocus *ps=(finalSummary ? &stotal : &stats );
1368 if (!finalSummary) { //collecting contig stats
1369 //gather statistics for all loci/superloci here
1370 collectStats(stats, seqdata, refdata);
1371 stotal.addStats(stats);
1372 if (!perContigStats) return;
1373 }
1374 ps->calcF();
1375 if (seqdata!=NULL) fprintf(fout, "#> Genomic sequence: %s \n", setname);
1376 else fprintf(fout, "\n#= Summary for dataset: %s :\n", setname);
1377
1378 fprintf(fout, "# Query mRNAs : %7d in %7d loci (%d multi-exon transcripts)\n",
1379 ps->total_qmrnas, ps->total_qloci, ps->total_qichains);
1380 fprintf(fout, "# (%d multi-transcript loci, ~%.1f transcripts per locus)\n",
1381 ps->total_qloci_alt, ((double)ps->total_qmrnas/ps->total_qloci));
1382
1383 if (haveRefs) {
1384 fprintf(fout, "# Reference mRNAs : %7d in %7d loci (%d multi-exon)\n",
1385 ps->total_rmrnas, ps->total_rloci, ps->total_richains);
1386 if (ps->baseTP+ps->baseFP==0 || ps->baseTP+ps->baseFN==0) return;
1387 fprintf(fout, "# Corresponding super-loci: %7d\n",ps->total_superloci);
1388
1389 /*if (seqdata!=NULL) {
1390 fprintf(fout, " ( %d/%d on forward/reverse strand)\n",
1391 seqdata->gstats_f.Count(),seqdata->gstats_r.Count());
1392 }*/
1393 fprintf(fout, "#--------------------| Sn | Sp | fSn | fSp \n");
1394 double sp=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFP);
1395 double sn=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFN);
1396 fprintf(fout, " Base level: \t%5.1f\t%5.1f\t - \t - \n",sn, sp);
1397 sp=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFP);
1398 sn=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFN);
1399 double fsp=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFP);
1400 double fsn=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFN);
1401 if (fsp>100.0) fsp=100.0;
1402 if (fsn>100.0) fsn=100.0;
1403 fprintf(fout, " Exon level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1404 if (ps->total_rintrons>0) {
1405 //intron level
1406 sp=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFP);
1407 sn=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFN);
1408 fsp=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFP);
1409 fsn=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFN);
1410 if (fsp>100.0) fsp=100.0;
1411 if (fsn>100.0) fsn=100.0;
1412 fprintf(fout, " Intron level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1413 //intron chains:
1414 sp=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFP);
1415 sn=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFN);
1416 if (sp>100.0) sp=100.0;
1417 if (sn>100.0) sn=100.0;
1418 fsp=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFP);
1419 fsn=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFN);
1420 if (fsp>100.0) fsp=100.0;
1421 if (fsn>100.0) fsn=100.0;
1422 fprintf(fout, "Intron chain level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1423 }
1424 else {
1425 fprintf(fout, " Intron level: \t - \t - \t - \t - \n");
1426 fprintf(fout, "Intron chain level: \t - \t - \t - \t - \n");
1427 }
1428 sp=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFP);
1429 sn=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFN);
1430 fsp=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFP);
1431 fsn=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFN);
1432 if (fsp>100.0) fsp=100.0;
1433 if (fsn>100.0) fsn=100.0;
1434 fprintf(fout, " Transcript level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp,fsn,fsp);
1435 sp=(100.0*(double)ps->locusTP)/(ps->locusTP+ps->locusFP);
1436 sn=(100.0*(double)ps->locusTP)/(ps->locusTP+ps->locusFN);
1437 fsp=(100.0*(double)ps->locusATP)/(ps->locusATP+ps->locusAFP);
1438 fsn=(100.0*(double)ps->locusATP)/(ps->locusATP+ps->locusAFN);
1439 fprintf(fout, " Locus level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1440 sn=(100.0*(double)ps->m_exons)/(ps->total_rexons);
1441 fprintf(fout, " Missed exons:\t%d/%d (%5.1f%%)\n",ps->m_exons, ps->total_rexons, sn);
1442 sn=(100.0*(double)ps->w_exons)/(ps->total_qexons);
1443 fprintf(fout, " Wrong exons:\t%d/%d (%5.1f%%)\n",ps->w_exons, ps->total_qexons,sn);
1444 if (ps->total_rintrons>0) {
1445 sn=(100.0*(double)ps->m_introns)/(ps->total_rintrons);
1446 fprintf(fout, " Missed introns:\t%d/%d (%5.1f%%)\n",ps->m_introns, ps->total_rintrons, sn);
1447 }
1448 if (ps->total_qintrons>0) {
1449 sn=(100.0*(double)ps->w_introns)/(ps->total_qintrons);
1450 fprintf(fout, " Wrong introns:\t%d/%d (%5.1f%%)\n",ps->w_introns, ps->total_qintrons,sn);
1451 }
1452 if (ps->total_rloci>0) {
1453 sn=(100.0*(double)ps->m_loci)/(ps->total_rloci);
1454 fprintf(fout, " Missed loci:\t%d/%d (%5.1f%%)\n",ps->m_loci, ps->total_rloci, sn);
1455 }
1456 if (ps->total_qloci>0) {
1457 sn=(100.0*(double)ps->w_loci)/(ps->total_qloci);
1458 fprintf(fout, " Wrong loci:\t%d/%d (%5.1f%%)\n",ps->w_loci, ps->total_qloci,sn);
1459 }
1460
1461 }
1462 }
1463
1464 int inbuf_len=1024; //starting inbuf capacity
1465 char* inbuf=NULL; // incoming buffer for sequence lines.
1466
1467 void loadRefDescr(const char* fname) {
1468 if (inbuf==NULL) { GMALLOC(inbuf, inbuf_len); }
1469 FILE *f=fopen(fname, "rb");
1470 if (f==NULL) GError("Error opening exon file: %s\n",fname);
1471 char* line;
1472 int llen=0;
1473 off_t fpos;
1474 while ((line=fgetline(inbuf, inbuf_len, f, &fpos, &llen))!=NULL) {
1475 if (strlen(line)<=2) continue;
1476 int idlen=strcspn(line,"\t ");
1477 char* p=line+idlen;
1478 if (idlen<llen && idlen>0) {
1479 *p=0;
1480 p++;
1481 refdescr.Add(line, new GStr(p));
1482 }
1483 }
1484 }
1485
1486 GSeqTrack* findGSeqTrack(int gsid) {
1487 GSeqTrack f(gsid);
1488 int fidx=-1;
1489 if (gseqtracks.Found(&f,fidx))
1490 return gseqtracks[fidx];
1491 fidx=gseqtracks.Add(new GSeqTrack(gsid));
1492 return gseqtracks[fidx];
1493 }
1494
1495
1496
1497 GffObj* findRefMatch(GffObj& m, GLocus& rloc, int& ovlen) {
1498 ovlen=0;
1499 CTData* mdata=((CTData*)m.uptr);
1500 if (mdata->eqref!=NULL && ((CTData*)(mdata->eqref->uptr))->locus==&rloc) {
1501 mdata->eqref=mdata->ovls.First()->mrna; //this should be unnecessary
1502 //check it?
1503 return mdata->ovls.First()->mrna;
1504 }
1505 //if (rloc==NULL|| m==NULL) return NULL;
1506 GffObj* ret=NULL;
1507 for (int r=0;r<rloc.mrnas.Count();r++) {
1508 int olen=0;
1509 if (tMatch(m, *(rloc.mrnas[r]),olen, true)) { //return rloc->mrnas[r];
1510 /*
1511 if (ovlen<olen) {
1512 ovlen=olen;
1513 ret=rloc.mrnas[r]; //keep the longest matching ref
1514 //but this is unnecessary, there can be only one matching ref
1515 // because duplicate refs were discarded (unless -G was used)
1516 }
1517 */
1518 mdata->addOvl('=',rloc.mrnas[r], olen);
1519 ret=mdata->ovls.First()->mrna;
1520 //this must be called only for the head of an equivalency chain
1521 CTData* rdata=(CTData*)rloc.mrnas[r]->uptr;
1522 rdata->addOvl('=',&m,olen);
1523 //if (rdata->eqnext==NULL) rdata->eqnext=&m;
1524 }
1525 }
1526 if (ret!=NULL)
1527 mdata->eqref=ret;
1528 return ret;
1529 }
1530
1531
1532 void addXCons(GXLocus* xloc, GffObj* ref, char ovlcode, GffObj* tcons, CEqList* ts) {
1533 GXConsensus* c=new GXConsensus(tcons, ts, ref, ovlcode);
1534 //xloc->tcons.Add(c);
1535 //this will also check c against the other tcons for containment:
1536 xloc->addXCons(c);
1537 }
1538
1539
1540 const uint pre_mrna_threshold = 100;
1541
1542 char getOvlCode(GffObj& m, GffObj& r, int& ovlen) {
1543 ovlen=0;
1544 if (!m.overlap(r.start,r.end)) return 'u';
1545 int jmax=r.exons.Count()-1;
1546
1547 if (m.exons.Count()==1) { //single-exon transfrag
1548 GSeg mseg(m.start, m.end);
1549 if (jmax==0) { //also single-exon ref
1550 ovlen=mseg.overlapLen(r.start,r.end);
1551 int lmax=GMAX(r.covlen, m.covlen);
1552 if (ovlen >= lmax*0.8) return '='; //fuzz matching for single-exon transcripts: 80% of the longer one
1553 //if (m.covlen<=ovlen+12 && m.covlen<r.covlen) return 'c'; //contained
1554 if (m.covlen<r.covlen && ovlen >= m.covlen*0.8) return 'c';
1555 return 'o'; //just plain overlapping
1556 }
1557 //single-exon qry overlaping multi-exon ref
1558 for (int j=0;j<=jmax;j++) {
1559 //check if it's contained by an exon
1560 if (m.start>r.exons[j]->start-8 && m.end<r.exons[j]->end+8)
1561 return 'c';
1562 if (j==jmax) break;
1563 //check if it's contained by an intron
1564 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1565 return 'i';
1566 // check if it's a potential pre-mRNA transcript
1567 // (if overlaps an intron at least 10 bases)
1568 uint iovlen=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1);
1569 if (iovlen>=10 && mseg.len()>iovlen+10) return 'e';
1570 }
1571 return 'o'; //plain overlap, uncategorized
1572 } //single-exon transfrag
1573 //-- from here on we have a multi-exon transfrag --
1574 // * check if contained by a ref intron
1575 for (int j=0;j<jmax;j++) {
1576 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1577 return 'i';
1578 }
1579 //> check if m's intron chain is a subset of r's intron chain
1580 int imax=m.exons.Count()-1;// imax>0 here
1581 if (m.exons[imax]->start<r.exons[0]->end ||
1582 r.exons[jmax]->start<m.exons[0]->end ) //intron chains do not overlap at all
1583 return 'o'; //but terminal exons do, otherwise we wouldn't be here
1584 int i=1; //index of exon to the right of current qry intron
1585 int j=1; //index of exon to the right of current ref intron
1586 //find first intron overlap
1587 while (i<=imax && j<=jmax) {
1588 if (r.exons[j]->start<m.exons[i-1]->end) { j++; continue; }
1589 if (m.exons[i]->start<r.exons[j-1]->end) { i++; continue; }
1590 break; //here we have an intron overlap
1591 }
1592 if (i>imax || j>jmax)
1593 return 'o'; //no initial intron overlap found
1594 //from here on we check all qry introns against ref introns
1595 bool jmatch=false; //true if at least a junction match is found
1596 bool icmatch=(i==1); //intron chain match - it will be updated as introns are checked
1597 //bool exovli=false; // if any terminal exon of qry extends into a ref intron
1598 int jmstart=j; //index of first intron overlap of reference
1599 int jmend=0; //index of last intron overlap of reference
1600 int imend=0; //index of last intron overlap of query
1601 //check for intron matches
1602 while (i<=imax && j<=jmax) {
1603 uint mstart=m.exons[i-1]->end;
1604 uint mend=m.exons[i]->start;
1605 uint rstart=r.exons[j-1]->end;
1606 uint rend=r.exons[j]->start;
1607 if (rend<mstart) { j++; icmatch=false; continue; } //skipping ref intron, no ichain match
1608 if (mend<rstart) { i++; icmatch=false; continue; } //skipping qry intron, no ichain match
1609 //overlapping introns here, test junction matching
1610 jmend=j; //keep track of last overlapping intron
1611 imend=i;
1612 bool smatch=(mstart==rstart);
1613 bool ematch=(mend==rend);
1614 if (smatch || ematch) jmatch=true;
1615 if (smatch && ematch) { i++; j++; } //perfect match for this intron
1616 else { //at least one junction doesn't match
1617 icmatch=false;
1618 if (mend>rend) j++; else i++;
1619 }
1620 } //while checking intron overlaps
1621
1622 if (icmatch && imend==imax) { // qry intron chain match
1623 if (jmstart==1 && jmend==jmax) return '='; //identical intron chains
1624 // -- qry intron chain is shorter than ref intron chain --
1625 int l_iovh=0; // overhang of leftmost q exon left boundary beyond the end of ref intron to the left
1626 int r_iovh=0; // same type of overhang through the ref intron on the right
1627 if (jmstart>1 && r.exons[jmstart-1]->start>m.start)
1628 l_iovh = r.exons[jmstart-1]->start - m.start;
1629 if (jmend<jmax && m.end > r.exons[jmend]->end)
1630 r_iovh = m.end - r.exons[jmend]->end;
1631 if (l_iovh<4 && r_iovh<4) return 'c';
1632 //TODO? check if any x_iovl>10 and return 'e' to signal an "unspliced intron" ?
1633 // or we can check if any of them are >= the length of the corresponding ref intron on that side
1634 return 'j';
1635 }
1636 /*
1637 if (icmatch && (jmax>=imax)) { //all qry introns match
1638 //but they may overlap
1639 // if ((lbound && lbound > m.exons[0]->start+10) ||
1640 // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1641 // return 'c';
1642 // }
1643 int code = 'c';
1644 if (lbound)
1645 {
1646 uint ref_boundary = lbound;
1647 uint cuff_boundary = m.exons[0]->start;
1648 if (ref_boundary > (cuff_boundary + pre_mrna_threshold)) // cuff extends a lot
1649 {
1650 code = 'j';
1651 }
1652 if (ref_boundary > cuff_boundary) // cuff extends just a bit into a ref intron
1653 {
1654 code = 'e';
1655 }
1656 }
1657 if (j <= jmax)
1658 {
1659 uint ref_boundary = r.exons[j-1]->end;
1660 uint cuff_boundary = m.exons[i-1]->end;
1661 if (cuff_boundary > (ref_boundary + pre_mrna_threshold)) // cuff extends a lot
1662 {
1663 code = 'j';
1664 }
1665 if (cuff_boundary > ref_boundary) // cuff extends just a bit into a ref intron
1666 {
1667 code = 'e';
1668 }
1669 }
1670 //if ((lbound && lbound > m.exons[0]->start+10) ||
1671 // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1672 return code;
1673 }
1674
1675 // if (!ichain) // first and last exons more or less match, but there's a different intron somewhere
1676 // {
1677 //
1678 // }
1679
1680 */
1681 return jmatch ? 'j':'o';
1682 }
1683
1684 char getRefOvl(GffObj& m, GLocus& rloc, GffObj*& rovl, int& ovlen) {
1685 rovl=NULL;
1686 ovlen=0;
1687 if (m.start>rloc.end || m.end<rloc.start) {
1688 //see if it's a polymerase run ?
1689 /*
1690 if ((m.strand=='+' && m.end<=rloc.end+polyrun_range) ||
1691 (m.strand=='-' && m.start>=rloc.start-polyrun_range)) {
1692 rovl=rloc.mrna_maxcov;
1693 ((CTData*)m.uptr)->addOvl('p',rloc.mrna_maxcov);
1694 return 'p';
1695 }
1696 */
1697 return 0; //unknown -> intergenic space
1698 }
1699 for (int i=0;i<rloc.mrnas.Count();i++) {
1700 GffObj* r=rloc.mrnas[i];
1701 int olen=0;
1702 char ovlcode=getOvlCode(m,*r,olen);
1703 if (ovlcode!=0) { //has some sort of "overlap" with r
1704 ((CTData*)m.uptr)->addOvl(ovlcode,r,olen);
1705 if (olen>ovlen) ovlen=olen;
1706 if (ovlcode=='c' || ovlcode=='=') //keep match/containment for each reference transcript
1707 ((CTData*)r->uptr)->addOvl(ovlcode,&m,olen);
1708 }
1709 }//for each ref in rloc
1710 // i,j,o
1711 return ((CTData*)m.uptr)->getBestCode();
1712 }
1713
1714 /*
1715 void findTMatches(GTrackLocus& loctrack, int qcount) {
1716 //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1717 for (int q=0;q<qcount-1;q++) { //for each qry dataset
1718 if (loctrack[q]==NULL) continue;
1719 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1720 GffObj* qi_t=loctrack[q]->Get(qi);
1721 CTData* qi_d=(CTData*)qi_t->uptr;
1722 if ((qi_d->eqdata & EQHEAD_TAG) !=0) { //this is set as an EQ chain head already
1723 //if (qi_t->exons.Count()>1)
1724 continue;
1725 }
1726 for (int n=q+1;n<qcount;n++) { // for every successor dataset
1727 if (loctrack[n]==NULL) continue;
1728 for (int ni=0;ni<loctrack[n]->Count();ni++) {
1729 GffObj* ni_t=loctrack[n]->Get(ni);
1730 CTData* ni_d=(CTData*)ni_t->uptr;
1731 //if (ni_d->eqdata!=0) continue; //already part of an EQ chain
1732 //single exon transfrags have special treatment:
1733 bool s_match=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1734 if (ni_d->eqdata!=0 && !s_match) continue; //already part of an EQ chain
1735 if (ni_d->eqnext!=NULL) continue;
1736 int ovlen=0;
1737
1738 if (qi_d->eqnext!=NULL) {
1739 if (!s_match) continue;
1740 //test all in the EQ list for a match
1741 bool matchFound=false;
1742 CTData* next_eq_d=qi_d;
1743 if (tMatch(*qi_t, *ni_t, ovlen, true)) {
1744 matchFound=true;
1745 }
1746 else {
1747 while (next_eq_d->eqnext!=NULL) {
1748 if (tMatch(*(next_eq_d->eqnext), *ni_t, ovlen, true)) {
1749 matchFound=true;
1750 break;
1751 }
1752 next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1753 } //last in the chain
1754 }
1755 if (matchFound) {
1756 //add this to the end of the EQ chain instead
1757 next_eq_d=(CTData*)(qi_d->eqnext->uptr);
1758 while (next_eq_d->eqnext!=NULL) {
1759 next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1760 } //last in the chain
1761 next_eq_d->eqnext=ni_t;
1762 ni_d->eqdata=n+1;
1763 ni_d->eqnext=NULL; ///TEST
1764 }
1765 }
1766 else {
1767 if (tMatch(*qi_t,*ni_t, ovlen, true)) {
1768 qi_d->eqnext=ni_t;
1769 ni_d->eqnext=NULL; ///TEST
1770 if (qi_d->eqdata == 0) {//only start of chain is tagged
1771 qi_d->eqdata = ((q+1) | EQHEAD_TAG);
1772 }
1773 ni_d->eqdata=n+1; //EQ chain member only marked with qry# (1-based)
1774 }
1775 } //multi-exon case
1776 } //for each transfrag in the next qry dataset
1777
1778 if (qi_d->eqnext!=NULL && qi_t->exons.Count()>1) break;
1779 //part of a chain already, skip other datasets
1780 } // for each successor dataset
1781 } //for each transcript in qry dataset
1782 } //for each qry dataset
1783 }
1784 */
1785
1786 void findTMatches(GTrackLocus& loctrack, int qcount) {
1787 //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1788 for (int q=0;q<qcount-1;q++) { //for each qry dataset
1789 if (loctrack[q]==NULL) continue;
1790 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1791 GffObj* qi_t=loctrack[q]->Get(qi);
1792 CTData* qi_d=(CTData*)qi_t->uptr;
1793 if (qi_d->eqlist!=NULL && qi_t->exons.Count()>1) {
1794 continue; //this is part of an EQ chain already
1795 }
1796 for (int n=q+1;n<qcount;n++) { // for every successor dataset
1797 if (loctrack[n]==NULL) continue;
1798 for (int ni=0;ni<loctrack[n]->Count();ni++) {
1799 GffObj* ni_t=loctrack[n]->Get(ni);
1800 CTData* ni_d=(CTData*)ni_t->uptr;
1801 bool singleExon=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1802 if (ni_d->eqlist!=NULL &&
1803 (ni_d->eqlist==qi_d->eqlist || !singleExon)) continue;
1804 int ovlen=0;
1805 if ((ni_d->eqlist==qi_d->eqlist && qi_d->eqlist!=NULL) ||
1806 tMatch(*qi_t,*ni_t, ovlen, singleExon)) {
1807 qi_d->joinEqList(ni_t);
1808 }
1809 }
1810 } // for each successor dataset
1811 } //for each transcript in qry dataset
1812 } //for each qry dataset
1813 }
1814
1815
1816 int cmpTData_qset(const pointer* p1, const pointer* p2) {
1817 CTData* d1=(CTData*)(((GffObj*)p1)->uptr);
1818 CTData* d2=(CTData*)(((GffObj*)p2)->uptr);
1819 return (d1->qset - d2->qset);
1820 }
1821
1822 void printITrack(FILE* ft, GList<GffObj>& mrnas, int qcount, int& cnum) {
1823 for (int i=0;i<mrnas.Count();i++) {
1824 GffObj& qt=*(mrnas[i]);
1825 CTData* qtdata=(CTData*)qt.uptr;
1826 int qfidx=qtdata->qset;
1827 char ovlcode=qtdata->classcode;
1828 //GList<GffObj> eqchain(false,false,false);
1829 CEqList* eqchain=qtdata->eqlist;
1830 GffObj* ref=NULL; //related ref -- it doesn't have to be fully matching
1831 GffObj* eqref=NULL; //fully ichain-matching ref
1832 GffObj* tcons=NULL; //"consensus" (largest) transcript for a clique
1833 int tmaxcov=0;
1834 //eqchain.Add(&qt);
1835 eqref=qtdata->eqref;
1836 if (qtdata->ovls.Count()>0 && qtdata->ovls[0]->mrna!=NULL) {
1837 //if it has ovlcode with a ref
1838 ref=qtdata->ovls[0]->mrna;
1839 //consistency check: qtdata->ovls[0]->code==ovlcode
1840 // -- let tcons be a transfrag, not a ref transcript
1841 //tcons=eqref;
1842 //if (tcons!=NULL) tmaxcov=tcons->covlen;
1843 }
1844 //chain pre-check
1845 if (tcons==NULL || mrnas[i]->covlen>tmaxcov) {
1846 tcons=mrnas[i];
1847 tmaxcov=tcons->covlen;
1848 }
1849 if (qtdata->isEqHead()) {//head of a equivalency chain
1850 //check if all transcripts in this chain have the same ovlcode
1851 for (int k=0;k<qtdata->eqlist->Count();k++) {
1852 GffObj* m=qtdata->eqlist->Get(k);
1853 if (m->covlen>tmaxcov) {
1854 tmaxcov=m->covlen;
1855 tcons=m;
1856 }
1857 if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1858 ovlcode='.'; //non-uniform ovlcode
1859 }
1860 }
1861 /*
1862 GffObj* m=mrnas[i];
1863 while (((CTData*)m->uptr)->eqnext!=NULL) {
1864 m=((CTData*)m->uptr)->eqnext;
1865 eqchain.Add(m);
1866 if (m->covlen>tmaxcov) {
1867 tmaxcov=m->covlen;
1868 tcons=m;
1869 }
1870 if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1871 ovlcode='.'; //non-uniform ovlcode
1872 //break;
1873 }
1874 } //while elements in chain
1875 */
1876
1877 }//chain check
1878 //if (ovlcode=='p') ref=NULL; //ignore polymerase runs?
1879 if (ovlcode==0 || ovlcode=='-') ovlcode = (ref==NULL) ? 'u' : '.';
1880 //-- print columns 1 and 2 as LOCUS_ID and TCONS_ID
1881 //bool chainHead=(qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0));
1882 bool chainHead=qtdata->isEqHead();
1883 //bool noChain=((qtdata->eqdata & EQCHAIN_TAGMASK)==0);
1884 bool noChain=(eqchain==NULL);
1885 if (chainHead || noChain) {
1886 cnum++;
1887 if (ft!=NULL) fprintf(ft,"%s_%08d\t",cprefix,cnum);
1888 GXLocus* xloc=qtdata->locus->xlocus;
1889 if (xloc!=NULL) {
1890 if (ft!=NULL) fprintf(ft, "XLOC_%06d\t",xloc->id);
1891 if (tcons->exons.Count()>1) {
1892 //! only multi-exon mRNAs are counted for multi-transcript xloci !
1893 xloc->num_mtcons++;
1894 if (xloc->num_mtcons==2)
1895 total_xloci_alt++;
1896 }
1897 }
1898 else {
1899 //should NEVER happen!
1900 int fidx=qtdata->qset;
1901 GError("Error: no XLocus created for transcript %s (file %s) [%d, %d], on %s%c:%d-%d\n", qt.getID(),
1902 qryfiles[qtdata->locus->qfidx]->chars(), qtdata->locus->qfidx, fidx, qt.getGSeqName(), qt.strand, qt.start, qt.end);
1903 }
1904 addXCons(xloc, ref, ovlcode, tcons, eqchain);
1905 } // if chain head or uniq entry (not part of a chain)
1906 if (ft==NULL) continue;
1907 if (chainHead) {
1908 //this is the start of a equivalence class as a printing chain
1909 if (ref!=NULL) fprintf(ft,"%s|%s\t%c", getGeneID(ref),ref->getID(), ovlcode);
1910 else fprintf(ft,"-\t%c", ovlcode);
1911 GffObj* m=mrnas[i];
1912 CTData* mdata=(CTData*)m->uptr;
1913
1914 int lastpq=-1;
1915 /*
1916 for (int ptab=mdata->qset-lastpq; ptab>0;ptab--)
1917 if (ptab>1) fprintf(ft,"\t-");
1918 else fprintf(ft,"\t");
1919 lastpq=mdata->qset;
1920 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1921 iround(m->gscore/10), mdata->FPKM, mdata->conf_lo, mdata->conf_hi, mdata->cov, m->covlen);
1922 //traverse linked list of matching transcripts
1923 while (mdata->eqnext!=NULL) {
1924 m=mdata->eqnext;
1925 mdata=(CTData*)m->uptr;
1926 for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1927 if (ptab>1) fprintf(ft,"\t-");
1928 else fprintf(ft,"\t");
1929 lastpq = mdata->qset;
1930 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1931 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1932 } //traverse and print row
1933 */
1934 eqchain->setUnique(false);
1935 eqchain->setSorted((GCompareProc*) cmpTData_qset);
1936
1937 for (int k=0;k<eqchain->Count();k++) {
1938 m=eqchain->Get(k);
1939 mdata=(CTData*)m->uptr;
1940 if (mdata->qset==lastpq) {
1941 //shouldn't happen, unless this input set is messed up (has duplicates/redundant transfrags)
1942 fprintf(ft,",%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", getGeneID(m), m->getID(),
1943 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1944 continue;
1945 }
1946 for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1947 if (ptab>1) fprintf(ft,"\t-");
1948 else fprintf(ft,"\t");
1949 lastpq = mdata->qset;
1950 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1951 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1952 }
1953 for (int ptab=qcount-lastpq-1;ptab>0;ptab--)
1954 fprintf(ft,"\t-");
1955 fprintf(ft,"\n");
1956 continue;
1957 } //start of eq class (printing chain)
1958
1959 if (eqchain!=NULL) continue; //part of a matching chain, dealt with previously
1960
1961 //--------- not in an ichain-matching class, print as singleton
1962
1963 if (ref!=NULL) fprintf(ft,"%s|%s\t%c",getGeneID(ref), ref->getID(), ovlcode);
1964 else fprintf(ft,"-\t%c",ovlcode);
1965 for (int ptab=qfidx;ptab>=0;ptab--)
1966 if (ptab>0) fprintf(ft,"\t-");
1967 else fprintf(ft,"\t");
1968 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|-",qfidx+1, getGeneID(qt), qt.getID(),iround(qt.gscore/10),
1969 qtdata->FPKM, qtdata->conf_lo,qtdata->conf_hi,qtdata->cov);
1970 for (int ptab=qcount-qfidx-1;ptab>0;ptab--)
1971 fprintf(ft,"\t-");
1972 fprintf(ft,"\n");
1973 } //for each transcript
1974 }
1975
1976
1977 void findTRMatch(GTrackLocus& loctrack, int qcount, GLocus& rloc) {
1978 //requires loctrack to be already populated with overlapping qloci by findTMatches()
1979 // which also found (and tagged) all matching qry transcripts
1980 for (int q=0;q<qcount;q++) { //for each qry dataset
1981 if (loctrack[q]==NULL) continue;
1982 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1983 //if (loctrack[q]->cl[qi]->exons.Count()<2) continue; //skip single-exon transcripts
1984 GffObj& qt=*(loctrack[q]->Get(qi));
1985 CTData* qtdata=(CTData*)qt.uptr;
1986 GffObj* rmatch=NULL; //== ref match for this row
1987 int rovlen=0;
1988 //if (qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0)) {
1989 if (qtdata->isEqHead()) {
1990 //EQ chain head -- transfrag equivalency list starts here
1991 if (qtdata->eqref==NULL) { //find rloc overlap
1992 if (qt.overlap(rloc.start, rloc.end)) {
1993 rmatch=findRefMatch(qt, rloc, rovlen);
1994 }
1995 } else rmatch=qtdata->eqref;
1996 if (rmatch!=NULL) {
1997 /*
1998 GffObj* m=loctrack[q]->Get(qi);
1999 //traverse linked list of matching transcripts
2000 while (((CTData*)m->uptr)->eqnext!=NULL) {
2001 m=((CTData*)m->uptr)->eqnext;
2002 if (rmatch!=NULL) {
2003 ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2004 }
2005 } //traverse qry data sets
2006 continue;
2007 }
2008 */
2009 for (int k=0;k<qtdata->eqlist->Count();k++) {
2010 GffObj* m=qtdata->eqlist->Get(k);
2011 ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2012 continue;
2013 }
2014 }
2015 //if (rmatch!=NULL) continue;
2016 } //equivalence class (chain of intron-matching)
2017 //if ((qtdata->eqdata & EQCHAIN_TAGMASK)!=0) continue; //part of a matching chain, dealt with previously
2018
2019 //--------- qry mrna not in a '=' matching clique
2020 if (qtdata->eqref==NULL) { //find any rloc overlap
2021 if (qt.overlap(rloc.start, rloc.end)) {
2022 rmatch=findRefMatch(qt, rloc, rovlen);
2023 if (rmatch==NULL) {
2024 //not an ichain match, look for other codes
2025 GffObj* rovl=NULL;
2026 int rovlen=0;
2027 //char ovlcode=
2028 getRefOvl(qt, rloc,rovl,rovlen);
2029 }
2030 }
2031 }
2032 else rmatch=qtdata->eqref;
2033 } //for each qry transcript
2034 }//for each qry dataset
2035 }
2036
2037
2038 bool inPolyRun(char strand, GffObj& m, GList<GLocus>* rloci, int& rlocidx) {
2039 //we are only here if there is no actual overlap between m and any locus in rloci
2040 if (rloci==NULL || rloci->Count()==0) return false; // || m.exons.Count()>1
2041 if (strand=='-') {
2042 rlocidx=qsearch_loci(m.end, *rloci);
2043 //returns index of locus starting just ABOVE m.end
2044 // or -1 if last locus start <= m.end
2045 GLocus* rloc=NULL;
2046 if (rlocidx<0) return false;
2047 while (rlocidx<rloci->Count()) {
2048 rloc=rloci->Get(rlocidx);
2049 if (rloc->start>m.end+polyrun_range) break;
2050 if (rloc->start+6>m.end) return true;
2051 rlocidx++;
2052 }
2053 }
2054 else { // strand == '+' (or '.' ?)
2055 rlocidx=qsearch_loci(m.end, *rloci);
2056 GLocus* rloc=NULL;
2057 //returns index of closest locus starting ABOVE m.end
2058 // or -1 if last locus start <= m.end
2059 if (rlocidx<0) rlocidx=rloci->Count(); //this may actually start below m.end
2060 while ((--rlocidx)>=0) {
2061 rloc=rloci->Get(rlocidx);
2062 if (m.start>rloc->start+GFF_MAX_LOCUS) break;
2063 if (m.start+6>rloc->end && m.start<rloc->end+polyrun_range) return true;
2064 }
2065 }
2066 return false;
2067 }
2068
2069 CTData* getBestOvl(GffObj& m) {
2070 //CTData* mdata=(CTData*)m.uptr;
2071 //return mdata->getBestCode();
2072 if ( ((CTData*)m.uptr)->ovls.Count()>0)
2073 return (CTData*)m.uptr;
2074 return NULL;
2075 }
2076
2077 void reclass_XStrand(GList<GffObj>& mrnas, GList<GLocus>* rloci) {
2078 //checking for relationship with ref transcripts on opposite strand
2079 if (rloci==NULL || rloci->Count()<1) return;
2080 int j=0;//current rloci index
2081 for (int i=0;i<mrnas.Count();i++) {
2082 GffObj& m=*mrnas[i];
2083 char ovlcode=((CTData*)m.uptr)->getBestCode();
2084 if (ovlcode>47 && strchr("=cjeo",ovlcode)!=NULL) continue;
2085 GLocus* rloc=rloci->Get(j);
2086 if (rloc->start>m.end) continue; //check next transfrag
2087 while (m.start>rloc->end && j+1<rloci->Count()) {
2088 j++;
2089 rloc=rloci->Get(j);
2090 }
2091 if (rloc->start>m.end) continue; //check next transfrag
2092 //m overlaps rloc:
2093 //check if m has a fuzzy intron overlap -> 's' (shadow, mapped on the wrong strand)
2094 // then if m is contained within an intron -> 'i'
2095 // otherwise it's just a plain cross-strand overlap: 'x'
2096 int jm=0;
2097 do { //while rloci overlap this transfrag (m)
2098 rloc=rloci->Get(j+jm);
2099 bool is_shadow=false;
2100 GffObj* sovl=NULL;
2101 bool is_intraintron=false;
2102 GffObj* iovl=NULL;
2103 if (rloc->introns.Count()>0) {
2104 for (int n=0;n<rloc->introns.Count();n++) {
2105 GISeg& rintron=rloc->introns[n];
2106 if (rintron.start>m.end) break;
2107 if (m.start>rintron.end) continue;
2108 //overlap between m and intron
2109 if (m.end<=rintron.end && m.start>=rintron.start) {
2110 is_intraintron=true;
2111 if (iovl==NULL || iovl->covlen<rintron.t->covlen) iovl=rintron.t;
2112 continue;
2113 }
2114 //check if any intron of m has a fuzz-match with rintron
2115 for (int e=1;e<m.exons.Count();e++) {
2116 GSeg mintron(m.exons[e-1]->end+1,m.exons[e]->start-1);
2117 if (rintron.coordMatch(&mintron,10)) {
2118 is_shadow=true;
2119 if (sovl==NULL || sovl->covlen<rintron.t->covlen) sovl=rintron.t;
2120 break;
2121 }
2122 } //for each m intron
2123 } //for each intron of rloc
2124 }//rloc has introns
2125 bool xcode=true;
2126 if (is_shadow) { ((CTData*)m.uptr)->addOvl('s', sovl); xcode=false; }
2127 // else
2128 if (ovlcode!='i' && is_intraintron) { ((CTData*)m.uptr)->addOvl('i', iovl); xcode=false; }
2129 if (xcode) {
2130 // just plain overlap, find the overlapping mrna in rloc
2131 GffObj* maxovl=NULL;
2132 int ovlen=0;
2133 for (int ri=0;ri<rloc->mrnas.Count();ri++) {
2134 int o=m.exonOverlapLen(*(rloc->mrnas[ri]));
2135 if (o>ovlen) {
2136 ovlen=o;
2137 maxovl=rloc->mrnas[ri];
2138 }
2139 }
2140 if (maxovl!=NULL) ((CTData*)m.uptr)->addOvl('x',maxovl);
2141 } //'x'
2142 jm++;
2143 } while (j+jm<rloci->Count() && rloci->Get(j+jm)->overlap(m));
2144 } //for each transfrag
2145 }
2146
2147 void reclass_mRNAs(char strand, GList<GffObj>& mrnas, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2148 int rlocidx=-1;
2149 for (int i=0;i<mrnas.Count();i++) {
2150 GffObj& m=*mrnas[i];
2151 char ovlcode=((CTData*)m.uptr)->getBestCode();
2152 //if (ovlcode=='u' || ovlcode=='i' || ovlcode==0) {
2153 if (ovlcode=='u' || ovlcode<47) {
2154 //check for overlaps with ref transcripts on the other strand
2155 if (m.exons.Count()==1 && inPolyRun(strand, m, rloci, rlocidx)) {
2156 ((CTData*)m.uptr)->addOvl('p',rloci->Get(rlocidx)->mrna_maxcov);
2157 }
2158 else { //check for repeat content
2159 if (faseq!=NULL) {
2160 int seqlen;
2161 char* seq=m.getSpliced(faseq, false, &seqlen);
2162 //get percentage of lowercase
2163 int numlc=0;
2164 for (int c=0;c<seqlen;c++) if (seq[c]>='a') numlc++;
2165 if (numlc > seqlen/2)
2166 ((CTData*)m.uptr)->addOvl('r');
2167 GFREE(seq);
2168 }
2169 }
2170 } //for unassigned class
2171 }//for each mrna
2172
2173 }
2174
2175 void reclassLoci(char strand, GList<GLocus>& qloci, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2176 for (int ql=0;ql<qloci.Count();ql++) {
2177 reclass_mRNAs(strand, qloci[ql]->mrnas, rloci, faseq);
2178 //find closest upstream ref locus for this q locus
2179 } //for each locus
2180 }
2181
2182 //for a single genomic sequence, all qry data and ref data is stored in gtrack
2183 //check for all 'u' transfrags if they are repeat ('r') or polymerase run 'p' or anything else
2184 void umrnaReclass(int qcount, GSeqTrack& gtrack, FILE** ftr, GFaSeqGet* faseq=NULL) {
2185 for (int q=0;q<qcount;q++) {
2186 if (gtrack.qdata[q]==NULL) continue; //no transcripts in this q dataset for this genomic seq
2187 reclassLoci('+', gtrack.qdata[q]->loci_f, gtrack.rloci_f, faseq);
2188 reclassLoci('-', gtrack.qdata[q]->loci_r, gtrack.rloci_r, faseq);
2189 reclass_mRNAs('+', gtrack.qdata[q]->umrnas, gtrack.rloci_f, faseq);
2190 reclass_mRNAs('-', gtrack.qdata[q]->umrnas, gtrack.rloci_r, faseq);
2191 //and also check for special cases with cross-strand overlaps:
2192 reclass_XStrand(gtrack.qdata[q]->mrnas_f, gtrack.rloci_r);
2193 reclass_XStrand(gtrack.qdata[q]->mrnas_r, gtrack.rloci_f);
2194 // print all tmap data here here:
2195 for (int i=0;i<gtrack.qdata[q]->tdata.Count();i++) {
2196 CTData* mdata=gtrack.qdata[q]->tdata[i];
2197 if (mdata->mrna==NULL) continue; //invalidated -- removed earlier
2198 //GLocus* rlocus=NULL;
2199 mdata->classcode='u';
2200 GffObj* ref=NULL;
2201 if (mdata->ovls.Count()>0) {
2202 mdata->classcode=mdata->ovls[0]->code;
2203 ref=mdata->ovls[0]->mrna;
2204 }
2205 //if (mdata->classcode<33) mdata->classcode='u';
2206 if (mdata->classcode<47) mdata->classcode='u'; // if 0, '-' or '.'
2207 if (tmapFiles) {
2208 char ref_match_len[2048];
2209 if (ref!=NULL) {
2210 sprintf(ref_match_len, "%d",ref->covlen);
2211 fprintf(ftr[q],"%s\t%s\t",getGeneID(ref),ref->getID());
2212 //rlocus=((CTData*)(ref->uptr))->locus;
2213 }
2214 else {
2215 fprintf(ftr[q],"-\t-\t");
2216 strcpy(ref_match_len, "-");
2217 }
2218 //fprintf(ftr[q],"%c\t%s\t%d\t%8.6f\t%8.6f\t%d\n", ovlcode, mdata->mrna->getID(),
2219 // iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->cov, mdata->mrna->covlen);
2220 const char* mlocname = (mdata->locus!=NULL) ? mdata->locus->mrna_maxcov->getID() : mdata->mrna->getID();
2221 fprintf(ftr[q],"%c\t%s\t%s\t%d\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%d\t%s\t%s\n", mdata->classcode, getGeneID(mdata->mrna), mdata->mrna->getID(),
2222 iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->conf_lo,mdata->conf_hi, mdata->cov, mdata->mrna->covlen, mlocname, ref_match_len);
2223 }
2224 } //for each tdata
2225 } //for each qdata
2226 }
2227
2228 void buildXLoci(GTrackLocus& loctrack, int qcount, GSeqTrack& gtrack, char strand,
2229 GList<GXLocus>* retxloci=NULL) {
2230 GList<GXLocus>* dest_xloci=NULL;
2231 GList<GXLocus> tmpxloci(true,false,true); //local set of newly created xloci
2232 GList<GXLocus>* xloci=&tmpxloci;
2233 if (strand=='+') {
2234 dest_xloci=& gtrack.xloci_f;
2235 }
2236 else if (strand=='-') {
2237 dest_xloci = & gtrack.xloci_r;
2238 }
2239 else dest_xloci= & gtrack.xloci_u;
2240
2241 if (retxloci==NULL) {
2242 //if no return set of build xloci was given
2243 //take it as a directive to work directly on the global xloci
2244 xloci=dest_xloci;
2245 dest_xloci=NULL;
2246 }
2247 for (int q=-1;q<qcount;q++) {
2248 GList<GLocus>* wrkloci=NULL;
2249 if (q<0) {
2250 if (loctrack.rloci.Count()==0) continue;
2251 //loci=new GList<GLocus>(true,false,false);
2252 //loci->Add(loctrack.rloc);
2253 wrkloci = &(loctrack.rloci);
2254 }
2255 else {
2256 if (loctrack[q]==NULL) continue;
2257 wrkloci = &(loctrack[q]->qloci);
2258 }
2259
2260 for (int t=0;t<wrkloci->Count();t++) {
2261 GLocus* loc=wrkloci->Get(t);
2262 int xfound=0; //count of parent xloci
2263 if (loc->xlocus!=NULL) continue; //already assigned a superlocus
2264 GArray<int> mrgxloci(true);
2265 for (int xl=0;xl<xloci->Count();xl++) {
2266 GXLocus& xloc=*(xloci->Get(xl));
2267 if (xloc.start>loc->end) {
2268 if (xloc.start-loc->end > GFF_MAX_LOCUS) break;
2269 continue;
2270 }
2271 if (loc->start>xloc.end) continue;
2272 if (xloc.add_Locus(loc)) {
2273 xfound++;
2274 mrgxloci.Add(xl);
2275 }
2276 } //for each existing Xlocus
2277 //if (xfound<0) continue;
2278 if (xfound==0) {
2279 //GXLocus *newxloc=new GXLocus(loc);
2280 xloci->Add(new GXLocus(loc));
2281 }
2282 else if (xfound>1) {
2283 for (int l=1;l<xfound;l++) {
2284 int mlidx=mrgxloci[l]-l+1;
2285 xloci->Get(mrgxloci[0])->addMerge(*(xloci->Get(mlidx)));
2286 GXLocus* ldel=xloci->Get(mlidx);
2287 xloci->Delete(mlidx);
2288 if (retxloci!=NULL)
2289 delete ldel;
2290 }
2291 }
2292 }//for each locus
2293 }//for each set of loci in the region (refs and each qry set)
2294 //-- add xloci to the global set of xloci unless retxloci was given,
2295 if (retxloci!=NULL) retxloci->Add(*xloci);
2296 else dest_xloci->Add(*xloci);
2297 }
2298
2299 void singleQData(GList<GLocus>& qloci, GList<GTrackLocus>& loctracks) {
2300 for (int i=0;i<qloci.Count();i++) {
2301 if (qloci[i]->t_ptr==NULL) {
2302 GTrackLocus* tloc=new GTrackLocus();
2303 tloc->addQLocus(qloci[i],0);
2304 loctracks.Add(tloc);
2305 }
2306 }
2307 }
2308
2309 void recheckUmrnas(GSeqData* gseqdata, GList<GffObj>& mrnas,
2310 GList<GLocus>& loci, GList<GLocus>& nloci, GList<GLocus>& oloci) {
2311 GList<GLocus> reassignedLocs(false,false);
2312 for (int u=0;u<gseqdata->umrnas.Count();u++) {
2313 for (int l=0;l<oloci.Count();l++) {
2314 if (gseqdata->umrnas[u]==NULL) break;
2315 if (gseqdata->umrnas[u]->end<oloci[l]->start) break; //try next umrna
2316 if (oloci[l]->end<gseqdata->umrnas[u]->start) continue; //try next locus
2317 if (gseqdata->umrnas[u]->strand=='+' || gseqdata->umrnas[u]->strand=='-') {
2318 gseqdata->umrnas.Forget(u);
2319 continue; //already reassigned earlier
2320 }
2321 //umrna overlaps locus region
2322 GffObj* umrna=gseqdata->umrnas[u];
2323 for (int m=0;m<oloci[l]->mrnas.Count();m++) {
2324 if (oloci[l]->mrnas[m]->exonOverlap(umrna)) {
2325 gseqdata->umrnas.Forget(u);
2326 CTData* umdata=((CTData*)umrna->uptr);
2327 //must be in a Loci anyway
2328 if (umdata==NULL || umdata->locus==NULL)
2329 GError("Error: no locus pointer for umrna %s!\n",umrna->getID());
2330 for (int i=0;i<umdata->locus->mrnas.Count();i++) {
2331 GffObj* um=umdata->locus->mrnas[i];
2332 um->strand=oloci[l]->mrnas[m]->strand;
2333 }
2334 reassignedLocs.Add(umdata->locus);
2335 break;
2336 }
2337 } //for each mrna in locus
2338 } //for each locus
2339 } //for each umrna
2340 if (reassignedLocs.Count()>0) {
2341 gseqdata->umrnas.Pack();
2342 gseqdata->nloci_u.setFreeItem(false);
2343 for (int i=0;i<reassignedLocs.Count();i++) {
2344 GLocus* loc=reassignedLocs[i];
2345 for (int m=0;m<loc->mrnas.Count();m++) {
2346 mrnas.Add(loc->mrnas[m]);
2347 }
2348 loci.Add(loc);
2349 nloci.Add(loc);
2350 gseqdata->nloci_u.Remove(loc);
2351 }
2352 gseqdata->nloci_u.setFreeItem(true);
2353 }
2354 }
2355
2356 void umrnasXStrand(GList<GXLocus>& xloci, GSeqTrack& gtrack) {
2357 //try to determine the strand of unoriented transfrags based on possible overlaps
2358 //with other, oriented transfrags
2359 for (int x=0;x<xloci.Count();x++) {
2360 if (xloci[x]->strand=='.') continue;
2361 if (xloci[x]->qloci.Count()==0) continue;
2362 //go through all qloci in this xlocus
2363 for (int l = 0; l < xloci[x]->qloci.Count(); l++) {
2364 char locstrand=xloci[x]->qloci[l]->mrna_maxcov->strand;
2365 if (locstrand=='.') {
2366 //this is a umrna cluster
2367 GLocus* qloc=xloci[x]->qloci[l];
2368 //we don't really need to update loci lists (loci_f, nloci_f etc.)
2369 /*
2370 if (xloci[x]->strand=='+') {
2371 }
2372 else { // - strand
2373 }
2374 */
2375 for (int i=0;i<qloc->mrnas.Count();i++) {
2376 qloc->mrnas[i]->strand=xloci[x]->strand;
2377 int uidx=gtrack.qdata[qloc->qfidx]->umrnas.IndexOf(qloc->mrnas[i]);
2378 if (uidx>=0) {
2379 gtrack.qdata[qloc->qfidx]->umrnas.Forget(uidx);
2380 gtrack.qdata[qloc->qfidx]->umrnas.Delete(uidx);
2381 if (xloci[x]->strand=='+')
2382 gtrack.qdata[qloc->qfidx]->mrnas_f.Add(qloc->mrnas[i]);
2383 else
2384 gtrack.qdata[qloc->qfidx]->mrnas_r.Add(qloc->mrnas[i]);
2385 }
2386 }
2387 } //unknown strand
2388 } //for each xloci[x].qloci (l)
2389
2390 } //for each xloci (x)
2391 }
2392
2393 //cluster loci across all datasets
2394 void xclusterLoci(int qcount, char strand, GSeqTrack& gtrack) {
2395 //gtrack holds data for all input qry datasets for a chromosome/contig
2396 //cluster QLoci
2397 GList<GTrackLocus> loctracks(true,true,false);
2398 //all vs all clustering across all qry data sets + ref
2399 //one-strand set of loci from all datasets + ref loci
2400 GList<GLocus>* wrkloci=NULL;
2401 //build xloci without references first
2402 //then add references only if they overlap an existing xloci
2403
2404 int nq=0;
2405 for (int q=0;q<=qcount+1;q++) {
2406 bool refcheck=false;
2407 if (q==qcount) { // check the unoriented loci for each query file
2408 while (nq<qcount &&
2409 (gtrack.qdata[nq]==NULL || gtrack.qdata[nq]->nloci_u.Count()==0))
2410 nq++; //skip query files with no unoriented loci
2411 if (nq<qcount) {
2412 wrkloci=&(gtrack.qdata[nq]->nloci_u);
2413 nq++;
2414 if (nq<qcount) q--; //so we can fetch the next nq in the next q cycle
2415 }
2416 else continue; //no more q files with unoriented loci
2417 }
2418 else if (q==qcount+1) { // check the reference loci
2419 if (strand=='+') wrkloci=gtrack.rloci_f;
2420 else wrkloci=gtrack.rloci_r;
2421
2422 if (wrkloci==NULL) break; //no ref loci here
2423 refcheck=true;
2424 }
2425 else {
2426 if (gtrack.qdata[q]==NULL) continue;
2427 if (strand=='+') wrkloci=&(gtrack.qdata[q]->loci_f);
2428 else wrkloci=&(gtrack.qdata[q]->loci_r);
2429 }
2430 // now do the all-vs-all clustering thing:
2431 for (int t=0;t<wrkloci->Count();t++) {
2432 GLocus* loc=wrkloci->Get(t);
2433 int xfound=0; //count of parent loctracks
2434 if (loc->t_ptr!=NULL) continue; //already assigned a loctrack
2435 GArray<int> mrgloctracks(true);
2436 for (int xl=0;xl<loctracks.Count();xl++) {
2437 GTrackLocus& trackloc=*loctracks[xl];
2438 if (trackloc.start>loc->end) break;
2439 if (loc->start>trackloc.end) continue;
2440 if (trackloc.add_Locus(loc)) {
2441 xfound++;
2442 mrgloctracks.Add(xl);
2443 }
2444 } //for each existing Xlocus
2445 if (xfound==0) {
2446 if (!refcheck) //we really don't care about ref-only clusters
2447 loctracks.Add(new GTrackLocus(loc));
2448 }
2449 else if (xfound>1) {
2450 for (int l=1;l<xfound;l++) {
2451 int mlidx=mrgloctracks[l]-l+1;
2452 loctracks.Get(mrgloctracks[0])->addMerge(loctracks[mlidx], qcount, loc);
2453 loctracks.Delete(mlidx);
2454 }
2455 }
2456 }//for each wrklocus
2457 } //for each set of loci (q)
2458 //loctracks is now set with all x-clusters on this strand
2459 for (int i=0;i<loctracks.Count();i++) {
2460 if (!loctracks[i]->hasQloci) continue; //we really don't care here about reference-only clusters
2461 GTrackLocus& loctrack=*loctracks[i];
2462 findTMatches(loctrack, qcount); //find matching transfrags in this xcluster
2463 for (int rl=0; rl < loctrack.rloci.Count(); rl++) {
2464 findTRMatch(loctrack, qcount, *(loctrack.rloci[rl]));
2465 //find matching reference annotation for this xcluster and assign class codes to transfrags
2466 }
2467 GList<GXLocus> xloci(false,false,false);
2468 buildXLoci(loctrack, qcount, gtrack, strand, &xloci);
2469 //the newly created xloci are in xloci
2470 umrnasXStrand(xloci, gtrack);
2471 //also merge these xloci into the global list of xloci
2472 for (int l=0; l < xloci.Count(); l++) {
2473 if (xloci[l]->strand=='+') {
2474 gtrack.xloci_f.Add(xloci[l]);
2475 }
2476 else if (xloci[l]->strand=='-') {
2477 gtrack.xloci_r.Add(xloci[l]);
2478 }
2479 else gtrack.xloci_u.Add(xloci[l]);
2480 }
2481 }//for each xcluster
2482 }
2483
2484
2485 void printRefMap(FILE** frs, int qcount, GList<GLocus>* rloci) {
2486 if (rloci==NULL) return;
2487
2488 for (int l=0;l<rloci->Count(); l++) {
2489 for (int r=0;r<rloci->Get(l)->mrnas.Count(); r++) {
2490 GffObj& ref = *(rloci->Get(l)->mrnas[r]);
2491 CTData* refdata = ((CTData*)ref.uptr);
2492 GStr* clist = new GStr[qcount];
2493 GStr* eqlist = new GStr[qcount];
2494 for (int i = 0; i<refdata->ovls.Count(); i++) {
2495 GffObj* m=refdata->ovls[i]->mrna;
2496 char ovlcode=refdata->ovls[i]->code;
2497 if (m==NULL) {
2498 GMessage("Warning: NULL mRNA found for ref %s with ovlcode '%c'\n",
2499 ref.getID(), refdata->ovls[i]->code);
2500 continue;
2501 }
2502 int qfidx = ((CTData*)m->uptr)->qset;
2503 if (ovlcode == '=') {
2504 eqlist[qfidx].append(getGeneID(m));
2505 eqlist[qfidx].append('|');
2506 eqlist[qfidx].append(m->getID());
2507 eqlist[qfidx].append(',');
2508 }
2509 else if (ovlcode == 'c') {
2510 clist[qfidx].append(getGeneID(m));
2511 clist[qfidx].append('|');
2512 clist[qfidx].append(m->getID());
2513 clist[qfidx].append(',');
2514 }
2515 }//for each reference overlap
2516 for (int q=0;q<qcount;q++) {
2517 if (!eqlist[q].is_empty()) {
2518 eqlist[q].trimR(',');
2519 fprintf(frs[q],"%s\t%s\t=\t%s\n", getGeneID(ref), ref.getID(),eqlist[q].chars());
2520 }
2521 if (!clist[q].is_empty()) {
2522 clist[q].trimR(',');
2523 fprintf(frs[q],"%s\t%s\tc\t%s\n",getGeneID(ref), ref.getID(),clist[q].chars());
2524 }
2525 }
2526 delete[] clist;
2527 delete[] eqlist;
2528 }// ref loop
2529 }//ref locus loop
2530 }
2531
2532
2533
2534 void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs) {
2535 FILE* f_ltrack=NULL;
2536 FILE* f_itrack=NULL;
2537 FILE* f_ctrack=NULL;
2538 FILE* f_xloci=NULL;
2539 int cnum=0; //consensus numbering for printITrack()
2540 GStr s=fbasename;
2541 //if (qcount>1 || generic_GFF) { //doesn't make much sense for only 1 query file
2542 s.append(".tracking");
2543 f_itrack=fopen(s.chars(),"w");
2544 if (f_itrack==NULL) GError("Error creating file %s !\n",s.chars());
2545 // }
2546 s=fbasename;
2547 s.append(".combined.gtf");
2548 f_ctrack=fopen(s.chars(),"w");
2549 if (f_ctrack==NULL) GError("Error creating file %s !\n",s.chars());
2550
2551 s=fbasename;
2552 s.append(".loci");
2553 f_xloci=fopen(s.chars(),"w");
2554 if (f_xloci==NULL) GError("Error creating file %s !\n",s.chars());
2555 for (int g=0;g<gtracks.Count();g++) { //for each genomic sequence
2556 GSeqTrack& gseqtrack=*gtracks[g];
2557
2558 xclusterLoci(qcount, '+', gseqtrack);
2559 xclusterLoci(qcount, '-', gseqtrack);
2560
2561 //count XLoci, setting their id
2562 numXLoci(gseqtrack.xloci_f, xlocnum);
2563 numXLoci(gseqtrack.xloci_r, xlocnum);
2564 numXLoci(gseqtrack.xloci_u, xlocnum);
2565 //transcript accounting: for all those transcripts with 'u' or 0 class code
2566 // we have to check for polymerase runs 'p' or repeats 'r'
2567
2568 GFaSeqGet *faseq=gfasta.fetch(gseqtrack.get_gseqid(), checkFasta);
2569
2570 umrnaReclass(qcount, gseqtrack, ftr, faseq);
2571
2572 // print transcript tracking (ichain_tracking)
2573 //if (qcount>1)
2574 for (int q=0;q<qcount;q++) {
2575 if (gseqtrack.qdata[q]==NULL) continue;
2576 printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_f, qcount, cnum);
2577 printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_r, qcount, cnum);
2578 //just for the sake of completion:
2579 printITrack(f_itrack, gseqtrack.qdata[q]->umrnas, qcount, cnum);
2580 }
2581 //print XLoci and XConsensi within each xlocus
2582 //also TSS clustering and protein ID assignment for XConsensi
2583 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_f, faseq);
2584 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_r, faseq);
2585 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_u, faseq);
2586 if (tmapFiles && haveRefs) {
2587 printRefMap(frs, qcount, gseqtrack.rloci_f);
2588 printRefMap(frs, qcount, gseqtrack.rloci_r);
2589 }
2590 delete faseq;
2591 }
2592 if (tmapFiles) {
2593 for (int q=0;q<qcount;q++) {
2594 fclose(ftr[q]);
2595 if (haveRefs) fclose(frs[q]);
2596 }
2597 }
2598 if (f_ltrack!=NULL) fclose(f_ltrack);
2599 if (f_itrack!=NULL) fclose(f_itrack);
2600 if (f_ctrack!=NULL) fclose(f_ctrack);
2601 if (f_xloci!=NULL) fclose(f_xloci);
2602
2603 }

Properties

Name Value
svn:executable *