ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/cuffcompare.cpp
Revision: 146
Committed: Thu Dec 22 20:41:43 2011 UTC (8 years, 3 months ago) by gpertea
File size: 99614 byte(s)
Log Message:
removed useless operator>

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #else
4 #define PACKAGE_VERSION "INTERNAL"
5 #define SVN_REVISION "SVN"
6 #endif
7
8 #include "GArgs.h"
9 #include <ctype.h>
10 #include <errno.h>
11 #include "gtf_tracking.h"
12
13 #ifdef HAVE_CONFIG_H
14 #include "update_check.h"
15 #endif
16
17 #define USAGE "Usage:\n\
18 cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>] \n\
19 [-o <outprefix>] [-p <cprefix>] \n\
20 {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}\n\
21 \n\
22 Cuffcompare provides classification, reference annotation mapping and various\n\
23 statistics for Cufflinks transfrags.\n\
24 Cuffcompare clusters and tracks transfrags across multiple samples, writing\n\
25 matching transcripts (intron chains) into <outprefix>.tracking, and a GTF\n\
26 file <outprefix>.combined.gtf containing a nonredundant set of transcripts \n\
27 across all input files (with a single representative transfrag chosen\n\
28 for each clique of matching transfrags across samples).\n\
29 \n\
30 Options:\n\
31 -i provide a text file with a list of Cufflinks GTF files to process instead\n\
32 of expecting them as command line arguments (useful when a large number\n\
33 of GTF files should be processed)\n\
34 \n\
35 -r a set of known mRNAs to use as a reference for assessing \n\
36 the accuracy of mRNAs or gene models given in <input.gtf>\n\
37 \n\
38 -R for -r option, reduce the set of reference transcripts to \n\
39 only those found to overlap any of the input loci\n\
40 -M discard (ignore) single-exon transfrags and reference transcripts\n\
41 -N discard (ignore) single-exon reference transcripts\n\
42 \n\
43 -s <seq_path> can be a multi-fasta file with all the genomic sequences or \n\
44 a directory containing multiple single-fasta files (one file per contig);\n\
45 lower case bases will be used to classify input transcripts as repeats\n\
46 \n\
47 -d max distance (range) for grouping transcript start sites (100)\n\
48 -p the name prefix to use for consensus transcripts in the \n\
49 <outprefix>.combined.gtf file (default: 'TCONS')\n\
50 -C include the \"contained\" transcripts in the .combined.gtf file\n\
51 -G generic GFF input file(s) (do not assume Cufflinks GTF)\n\
52 -T do not generate .tmap and .refmap files for each input file\n\
53 -V verbose processing mode (showing all GFF parsing warnings)\n\
54 "
55 bool debug=false;
56 bool perContigStats=false; // -S to enable stats for every single contig
57 bool generic_GFF=false; //-G, don't assume Cufflinks GTF as input
58 bool showContained=false; // -C
59 bool reduceRefs=false;
60 bool checkFasta=false;
61 bool tmapFiles=true;
62 bool only_spliced_refs=false;
63 int debugCounter=0;
64
65 int polyrun_range=2000; //polymerase run range 2KB
66 double scoreThreshold=0;
67 double exprThreshold=0;
68 char* cprefix=NULL;
69 FILE* ffasta=NULL; //genomic seq file
70 FILE *f_ref=NULL; //reference mRNA GFF, if provided
71 FILE* f_in=NULL; //sequentially, each input GFF file
72 FILE* f_out=NULL; //stdout if not provided
73 GFastaHandler gfasta;
74 int xlocnum=0;
75 int tsscl_num=0; //for tss cluster IDs
76 int protcl_num=0; //for "unique" protein IDs within TSS clusters
77 int tssDist=100;
78 //int total_tcons=0;
79 int total_xloci_alt=0;
80
81 void openfwrite(FILE* &f, GArgs& args, char opt) {
82 GStr s=args.getOpt(opt);
83 if (!s.is_empty()) {
84 if (s=='-')
85 f=stdout;
86 else {
87 f=fopen(s,"w");
88 if (f==NULL) GError("Error creating file: %s\n", s.chars());
89 }
90 }
91 }
92
93 //-- structure to keep track of data from multiple qry input files for a single genomic seq
94 class GSeqTrack {
95 int gseq_id;
96 public:
97 const char* gseq_name;
98 GList<GLocus>* rloci_f; //reference loci for this genomic sequence
99 GList<GLocus>* rloci_r;
100 GList<GXLocus> xloci_f; // extended super-loci across all qry datasets
101 GList<GXLocus> xloci_r; // extended super-loci across all qry datasets
102 GList<GXLocus> xloci_u; // extended super-loci across all qry datasets
103 GSeqData* qdata[MAX_QFILES]; //fixed order array with GSeqData for each qry input
104 //element in array is NULL if a qry file has no transcripts on this genomic sequence
105 int get_gseqid() { return gseq_id; }
106 GSeqTrack(int gid=-1):xloci_f(true,true,false),
107 xloci_r(true,true,false), xloci_u(true,true,false) {
108 gseq_id=gid;
109 if (gseq_id>=0) {
110 gseq_name=GffObj::names->gseqs.getName(gseq_id);
111 }
112 rloci_f=NULL;
113 rloci_r=NULL;
114 for (int i=0;i<MAX_QFILES;i++) qdata[i]=NULL;
115 }
116 bool operator==(GSeqTrack& d){
117 return (gseq_id==d.gseq_id);
118 }
119 bool operator<(GSeqTrack& d){
120 return (gseq_id<d.gseq_id);
121 }
122 };
123
124 char* getFastaFile(int gseq_id);
125
126 // ref globals
127 bool haveRefs=false; //true if a reference annotation (-r) is provide
128
129 GList<GSeqData> ref_data(true,true,true); //list of reference mRNAs and loci data for each genomic seq
130 //each locus will keep track of any superloci which includes it, formed during the analysis
131
132 void processLoci(GSeqData& seqdata, GSeqData* refdata=NULL, int qfidx=0);
133
134 void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
135 GSeqData* seqdata=NULL, GSeqData* refdata=NULL);
136
137 GSeqData* getQryData(int gid, GList<GSeqData>& qdata);
138 void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs);
139
140 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
141 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
142
143 FILE* f_mintr=NULL; //missed ref introns
144
145 bool multiexon_only=false;
146 bool multiexonrefs_only=false;
147
148 GHash<GStr> refdescr;
149 void loadRefDescr(const char* fname);
150
151 GList<GStr> qryfiles(false,true,false);
152
153 //list of GSeqTrack data, sorted by gseq_id
154 GList<GSeqTrack> gseqtracks(true,true,true);
155 GSeqTrack* findGSeqTrack(int gsid);
156
157
158 int cmpGTrackByName(const pointer p1, const pointer p2) {
159 return strcmp(((GSeqTrack*)p1)->gseq_name, ((GSeqTrack*)p2)->gseq_name);
160 }
161
162
163 void show_usage() {
164 GMessage("cuffcompare v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
165 GMessage( "-----------------------------\n");
166 GMessage("%s\n", USAGE);
167 }
168
169 int main(int argc, char * const argv[]) {
170 GArgs args(argc, argv, "XDTMNVGSCKRLhp:c:d:s:i:n:r:o:");
171 int e;
172 if ((e=args.isError())>0) {
173 show_usage();
174 GMessage("Invalid argument: %s\n", argv[e]);
175 exit(1);
176 }
177 if (args.getOpt('h')!=NULL){
178 show_usage();
179 exit(1);
180 }
181 showContained=(args.getOpt('C')!=NULL);
182 debug=(args.getOpt('D')!=NULL);
183 tmapFiles=(args.getOpt('T')==NULL);
184 multiexon_only=(args.getOpt('M')!=NULL);
185 multiexonrefs_only=(args.getOpt('N')!=NULL);
186 perContigStats=(args.getOpt('S')!=NULL);
187 checkFasta=(args.getOpt('K')!=NULL);
188 gtf_tracking_verbose=((args.getOpt('V')!=NULL) || debug);
189 FILE* finlst=NULL;
190 GStr s=args.getOpt('i');
191 if (!s.is_empty()) {
192 if (s=='-')
193 finlst=stdin;
194 else {
195 finlst=fopen(s,"r");
196 if (finlst==NULL) GError("Error opening file: %s\n", s.chars());
197 }
198 }
199 int numqryfiles=0;
200 if (finlst) {
201 GLineReader* lr=new GLineReader(finlst);
202 char* l=NULL;
203 while ((l=lr->getLine())!=NULL) {
204 if (strlen(l)<2 || startsWith(l,"# ") || isspace(*l)) continue;
205 if (!fileExists(l)) GError("Error: cannot locate input file: %s\n", l);
206 qryfiles.Add(new GStr(l));
207 }
208 delete lr;
209 //if (qryfiles.Count()>10)
210 gtf_tracking_largeScale=true;
211 }
212 else {
213 numqryfiles=args.startNonOpt();
214 char *infile=NULL;
215 if (numqryfiles>0) {
216 while ((infile=args.nextNonOpt())!=NULL) {
217 if (!fileExists(infile)) GError("Error: cannot locate input file: %s\n", infile);
218 qryfiles.Add(new GStr(infile));
219 } //for each argument
220 }
221 }
222 numqryfiles=qryfiles.Count();
223 if (numqryfiles==0) {
224 show_usage();
225 exit(1);
226 }
227 if (numqryfiles>MAX_QFILES) {
228 GMessage("Error: too many input files (limit set to %d at compile time)\n",MAX_QFILES);
229 GMessage("(if you need to raise this limit set a new value for\nMAX_QFILES in gtf_tracking.h and recompile)\n");
230 exit(0x5000);
231 }
232 #ifdef HAVE_CONFIG_H
233 check_version(PACKAGE_VERSION);
234 #endif
235 gfasta.init(args.getOpt('s'));
236 // determine if -s points to a multi-fasta file or a directory
237 s=args.getOpt('c');
238 if (!s.is_empty()) scoreThreshold=s.asReal();
239 s=args.getOpt('p');
240 if (!s.is_empty()) cprefix=Gstrdup(s.chars());
241 else cprefix=Gstrdup("TCONS");
242 s=args.getOpt('e');
243 if (!s.is_empty()) exprThreshold=s.asReal();
244 s=args.getOpt('d');
245 if (!s.is_empty()) {
246 tssDist=s.asInt();
247 }
248
249 s=args.getOpt('n');
250 if (!s.is_empty()) loadRefDescr(s.chars());
251 s=args.getOpt('r');
252 if (!s.is_empty()) {
253 f_ref=fopen(s,"r");
254 if (f_ref==NULL) GError("Error opening reference gff: %s\n",s.chars());
255 haveRefs=true;
256 if (gtf_tracking_verbose) GMessage("Loading reference transcripts..\n");
257 read_mRNAs(f_ref, ref_data, &ref_data, true, -1, s.chars(), (multiexonrefs_only || multiexon_only));
258 haveRefs=(ref_data.Count()>0);
259 reduceRefs=(args.getOpt('R')!=NULL);
260 if (gtf_tracking_verbose) GMessage("..reference annotation loaded\n");
261 }
262 bool discard_redundant=true; //discard redundant input transfrags
263 generic_GFF=args.getOpt('G');
264 if (generic_GFF) discard_redundant=false; //generic GTF, don't try to discard "redundant" transcripts
265 //if a full pathname is given
266 //the other common output files will still be created in the current directory:
267 // .loci, .tracking, .stats
268 GStr outbasename; //include path, if provided
269 GStr outprefix; //without path and/or extension
270 GStr outstats=args.getOpt('o');
271 if (outstats.is_empty() || outstats=="-") {
272 outstats="cuffcmp";
273 }
274 outbasename=outstats;
275 GStr outext(getFileExt(outstats.chars()));
276 if (outext.is_empty()) {
277 outext="stats";
278 outstats.append(".stats");
279 outbasename=outstats;
280 }
281 else outext.lower();
282 if (outext=="txt" || outext=="out" || outext=="stats" || outext=="summary") {
283 outbasename.cut(outbasename.length()-outext.length()-1);
284 }
285
286 outprefix=outbasename;
287 int di=outprefix.rindex(CHPATHSEP);
288 if (di>=0) outprefix.cut(0,di+1);
289
290 if (debug) { //create a few more files potentially useful for debugging
291 s=outbasename;
292 s.append(".missed_introns.gtf");
293 f_mintr=fopen(s.chars(),"w");
294 if (f_mintr==NULL) GError("Error creating file %s!\n",s.chars());
295 /*
296 s=outbasename;
297 s.append(".noTP_introns.gtf");
298 f_nintr=fopen(s.chars(),"w");
299 s=outbasename;
300 s.append(".wrong_Qintrons.gtf");
301 f_qintr=fopen(s.chars(),"w");
302 */
303 }
304
305 f_out=fopen(outstats, "w");
306 if (f_out==NULL) GError("Error creating output file %s!\n", outstats.chars());
307 if (gtf_tracking_verbose) GMessage("Prefix for output files: %s\n", outprefix.chars());
308 fprintf(f_out, "# Cuffcompare v%s | Command line was:\n#", PACKAGE_VERSION);
309 for (int i=0;i<argc-1;i++)
310 fprintf(f_out, "%s ", argv[i]);
311 fprintf(f_out, "%s\n#\n", argv[argc-1]);
312 //int qfileno=0;
313 GList<GSeqData>** qrysdata=NULL;
314 FILE** tfiles=NULL;
315 FILE** rtfiles=NULL;
316 GMALLOC(qrysdata, numqryfiles*sizeof(GList<GSeqData>*));
317 if (tmapFiles) {
318 GMALLOC(tfiles, numqryfiles*sizeof(FILE*));
319 if (haveRefs) {
320 GMALLOC(rtfiles, numqryfiles*sizeof(FILE*));
321 }
322 }
323 for (int fi=0;fi<qryfiles.Count();fi++) {
324 GStr in_file(qryfiles[fi]->chars());
325 GStr infname(getFileName(qryfiles[fi]->chars())); //file name only
326 GStr indir(qryfiles[fi]->chars());
327 di=indir.rindex(CHPATHSEP);
328 if (di>=0) indir.cut(di+1); //directory path for this input file
329 else indir=""; //current directory
330
331 if (debug || (gtf_tracking_verbose && !gtf_tracking_largeScale))
332 GMessage("Processing qfile #%d: %s\n",fi+1, in_file.chars());
333 if (in_file=="-") { f_in=stdin; in_file="stdin"; }
334 else {
335 f_in=fopen(in_file.chars(),"r");
336 if (f_in==NULL)
337 GError("Cannot open input file %s!\n",in_file.chars());
338 }
339 //f_in is the query gff file to process
340
341 GStr sbase(indir);
342 sbase.append(outprefix);
343 sbase.append(".");
344 sbase.append(infname);
345 if (tmapFiles) {
346 //-- we should keep the infname path, otherwise the remaining file names
347 // may be the same and clobber each other
348 s=sbase;
349 s.append(".tmap");
350 tfiles[fi]=fopen(s.chars(),"w");
351 if (tfiles[fi]==NULL)
352 GError("Error creating file '%s'!\n",s.chars());
353 fprintf(tfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_gene_id\tcuff_id\tFMI\tFPKM\tFPKM_conf_lo\tFPKM_conf_hi\tcov\tlen\tmajor_iso_id\tref_match_len\n");
354 if (haveRefs) {
355 s=sbase;
356 s.append(".refmap");
357 rtfiles[fi]=fopen(s.chars(),"w");
358 if (rtfiles[fi]==NULL)
359 GError("Error creating file '%s'!\n",s.chars());
360 fprintf(rtfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_id_list\n");
361 }
362 }
363
364 GList<GSeqData>* pdata=new GList<GSeqData>(true,true,true);
365 qrysdata[fi]=pdata;
366 if (gtf_tracking_verbose) GMessage("Loading transcripts from %s..\n",in_file.chars());
367 read_mRNAs(f_in, *pdata, &ref_data, discard_redundant, fi, in_file.chars(), multiexon_only);
368 GSuperLocus gstats;
369 GFaSeqGet *faseq=NULL;
370 for (int g=0;g<pdata->Count();g++) { //for each seqdata related to a genomic sequence
371 int gsid=pdata->Get(g)->get_gseqid();
372 GSeqData* refdata=getRefData(gsid, ref_data);//ref data for this contig
373 if (!gtf_tracking_largeScale)
374 processLoci(*(pdata->Get(g)), refdata, fi);
375 GSeqTrack* seqtrack=findGSeqTrack(gsid); //this will add a gseqtrack if it doesn't exist
376 // for gsid
377 if (refdata!=NULL) {
378 seqtrack->rloci_f=&(refdata->loci_f);
379 seqtrack->rloci_r=&(refdata->loci_r);
380 }
381 seqtrack->qdata[fi]=pdata->Get(g);
382 //will only gather data into stats if perContig==false
383 if (!gtf_tracking_largeScale) reportStats(f_out, getGSeqName(gsid), gstats,
384 pdata->Get(g), refdata);
385 if (faseq!=NULL) delete faseq;
386 } //for each genomic sequence data
387 //there could be genomic sequences with no qry transcripts
388 //but with reference transcripts
389 if (haveRefs && !reduceRefs && !gtf_tracking_largeScale) {
390 for (int r=0;r<ref_data.Count();r++) {
391 GSeqData* refdata=ref_data[r];
392 int gsid=refdata->get_gseqid();
393 if (getQryData(gsid, *pdata)==NULL) {
394 reportStats(f_out, getGSeqName(gsid), gstats, NULL, refdata);
395 }//completely missed all refdata on this contig
396 }
397 }
398 //now report the summary:
399 if (!gtf_tracking_largeScale) reportStats(f_out, in_file.chars(), gstats);
400 if (f_in!=stdin) fclose(f_in);
401 //qfileno++;
402 }//for each input file
403 if (f_mintr!=NULL) fclose(f_mintr);
404 gseqtracks.setSorted(&cmpGTrackByName);
405 if (gtf_tracking_verbose) GMessage("Tracking transcripts across %d query files..\n", numqryfiles);
406 trackGData(numqryfiles, gseqtracks, outbasename, tfiles, rtfiles);
407 fprintf(f_out, "\n Total union super-loci across all input datasets: %d \n", xlocnum);
408 if (numqryfiles>1) {
409 fprintf(f_out, " (%d multi-transcript, ~%.1f transcripts per locus)\n",
410 total_xloci_alt, ((double)(GXConsensus::count))/xlocnum);
411 }
412 if (gtf_tracking_verbose) GMessage("Cleaning up..\n");
413 GFREE(cprefix);
414 // clean up
415 for (int i=0;i<numqryfiles;i++) {
416 delete qrysdata[i];
417 }
418 GFREE(qrysdata);
419 GFREE(tfiles);
420 GFREE(rtfiles);
421 gseqtracks.Clear();
422 FRCLOSE(f_ref);
423 FWCLOSE(f_out);
424 if (gtf_tracking_verbose) GMessage("Done.\n");
425 ref_data.Clear();
426 //getchar();
427 } //main ends here
428
429 void show_exons(FILE* f, GffObj& m) {
430 fprintf(f,"(");
431 int imax=m.exons.Count()-1;
432 for (int i=0;i<=imax;i++) {
433 if (i==imax) fprintf(f,"%d-%d)",m.exons[i]->start, m.exons[i]->end);
434 else fprintf(f,"%d-%d,",m.exons[i]->start, m.exons[i]->end);
435 }
436 }
437
438 bool ichainMatch(GffObj* t, GffObj* r, bool& exonMatch, int fuzz=0) {
439 //t's intron chain is considered matching to reference r
440 //if r chain is the same or a subset of t's chain
441 exonMatch=false;
442 int imax=r->exons.Count()-1;
443 int jmax=t->exons.Count()-1;
444 if (imax==0 || jmax==0) { //single-exon mRNAs
445 if (imax!=jmax) return false;
446 exonMatch=r->exons[0]->coordMatch(t->exons[0],fuzz);
447 /*if (exonMatch) return true;
448 else return (r->exons[0]->start>=t->exons[0]->start &&
449 r->exons[0]->end<=t->exons[0]->end);*/
450 return exonMatch;
451 }
452
453 if (r->exons[imax]->start<t->exons[0]->end ||
454 t->exons[jmax]->start<r->exons[0]->end ) //intron chains do not overlap at all
455 {
456 return false;
457 }
458 //check intron overlaps
459 int i=1;
460 int j=1;
461 bool exmism=false; //any mismatch
462 while (i<=imax && j<=jmax) {
463 uint rstart=r->exons[i-1]->end;
464 uint rend=r->exons[i]->start;
465 uint tstart=t->exons[j-1]->end;
466 uint tend=t->exons[j]->start;
467 if (tend<rstart) { j++; continue; }
468 if (rend<tstart) { i++; continue; }
469 break; //here we have an intron overlap
470 }
471 if (i>1 || i>imax || j>jmax) {
472 return false; //no intron overlaps found at all
473 //or first intron of ref not overlapping
474 }
475 //from now on we expect intron matches up to imax
476 if (i!=j || imax!=jmax) { exmism=true; if (fuzz==0) return false; }
477 for (;i<=imax && j<=jmax;i++,j++) {
478 if (abs((int)(r->exons[i-1]->end-t->exons[j-1]->end))>fuzz ||
479 abs((int)(r->exons[i]->start-t->exons[j]->start))>fuzz) {
480 return false; //just run away
481 }
482 }
483 //if we made it here, we have matching intron chains up to MIN(imax,jmax)
484 if (imax!=jmax) {
485 exmism=true;
486 if (jmax<imax) return false; // qry ichain included in ref ichain
487 else //ref ichain included in qry ichain
488 if (fuzz==0) return false;
489 }
490 if (exmism) {
491 //exonMatch=false; -- it's default
492 return true;
493 }
494 exonMatch = ( abs((int)(r->exons[0]->start-t->exons[0]->start))<=fuzz &&
495 abs((int)(r->exons[imax]->end-t->exons[jmax]->end))<=fuzz );
496 return true;
497 }
498
499
500 void compareLoci2R(GList<GLocus>& loci, GList<GSuperLocus>& cmpdata,
501 GList<GLocus>& refloci, int qfidx) {
502 cmpdata.Clear();//a new list of superloci will be built
503 if (refloci.Count()==0 || loci.Count()==0) return;
504 //reset cmpovl and stats
505 for (int i=0;i<refloci.Count();i++) refloci[i]->creset();
506 //find loci with overlapping refloci
507 //and store cmpovl links both ways for ALL loci and refloci on this strand
508 for (int l=0;l<loci.Count();l++) {
509 GLocus* locus=loci[l];
510 locus->creset();
511 for (int j=0;j<refloci.Count();j++) {
512 //if (refloci[j]->start>locus->end) break;
513 if (refloci[j]->start>locus->end) {
514 if (refloci[j]->start-locus->end > GFF_MAX_LOCUS) break;
515 continue;
516 }
517 if (locus->start>refloci[j]->end) continue;
518 // then we must have overlap here:
519 //if (locus->overlap(refloci[j]->start, refloci[j]->end)) {
520 locus->cmpovl.Add(refloci[j]);
521 refloci[j]->cmpovl.Add(locus);
522 //}
523 }//for each reflocus
524 } //for each locus
525
526 //create corresponding "superloci" from transitive overlapping between loci and ref
527 for (int l=0;l<loci.Count();l++) {
528 if (loci[l]->v!=0) continue; //skip, already processed
529 GSuperLocus* super=new GSuperLocus();
530 super->qfidx=qfidx;
531 //try to find all other loci connected to this locus loci[l]
532 GList<GLocus> lstack(false,false,false); //traversal stack
533 lstack.Push(loci[l]);
534 while (lstack.Count()>0) {
535 GLocus* locus=lstack.Pop();
536 if (locus->v!=0 || locus->cmpovl.Count()==0) continue;
537 super->addQlocus(*locus);
538 locus->v=1;
539 for (int r=0;r<locus->cmpovl.Count();r++) {
540 GLocus* rloc=locus->cmpovl[r];
541 if (rloc->v==0) {
542 super->addRlocus(*rloc);
543 rloc->v=1;
544 for (int ll=0;ll<rloc->cmpovl.Count();ll++) {
545 if (rloc->cmpovl[ll]->v==0) lstack.Push(rloc->cmpovl[ll]);
546 }
547 }
548 } //for each overlapping reflocus
549 } //while linking
550
551 if (super->qloci.Count()==0) {
552 delete super;
553 continue; //try next query loci
554 }
555 //--here we have a "superlocus" region data on both qry and ref
556 // -- analyze mexons matching (base level metrics)
557 cmpdata.Add(super);
558 //make each ref locus keep track of all superloci containing it
559 for (int rl=0;rl<super->rloci.Count();rl++) {
560 super->rloci[rl]->superlst->Add(super);
561 }
562 for (int x=0;x<super->rmexons.Count();x++) {
563 super->rbases_all += super->rmexons[x].end-super->rmexons[x].start+1;
564 }
565 for (int x=0;x<super->qmexons.Count();x++) {
566 super->qbases_all += super->qmexons[x].end-super->qmexons[x].start+1;
567 }
568 int i=0; //locus mexons
569 int j=0; //refmexons
570 while (i<super->qmexons.Count() && j<super->rmexons.Count()) {
571 uint istart=super->qmexons[i].start;
572 uint iend=super->qmexons[i].end;
573 uint jstart=super->rmexons[j].start;
574 uint jend=super->rmexons[j].end;
575 if (iend<jstart) { i++; continue; }
576 if (jend<istart) { j++; continue; }
577 //v--overlap here:
578 uint ovlstart = jstart>istart? jstart : istart;
579 uint ovlend = iend<jend ? iend : jend;
580 uint ovlen=ovlend-ovlstart+1;
581 super->baseTP+=ovlen; //qbases_cov
582 if (iend<jend) i++;
583 else j++;
584 } //while mexons ovl search
585 /* if (reduceRefs) {
586 super->baseFP=super->qbases_all-super->baseTP;
587 super->baseFN=super->rbases_all-super->baseTP;
588 }
589 */
590 // -- exon level comparison:
591 int* qexovl; //flags for qry exons with ref overlap
592 GCALLOC(qexovl,super->quexons.Count()*sizeof(int));
593 int* rexovl; //flags for ref exons with qry overlap
594 GCALLOC(rexovl,super->ruexons.Count()*sizeof(int));
595 for (int i=0;i<super->quexons.Count();i++) {
596 uint istart=super->quexons[i].start;
597 uint iend=super->quexons[i].end;
598 for (int j=0;j<super->ruexons.Count();j++) {
599 uint jstart=super->ruexons[j].start;
600 uint jend=super->ruexons[j].end;
601 if (iend<jstart) break;
602 if (jend<istart) continue;
603 //--- overlap here between quexons[i] and ruexons[j]
604 qexovl[i]++;
605 rexovl[j]++;
606 if (super->quexons[i].coordMatch(&super->ruexons[j],5)) {
607 super->exonATP++;
608 if (super->quexons[i].coordMatch(&super->ruexons[j])) {
609 super->exonTP++;
610 } //exact match
611 } //fuzzy match
612 } //ref uexon loop
613 } //qry uexon loop
614 super->m_exons=0; //ref exons with no query overlap
615 super->w_exons=0; //qry exons with no ref overlap
616 for (int x=0;x<super->quexons.Count();x++)
617 if (qexovl[x]==0) super->w_exons++;
618 for (int x=0;x<super->ruexons.Count();x++)
619 if (rexovl[x]==0) super->m_exons++;
620 GFREE(rexovl);
621 GFREE(qexovl);
622
623 //-- intron level stats:
624 //query:
625 int* qinovl=NULL; //flags for qry introns with at least some ref overlap
626 int* qtpinovl=NULL; //flags for qry introns with perfect ref overlap
627 if (super->qintrons.Count()>0) {
628 GCALLOC(qinovl,super->qintrons.Count()*sizeof(int));
629 GCALLOC(qtpinovl,super->qintrons.Count()*sizeof(int));
630 }
631 //-- reference:
632 int* rinovl=NULL; //flags for ref introns with qry overlap
633 int* rtpinovl=NULL; //ref introns with perfect qry intron overlap
634 if (super->rintrons.Count()>0) {
635 GCALLOC(rinovl,super->rintrons.Count()*sizeof(int));
636 GCALLOC(rtpinovl,super->rintrons.Count()*sizeof(int));
637 }
638 for (int i=0;i<super->qintrons.Count();i++) {
639 uint istart=super->qintrons[i].start;
640 uint iend=super->qintrons[i].end;
641 for (int j=0;j<super->rintrons.Count();j++) {
642 uint jstart=super->rintrons[j].start;
643 uint jend=super->rintrons[j].end;
644 if (iend<jstart) break;
645 if (jend<istart) continue;
646 //--- overlap here between qintrons[i] and rintrons[j]
647 qinovl[i]++;
648 rinovl[j]++;
649 if (super->qintrons[i].coordMatch(&super->rintrons[j],5)) {
650 super->intronATP++;
651 if (super->qintrons[i].coordMatch(&super->rintrons[j])) {
652 super->intronTP++;
653 qtpinovl[i]++;
654 rtpinovl[j]++;
655 } //exact match
656 } //fuzzy match
657 } //ref intron loop
658 } //qry intron loop
659 super->m_introns=0; //ref introns with no query overlap
660 super->w_introns=0; //qry introns with no ref overlap
661 for (int x=0;x<super->qintrons.Count();x++) {
662 if (qinovl[x]==0) { super->w_introns++;
663 //qry introns with no ref intron overlap AT ALL
664 super->i_qwrong.Add(super->qintrons[x]);
665 }
666 else
667 if (qtpinovl[x]==0) {
668 super->i_qnotp.Add(super->qintrons[x]);
669 }
670 }
671 for (int x=0;x<super->rintrons.Count();x++) {
672 if (rinovl[x]==0) { //no intron overlap at all
673 super->m_introns++;
674 super->i_missed.Add(super->rintrons[x]);
675 }
676 else if (rtpinovl[x]==0) { //no perfect intron match
677 super->i_notp.Add(super->rintrons[x]);
678 }
679 }
680 GFREE(rinovl);
681 GFREE(rtpinovl);
682 GFREE(qinovl);
683 GFREE(qtpinovl);
684
685 // ---- now intron-chain and transcript comparison
686 for (int i=0;i<super->qmrnas.Count();i++) {
687 uint istart=super->qmrnas[i]->exons.First()->start;
688 uint iend=super->qmrnas[i]->exons.Last()->end;
689 for (int j=0;j<super->rmrnas.Count();j++) {
690 uint jstart=super->rmrnas[j]->exons.First()->start;
691 uint jend=super->rmrnas[j]->exons.Last()->end;
692 if (iend<jstart) break;
693 if (jend<istart) continue;
694 //--- overlap here --
695 bool exonMatch=false;
696 if ((super->qmrnas[i]->udata & 3) > 1) continue; //already counted a ichainTP for this qry
697 if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch, 5)) { //fuzzy match
698 GLocus* qlocus=((CTData*)super->qmrnas[i]->uptr)->locus;
699 GLocus* rlocus=((CTData*)super->rmrnas[j]->uptr)->locus;
700 if (super->qmrnas[i]->exons.Count()>1) {
701 super->ichainATP++;
702 qlocus->ichainATP++;
703 rlocus->ichainATP++;
704 }
705 if (exonMatch) {
706 super->mrnaATP++;
707 qlocus->mrnaATP++;
708 rlocus->mrnaATP++;
709 }
710 if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch)) { //exact match
711 if (super->qmrnas[i]->exons.Count()>1) {
712 super->qmrnas[i]->udata|=1;
713 super->ichainTP++;
714 qlocus->ichainTP++;
715 rlocus->ichainTP++;
716 }
717 if (exonMatch) {
718 super->qmrnas[i]->udata|=2;
719 super->mrnaTP++;
720 qlocus->mrnaTP++;
721 rlocus->mrnaTP++;
722 }
723 } //exact match
724 } //fuzzy match
725 } //ref mrna loop
726 } //qry mrna loop
727 for (int ql=0;ql<super->qloci.Count();ql++) {
728 if (super->qloci[ql]->ichainTP+super->qloci[ql]->mrnaTP >0 )
729 super->locusQTP++;
730 if (super->qloci[ql]->ichainATP+super->qloci[ql]->mrnaATP>0)
731 super->locusAQTP++;
732 }
733 for (int rl=0;rl<super->rloci.Count();rl++) {
734 if (super->rloci[rl]->ichainTP+super->rloci[rl]->mrnaTP >0 )
735 super->locusTP++;
736 if (super->rloci[rl]->ichainATP+super->rloci[rl]->mrnaATP>0)
737 super->locusATP++;
738 }
739
740 }//for each unlinked locus
741
742 }
743
744 //look for qry data for a specific genomic sequence
745 GSeqData* getQryData(int gid, GList<GSeqData>& qdata) {
746 int qi=-1;
747 GSeqData f(gid);
748 GSeqData* q=NULL;
749 if (qdata.Found(&f,qi))
750 q=qdata[qi];
751 return q;
752 }
753
754 const char* findDescr(GffObj* gfobj) {
755 if (refdescr.Count()==0) return NULL;
756 GStr* s=refdescr.Find(gfobj->getID());
757 if (s==NULL) {
758 s=refdescr.Find(gfobj->getGeneName());
759 if (s==NULL) s=refdescr.Find(gfobj->getGeneID());
760 }
761 if (s!=NULL)
762 return s->chars();
763 return NULL;
764 }
765
766 const char* getGeneID(GffObj* gfobj) {
767 //returns anything that might resemble a gene identifier for this transcript
768 //or, if everything fails, returns the transcript ID
769 const char* s=gfobj->getGeneName();
770 if (s) return s;
771 if ((s=gfobj->getGeneID())!=NULL) return s;
772 if ((s=gfobj->getAttr("Name"))!=NULL) return s;
773 return gfobj->getID();
774 }
775
776 const char* getGeneID(GffObj& gfobj) {
777 return getGeneID(&gfobj);
778 }
779
780 void writeLoci(FILE* f, GList<GLocus> & loci) {
781 for (int l=0;l<loci.Count();l++) {
782 GLocus& loc=*(loci[l]);
783 fprintf(f,"%s\t%s[%c]%d-%d\t", loc.mrna_maxcov->getID(),
784 loc.mrna_maxcov->getGSeqName(),
785 loc.mrna_maxcov->strand, loc.start,loc.end);
786 //now print all transcripts in this locus, comma delimited
787 int printfd=0;
788 for (int i=0;i<loc.mrnas.Count();i++) {
789 if (loc.mrnas[i]==loc.mrna_maxcov) continue;
790 if (printfd==0) fprintf(f,"%s",loc.mrnas[i]->getID());
791 else fprintf(f,",%s",loc.mrnas[i]->getID());
792 printfd++;
793 }
794 const char* rdescr=findDescr(loc.mrna_maxcov);
795 if (rdescr==NULL) fprintf(f,"\t\n");
796 else fprintf(f,"\t%s\n",rdescr);
797 }
798 }
799
800 void printXQ1(FILE* f, int qidx, GList<GLocus>& qloci) {
801 int printfd=0;
802 //print
803 for (int i=0;i<qloci.Count();i++) {
804 if (qloci[i]->qfidx!=qidx) continue;
805 for (int j=0;j<qloci[i]->mrnas.Count();j++) {
806 if (printfd==0) fprintf(f,"%s",qloci[i]->mrnas[j]->getID());
807 else fprintf(f,",%s",qloci[i]->mrnas[j]->getID());
808 printfd++;
809 }
810 }
811 if (printfd==0) fprintf(f,"-");
812 }
813
814 void numXLoci(GList<GXLocus>& xloci, int& last_id) {
815 for (int l=0;l<xloci.Count();l++) {
816 if (xloci[l]->qloci.Count()==0) continue; //we never print ref-only xloci
817 last_id++;
818 xloci[l]->id=last_id;
819 }
820 }
821
822
823 class GProtCl {
824 public:
825 GList<GXConsensus> protcl;
826 GProtCl(GXConsensus* c=NULL):protcl(true,false,false) {
827 if (c!=NULL)
828 protcl.Add(c);
829 }
830 bool add_Pcons(GXConsensus* c) {
831 if (c==NULL || c->aalen==0) return false;
832 if (protcl.Count()==0) {
833 protcl.Add(c);
834 return true;
835 }
836 if (protcl[0]->aalen!=c->aalen) return false;
837 if (strcmp(protcl[0]->aa,c->aa)!=0) return false;
838 protcl.Add(c);
839 return true;
840 }
841
842 void addMerge(GProtCl& pcl, GXConsensus* pclnk) {
843 for (int i=0;i<pcl.protcl.Count();i++) {
844 if (pcl.protcl[i]!=pclnk) {
845 protcl.Add(pcl.protcl[i]);
846 }
847 }
848 }
849
850 int aalen() {
851 if (protcl.Count()==0) return 0;
852 return protcl[0]->aalen;
853 }
854 bool operator==(GProtCl& cl) {
855 return this==&cl;
856 }
857 bool operator<(GProtCl& cl) {
858 return (this<&cl);
859 }
860 };
861
862 class GTssCl:public GSeg { //experiment cluster of ref loci (isoforms)
863 public:
864 uint fstart; //lowest coordinate of the first exon
865 uint fend; //highest coordinate of the first exon
866 GList<GXConsensus> tsscl;
867 GTssCl(GXConsensus* c=NULL):tsscl(true,false,false) {
868 start=0;
869 end=0;
870 fstart=0;
871 fend=0;
872 if (c!=NULL) addFirst(c);
873 }
874
875 void addFirst(GXConsensus* c) {
876 tsscl.Add(c);
877 start=c->start;
878 end=c->end;
879 GffExon* fexon=(c->tcons->strand=='-') ? c->tcons->exons.Last() :
880 c->tcons->exons.First();
881 fstart=fexon->start;
882 fend=fexon->end;
883 }
884 bool add_Xcons(GXConsensus* c) {
885 if (tsscl.Count()==0) {
886 addFirst(c);
887 return true;
888 }
889 //check if it can be added to existing xconsensi
890 uint nfend=0;
891 uint nfstart=0;
892 /*
893 if (tsscl.Get(0)->tcons->getGeneID()!=NULL &&
894 c->tcons->getGeneID()!=NULL &&
895 strcmp(tsscl.Get(0)->tcons->getGeneID(), c->tcons->getGeneID()))
896 //don't tss cluster if they don't have the same GeneID (?)
897 //FIXME: we might not want this if input files are not from Cufflinks
898 // and they could simply lack proper GeneID
899 return false;
900 */
901 if (c->tcons->strand=='-') {
902 //no, the first exons don't have to overlap
903 //if (!c->tcons->exons.Last()->overlap(fstart,fend)) return false;
904 nfstart=c->tcons->exons.Last()->start;
905 nfend=c->tcons->exons.Last()->end;
906 //proximity check for the transcript start:
907 if (nfend>fend+tssDist || fend>nfend+tssDist)
908 return false;
909 }
910 else {
911 //if (!c->tcons->exons.First()->overlap(fstart,fend)) return false;
912 nfstart=c->tcons->exons.First()->start;
913 nfend=c->tcons->exons.First()->end;
914 if (nfstart>fstart+tssDist || fstart>nfstart+tssDist)
915 return false;
916 }
917 // -- if we are here, we can add to tss cluster
918
919 tsscl.Add(c);
920 if (fstart>nfstart) fstart=nfstart;
921 if (fend<nfend) fend=nfend;
922 if (start>c->start) start=c->start;
923 if (end<c->end) end=c->end;
924 return true;
925 }
926
927 void addMerge(GTssCl& cl, GXConsensus* clnk) {
928 for (int i=0;i<cl.tsscl.Count();i++) {
929 if (cl.tsscl[i]==clnk) continue;
930 tsscl.Add(cl.tsscl[i]);
931 }
932 if (fstart>cl.fstart) fstart=cl.fstart;
933 if (fend<cl.fend) fend=cl.fend;
934 if (start>cl.start) start=cl.start;
935 if (end<cl.end) end=cl.end;
936 }
937 };
938 /*
939 class IntArray { //two dimensional int array
940 int* mem;
941 int xsize;
942 int ysize;
943 public:
944 IntArray(int xlen, int ylen) {
945 xsize=xlen;
946 ysize=ylen;
947 GMALLOC(mem, xsize*ysize*sizeof(int));
948 }
949 ~IntArray() {
950 GFREE(mem);
951 }
952 int& data(int x, int y) {
953 return mem[y*xsize+x];
954 }
955 };
956
957 int aa_diff(GXConsensus* c1, GXConsensus* c2) {
958 int diflen=abs(c1->aalen-c2->aalen);
959 if (diflen>=6) return diflen;
960 //obvious case: same CDS
961 if (diflen==0 && strcmp(c1->aa, c2->aa)==0) return 0;
962 //simple edit distance calculation
963 IntArray dist(c1->aalen+1, c2->aalen+1);
964 for (int i=0;i<=c1->aalen;i++) {
965 dist.data(i,0) = i;
966 }
967 for (int j = 0; j <= c2->aalen; j++) {
968 dist.data(0,j) = j;
969 }
970 for (int i = 1; i <= c1->aalen; i++)
971 for (int j = 1; j <= c2->aalen; j++) {
972 dist.data(i,j) = GMIN3( dist.data(i-1,j)+1,
973 dist.data(i,j-1)+1,
974 dist.data(i-1,j-1)+((c1->aa[i-1] == c2->aa[j-1]) ? 0 : 1) );
975 }
976 int r=dist.data(c1->aalen,c2->aalen);
977 return r;
978 }
979 */
980 void printConsGTF(FILE* fc, GXConsensus* xc, int xlocnum) {
981 for (int i=0;i<xc->tcons->exons.Count();i++) {
982 fprintf(fc,
983 "%s\t%s\texon\t%d\t%d\t.\t%c\t.\tgene_id \"XLOC_%06d\"; transcript_id \"%s_%08d\"; exon_number \"%d\";",
984 xc->tcons->getGSeqName(),xc->tcons->getTrackName(),xc->tcons->exons[i]->start, xc->tcons->exons[i]->end, xc->tcons->strand,
985 xlocnum, cprefix, xc->id, i+1);
986 //if (i==0) {
987 const char* gene_name=NULL;
988 if (xc->ref) {
989 gene_name=xc->ref->getGeneName();
990 if (gene_name==NULL) gene_name=xc->ref->getGeneID();
991 if (gene_name) {
992 fprintf (fc, " gene_name \"%s\";", gene_name);
993 }
994 }
995 if (!haveRefs) {
996 if (gene_name==NULL && xc->tcons->getGeneName())
997 fprintf (fc, " gene_name \"%s\";", xc->tcons->getGeneName());
998 char* s=xc->tcons->getAttr("nearest_ref", true);
999 if (s) fprintf(fc, " nearest_ref \"%s\";",s);
1000 s=xc->tcons->getAttr("class_code", true);
1001 if (s) fprintf(fc, " class_code \"%s\";", s);
1002 }
1003 fprintf(fc, " oId \"%s\";",xc->tcons->getID());
1004 if (xc->contained) {
1005 fprintf(fc, " contained_in \"%s_%08d\";", cprefix, xc->contained->id);
1006 }
1007 if (haveRefs) {
1008 if (xc->ref!=NULL)
1009 fprintf(fc, " nearest_ref \"%s\";",xc->ref->getID());
1010 fprintf(fc, " class_code \"%c\";",xc->refcode ? xc->refcode : '.');
1011 }
1012 if (xc->tss_id>0) fprintf(fc, " tss_id \"TSS%d\";",xc->tss_id);
1013 if (xc->p_id>0) fprintf(fc, " p_id \"P%d\";",xc->p_id);
1014 // }
1015 fprintf(fc,"\n");
1016 }
1017 }
1018
1019 void tssCluster(GXLocus& xloc)
1020 {
1021 GList<GTssCl> xpcls(true,true,false);
1022 for (int i=0;i<xloc.tcons.Count();i++)
1023 {
1024 GXConsensus* c=xloc.tcons[i];
1025 //if (c->tcons->exons.Count()<2) continue; //should we skip single-exon transcripts ??
1026 GArray<int> mrgloci(true);
1027 int lfound=0;
1028 for (int l=0;l<xpcls.Count();l++)
1029 {
1030 if (xpcls[l]->end<c->tcons->exons.First()->start) continue;
1031 if (xpcls[l]->start>c->tcons->exons.Last()->end) break;
1032 if (xpcls[l]->add_Xcons(c))
1033 {
1034 lfound++;
1035 mrgloci.Add(l);
1036
1037 }
1038
1039 } // for each xpcluster
1040 if (lfound==0)
1041 {
1042 //create a xpcl with only this xconsensus
1043 xpcls.Add(new GTssCl(c));
1044
1045 }
1046 else if (lfound>1)
1047 {
1048 for (int l=1;l<lfound;l++)
1049 {
1050 int mlidx=mrgloci[l]-l+1;
1051 xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1052 xpcls.Delete(mlidx);
1053 }
1054 }
1055
1056 }//for each xconsensus in this xlocus
1057 for (int l=0;l<xpcls.Count();l++)
1058 {
1059 //if (xpcls[l]->tsscl.Count()<2) continue;
1060 tsscl_num++;
1061 for (int i=0;i<xpcls[l]->tsscl.Count();i++)
1062 xpcls[l]->tsscl[i]->tss_id=tsscl_num;
1063 //processTssCl(xcds_num, xpcls[l], faseq);
1064 }
1065 }
1066
1067 void protCluster(GXLocus& xloc, GFaSeqGet *faseq) {
1068 if (!faseq)
1069 return;
1070 GList<GProtCl> xpcls(true,true,false);
1071 for (int i=0;i<xloc.tcons.Count();i++) {
1072 GXConsensus* c=xloc.tcons[i];
1073 if (c->ref==NULL || c->ref->CDstart==0) continue; //no ref or CDS available
1074 if (c->refcode!='=') continue;
1075 //get the CDS translation here
1076 if (c->aa==NULL) {
1077 c->aa=c->ref->getSplicedTr(faseq, true, &c->aalen);
1078 if (c->aalen>0 && c->aa[c->aalen-1]=='.') {
1079 //discard the final stop codon
1080 c->aalen--;
1081 c->aa[c->aalen]=0;
1082 }
1083 }
1084 GArray<int> mrgloci(true);
1085 int lfound=0;
1086 for (int l=0;l<xpcls.Count();l++) {
1087 if (xpcls[l]->aalen()!=c->aalen) continue;
1088 if (xpcls[l]->add_Pcons(c)) {
1089 lfound++;
1090 mrgloci.Add(l);
1091 }
1092 } // for each xpcluster
1093 if (lfound==0) {
1094 //create a xpcl with only this xconsensus
1095 xpcls.Add(new GProtCl(c));
1096 }
1097 else if (lfound>1) {
1098 for (int l=1;l<lfound;l++) {
1099 int mlidx=mrgloci[l]-l+1;
1100 xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1101 xpcls.Delete(mlidx);
1102 }
1103 }
1104 }//for each xconsensus in this xlocus
1105 for (int l=0;l<xpcls.Count();l++) {
1106 protcl_num++;
1107 for (int i=0;i<xpcls[l]->protcl.Count();i++)
1108 xpcls[l]->protcl[i]->p_id=protcl_num;
1109 }
1110 for (int i=0;i<xloc.tcons.Count();i++) {
1111 GXConsensus* c=xloc.tcons[i];
1112 if (c->aa!=NULL) { GFREE(c->aa); }
1113 }
1114 }
1115
1116 void printXLoci(FILE* f, FILE* fc, int qcount, GList<GXLocus>& xloci, GFaSeqGet *faseq) {
1117 for (int l=0;l<xloci.Count();l++) {
1118 if (xloci[l]->qloci.Count()==0) continue;
1119 GXLocus& xloc=*(xloci[l]);
1120 xloc.checkContainment();
1121 tssCluster(xloc);//cluster and assign tss_id and cds_id to each xconsensus in xloc
1122 protCluster(xloc,faseq);
1123 for (int c=0;c<xloc.tcons.Count();c++) {
1124 if (showContained || xloc.tcons[c]->contained==NULL)
1125 printConsGTF(fc,xloc.tcons[c],xloc.id);
1126 }
1127 fprintf(f,"XLOC_%06d\t%s[%c]%d-%d\t", xloc.id,
1128 xloc.qloci[0]->mrna_maxcov->getGSeqName(),
1129 xloc.strand, xloc.start,xloc.end);
1130 //now print all transcripts in this locus, comma delimited
1131 //first, ref loci, if any
1132 int printfd=0;
1133 if (xloc.rloci.Count()>0) {
1134 for (int i=0;i<xloc.rloci.Count();i++) {
1135 for (int j=0;j<xloc.rloci[i]->mrnas.Count();j++) {
1136 if (printfd==0) fprintf(f,"%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1137 xloc.rloci[i]->mrnas[j]->getID());
1138 else fprintf(f,",%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1139 xloc.rloci[i]->mrnas[j]->getID());
1140 printfd++;
1141 }
1142 }
1143 }
1144 else {
1145 fprintf(f,"-");
1146 }
1147 //second, all the cufflinks transcripts
1148 for (int qi=0;qi<qcount;qi++) {
1149 fprintf(f,"\t");
1150 printXQ1(f,qi,xloc.qloci);
1151 }
1152 fprintf(f,"\n");
1153 }
1154 }
1155
1156 void writeIntron(FILE* f, char strand, GFaSeqGet* faseq, GSeg& iseg,
1157 GList<GffObj>& mrnas, bool wrong=false) {
1158 //find a ref mrna having this intron
1159 GffObj* rm=NULL;
1160 for (int i=0;i<mrnas.Count();i++) {
1161 GffObj* m=mrnas[i];
1162 if (m->start>iseg.end) break;
1163 if (m->end<iseg.start) continue;
1164 //intron coords overlaps mrna region
1165 for (int j=1;j<m->exons.Count();j++) {
1166 if (iseg.start==m->exons[j-1]->end+1 &&
1167 iseg.end==m->exons[j]->start-1) { rm=m; break; } //match found
1168 }//for each intron
1169 if (rm!=NULL) break;
1170 } //for each ref mrna in this locus
1171 if (rm==NULL) GError("Error: couldn't find ref mrna for intron %d-%d! (BUG)\n",
1172 iseg.start,iseg.end);
1173 int ilen=iseg.end-iseg.start+1;
1174 fprintf(f,"%s\t%s\tintron\t%d\t%d\t.\t%c\t.\t",
1175 rm->getGSeqName(),rm->getTrackName(),iseg.start,iseg.end,strand);
1176 if (faseq!=NULL) {
1177 const char* gseq=faseq->subseq(iseg.start, ilen);
1178 char* cseq=Gstrdup(gseq, gseq+ilen-1);
1179 if (strand=='-') reverseComplement(cseq, ilen);
1180 fprintf(f,"spl=\"%c%c..%c%c\"; ", toupper(cseq[0]),toupper(cseq[1]),
1181 toupper(cseq[ilen-2]),toupper(cseq[ilen-1]));
1182 GFREE(cseq);
1183 }
1184 fprintf(f,"transcript_id \"%s\";", rm->getID());
1185 if (wrong) fprintf(f," noOvl=1;");
1186 fprintf(f,"\n");
1187
1188 }
1189
1190 void reportMIntrons(FILE* fm, FILE* fn, FILE* fq, char strand,
1191 GList<GSuperLocus>& cmpdata) {
1192 if (fm==NULL) return;
1193 for (int l=0;l<cmpdata.Count();l++) {
1194 GSuperLocus *sl=cmpdata[l];
1195 //cache the whole locus sequence if possible
1196 //write these introns and their splice sites into the file
1197 for (int i=0;i<sl->i_missed.Count();i++)
1198 writeIntron(fm, strand, NULL, sl->i_missed[i], sl->rmrnas);
1199 if (fn!=NULL) {
1200 for (int i=0;i<sl->i_notp.Count();i++)
1201 writeIntron(fn, strand, NULL, sl->i_notp[i], sl->rmrnas);
1202 }
1203 if (fq!=NULL) {
1204 for (int i=0;i<sl->i_qwrong.Count();i++) {
1205 writeIntron(fq, strand, NULL, sl->i_qwrong[i], sl->qmrnas, true);
1206 }
1207 for (int i=0;i<sl->i_qnotp.Count();i++) {
1208 writeIntron(fq, strand, NULL, sl->i_qnotp[i], sl->qmrnas);
1209 }
1210 }
1211 }
1212 }
1213
1214
1215 void processLoci(GSeqData& seqdata, GSeqData* refdata, int qfidx) {
1216 //GList<GSeqLoci>& glstloci, GList<GSeqCmpRegs>& cmpdata)
1217
1218 if (refdata!=NULL) {
1219 //if (gtf_tracking_verbose) GMessage(" ..comparing to reference loci..\n") ;
1220 compareLoci2R(seqdata.loci_f, seqdata.gstats_f, refdata->loci_f, qfidx);
1221 compareLoci2R(seqdata.loci_r, seqdata.gstats_r, refdata->loci_r, qfidx);
1222 // -- report
1223
1224 if (f_mintr!=NULL) {
1225 GMessage(" ..reporting missed ref introns..\n");
1226 //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '+', seqdata.gstats_f);
1227 //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '-', seqdata.gstats_r);
1228 reportMIntrons(f_mintr, NULL, NULL, '+', seqdata.gstats_f);
1229 reportMIntrons(f_mintr, NULL, NULL, '-', seqdata.gstats_r);
1230 }
1231 }
1232 }
1233
1234 //adjust stats for a list of unoverlapped (completely missed) ref loci
1235 void collectRLocData(GSuperLocus& stats, GLocus& loc) {
1236 stats.total_rmrnas+=loc.mrnas.Count();
1237 stats.total_rexons+=loc.uexons.Count();
1238 stats.total_rintrons+=loc.introns.Count();
1239 stats.total_rmexons+=loc.mexons.Count();
1240 stats.total_richains+=loc.ichains;
1241 stats.m_exons+=loc.uexons.Count();
1242 stats.m_introns+=loc.introns.Count();
1243 stats.total_rloci++;
1244 for (int e=0;e<loc.mexons.Count();e++) {
1245 stats.rbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1246 }
1247 }
1248
1249 void collectRData(GSuperLocus& stats, GList<GLocus>& loci) {
1250 for (int l=0;l<loci.Count();l++)
1251 collectRLocData(stats,*loci[l]);
1252 }
1253
1254 //adjust stats for a list of unoverlapped (completely "wrong" or novel) qry loci
1255 void collectQLocData(GSuperLocus& stats, GLocus& loc) {
1256 stats.total_qmrnas+=loc.mrnas.Count();
1257 stats.total_qexons+=loc.uexons.Count();
1258 stats.total_qmexons+=loc.mexons.Count();
1259 stats.total_qintrons+=loc.introns.Count();
1260 stats.total_qichains+=loc.ichains;
1261 stats.total_qloci++;
1262 if (loc.ichains>0 && loc.mrnas.Count()>1)
1263 stats.total_qloci_alt++;
1264 stats.w_exons+=loc.uexons.Count();
1265 stats.w_introns+=loc.introns.Count();
1266 for (int e=0;e<loc.mexons.Count();e++) {
1267 stats.qbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1268 }
1269 }
1270
1271 void collectQData(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1272 for (int l=0;l<loci.Count();l++) {
1273 //this is called when no refdata is given, so all these loci are nloci
1274 nloci.Add(loci[l]);
1275 collectQLocData(stats,*loci[l]);
1276 }
1277 }
1278
1279 void collectQNOvl(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1280 for (int l=0;l<loci.Count();l++) {
1281 if (loci[l]->cmpovl.Count()==0) {//locus with no ref loci overlaps
1282 stats.w_loci++; //novel/wrong loci
1283 nloci.Add(loci[l]);
1284 collectQLocData(stats,*loci[l]);
1285 }
1286 }
1287 }
1288
1289 void collectQU(GSuperLocus& stats, GList<GLocus>& nloci) {
1290 for (int l=0;l<nloci.Count();l++) {
1291 stats.w_loci++; //novel/wrong loci
1292 collectQLocData(stats, *nloci[l]);
1293 }
1294 }
1295
1296 void printLocus(FILE* f, GLocus& loc, const char* gseqname) {
1297 fprintf(f, "## Locus %s:%d-%d\n",gseqname, loc.start, loc.end);
1298 for (int m=0;m<loc.mrnas.Count();m++) {
1299 loc.mrnas[m]->printGtf(f);
1300 }
1301 }
1302
1303 void collectRNOvl(GSuperLocus& stats, GList<GLocus>& loci) { //, const char* gseqname) {
1304 for (int l=0;l<loci.Count();l++) {
1305 if (loci[l]->cmpovl.Count()==0) {
1306 stats.m_loci++; //missed ref loci
1307 //if (f_mloci!=NULL)
1308 // printLocus(f_mloci,*loci[l], gseqname);
1309 collectRLocData(stats,*loci[l]);
1310 }
1311 }
1312 }
1313
1314
1315 void collectCmpData(GSuperLocus& stats, GList<GSuperLocus>& cmpdata) { //, const char* gseqname) {
1316 for (int c=0;c<cmpdata.Count();c++) {
1317 stats.addStats(*cmpdata[c]);
1318 /*
1319 if (f_nloci!=NULL && cmpdata[c]->locusTP==0 && cmpdata[c]->rloci.Count()>0) {
1320 fprintf(f_nloci, "# Superlocus %s:%d-%d\n",gseqname, cmpdata[c]->start, cmpdata[c]->end);
1321 for (int l=0;l<cmpdata[c]->rloci.Count();l++) {
1322 printLocus(f_nloci,*cmpdata[c]->rloci[l], gseqname);
1323 }
1324 }
1325 */
1326 }
1327 }
1328
1329 void collectStats(GSuperLocus& stats, GSeqData* seqdata, GSeqData* refdata) {
1330 //collect all stats for a single genomic sequence into stats
1331 if (seqdata==NULL) {
1332 if (reduceRefs || refdata==NULL) return;
1333 //special case with completely missed all refs on a contig/chromosome
1334 collectRData(stats, refdata->loci_f);
1335 collectRData(stats, refdata->loci_r);
1336 return;
1337 }
1338 if (refdata==NULL) {//reference data missing on this contig
1339 collectQData(stats, seqdata->loci_f, seqdata->nloci_f);
1340 collectQData(stats, seqdata->loci_r, seqdata->nloci_r);
1341 collectQU(stats, seqdata->nloci_u);
1342 return;
1343 }
1344
1345 /*stats.total_qloci+=seqdata->loci_f.Count();
1346 stats.total_qloci+=seqdata->loci_r.Count();
1347 if (reduceRefs) { //only collect ref loci from superloci
1348
1349 }
1350 else {
1351 stats.total_rloci+=refdata->loci_f.Count();
1352 stats.total_rloci+=refdata->loci_r.Count();
1353 }
1354 */
1355 //collect data for overlapping superloci (already in seqdata->gstats_f/_r)
1356 //char* gseqname=getGSeqName(seqdata->gseq_id);
1357 collectCmpData(stats, seqdata->gstats_f);
1358 collectCmpData(stats, seqdata->gstats_r);
1359 //for non-overlapping qry loci, always add them as false positives FP
1360 collectQNOvl(stats, seqdata->loci_f, seqdata->nloci_f);
1361 collectQNOvl(stats, seqdata->loci_r, seqdata->nloci_r);
1362 collectQU(stats, seqdata->nloci_u);
1363 if (!reduceRefs) { //find ref loci with empty cmpovl and add them
1364 collectRNOvl(stats, refdata->loci_f);
1365 collectRNOvl(stats, refdata->loci_r);
1366 }
1367 }
1368
1369 void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
1370 GSeqData* seqdata, GSeqData* refdata) {
1371 GSuperLocus stats;
1372 bool finalSummary=(seqdata==NULL && refdata==NULL);
1373 GSuperLocus *ps=(finalSummary ? &stotal : &stats );
1374 if (!finalSummary) { //collecting contig stats
1375 //gather statistics for all loci/superloci here
1376 collectStats(stats, seqdata, refdata);
1377 stotal.addStats(stats);
1378 if (!perContigStats) return;
1379 }
1380 ps->calcF();
1381 if (seqdata!=NULL) fprintf(fout, "#> Genomic sequence: %s \n", setname);
1382 else fprintf(fout, "\n#= Summary for dataset: %s :\n", setname);
1383
1384 fprintf(fout, "# Query mRNAs : %7d in %7d loci (%d multi-exon transcripts)\n",
1385 ps->total_qmrnas, ps->total_qloci, ps->total_qichains);
1386 fprintf(fout, "# (%d multi-transcript loci, ~%.1f transcripts per locus)\n",
1387 ps->total_qloci_alt, ((double)ps->total_qmrnas/ps->total_qloci));
1388
1389 if (haveRefs) {
1390 fprintf(fout, "# Reference mRNAs : %7d in %7d loci (%d multi-exon)\n",
1391 ps->total_rmrnas, ps->total_rloci, ps->total_richains);
1392 if (ps->baseTP+ps->baseFP==0 || ps->baseTP+ps->baseFN==0) return;
1393 fprintf(fout, "# Corresponding super-loci: %7d\n",ps->total_superloci);
1394
1395 /*if (seqdata!=NULL) {
1396 fprintf(fout, " ( %d/%d on forward/reverse strand)\n",
1397 seqdata->gstats_f.Count(),seqdata->gstats_r.Count());
1398 }*/
1399 fprintf(fout, "#--------------------| Sn | Sp | fSn | fSp \n");
1400 double sp=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFP);
1401 double sn=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFN);
1402 fprintf(fout, " Base level: \t%5.1f\t%5.1f\t - \t - \n",sn, sp);
1403 sp=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFP);
1404 sn=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFN);
1405 double fsp=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFP);
1406 double fsn=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFN);
1407 if (fsp>100.0) fsp=100.0;
1408 if (fsn>100.0) fsn=100.0;
1409 fprintf(fout, " Exon level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1410 if (ps->total_rintrons>0) {
1411 //intron level
1412 sp=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFP);
1413 sn=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFN);
1414 fsp=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFP);
1415 fsn=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFN);
1416 if (fsp>100.0) fsp=100.0;
1417 if (fsn>100.0) fsn=100.0;
1418 fprintf(fout, " Intron level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1419 //intron chains:
1420 sp=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFP);
1421 sn=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFN);
1422 if (sp>100.0) sp=100.0;
1423 if (sn>100.0) sn=100.0;
1424 fsp=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFP);
1425 fsn=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFN);
1426 if (fsp>100.0) fsp=100.0;
1427 if (fsn>100.0) fsn=100.0;
1428 fprintf(fout, "Intron chain level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1429 }
1430 else {
1431 fprintf(fout, " Intron level: \t - \t - \t - \t - \n");
1432 fprintf(fout, "Intron chain level: \t - \t - \t - \t - \n");
1433 }
1434 sp=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFP);
1435 sn=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFN);
1436 fsp=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFP);
1437 fsn=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFN);
1438 if (fsp>100.0) fsp=100.0;
1439 if (fsn>100.0) fsn=100.0;
1440 fprintf(fout, " Transcript level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1441 //sp=(100.0*(double)ps->locusTP)/(ps->locusTP+ps->locusFP);
1442 sp=(100.0*(double)ps->locusQTP)/ps->total_qloci;
1443 sn=(100.0*(double)ps->locusTP)/ps->total_rloci; //(ps->locusTP+ps->locusFN);
1444 fsp=(100.0*(double)ps->locusAQTP)/ps->total_qloci; //(ps->locusATP+ps->locusAFP);
1445 fsn=(100.0*(double)ps->locusATP)/ps->total_rloci; //(ps->locusATP+ps->locusAFN);
1446 fprintf(fout, " Locus level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1447 //fprintf(fout, " (locus TP=%d, total ref loci=%d)\n",ps->locusTP, ps->total_rloci);
1448 fprintf(fout, "\nMatching intron chains: %7d\n",ps->ichainTP);
1449 fprintf(fout, " Matching loci: %7d\n",ps->locusTP);
1450 fprintf(fout, "\n");
1451 sn=(100.0*(double)ps->m_exons)/(ps->total_rexons);
1452 fprintf(fout, " Missed exons: %7d/%d\t(%5.1f%%)\n",ps->m_exons, ps->total_rexons, sn);
1453 sn=(100.0*(double)ps->w_exons)/(ps->total_qexons);
1454 fprintf(fout, " Novel exons: %7d/%d\t(%5.1f%%)\n",ps->w_exons, ps->total_qexons,sn);
1455 if (ps->total_rintrons>0) {
1456 sn=(100.0*(double)ps->m_introns)/(ps->total_rintrons);
1457 fprintf(fout, " Missed introns: %7d/%d\t(%5.1f%%)\n",ps->m_introns, ps->total_rintrons, sn);
1458 }
1459 if (ps->total_qintrons>0) {
1460 sn=(100.0*(double)ps->w_introns)/(ps->total_qintrons);
1461 fprintf(fout, " Novel introns: %7d/%d\t(%5.1f%%)\n",ps->w_introns, ps->total_qintrons,sn);
1462 }
1463 if (ps->total_rloci>0) {
1464 sn=(100.0*(double)ps->m_loci)/(ps->total_rloci);
1465 fprintf(fout, " Missed loci: %7d/%d\t(%5.1f%%)\n",ps->m_loci, ps->total_rloci, sn);
1466 }
1467 if (ps->total_qloci>0) {
1468 sn=(100.0*(double)ps->w_loci)/(ps->total_qloci);
1469 fprintf(fout, " Novel loci: %7d/%d\t(%5.1f%%)\n",ps->w_loci, ps->total_qloci,sn);
1470 }
1471
1472 }
1473 }
1474
1475 int inbuf_len=1024; //starting inbuf capacity
1476 char* inbuf=NULL; // incoming buffer for sequence lines.
1477
1478 void loadRefDescr(const char* fname) {
1479 if (inbuf==NULL) { GMALLOC(inbuf, inbuf_len); }
1480 FILE *f=fopen(fname, "rb");
1481 if (f==NULL) GError("Error opening exon file: %s\n",fname);
1482 char* line;
1483 int llen=0;
1484 off_t fpos;
1485 while ((line=fgetline(inbuf, inbuf_len, f, &fpos, &llen))!=NULL) {
1486 if (strlen(line)<=2) continue;
1487 int idlen=strcspn(line,"\t ");
1488 char* p=line+idlen;
1489 if (idlen<llen && idlen>0) {
1490 *p=0;
1491 p++;
1492 refdescr.Add(line, new GStr(p));
1493 }
1494 }
1495 }
1496
1497 GSeqTrack* findGSeqTrack(int gsid) {
1498 GSeqTrack f(gsid);
1499 int fidx=-1;
1500 if (gseqtracks.Found(&f,fidx))
1501 return gseqtracks[fidx];
1502 fidx=gseqtracks.Add(new GSeqTrack(gsid));
1503 return gseqtracks[fidx];
1504 }
1505
1506
1507
1508 GffObj* findRefMatch(GffObj& m, GLocus& rloc, int& ovlen) {
1509 ovlen=0;
1510 CTData* mdata=((CTData*)m.uptr);
1511 if (mdata->eqref!=NULL && ((CTData*)(mdata->eqref->uptr))->locus==&rloc) {
1512 mdata->eqref=mdata->ovls.First()->mrna; //this should be unnecessary
1513 //check it?
1514 return mdata->ovls.First()->mrna;
1515 }
1516 //if (rloc==NULL|| m==NULL) return NULL;
1517 GffObj* ret=NULL;
1518 for (int r=0;r<rloc.mrnas.Count();r++) {
1519 int olen=0;
1520 if (tMatch(m, *(rloc.mrnas[r]),olen, true)) { //return rloc->mrnas[r];
1521 /*
1522 if (ovlen<olen) {
1523 ovlen=olen;
1524 ret=rloc.mrnas[r]; //keep the longest matching ref
1525 //but this is unnecessary, there can be only one matching ref
1526 // because duplicate refs were discarded (unless -G was used)
1527 }
1528 */
1529 mdata->addOvl('=',rloc.mrnas[r], olen);
1530 ret=mdata->ovls.First()->mrna;
1531 //this must be called only for the head of an equivalency chain
1532 CTData* rdata=(CTData*)rloc.mrnas[r]->uptr;
1533 rdata->addOvl('=',&m,olen);
1534 //if (rdata->eqnext==NULL) rdata->eqnext=&m;
1535 }
1536 }
1537 if (ret!=NULL)
1538 mdata->eqref=ret;
1539 return ret;
1540 }
1541
1542
1543 void addXCons(GXLocus* xloc, GffObj* ref, char ovlcode, GffObj* tcons, CEqList* ts) {
1544 GXConsensus* c=new GXConsensus(tcons, ts, ref, ovlcode);
1545 //xloc->tcons.Add(c);
1546 //this will also check c against the other tcons for containment:
1547 xloc->addXCons(c);
1548 }
1549
1550
1551 const uint pre_mrna_threshold = 100;
1552
1553 char getOvlCode(GffObj& m, GffObj& r, int& ovlen) {
1554 ovlen=0;
1555 if (!m.overlap(r.start,r.end)) return 'u';
1556 int jmax=r.exons.Count()-1;
1557
1558 if (m.exons.Count()==1) { //single-exon transfrag
1559 GSeg mseg(m.start, m.end);
1560 if (jmax==0) { //also single-exon ref
1561 ovlen=mseg.overlapLen(r.start,r.end);
1562 int lmax=GMAX(r.covlen, m.covlen);
1563 if (ovlen >= lmax*0.8) return '='; //fuzz matching for single-exon transcripts: 80% of the longer one
1564 //if (m.covlen<=ovlen+12 && m.covlen<r.covlen) return 'c'; //contained
1565 if (m.covlen<r.covlen && ovlen >= m.covlen*0.8) return 'c';
1566 return 'o'; //just plain overlapping
1567 }
1568 //single-exon qry overlaping multi-exon ref
1569 for (int j=0;j<=jmax;j++) {
1570 //check if it's contained by an exon
1571 if (m.start>r.exons[j]->start-8 && m.end<r.exons[j]->end+8)
1572 return 'c';
1573 if (j==jmax) break;
1574 //check if it's contained by an intron
1575 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1576 return 'i';
1577 // check if it's a potential pre-mRNA transcript
1578 // (if overlaps an intron at least 10 bases)
1579 uint iovlen=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1);
1580 if (iovlen>=10 && mseg.len()>iovlen+10) return 'e';
1581 }
1582 return 'o'; //plain overlap, uncategorized
1583 } //single-exon transfrag
1584 //-- from here on we have a multi-exon transfrag --
1585 // * check if contained by a ref intron
1586 for (int j=0;j<jmax;j++) {
1587 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1588 return 'i';
1589 }
1590 //> check if m's intron chain is a subset of r's intron chain
1591 int imax=m.exons.Count()-1;// imax>0 here
1592 if (m.exons[imax]->start<r.exons[0]->end ||
1593 r.exons[jmax]->start<m.exons[0]->end ) //intron chains do not overlap at all
1594 return 'o'; //but terminal exons do, otherwise we wouldn't be here
1595 int i=1; //index of exon to the right of current qry intron
1596 int j=1; //index of exon to the right of current ref intron
1597 //find first intron overlap
1598 while (i<=imax && j<=jmax) {
1599 if (r.exons[j]->start<m.exons[i-1]->end) { j++; continue; }
1600 if (m.exons[i]->start<r.exons[j-1]->end) { i++; continue; }
1601 break; //here we have an intron overlap
1602 }
1603 if (i>imax || j>jmax)
1604 return 'o'; //no initial intron overlap found
1605 //from here on we check all qry introns against ref introns
1606 bool jmatch=false; //true if at least a junction match is found
1607 bool icmatch=(i==1); //intron chain match - it will be updated as introns are checked
1608 //bool exovli=false; // if any terminal exon of qry extends into a ref intron
1609 int jmstart=j; //index of first intron overlap of reference
1610 int jmend=0; //index of last intron overlap of reference
1611 int imend=0; //index of last intron overlap of query
1612 //check for intron matches
1613 while (i<=imax && j<=jmax) {
1614 uint mstart=m.exons[i-1]->end;
1615 uint mend=m.exons[i]->start;
1616 uint rstart=r.exons[j-1]->end;
1617 uint rend=r.exons[j]->start;
1618 if (rend<mstart) { j++; icmatch=false; continue; } //skipping ref intron, no ichain match
1619 if (mend<rstart) { i++; icmatch=false; continue; } //skipping qry intron, no ichain match
1620 //overlapping introns here, test junction matching
1621 jmend=j; //keep track of last overlapping intron
1622 imend=i;
1623 bool smatch=(mstart==rstart);
1624 bool ematch=(mend==rend);
1625 if (smatch || ematch) jmatch=true;
1626 if (smatch && ematch) { i++; j++; } //perfect match for this intron
1627 else { //at least one junction doesn't match
1628 icmatch=false;
1629 if (mend>rend) j++; else i++;
1630 }
1631 } //while checking intron overlaps
1632
1633 if (icmatch && imend==imax) { // qry intron chain match
1634 if (jmstart==1 && jmend==jmax) return '='; //identical intron chains
1635 // -- qry intron chain is shorter than ref intron chain --
1636 int l_iovh=0; // overhang of leftmost q exon left boundary beyond the end of ref intron to the left
1637 int r_iovh=0; // same type of overhang through the ref intron on the right
1638 if (jmstart>1 && r.exons[jmstart-1]->start>m.start)
1639 l_iovh = r.exons[jmstart-1]->start - m.start;
1640 if (jmend<jmax && m.end > r.exons[jmend]->end)
1641 r_iovh = m.end - r.exons[jmend]->end;
1642 if (l_iovh<4 && r_iovh<4) return 'c';
1643 //TODO? check if any x_iovl>10 and return 'e' to signal an "unspliced intron" ?
1644 // or we can check if any of them are >= the length of the corresponding ref intron on that side
1645 return 'j';
1646 }
1647 /*
1648 if (icmatch && (jmax>=imax)) { //all qry introns match
1649 //but they may overlap
1650 // if ((lbound && lbound > m.exons[0]->start+10) ||
1651 // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1652 // return 'c';
1653 // }
1654 int code = 'c';
1655 if (lbound)
1656 {
1657 uint ref_boundary = lbound;
1658 uint cuff_boundary = m.exons[0]->start;
1659 if (ref_boundary > (cuff_boundary + pre_mrna_threshold)) // cuff extends a lot
1660 {
1661 code = 'j';
1662 }
1663 if (ref_boundary > cuff_boundary) // cuff extends just a bit into a ref intron
1664 {
1665 code = 'e';
1666 }
1667 }
1668 if (j <= jmax)
1669 {
1670 uint ref_boundary = r.exons[j-1]->end;
1671 uint cuff_boundary = m.exons[i-1]->end;
1672 if (cuff_boundary > (ref_boundary + pre_mrna_threshold)) // cuff extends a lot
1673 {
1674 code = 'j';
1675 }
1676 if (cuff_boundary > ref_boundary) // cuff extends just a bit into a ref intron
1677 {
1678 code = 'e';
1679 }
1680 }
1681 //if ((lbound && lbound > m.exons[0]->start+10) ||
1682 // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1683 return code;
1684 }
1685
1686 // if (!ichain) // first and last exons more or less match, but there's a different intron somewhere
1687 // {
1688 //
1689 // }
1690
1691 */
1692 return jmatch ? 'j':'o';
1693 }
1694
1695 char getRefOvl(GffObj& m, GLocus& rloc, GffObj*& rovl, int& ovlen) {
1696 rovl=NULL;
1697 ovlen=0;
1698 if (m.start>rloc.end || m.end<rloc.start) {
1699 //see if it's a polymerase run ?
1700 /*
1701 if ((m.strand=='+' && m.end<=rloc.end+polyrun_range) ||
1702 (m.strand=='-' && m.start>=rloc.start-polyrun_range)) {
1703 rovl=rloc.mrna_maxcov;
1704 ((CTData*)m.uptr)->addOvl('p',rloc.mrna_maxcov);
1705 return 'p';
1706 }
1707 */
1708 return 0; //unknown -> intergenic space
1709 }
1710 for (int i=0;i<rloc.mrnas.Count();i++) {
1711 GffObj* r=rloc.mrnas[i];
1712 int olen=0;
1713 char ovlcode=getOvlCode(m,*r,olen);
1714 if (ovlcode!=0) { //has some sort of "overlap" with r
1715 ((CTData*)m.uptr)->addOvl(ovlcode,r,olen);
1716 if (olen>ovlen) ovlen=olen;
1717 if (ovlcode=='c' || ovlcode=='=') //keep match/containment for each reference transcript
1718 ((CTData*)r->uptr)->addOvl(ovlcode,&m,olen);
1719 }
1720 }//for each ref in rloc
1721 // i,j,o
1722 return ((CTData*)m.uptr)->getBestCode();
1723 }
1724
1725 /*
1726 void findTMatches(GTrackLocus& loctrack, int qcount) {
1727 //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1728 for (int q=0;q<qcount-1;q++) { //for each qry dataset
1729 if (loctrack[q]==NULL) continue;
1730 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1731 GffObj* qi_t=loctrack[q]->Get(qi);
1732 CTData* qi_d=(CTData*)qi_t->uptr;
1733 if ((qi_d->eqdata & EQHEAD_TAG) !=0) { //this is set as an EQ chain head already
1734 //if (qi_t->exons.Count()>1)
1735 continue;
1736 }
1737 for (int n=q+1;n<qcount;n++) { // for every successor dataset
1738 if (loctrack[n]==NULL) continue;
1739 for (int ni=0;ni<loctrack[n]->Count();ni++) {
1740 GffObj* ni_t=loctrack[n]->Get(ni);
1741 CTData* ni_d=(CTData*)ni_t->uptr;
1742 //if (ni_d->eqdata!=0) continue; //already part of an EQ chain
1743 //single exon transfrags have special treatment:
1744 bool s_match=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1745 if (ni_d->eqdata!=0 && !s_match) continue; //already part of an EQ chain
1746 if (ni_d->eqnext!=NULL) continue;
1747 int ovlen=0;
1748
1749 if (qi_d->eqnext!=NULL) {
1750 if (!s_match) continue;
1751 //test all in the EQ list for a match
1752 bool matchFound=false;
1753 CTData* next_eq_d=qi_d;
1754 if (tMatch(*qi_t, *ni_t, ovlen, true)) {
1755 matchFound=true;
1756 }
1757 else {
1758 while (next_eq_d->eqnext!=NULL) {
1759 if (tMatch(*(next_eq_d->eqnext), *ni_t, ovlen, true)) {
1760 matchFound=true;
1761 break;
1762 }
1763 next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1764 } //last in the chain
1765 }
1766 if (matchFound) {
1767 //add this to the end of the EQ chain instead
1768 next_eq_d=(CTData*)(qi_d->eqnext->uptr);
1769 while (next_eq_d->eqnext!=NULL) {
1770 next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1771 } //last in the chain
1772 next_eq_d->eqnext=ni_t;
1773 ni_d->eqdata=n+1;
1774 ni_d->eqnext=NULL; ///TEST
1775 }
1776 }
1777 else {
1778 if (tMatch(*qi_t,*ni_t, ovlen, true)) {
1779 qi_d->eqnext=ni_t;
1780 ni_d->eqnext=NULL; ///TEST
1781 if (qi_d->eqdata == 0) {//only start of chain is tagged
1782 qi_d->eqdata = ((q+1) | EQHEAD_TAG);
1783 }
1784 ni_d->eqdata=n+1; //EQ chain member only marked with qry# (1-based)
1785 }
1786 } //multi-exon case
1787 } //for each transfrag in the next qry dataset
1788
1789 if (qi_d->eqnext!=NULL && qi_t->exons.Count()>1) break;
1790 //part of a chain already, skip other datasets
1791 } // for each successor dataset
1792 } //for each transcript in qry dataset
1793 } //for each qry dataset
1794 }
1795 */
1796
1797 void findTMatches(GTrackLocus& loctrack, int qcount) {
1798 //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1799 for (int q=0;q<qcount-1;q++) { //for each qry dataset
1800 if (loctrack[q]==NULL) continue;
1801 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1802 GffObj* qi_t=loctrack[q]->Get(qi);
1803 CTData* qi_d=(CTData*)qi_t->uptr;
1804 if (qi_d->eqlist!=NULL && qi_t->exons.Count()>1) {
1805 continue; //this is part of an EQ chain already
1806 }
1807 for (int n=q+1;n<qcount;n++) { // for every successor dataset
1808 if (loctrack[n]==NULL) continue;
1809 for (int ni=0;ni<loctrack[n]->Count();ni++) {
1810 GffObj* ni_t=loctrack[n]->Get(ni);
1811 CTData* ni_d=(CTData*)ni_t->uptr;
1812 bool singleExon=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1813 if (ni_d->eqlist!=NULL &&
1814 (ni_d->eqlist==qi_d->eqlist || !singleExon)) continue;
1815 int ovlen=0;
1816 if ((ni_d->eqlist==qi_d->eqlist && qi_d->eqlist!=NULL) ||
1817 tMatch(*qi_t,*ni_t, ovlen, singleExon)) {
1818 qi_d->joinEqList(ni_t);
1819 }
1820 }
1821 } // for each successor dataset
1822 } //for each transcript in qry dataset
1823 } //for each qry dataset
1824 }
1825
1826
1827 int cmpTData_qset(const pointer* p1, const pointer* p2) {
1828 CTData* d1=(CTData*)(((GffObj*)p1)->uptr);
1829 CTData* d2=(CTData*)(((GffObj*)p2)->uptr);
1830 return (d1->qset - d2->qset);
1831 }
1832
1833 void printITrack(FILE* ft, GList<GffObj>& mrnas, int qcount, int& cnum) {
1834 for (int i=0;i<mrnas.Count();i++) {
1835 GffObj& qt=*(mrnas[i]);
1836 CTData* qtdata=(CTData*)qt.uptr;
1837 int qfidx=qtdata->qset;
1838 char ovlcode=qtdata->classcode;
1839 //GList<GffObj> eqchain(false,false,false);
1840 CEqList* eqchain=qtdata->eqlist;
1841 GffObj* ref=NULL; //related ref -- it doesn't have to be fully matching
1842 GffObj* eqref=NULL; //fully ichain-matching ref
1843 GffObj* tcons=NULL; //"consensus" (largest) transcript for a clique
1844 int tmaxcov=0;
1845 //eqchain.Add(&qt);
1846 eqref=qtdata->eqref;
1847 if (qtdata->ovls.Count()>0 && qtdata->ovls[0]->mrna!=NULL) {
1848 //if it has ovlcode with a ref
1849 ref=qtdata->ovls[0]->mrna;
1850 //consistency check: qtdata->ovls[0]->code==ovlcode
1851 // -- let tcons be a transfrag, not a ref transcript
1852 //tcons=eqref;
1853 //if (tcons!=NULL) tmaxcov=tcons->covlen;
1854 }
1855 //chain pre-check
1856 if (tcons==NULL || mrnas[i]->covlen>tmaxcov) {
1857 tcons=mrnas[i];
1858 tmaxcov=tcons->covlen;
1859 }
1860 if (qtdata->isEqHead()) {//head of a equivalency chain
1861 //check if all transcripts in this chain have the same ovlcode
1862 for (int k=0;k<qtdata->eqlist->Count();k++) {
1863 GffObj* m=qtdata->eqlist->Get(k);
1864 if (m->covlen>tmaxcov) {
1865 tmaxcov=m->covlen;
1866 tcons=m;
1867 }
1868 if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1869 ovlcode='.'; //non-uniform ovlcode
1870 }
1871 }
1872 /*
1873 GffObj* m=mrnas[i];
1874 while (((CTData*)m->uptr)->eqnext!=NULL) {
1875 m=((CTData*)m->uptr)->eqnext;
1876 eqchain.Add(m);
1877 if (m->covlen>tmaxcov) {
1878 tmaxcov=m->covlen;
1879 tcons=m;
1880 }
1881 if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1882 ovlcode='.'; //non-uniform ovlcode
1883 //break;
1884 }
1885 } //while elements in chain
1886 */
1887
1888 }//chain check
1889 //if (ovlcode=='p') ref=NULL; //ignore polymerase runs?
1890 if (ovlcode==0 || ovlcode=='-') ovlcode = (ref==NULL) ? 'u' : '.';
1891 //-- print columns 1 and 2 as LOCUS_ID and TCONS_ID
1892 //bool chainHead=(qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0));
1893 bool chainHead=qtdata->isEqHead();
1894 //bool noChain=((qtdata->eqdata & EQCHAIN_TAGMASK)==0);
1895 bool noChain=(eqchain==NULL);
1896 if (chainHead || noChain) {
1897 cnum++;
1898 if (ft!=NULL) fprintf(ft,"%s_%08d\t",cprefix,cnum);
1899 GXLocus* xloc=qtdata->locus->xlocus;
1900 if (xloc!=NULL) {
1901 if (ft!=NULL) fprintf(ft, "XLOC_%06d\t",xloc->id);
1902 if (tcons->exons.Count()>1) {
1903 //! only multi-exon mRNAs are counted for multi-transcript xloci !
1904 xloc->num_mtcons++;
1905 if (xloc->num_mtcons==2)
1906 total_xloci_alt++;
1907 }
1908 }
1909 else {
1910 //should NEVER happen!
1911 int fidx=qtdata->qset;
1912 GError("Error: no XLocus created for transcript %s (file %s) [%d, %d], on %s%c:%d-%d\n", qt.getID(),
1913 qryfiles[qtdata->locus->qfidx]->chars(), qtdata->locus->qfidx, fidx, qt.getGSeqName(), qt.strand, qt.start, qt.end);
1914 }
1915 addXCons(xloc, ref, ovlcode, tcons, eqchain);
1916 } // if chain head or uniq entry (not part of a chain)
1917 if (ft==NULL) continue;
1918 if (chainHead) {
1919 //this is the start of a equivalence class as a printing chain
1920 if (ref!=NULL) fprintf(ft,"%s|%s\t%c", getGeneID(ref),ref->getID(), ovlcode);
1921 else fprintf(ft,"-\t%c", ovlcode);
1922 GffObj* m=mrnas[i];
1923 CTData* mdata=(CTData*)m->uptr;
1924
1925 int lastpq=-1;
1926 /*
1927 for (int ptab=mdata->qset-lastpq; ptab>0;ptab--)
1928 if (ptab>1) fprintf(ft,"\t-");
1929 else fprintf(ft,"\t");
1930 lastpq=mdata->qset;
1931 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1932 iround(m->gscore/10), mdata->FPKM, mdata->conf_lo, mdata->conf_hi, mdata->cov, m->covlen);
1933 //traverse linked list of matching transcripts
1934 while (mdata->eqnext!=NULL) {
1935 m=mdata->eqnext;
1936 mdata=(CTData*)m->uptr;
1937 for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1938 if (ptab>1) fprintf(ft,"\t-");
1939 else fprintf(ft,"\t");
1940 lastpq = mdata->qset;
1941 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1942 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1943 } //traverse and print row
1944 */
1945 eqchain->setUnique(false);
1946 eqchain->setSorted((GCompareProc*) cmpTData_qset);
1947
1948 for (int k=0;k<eqchain->Count();k++) {
1949 m=eqchain->Get(k);
1950 mdata=(CTData*)m->uptr;
1951 if (mdata->qset==lastpq) {
1952 //shouldn't happen, unless this input set is messed up (has duplicates/redundant transfrags)
1953 fprintf(ft,",%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", getGeneID(m), m->getID(),
1954 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1955 continue;
1956 }
1957 for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1958 if (ptab>1) fprintf(ft,"\t-");
1959 else fprintf(ft,"\t");
1960 lastpq = mdata->qset;
1961 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1962 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1963 }
1964 for (int ptab=qcount-lastpq-1;ptab>0;ptab--)
1965 fprintf(ft,"\t-");
1966 fprintf(ft,"\n");
1967 continue;
1968 } //start of eq class (printing chain)
1969
1970 if (eqchain!=NULL) continue; //part of a matching chain, dealt with previously
1971
1972 //--------- not in an ichain-matching class, print as singleton
1973
1974 if (ref!=NULL) fprintf(ft,"%s|%s\t%c",getGeneID(ref), ref->getID(), ovlcode);
1975 else fprintf(ft,"-\t%c",ovlcode);
1976 for (int ptab=qfidx;ptab>=0;ptab--)
1977 if (ptab>0) fprintf(ft,"\t-");
1978 else fprintf(ft,"\t");
1979 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|-",qfidx+1, getGeneID(qt), qt.getID(),iround(qt.gscore/10),
1980 qtdata->FPKM, qtdata->conf_lo,qtdata->conf_hi,qtdata->cov);
1981 for (int ptab=qcount-qfidx-1;ptab>0;ptab--)
1982 fprintf(ft,"\t-");
1983 fprintf(ft,"\n");
1984 } //for each transcript
1985 }
1986
1987
1988 void findTRMatch(GTrackLocus& loctrack, int qcount, GLocus& rloc) {
1989 //requires loctrack to be already populated with overlapping qloci by findTMatches()
1990 // which also found (and tagged) all matching qry transcripts
1991 for (int q=0;q<qcount;q++) { //for each qry dataset
1992 if (loctrack[q]==NULL) continue;
1993 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1994 //if (loctrack[q]->cl[qi]->exons.Count()<2) continue; //skip single-exon transcripts
1995 GffObj& qt=*(loctrack[q]->Get(qi));
1996 CTData* qtdata=(CTData*)qt.uptr;
1997 GffObj* rmatch=NULL; //== ref match for this row
1998 int rovlen=0;
1999 //if (qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0)) {
2000 if (qtdata->isEqHead()) {
2001 //EQ chain head -- transfrag equivalency list starts here
2002 if (qtdata->eqref==NULL) { //find rloc overlap
2003 if (qt.overlap(rloc.start, rloc.end)) {
2004 rmatch=findRefMatch(qt, rloc, rovlen);
2005 }
2006 } else rmatch=qtdata->eqref;
2007 if (rmatch!=NULL) {
2008 /*
2009 GffObj* m=loctrack[q]->Get(qi);
2010 //traverse linked list of matching transcripts
2011 while (((CTData*)m->uptr)->eqnext!=NULL) {
2012 m=((CTData*)m->uptr)->eqnext;
2013 if (rmatch!=NULL) {
2014 ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2015 }
2016 } //traverse qry data sets
2017 continue;
2018 }
2019 */
2020 for (int k=0;k<qtdata->eqlist->Count();k++) {
2021 GffObj* m=qtdata->eqlist->Get(k);
2022 ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2023 continue;
2024 }
2025 }
2026 //if (rmatch!=NULL) continue;
2027 } //equivalence class (chain of intron-matching)
2028 //if ((qtdata->eqdata & EQCHAIN_TAGMASK)!=0) continue; //part of a matching chain, dealt with previously
2029
2030 //--------- qry mrna not in a '=' matching clique
2031 if (qtdata->eqref==NULL) { //find any rloc overlap
2032 if (qt.overlap(rloc.start, rloc.end)) {
2033 rmatch=findRefMatch(qt, rloc, rovlen);
2034 if (rmatch==NULL) {
2035 //not an ichain match, look for other codes
2036 GffObj* rovl=NULL;
2037 int rovlen=0;
2038 //char ovlcode=
2039 getRefOvl(qt, rloc,rovl,rovlen);
2040 }
2041 }
2042 }
2043 else rmatch=qtdata->eqref;
2044 } //for each qry transcript
2045 }//for each qry dataset
2046 }
2047
2048
2049 bool inPolyRun(char strand, GffObj& m, GList<GLocus>* rloci, int& rlocidx) {
2050 //we are only here if there is no actual overlap between m and any locus in rloci
2051 if (rloci==NULL || rloci->Count()==0) return false; // || m.exons.Count()>1
2052 if (strand=='-') {
2053 rlocidx=qsearch_loci(m.end, *rloci);
2054 //returns index of locus starting just ABOVE m.end
2055 // or -1 if last locus start <= m.end
2056 GLocus* rloc=NULL;
2057 if (rlocidx<0) return false;
2058 while (rlocidx<rloci->Count()) {
2059 rloc=rloci->Get(rlocidx);
2060 if (rloc->start>m.end+polyrun_range) break;
2061 if (rloc->start+6>m.end) return true;
2062 rlocidx++;
2063 }
2064 }
2065 else { // strand == '+' (or '.' ?)
2066 rlocidx=qsearch_loci(m.end, *rloci);
2067 GLocus* rloc=NULL;
2068 //returns index of closest locus starting ABOVE m.end
2069 // or -1 if last locus start <= m.end
2070 if (rlocidx<0) rlocidx=rloci->Count(); //this may actually start below m.end
2071 while ((--rlocidx)>=0) {
2072 rloc=rloci->Get(rlocidx);
2073 if (m.start>rloc->start+GFF_MAX_LOCUS) break;
2074 if (m.start+6>rloc->end && m.start<rloc->end+polyrun_range) return true;
2075 }
2076 }
2077 return false;
2078 }
2079
2080 CTData* getBestOvl(GffObj& m) {
2081 //CTData* mdata=(CTData*)m.uptr;
2082 //return mdata->getBestCode();
2083 if ( ((CTData*)m.uptr)->ovls.Count()>0)
2084 return (CTData*)m.uptr;
2085 return NULL;
2086 }
2087
2088 void reclass_XStrand(GList<GffObj>& mrnas, GList<GLocus>* rloci) {
2089 //checking for relationship with ref transcripts on opposite strand
2090 if (rloci==NULL || rloci->Count()<1) return;
2091 int j=0;//current rloci index
2092 for (int i=0;i<mrnas.Count();i++) {
2093 GffObj& m=*mrnas[i];
2094 char ovlcode=((CTData*)m.uptr)->getBestCode();
2095 if (ovlcode>47 && strchr("=cjeo",ovlcode)!=NULL) continue;
2096 GLocus* rloc=rloci->Get(j);
2097 if (rloc->start>m.end) continue; //check next transfrag
2098 while (m.start>rloc->end && j+1<rloci->Count()) {
2099 j++;
2100 rloc=rloci->Get(j);
2101 }
2102 if (rloc->start>m.end) continue; //check next transfrag
2103 //m overlaps rloc:
2104 //check if m has a fuzzy intron overlap -> 's' (shadow, mapped on the wrong strand)
2105 // then if m is contained within an intron -> 'i'
2106 // otherwise it's just a plain cross-strand overlap: 'x'
2107 int jm=0;
2108 do { //while rloci overlap this transfrag (m)
2109 rloc=rloci->Get(j+jm);
2110 bool is_shadow=false;
2111 GffObj* sovl=NULL;
2112 bool is_intraintron=false;
2113 GffObj* iovl=NULL;
2114 if (rloc->introns.Count()>0) {
2115 for (int n=0;n<rloc->introns.Count();n++) {
2116 GISeg& rintron=rloc->introns[n];
2117 if (rintron.start>m.end) break;
2118 if (m.start>rintron.end) continue;
2119 //overlap between m and intron
2120 if (m.end<=rintron.end && m.start>=rintron.start) {
2121 is_intraintron=true;
2122 if (iovl==NULL || iovl->covlen<rintron.t->covlen) iovl=rintron.t;
2123 continue;
2124 }
2125 //check if any intron of m has a fuzz-match with rintron
2126 for (int e=1;e<m.exons.Count();e++) {
2127 GSeg mintron(m.exons[e-1]->end+1,m.exons[e]->start-1);
2128 if (rintron.coordMatch(&mintron,10)) {
2129 is_shadow=true;
2130 if (sovl==NULL || sovl->covlen<rintron.t->covlen) sovl=rintron.t;
2131 break;
2132 }
2133 } //for each m intron
2134 } //for each intron of rloc
2135 }//rloc has introns
2136 bool xcode=true;
2137 if (is_shadow) { ((CTData*)m.uptr)->addOvl('s', sovl); xcode=false; }
2138 // else
2139 if (ovlcode!='i' && is_intraintron) { ((CTData*)m.uptr)->addOvl('i', iovl); xcode=false; }
2140 if (xcode) {
2141 // just plain overlap, find the overlapping mrna in rloc
2142 GffObj* maxovl=NULL;
2143 int ovlen=0;
2144 GffObj* max_lovl=NULL; //max len ref transcript
2145 // having no exon overlap but simply range overlap (interleaved exons)
2146 for (int ri=0;ri<rloc->mrnas.Count();ri++) {
2147 if (!m.overlap(*(rloc->mrnas[ri]))) continue;
2148 int o=m.exonOverlapLen(*(rloc->mrnas[ri]));
2149 if (o>0) {
2150 if (o>ovlen) {
2151 ovlen=o;
2152 maxovl=rloc->mrnas[ri];
2153 }
2154 }
2155 else { //no exon overlap, but still overlapping (interleaved exons)
2156 if (max_lovl==NULL || max_lovl->covlen<rloc->mrnas[ri]->covlen)
2157 max_lovl=rloc->mrnas[ri];
2158 }
2159 }
2160 if (maxovl) ((CTData*)m.uptr)->addOvl('x',maxovl);
2161 else if (max_lovl) ((CTData*)m.uptr)->addOvl('x',max_lovl);
2162 } //'x'
2163 jm++;
2164 } while (j+jm<rloci->Count() && rloci->Get(j+jm)->overlap(m));
2165 } //for each transfrag
2166 }
2167
2168 void reclass_mRNAs(char strand, GList<GffObj>& mrnas, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2169 int rlocidx=-1;
2170 for (int i=0;i<mrnas.Count();i++) {
2171 GffObj& m=*mrnas[i];
2172 char ovlcode=((CTData*)m.uptr)->getBestCode();
2173 //if (ovlcode=='u' || ovlcode=='i' || ovlcode==0) {
2174 if (ovlcode=='u' || ovlcode<47) {
2175 //check for overlaps with ref transcripts on the other strand
2176 if (m.exons.Count()==1 && inPolyRun(strand, m, rloci, rlocidx)) {
2177 ((CTData*)m.uptr)->addOvl('p',rloci->Get(rlocidx)->mrna_maxcov);
2178 }
2179 else { //check for repeat content
2180 if (faseq!=NULL) {
2181 int seqlen;
2182 char* seq=m.getSpliced(faseq, false, &seqlen);
2183 //get percentage of lowercase
2184 int numlc=0;
2185 for (int c=0;c<seqlen;c++) if (seq[c]>='a') numlc++;
2186 if (numlc > seqlen/2)
2187 ((CTData*)m.uptr)->addOvl('r');
2188 GFREE(seq);
2189 }
2190 }
2191 } //for unassigned class
2192 }//for each mrna
2193
2194 }
2195
2196 void reclassLoci(char strand, GList<GLocus>& qloci, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2197 for (int ql=0;ql<qloci.Count();ql++) {
2198 reclass_mRNAs(strand, qloci[ql]->mrnas, rloci, faseq);
2199 //find closest upstream ref locus for this q locus
2200 } //for each locus
2201 }
2202
2203 //for a single genomic sequence, all qry data and ref data is stored in gtrack
2204 //check for all 'u' transfrags if they are repeat ('r') or polymerase run 'p' or anything else
2205 void umrnaReclass(int qcount, GSeqTrack& gtrack, FILE** ftr, GFaSeqGet* faseq=NULL) {
2206 for (int q=0;q<qcount;q++) {
2207 if (gtrack.qdata[q]==NULL) continue; //no transcripts in this q dataset for this genomic seq
2208 reclassLoci('+', gtrack.qdata[q]->loci_f, gtrack.rloci_f, faseq);
2209 reclassLoci('-', gtrack.qdata[q]->loci_r, gtrack.rloci_r, faseq);
2210 reclass_mRNAs('+', gtrack.qdata[q]->umrnas, gtrack.rloci_f, faseq);
2211 reclass_mRNAs('-', gtrack.qdata[q]->umrnas, gtrack.rloci_r, faseq);
2212 //and also check for special cases with cross-strand overlaps:
2213 reclass_XStrand(gtrack.qdata[q]->mrnas_f, gtrack.rloci_r);
2214 reclass_XStrand(gtrack.qdata[q]->mrnas_r, gtrack.rloci_f);
2215 // print all tmap data here here:
2216 for (int i=0;i<gtrack.qdata[q]->tdata.Count();i++) {
2217 CTData* mdata=gtrack.qdata[q]->tdata[i];
2218 if (mdata->mrna==NULL) continue; //invalidated -- removed earlier
2219 //GLocus* rlocus=NULL;
2220 mdata->classcode='u';
2221 GffObj* ref=NULL;
2222 if (mdata->ovls.Count()>0) {
2223 mdata->classcode=mdata->ovls[0]->code;
2224 ref=mdata->ovls[0]->mrna;
2225 }
2226 //if (mdata->classcode<33) mdata->classcode='u';
2227 if (mdata->classcode<47) mdata->classcode='u'; // if 0, '-' or '.'
2228 if (tmapFiles) {
2229 char ref_match_len[2048];
2230 if (ref!=NULL) {
2231 sprintf(ref_match_len, "%d",ref->covlen);
2232 fprintf(ftr[q],"%s\t%s\t",getGeneID(ref),ref->getID());
2233 //rlocus=((CTData*)(ref->uptr))->locus;
2234 }
2235 else {
2236 fprintf(ftr[q],"-\t-\t");
2237 strcpy(ref_match_len, "-");
2238 }
2239 //fprintf(ftr[q],"%c\t%s\t%d\t%8.6f\t%8.6f\t%d\n", ovlcode, mdata->mrna->getID(),
2240 // iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->cov, mdata->mrna->covlen);
2241 const char* mlocname = (mdata->locus!=NULL) ? mdata->locus->mrna_maxcov->getID() : mdata->mrna->getID();
2242 fprintf(ftr[q],"%c\t%s\t%s\t%d\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%d\t%s\t%s\n", mdata->classcode, getGeneID(mdata->mrna), mdata->mrna->getID(),
2243 iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->conf_lo,mdata->conf_hi, mdata->cov, mdata->mrna->covlen, mlocname, ref_match_len);
2244 }
2245 } //for each tdata
2246 } //for each qdata
2247 }
2248
2249 void buildXLoci(GTrackLocus& loctrack, int qcount, GSeqTrack& gtrack, char strand,
2250 GList<GXLocus>* retxloci=NULL) {
2251 GList<GXLocus>* dest_xloci=NULL;
2252 GList<GXLocus> tmpxloci(true,false,true); //local set of newly created xloci
2253 GList<GXLocus>* xloci=&tmpxloci;
2254 if (strand=='+') {
2255 dest_xloci=& gtrack.xloci_f;
2256 }
2257 else if (strand=='-') {
2258 dest_xloci = & gtrack.xloci_r;
2259 }
2260 else dest_xloci= & gtrack.xloci_u;
2261
2262 if (retxloci==NULL) {
2263 //if no return set of build xloci was given
2264 //take it as a directive to work directly on the global xloci
2265 xloci=dest_xloci;
2266 dest_xloci=NULL;
2267 }
2268 for (int q=-1;q<qcount;q++) {
2269 GList<GLocus>* wrkloci=NULL;
2270 if (q<0) {
2271 if (loctrack.rloci.Count()==0) continue;
2272 //loci=new GList<GLocus>(true,false,false);
2273 //loci->Add(loctrack.rloc);
2274 wrkloci = &(loctrack.rloci);
2275 }
2276 else {
2277 if (loctrack[q]==NULL) continue;
2278 wrkloci = &(loctrack[q]->qloci);
2279 }
2280
2281 for (int t=0;t<wrkloci->Count();t++) {
2282 GLocus* loc=wrkloci->Get(t);
2283 int xfound=0; //count of parent xloci
2284 if (loc->xlocus!=NULL) continue; //already assigned a superlocus
2285 GArray<int> mrgxloci(true);
2286 for (int xl=0;xl<xloci->Count();xl++) {
2287 GXLocus& xloc=*(xloci->Get(xl));
2288 if (xloc.start>loc->end) {
2289 if (xloc.start-loc->end > GFF_MAX_LOCUS) break;
2290 continue;
2291 }
2292 if (loc->start>xloc.end) continue;
2293 if (xloc.add_Locus(loc)) {
2294 xfound++;
2295 mrgxloci.Add(xl);
2296 }
2297 } //for each existing Xlocus
2298 if (xfound==0) {
2299 xloci->Add(new GXLocus(loc));
2300 }
2301 else {
2302 int il=mrgxloci[0];
2303 GXLocus& xloc=*(xloci->Get(il));
2304 if (xfound>1) {
2305 for (int l=1;l<xfound;l++) {
2306 int mlidx=mrgxloci[l]-l+1;
2307 xloc.addMerge(*(xloci->Get(mlidx)));
2308 GXLocus* ldel=xloci->Get(mlidx);
2309 xloci->Delete(mlidx);
2310 if (retxloci!=NULL)
2311 delete ldel;
2312 }
2313 }
2314 //in case xloc.start was decreased, bubble-down until it's in the proper order
2315 while (il>0 && xloc<*(xloci->Get(il-1))) {
2316 il--;
2317 xloci->Swap(il,il+1);
2318 }
2319 } //at least one locus is being merged
2320 }//for each locus
2321 }//for each set of loci in the region (refs and each qry set)
2322 //-- add xloci to the global set of xloci unless retxloci was given,
2323 if (retxloci!=NULL) retxloci->Add(*xloci);
2324 else dest_xloci->Add(*xloci);
2325 }
2326
2327 void singleQData(GList<GLocus>& qloci, GList<GTrackLocus>& loctracks) {
2328 for (int i=0;i<qloci.Count();i++) {
2329 if (qloci[i]->t_ptr==NULL) {
2330 GTrackLocus* tloc=new GTrackLocus();
2331 tloc->addQLocus(qloci[i],0);
2332 loctracks.Add(tloc);
2333 }
2334 }
2335 }
2336
2337 void recheckUmrnas(GSeqData* gseqdata, GList<GffObj>& mrnas,
2338 GList<GLocus>& loci, GList<GLocus>& nloci, GList<GLocus>& oloci) {
2339 GList<GLocus> reassignedLocs(false,false);
2340 for (int u=0;u<gseqdata->umrnas.Count();u++) {
2341 for (int l=0;l<oloci.Count();l++) {
2342 if (gseqdata->umrnas[u]==NULL) break;
2343 if (gseqdata->umrnas[u]->end<oloci[l]->start) break; //try next umrna
2344 if (oloci[l]->end<gseqdata->umrnas[u]->start) continue; //try next locus
2345 if (gseqdata->umrnas[u]->strand=='+' || gseqdata->umrnas[u]->strand=='-') {
2346 gseqdata->umrnas.Forget(u);
2347 continue; //already reassigned earlier
2348 }
2349 //umrna overlaps locus region
2350 GffObj* umrna=gseqdata->umrnas[u];
2351 for (int m=0;m<oloci[l]->mrnas.Count();m++) {
2352 if (oloci[l]->mrnas[m]->exonOverlap(umrna)) {
2353 gseqdata->umrnas.Forget(u);
2354 CTData* umdata=((CTData*)umrna->uptr);
2355 //must be in a Loci anyway
2356 if (umdata==NULL || umdata->locus==NULL)
2357 GError("Error: no locus pointer for umrna %s!\n",umrna->getID());
2358 for (int i=0;i<umdata->locus->mrnas.Count();i++) {
2359 GffObj* um=umdata->locus->mrnas[i];
2360 um->strand=oloci[l]->mrnas[m]->strand;
2361 }
2362 reassignedLocs.Add(umdata->locus);
2363 break;
2364 }
2365 } //for each mrna in locus
2366 } //for each locus
2367 } //for each umrna
2368 if (reassignedLocs.Count()>0) {
2369 gseqdata->umrnas.Pack();
2370 gseqdata->nloci_u.setFreeItem(false);
2371 for (int i=0;i<reassignedLocs.Count();i++) {
2372 GLocus* loc=reassignedLocs[i];
2373 for (int m=0;m<loc->mrnas.Count();m++) {
2374 mrnas.Add(loc->mrnas[m]);
2375 }
2376 loci.Add(loc);
2377 nloci.Add(loc);
2378 gseqdata->nloci_u.Remove(loc);
2379 }
2380 gseqdata->nloci_u.setFreeItem(true);
2381 }
2382 }
2383
2384 void umrnasXStrand(GList<GXLocus>& xloci, GSeqTrack& gtrack) {
2385 //try to determine the strand of unoriented transfrags based on possible overlaps
2386 //with other, oriented transfrags
2387 for (int x=0;x<xloci.Count();x++) {
2388 if (xloci[x]->strand=='.') continue;
2389 if (xloci[x]->qloci.Count()==0) continue;
2390 //go through all qloci in this xlocus
2391 for (int l = 0; l < xloci[x]->qloci.Count(); l++) {
2392 char locstrand=xloci[x]->qloci[l]->mrna_maxcov->strand;
2393 if (locstrand=='.') {
2394 //this is a umrna cluster
2395 GLocus* qloc=xloci[x]->qloci[l];
2396 //we don't really need to update loci lists (loci_f, nloci_f etc.)
2397 /*
2398 if (xloci[x]->strand=='+') {
2399 }
2400 else { // - strand
2401 }
2402 */
2403 for (int i=0;i<qloc->mrnas.Count();i++) {
2404 qloc->mrnas[i]->strand=xloci[x]->strand;
2405 int uidx=gtrack.qdata[qloc->qfidx]->umrnas.IndexOf(qloc->mrnas[i]);
2406 if (uidx>=0) {
2407 gtrack.qdata[qloc->qfidx]->umrnas.Forget(uidx);
2408 gtrack.qdata[qloc->qfidx]->umrnas.Delete(uidx);
2409 if (xloci[x]->strand=='+')
2410 gtrack.qdata[qloc->qfidx]->mrnas_f.Add(qloc->mrnas[i]);
2411 else
2412 gtrack.qdata[qloc->qfidx]->mrnas_r.Add(qloc->mrnas[i]);
2413 }
2414 }
2415 } //unknown strand
2416 } //for each xloci[x].qloci (l)
2417
2418 } //for each xloci (x)
2419 }
2420
2421 //cluster loci across all datasets
2422 void xclusterLoci(int qcount, char strand, GSeqTrack& gtrack) {
2423 //gtrack holds data for all input qry datasets for a chromosome/contig
2424 //cluster QLoci
2425 GList<GTrackLocus> loctracks(true,true,false);
2426 //all vs all clustering across all qry data sets + ref
2427 //one-strand set of loci from all datasets + ref loci
2428 GList<GLocus>* wrkloci=NULL;
2429 //build xloci without references first
2430 //then add references only if they overlap an existing xloci
2431
2432 int nq=0;
2433 for (int q=0;q<=qcount+1;q++) {
2434 bool refcheck=false;
2435 if (q==qcount) { // check the unoriented loci for each query file
2436 while (nq<qcount &&
2437 (gtrack.qdata[nq]==NULL || gtrack.qdata[nq]->nloci_u.Count()==0))
2438 nq++; //skip query files with no unoriented loci
2439 if (nq<qcount) {
2440 wrkloci=&(gtrack.qdata[nq]->nloci_u);
2441 nq++;
2442 if (nq<qcount) q--; //so we can fetch the next nq in the next q cycle
2443 }
2444 else continue; //no more q files with unoriented loci
2445 }
2446 else if (q==qcount+1) { // check the reference loci
2447 if (strand=='+') wrkloci=gtrack.rloci_f;
2448 else wrkloci=gtrack.rloci_r;
2449
2450 if (wrkloci==NULL) break; //no ref loci here
2451 refcheck=true;
2452 }
2453 else {
2454 if (gtrack.qdata[q]==NULL) continue;
2455 if (strand=='+') wrkloci=&(gtrack.qdata[q]->loci_f);
2456 else wrkloci=&(gtrack.qdata[q]->loci_r);
2457 }
2458 // now do the all-vs-all clustering thing:
2459 for (int t=0;t<wrkloci->Count();t++) {
2460 GLocus* loc=wrkloci->Get(t);
2461 int xfound=0; //count of parent loctracks
2462 if (loc->t_ptr!=NULL) continue; //already assigned a loctrack
2463 GArray<int> mrgloctracks(true);
2464 for (int xl=0;xl<loctracks.Count();xl++) {
2465 GTrackLocus& trackloc=*loctracks[xl];
2466 if (trackloc.start>loc->end) break;
2467 if (loc->start>trackloc.end) continue;
2468 if (trackloc.add_Locus(loc)) {
2469 xfound++;
2470 mrgloctracks.Add(xl);
2471 }
2472 } //for each existing Xlocus
2473 if (xfound==0) {
2474 if (!refcheck) //we really don't care about ref-only clusters
2475 loctracks.Add(new GTrackLocus(loc));
2476 }
2477 else {
2478 int il=mrgloctracks[0];
2479 GTrackLocus& tloc=*(loctracks.Get(il));
2480 if (xfound>1) {
2481 for (int l=1;l<xfound;l++) {
2482 int mlidx=mrgloctracks[l]-l+1;
2483 tloc.addMerge(loctracks[mlidx], qcount, loc);
2484 loctracks.Delete(mlidx);
2485 }
2486 }
2487 //in case tloc.start was decreased, bubble-down 'til it's in the proper place
2488 while (il>0 && tloc<*(loctracks[il-1])) {
2489 il--;
2490 loctracks.Swap(il,il+1);
2491 }
2492 } //at least one locus found
2493 }//for each wrklocus
2494 } //for each set of loci (q)
2495 //loctracks is now set with all x-clusters on this strand
2496 for (int i=0;i<loctracks.Count();i++) {
2497 if (!loctracks[i]->hasQloci) continue; //we really don't care here about reference-only clusters
2498 GTrackLocus& loctrack=*loctracks[i];
2499 findTMatches(loctrack, qcount); //find matching transfrags in this xcluster
2500 for (int rl=0; rl < loctrack.rloci.Count(); rl++) {
2501 findTRMatch(loctrack, qcount, *(loctrack.rloci[rl]));
2502 //find matching reference annotation for this xcluster and assign class codes to transfrags
2503 }
2504 GList<GXLocus> xloci(false,false,false);
2505 buildXLoci(loctrack, qcount, gtrack, strand, &xloci);
2506 //the newly created xloci are in xloci
2507 umrnasXStrand(xloci, gtrack);
2508 //also merge these xloci into the global list of xloci
2509 for (int l=0; l < xloci.Count(); l++) {
2510 if (xloci[l]->strand=='+') {
2511 gtrack.xloci_f.Add(xloci[l]);
2512 }
2513 else if (xloci[l]->strand=='-') {
2514 gtrack.xloci_r.Add(xloci[l]);
2515 }
2516 else gtrack.xloci_u.Add(xloci[l]);
2517 }
2518 }//for each xcluster
2519 }
2520
2521
2522 void printRefMap(FILE** frs, int qcount, GList<GLocus>* rloci) {
2523 if (rloci==NULL) return;
2524
2525 for (int l=0;l<rloci->Count(); l++) {
2526 for (int r=0;r<rloci->Get(l)->mrnas.Count(); r++) {
2527 GffObj& ref = *(rloci->Get(l)->mrnas[r]);
2528 CTData* refdata = ((CTData*)ref.uptr);
2529 GStr* clist = new GStr[qcount];
2530 GStr* eqlist = new GStr[qcount];
2531 for (int i = 0; i<refdata->ovls.Count(); i++) {
2532 GffObj* m=refdata->ovls[i]->mrna;
2533 char ovlcode=refdata->ovls[i]->code;
2534 if (m==NULL) {
2535 GMessage("Warning: NULL mRNA found for ref %s with ovlcode '%c'\n",
2536 ref.getID(), refdata->ovls[i]->code);
2537 continue;
2538 }
2539 int qfidx = ((CTData*)m->uptr)->qset;
2540 if (ovlcode == '=') {
2541 eqlist[qfidx].append(getGeneID(m));
2542 eqlist[qfidx].append('|');
2543 eqlist[qfidx].append(m->getID());
2544 eqlist[qfidx].append(',');
2545 }
2546 else if (ovlcode == 'c') {
2547 clist[qfidx].append(getGeneID(m));
2548 clist[qfidx].append('|');
2549 clist[qfidx].append(m->getID());
2550 clist[qfidx].append(',');
2551 }
2552 }//for each reference overlap
2553 for (int q=0;q<qcount;q++) {
2554 if (!eqlist[q].is_empty()) {
2555 eqlist[q].trimR(',');
2556 fprintf(frs[q],"%s\t%s\t=\t%s\n", getGeneID(ref), ref.getID(),eqlist[q].chars());
2557 }
2558 if (!clist[q].is_empty()) {
2559 clist[q].trimR(',');
2560 fprintf(frs[q],"%s\t%s\tc\t%s\n",getGeneID(ref), ref.getID(),clist[q].chars());
2561 }
2562 }
2563 delete[] clist;
2564 delete[] eqlist;
2565 }// ref loop
2566 }//ref locus loop
2567 }
2568
2569
2570
2571 void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs) {
2572 FILE* f_ltrack=NULL;
2573 FILE* f_itrack=NULL;
2574 FILE* f_ctrack=NULL;
2575 FILE* f_xloci=NULL;
2576 int cnum=0; //consensus numbering for printITrack()
2577 GStr s=fbasename;
2578 //if (qcount>1 || generic_GFF) { //doesn't make much sense for only 1 query file
2579 s.append(".tracking");
2580 f_itrack=fopen(s.chars(),"w");
2581 if (f_itrack==NULL) GError("Error creating file %s !\n",s.chars());
2582 // }
2583 s=fbasename;
2584 s.append(".combined.gtf");
2585 f_ctrack=fopen(s.chars(),"w");
2586 if (f_ctrack==NULL) GError("Error creating file %s !\n",s.chars());
2587
2588 s=fbasename;
2589 s.append(".loci");
2590 f_xloci=fopen(s.chars(),"w");
2591 if (f_xloci==NULL) GError("Error creating file %s !\n",s.chars());
2592 for (int g=0;g<gtracks.Count();g++) { //for each genomic sequence
2593 GSeqTrack& gseqtrack=*gtracks[g];
2594
2595 xclusterLoci(qcount, '+', gseqtrack);
2596 xclusterLoci(qcount, '-', gseqtrack);
2597
2598 //count XLoci, setting their id
2599 numXLoci(gseqtrack.xloci_f, xlocnum);
2600 numXLoci(gseqtrack.xloci_r, xlocnum);
2601 numXLoci(gseqtrack.xloci_u, xlocnum);
2602 //transcript accounting: for all those transcripts with 'u' or 0 class code
2603 // we have to check for polymerase runs 'p' or repeats 'r'
2604
2605 GFaSeqGet *faseq=gfasta.fetch(gseqtrack.get_gseqid(), checkFasta);
2606
2607 umrnaReclass(qcount, gseqtrack, ftr, faseq);
2608
2609 // print transcript tracking (ichain_tracking)
2610 //if (qcount>1)
2611 for (int q=0;q<qcount;q++) {
2612 if (gseqtrack.qdata[q]==NULL) continue;
2613 printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_f, qcount, cnum);
2614 printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_r, qcount, cnum);
2615 //just for the sake of completion:
2616 printITrack(f_itrack, gseqtrack.qdata[q]->umrnas, qcount, cnum);
2617 }
2618 //print XLoci and XConsensi within each xlocus
2619 //also TSS clustering and protein ID assignment for XConsensi
2620 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_f, faseq);
2621 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_r, faseq);
2622 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_u, faseq);
2623 if (tmapFiles && haveRefs) {
2624 printRefMap(frs, qcount, gseqtrack.rloci_f);
2625 printRefMap(frs, qcount, gseqtrack.rloci_r);
2626 }
2627 delete faseq;
2628 }
2629 if (tmapFiles) {
2630 for (int q=0;q<qcount;q++) {
2631 fclose(ftr[q]);
2632 if (haveRefs) fclose(frs[q]);
2633 }
2634 }
2635 if (f_ltrack!=NULL) fclose(f_ltrack);
2636 if (f_itrack!=NULL) fclose(f_itrack);
2637 if (f_ctrack!=NULL) fclose(f_ctrack);
2638 if (f_xloci!=NULL) fclose(f_xloci);
2639
2640 }

Properties

Name Value
svn:executable *