ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/cuffcompare.cpp
Revision: 72
Committed: Tue Sep 27 01:20:21 2011 UTC (8 years ago) by gpertea
File size: 98271 byte(s)
Log Message:
reversed gene_id requirement for TSS clustering

Line File contents
1 #ifdef HAVE_CONFIG_H
2 #include <config.h>
3 #else
4 #define PACKAGE_VERSION "INTERNAL"
5 #define SVN_REVISION "SVN"
6 #endif
7
8 #include "GArgs.h"
9 #include <ctype.h>
10 #include <errno.h>
11 #include "gtf_tracking.h"
12
13 #ifdef HAVE_CONFIG_H
14 #include "update_check.h"
15 #endif
16
17 #define USAGE "Usage:\n\
18 cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>] \n\
19 [-o <outprefix>] [-p <cprefix>] \n\
20 {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}\n\
21 \n\
22 Cuffcompare provides classification, reference annotation mapping and various\n\
23 statistics for Cufflinks transfrags.\n\
24 Cuffcompare clusters and tracks transfrags across multiple samples, writing\n\
25 matching transcripts (intron chains) into <outprefix>.tracking, and a GTF\n\
26 file <outprefix>.combined.gtf containing a nonredundant set of transcripts \n\
27 across all input files (with a single representative transfrag chosen\n\
28 for each clique of matching transfrags across samples).\n\
29 \n\
30 Options:\n\
31 -i provide a text file with a list of Cufflinks GTF files to process instead\n\
32 of expecting them as command line arguments (useful when a large number\n\
33 of GTF files should be processed)\n\
34 \n\
35 -r a set of known mRNAs to use as a reference for assessing \n\
36 the accuracy of mRNAs or gene models given in <input.gtf>\n\
37 \n\
38 -R for -r option, reduce the set of reference transcripts to \n\
39 only those found to overlap any of the input loci\n\
40 -M discard (ignore) single-exon transfrags and reference transcripts\n\
41 -N discard (ignore) single-exon reference transcripts\n\
42 \n\
43 -s <seq_path> can be a multi-fasta file with all the genomic sequences or \n\
44 a directory containing multiple single-fasta files (one file per contig);\n\
45 lower case bases will be used to classify input transcripts as repeats\n\
46 \n\
47 -d max distance (range) for grouping transcript start sites (100)\n\
48 -p the name prefix to use for consensus transcripts in the \n\
49 <outprefix>.combined.gtf file (default: 'TCONS')\n\
50 -C include the \"contained\" transcripts in the .combined.gtf file\n\
51 -G generic GFF input file(s) (do not assume Cufflinks GTF)\n\
52 -T do not generate .tmap and .refmap files for each input file\n\
53 -V verbose processing mode (showing all GFF parsing warnings)\n\
54 "
55 bool debug=false;
56 bool perContigStats=false; // -S to enable stats for every single contig
57 bool generic_GFF=false; //-G, don't assume Cufflinks GTF as input
58 bool showContained=false; // -C
59 bool reduceRefs=false;
60 bool checkFasta=false;
61 bool tmapFiles=true;
62 bool only_spliced_refs=false;
63 int debugCounter=0;
64
65 int polyrun_range=2000; //polymerase run range 2KB
66 double scoreThreshold=0;
67 double exprThreshold=0;
68 char* cprefix=NULL;
69 FILE* ffasta=NULL; //genomic seq file
70 FILE *f_ref=NULL; //reference mRNA GFF, if provided
71 FILE* f_in=NULL; //sequentially, each input GFF file
72 FILE* f_out=NULL; //stdout if not provided
73 GFastaHandler gfasta;
74 int xlocnum=0;
75 int tsscl_num=0; //for tss cluster IDs
76 int protcl_num=0; //for "unique" protein IDs within TSS clusters
77 int tssDist=100;
78 //int total_tcons=0;
79 int total_xloci_alt=0;
80
81 void openfwrite(FILE* &f, GArgs& args, char opt) {
82 GStr s=args.getOpt(opt);
83 if (!s.is_empty()) {
84 if (s=='-')
85 f=stdout;
86 else {
87 f=fopen(s,"w");
88 if (f==NULL) GError("Error creating file: %s\n", s.chars());
89 }
90 }
91 }
92
93 //-- structure to keep track of data from multiple qry input files for a single genomic seq
94 class GSeqTrack {
95 int gseq_id;
96 public:
97 const char* gseq_name;
98 GList<GLocus>* rloci_f; //reference loci for this genomic sequence
99 GList<GLocus>* rloci_r;
100 GList<GXLocus> xloci_f; // extended super-loci across all qry datasets
101 GList<GXLocus> xloci_r; // extended super-loci across all qry datasets
102 GList<GXLocus> xloci_u; // extended super-loci across all qry datasets
103 GSeqData* qdata[MAX_QFILES]; //fixed order array with GSeqData for each qry input
104 //element in array is NULL if a qry file has no transcripts on this genomic sequence
105 int get_gseqid() { return gseq_id; }
106 GSeqTrack(int gid=-1):xloci_f(true,true,false),
107 xloci_r(true,true,false), xloci_u(true,true,false) {
108 gseq_id=gid;
109 if (gseq_id>=0) {
110 gseq_name=GffObj::names->gseqs.getName(gseq_id);
111 }
112 rloci_f=NULL;
113 rloci_r=NULL;
114 for (int i=0;i<MAX_QFILES;i++) qdata[i]=NULL;
115 }
116 bool operator==(GSeqTrack& d){
117 return (gseq_id==d.gseq_id);
118 }
119 bool operator>(GSeqTrack& d){
120 return (gseq_id>d.gseq_id);
121 }
122 bool operator<(GSeqTrack& d){
123 return (gseq_id<d.gseq_id);
124 }
125 };
126
127 char* getFastaFile(int gseq_id);
128
129 // ref globals
130 bool haveRefs=false; //true if a reference annotation (-r) is provide
131
132 GList<GSeqData> ref_data(true,true,true); //list of reference mRNAs and loci data for each genomic seq
133 //each locus will keep track of any superloci which includes it, formed during the analysis
134
135 void processLoci(GSeqData& seqdata, GSeqData* refdata=NULL, int qfidx=0);
136
137 void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
138 GSeqData* seqdata=NULL, GSeqData* refdata=NULL);
139
140 GSeqData* getQryData(int gid, GList<GSeqData>& qdata);
141 void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs);
142
143 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
144 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
145
146 FILE* f_mintr=NULL; //missed ref introns
147
148 bool multiexon_only=false;
149 bool multiexonrefs_only=false;
150
151 GHash<GStr> refdescr;
152 void loadRefDescr(const char* fname);
153
154 GList<GStr> qryfiles(false,true,false);
155
156 //list of GSeqTrack data, sorted by gseq_id
157 GList<GSeqTrack> gseqtracks(true,true,true);
158 GSeqTrack* findGSeqTrack(int gsid);
159
160
161 int cmpGTrackByName(const pointer p1, const pointer p2) {
162 return strcmp(((GSeqTrack*)p1)->gseq_name, ((GSeqTrack*)p2)->gseq_name);
163 }
164
165
166 void show_usage() {
167 GMessage("cuffcompare v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
168 GMessage( "-----------------------------\n");
169 GMessage("%s\n", USAGE);
170 }
171
172 int main(int argc, char * const argv[]) {
173 GArgs args(argc, argv, "XDTMNVGSCKRLhp:c:d:s:i:n:r:o:");
174 int e;
175 if ((e=args.isError())>0) {
176 show_usage();
177 GMessage("Invalid argument: %s\n", argv[e]);
178 exit(1);
179 }
180 if (args.getOpt('h')!=NULL){
181 show_usage();
182 exit(1);
183 }
184 showContained=(args.getOpt('C')!=NULL);
185 debug=(args.getOpt('D')!=NULL);
186 tmapFiles=(args.getOpt('T')==NULL);
187 multiexon_only=(args.getOpt('M')!=NULL);
188 multiexonrefs_only=(args.getOpt('N')!=NULL);
189 perContigStats=(args.getOpt('S')!=NULL);
190 checkFasta=(args.getOpt('K')!=NULL);
191 gtf_tracking_verbose=((args.getOpt('V')!=NULL) || debug);
192 FILE* finlst=NULL;
193 GStr s=args.getOpt('i');
194 if (!s.is_empty()) {
195 if (s=='-')
196 finlst=stdin;
197 else {
198 finlst=fopen(s,"r");
199 if (finlst==NULL) GError("Error opening file: %s\n", s.chars());
200 }
201 }
202 int numqryfiles=0;
203 if (finlst) {
204 GLineReader* lr=new GLineReader(finlst);
205 char* l=NULL;
206 while ((l=lr->getLine())!=NULL) {
207 if (strlen(l)<2 || startsWith(l,"# ") || isspace(*l)) continue;
208 if (!fileExists(l)) GError("Error: cannot locate input file: %s\n", l);
209 qryfiles.Add(new GStr(l));
210 }
211 delete lr;
212 //if (qryfiles.Count()>10)
213 gtf_tracking_largeScale=true;
214 }
215 else {
216 numqryfiles=args.startNonOpt();
217 char *infile=NULL;
218 if (numqryfiles>0) {
219 while ((infile=args.nextNonOpt())!=NULL) {
220 if (!fileExists(infile)) GError("Error: cannot locate input file: %s\n", infile);
221 qryfiles.Add(new GStr(infile));
222 } //for each argument
223 }
224 }
225 numqryfiles=qryfiles.Count();
226 if (numqryfiles==0) {
227 show_usage();
228 exit(1);
229 }
230 if (numqryfiles>MAX_QFILES) {
231 GMessage("Error: too many input files (limit set to %d at compile time)\n",MAX_QFILES);
232 GMessage("(if you need to raise this limit set a new value for\nMAX_QFILES in gtf_tracking.h and recompile)\n");
233 exit(0x5000);
234 }
235 #ifdef HAVE_CONFIG_H
236 check_version(PACKAGE_VERSION);
237 #endif
238 gfasta.init(args.getOpt('s'));
239 // determine if -s points to a multi-fasta file or a directory
240 s=args.getOpt('c');
241 if (!s.is_empty()) scoreThreshold=s.asReal();
242 s=args.getOpt('p');
243 if (!s.is_empty()) cprefix=Gstrdup(s.chars());
244 else cprefix=Gstrdup("TCONS");
245 s=args.getOpt('e');
246 if (!s.is_empty()) exprThreshold=s.asReal();
247 s=args.getOpt('d');
248 if (!s.is_empty()) {
249 tssDist=s.asInt();
250 }
251
252 s=args.getOpt('n');
253 if (!s.is_empty()) loadRefDescr(s.chars());
254 s=args.getOpt('r');
255 if (!s.is_empty()) {
256 f_ref=fopen(s,"r");
257 if (f_ref==NULL) GError("Error opening reference gff: %s\n",s.chars());
258 haveRefs=true;
259 if (gtf_tracking_verbose) GMessage("Loading reference transcripts..\n");
260 read_mRNAs(f_ref, ref_data, &ref_data, true, -1, s.chars(), (multiexonrefs_only || multiexon_only));
261 haveRefs=(ref_data.Count()>0);
262 reduceRefs=(args.getOpt('R')!=NULL);
263 if (gtf_tracking_verbose) GMessage("..reference annotation loaded\n");
264 }
265 bool discard_redundant=true; //discard redundant input transfrags
266 generic_GFF=args.getOpt('G');
267 if (generic_GFF) discard_redundant=false; //generic GTF, don't try to discard "redundant" transcripts
268 //if a full pathname is given
269 //the other common output files will still be created in the current directory:
270 // .loci, .tracking, .stats
271 GStr outbasename; //include path, if provided
272 GStr outprefix; //without path and/or extension
273 GStr outstats=args.getOpt('o');
274 if (outstats.is_empty() || outstats=="-") {
275 outstats="cuffcmp";
276 }
277 outbasename=outstats;
278 GStr outext(getFileExt(outstats.chars()));
279 if (outext.is_empty()) {
280 outext="stats";
281 outstats.append(".stats");
282 outbasename=outstats;
283 }
284 else outext.lower();
285 if (outext=="txt" || outext=="out" || outext=="stats" || outext=="summary") {
286 outbasename.cut(outbasename.length()-outext.length()-1);
287 }
288
289 outprefix=outbasename;
290 int di=outprefix.rindex(CHPATHSEP);
291 if (di>=0) outprefix.cut(0,di+1);
292
293 if (debug) { //create a few more files potentially useful for debugging
294 s=outbasename;
295 s.append(".missed_introns.gtf");
296 f_mintr=fopen(s.chars(),"w");
297 if (f_mintr==NULL) GError("Error creating file %s!\n",s.chars());
298 /*
299 s=outbasename;
300 s.append(".noTP_introns.gtf");
301 f_nintr=fopen(s.chars(),"w");
302 s=outbasename;
303 s.append(".wrong_Qintrons.gtf");
304 f_qintr=fopen(s.chars(),"w");
305 */
306 }
307
308 f_out=fopen(outstats, "w");
309 if (f_out==NULL) GError("Error creating output file %s!\n", outstats.chars());
310 if (gtf_tracking_verbose) GMessage("Prefix for output files: %s\n", outprefix.chars());
311 fprintf(f_out, "# Cuffcompare v%s | Command line was:\n#", PACKAGE_VERSION);
312 for (int i=0;i<argc-1;i++)
313 fprintf(f_out, "%s ", argv[i]);
314 fprintf(f_out, "%s\n#\n", argv[argc-1]);
315 //int qfileno=0;
316 GList<GSeqData>** qrysdata=NULL;
317 FILE** tfiles=NULL;
318 FILE** rtfiles=NULL;
319 GMALLOC(qrysdata, numqryfiles*sizeof(GList<GSeqData>*));
320 if (tmapFiles) {
321 GMALLOC(tfiles, numqryfiles*sizeof(FILE*));
322 if (haveRefs) {
323 GMALLOC(rtfiles, numqryfiles*sizeof(FILE*));
324 }
325 }
326 for (int fi=0;fi<qryfiles.Count();fi++) {
327 GStr in_file(qryfiles[fi]->chars());
328 GStr infname(getFileName(qryfiles[fi]->chars())); //file name only
329 GStr indir(qryfiles[fi]->chars());
330 di=indir.rindex(CHPATHSEP);
331 if (di>=0) indir.cut(di+1); //directory path for this input file
332 else indir=""; //current directory
333
334 if (debug || (gtf_tracking_verbose && !gtf_tracking_largeScale))
335 GMessage("Processing qfile #%d: %s\n",fi+1, in_file.chars());
336 if (in_file=="-") { f_in=stdin; in_file="stdin"; }
337 else {
338 f_in=fopen(in_file.chars(),"r");
339 if (f_in==NULL)
340 GError("Cannot open input file %s!\n",in_file.chars());
341 }
342 //f_in is the query gff file to process
343
344 GStr sbase(indir);
345 sbase.append(outprefix);
346 sbase.append(".");
347 sbase.append(infname);
348 if (tmapFiles) {
349 //-- we should keep the infname path, otherwise the remaining file names
350 // may be the same and clobber each other
351 s=sbase;
352 s.append(".tmap");
353 tfiles[fi]=fopen(s.chars(),"w");
354 if (tfiles[fi]==NULL)
355 GError("Error creating file '%s'!\n",s.chars());
356 fprintf(tfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_gene_id\tcuff_id\tFMI\tFPKM\tFPKM_conf_lo\tFPKM_conf_hi\tcov\tlen\tmajor_iso_id\tref_match_len\n");
357 if (haveRefs) {
358 s=sbase;
359 s.append(".refmap");
360 rtfiles[fi]=fopen(s.chars(),"w");
361 if (rtfiles[fi]==NULL)
362 GError("Error creating file '%s'!\n",s.chars());
363 fprintf(rtfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_id_list\n");
364 }
365 }
366
367 GList<GSeqData>* pdata=new GList<GSeqData>(true,true,true);
368 qrysdata[fi]=pdata;
369 if (gtf_tracking_verbose) GMessage("Loading transcripts from %s..\n",in_file.chars());
370 read_mRNAs(f_in, *pdata, &ref_data, discard_redundant, fi, in_file.chars(), multiexon_only);
371 GSuperLocus gstats;
372 GFaSeqGet *faseq=NULL;
373 for (int g=0;g<pdata->Count();g++) { //for each seqdata related to a genomic sequence
374 int gsid=pdata->Get(g)->get_gseqid();
375 GSeqData* refdata=getRefData(gsid, ref_data);//ref data for this contig
376 if (!gtf_tracking_largeScale)
377 processLoci(*(pdata->Get(g)), refdata, fi);
378 GSeqTrack* seqtrack=findGSeqTrack(gsid); //this will add a gseqtrack if it doesn't exist
379 // for gsid
380 if (refdata!=NULL) {
381 seqtrack->rloci_f=&(refdata->loci_f);
382 seqtrack->rloci_r=&(refdata->loci_r);
383 }
384 seqtrack->qdata[fi]=pdata->Get(g);
385 //will only gather data into stats if perContig==false
386 if (!gtf_tracking_largeScale) reportStats(f_out, getGSeqName(gsid), gstats,
387 pdata->Get(g), refdata);
388 if (faseq!=NULL) delete faseq;
389 } //for each genomic sequence data
390 //there could be genomic sequences with no qry transcripts
391 //but with reference transcripts
392 if (haveRefs && !reduceRefs && !gtf_tracking_largeScale) {
393 for (int r=0;r<ref_data.Count();r++) {
394 GSeqData* refdata=ref_data[r];
395 int gsid=refdata->get_gseqid();
396 if (getQryData(gsid, *pdata)==NULL) {
397 reportStats(f_out, getGSeqName(gsid), gstats, NULL, refdata);
398 }//completely missed all refdata on this contig
399 }
400 }
401 //now report the summary:
402 if (!gtf_tracking_largeScale) reportStats(f_out, in_file.chars(), gstats);
403 if (f_in!=stdin) fclose(f_in);
404 //qfileno++;
405 }//for each input file
406 if (f_mintr!=NULL) fclose(f_mintr);
407 gseqtracks.setSorted(&cmpGTrackByName);
408 if (gtf_tracking_verbose) GMessage("Tracking transcripts across %d query files..\n", numqryfiles);
409 trackGData(numqryfiles, gseqtracks, outbasename, tfiles, rtfiles);
410 fprintf(f_out, "\n Total union super-loci across all input datasets: %d \n", xlocnum);
411 if (numqryfiles>1) {
412 fprintf(f_out, " (%d multi-transcript, ~%.1f transcripts per locus)\n",
413 total_xloci_alt, ((double)(GXConsensus::count))/xlocnum);
414 }
415 if (gtf_tracking_verbose) GMessage("Cleaning up..\n");
416 GFREE(cprefix);
417 // clean up
418 for (int i=0;i<numqryfiles;i++) {
419 delete qrysdata[i];
420 }
421 GFREE(qrysdata);
422 GFREE(tfiles);
423 GFREE(rtfiles);
424 gseqtracks.Clear();
425 FRCLOSE(f_ref);
426 FWCLOSE(f_out);
427 if (gtf_tracking_verbose) GMessage("Done.\n");
428 ref_data.Clear();
429 //getchar();
430 } //main ends here
431
432 void show_exons(FILE* f, GffObj& m) {
433 fprintf(f,"(");
434 int imax=m.exons.Count()-1;
435 for (int i=0;i<=imax;i++) {
436 if (i==imax) fprintf(f,"%d-%d)",m.exons[i]->start, m.exons[i]->end);
437 else fprintf(f,"%d-%d,",m.exons[i]->start, m.exons[i]->end);
438 }
439 }
440
441 bool ichainMatch(GffObj* t, GffObj* r, bool& exonMatch, int fuzz=0) {
442 //t's intron chain is considered matching to reference r
443 //if r chain is the same or a subset of t's chain
444 exonMatch=false;
445 int imax=r->exons.Count()-1;
446 int jmax=t->exons.Count()-1;
447 if (imax==0 || jmax==0) { //single-exon mRNAs
448 if (imax!=jmax) return false;
449 exonMatch=r->exons[0]->coordMatch(t->exons[0],fuzz);
450 /*if (exonMatch) return true;
451 else return (r->exons[0]->start>=t->exons[0]->start &&
452 r->exons[0]->end<=t->exons[0]->end);*/
453 return exonMatch;
454 }
455
456 if (r->exons[imax]->start<t->exons[0]->end ||
457 t->exons[jmax]->start<r->exons[0]->end ) //intron chains do not overlap at all
458 {
459 return false;
460 }
461 //check intron overlaps
462 int i=1;
463 int j=1;
464 bool exmism=false; //any mismatch
465 while (i<=imax && j<=jmax) {
466 uint rstart=r->exons[i-1]->end;
467 uint rend=r->exons[i]->start;
468 uint tstart=t->exons[j-1]->end;
469 uint tend=t->exons[j]->start;
470 if (tend<rstart) { j++; continue; }
471 if (rend<tstart) { i++; continue; }
472 break; //here we have an intron overlap
473 }
474 if (i>1 || i>imax || j>jmax) {
475 return false; //no intron overlaps found at all
476 //or first intron of ref not overlapping
477 }
478 //from now on we expect intron matches up to imax
479 if (i!=j || imax!=jmax) { exmism=true; if (fuzz==0) return false; }
480 for (;i<=imax && j<=jmax;i++,j++) {
481 if (abs((int)(r->exons[i-1]->end-t->exons[j-1]->end))>fuzz ||
482 abs((int)(r->exons[i]->start-t->exons[j]->start))>fuzz) {
483 return false; //just run away
484 }
485 }
486 //if we made it here, we have matching intron chains up to MIN(imax,jmax)
487 if (imax!=jmax) {
488 exmism=true;
489 if (jmax<imax) return false; // qry ichain included in ref ichain
490 else //ref ichain included in qry ichain
491 if (fuzz==0) return false;
492 }
493 if (exmism) {
494 //exonMatch=false; -- it's default
495 return true;
496 }
497 exonMatch = ( abs((int)(r->exons[0]->start-t->exons[0]->start))<=fuzz &&
498 abs((int)(r->exons[imax]->end-t->exons[jmax]->end))<=fuzz );
499 return true;
500 }
501
502
503 void compareLoci2R(GList<GLocus>& loci, GList<GSuperLocus>& cmpdata,
504 GList<GLocus>& refloci, int qfidx) {
505 cmpdata.Clear();//a new list of superloci will be built
506 if (refloci.Count()==0 || loci.Count()==0) return;
507 //reset cmpovl and stats
508 for (int i=0;i<refloci.Count();i++) refloci[i]->creset();
509 //find loci with overlapping refloci
510 //and store cmpovl links both ways for ALL loci and refloci on this strand
511 for (int l=0;l<loci.Count();l++) {
512 GLocus* locus=loci[l];
513 locus->creset();
514 for (int j=0;j<refloci.Count();j++) {
515 //if (refloci[j]->start>locus->end) break;
516 if (refloci[j]->start>locus->end) {
517 if (refloci[j]->start-locus->end > GFF_MAX_LOCUS) break;
518 continue;
519 }
520 if (locus->start>refloci[j]->end) continue;
521 // then we must have overlap here:
522 //if (locus->overlap(refloci[j]->start, refloci[j]->end)) {
523 locus->cmpovl.Add(refloci[j]);
524 refloci[j]->cmpovl.Add(locus);
525 //}
526 }//for each reflocus
527 } //for each locus
528
529 //create corresponding "superloci" from transitive overlapping between loci and ref
530 for (int l=0;l<loci.Count();l++) {
531 if (loci[l]->v!=0) continue; //skip, already processed
532 GSuperLocus* super=new GSuperLocus();
533 super->qfidx=qfidx;
534 //try to find all other loci connected to this locus loci[l]
535 GList<GLocus> lstack(false,false,false); //traversal stack
536 lstack.Push(loci[l]);
537 while (lstack.Count()>0) {
538 GLocus* locus=lstack.Pop();
539 if (locus->v!=0 || locus->cmpovl.Count()==0) continue;
540 super->addQlocus(*locus);
541 locus->v=1;
542 for (int r=0;r<locus->cmpovl.Count();r++) {
543 GLocus* rloc=locus->cmpovl[r];
544 if (rloc->v==0) {
545 super->addRlocus(*rloc);
546 rloc->v=1;
547 for (int ll=0;ll<rloc->cmpovl.Count();ll++) {
548 if (rloc->cmpovl[ll]->v==0) lstack.Push(rloc->cmpovl[ll]);
549 }
550 }
551 } //for each overlapping reflocus
552 } //while linking
553
554 if (super->qloci.Count()==0) {
555 delete super;
556 continue; //try next query loci
557 }
558 //--here we have a "superlocus" region data on both qry and ref
559 // -- analyze mexons matching (base level metrics)
560 cmpdata.Add(super);
561 //make each ref locus keep track of all superloci containing it
562 for (int rl=0;rl<super->rloci.Count();rl++) {
563 super->rloci[rl]->superlst->Add(super);
564 }
565 for (int x=0;x<super->rmexons.Count();x++) {
566 super->rbases_all += super->rmexons[x].end-super->rmexons[x].start+1;
567 }
568 for (int x=0;x<super->qmexons.Count();x++) {
569 super->qbases_all += super->qmexons[x].end-super->qmexons[x].start+1;
570 }
571 int i=0; //locus mexons
572 int j=0; //refmexons
573 while (i<super->qmexons.Count() && j<super->rmexons.Count()) {
574 uint istart=super->qmexons[i].start;
575 uint iend=super->qmexons[i].end;
576 uint jstart=super->rmexons[j].start;
577 uint jend=super->rmexons[j].end;
578 if (iend<jstart) { i++; continue; }
579 if (jend<istart) { j++; continue; }
580 //v--overlap here:
581 uint ovlstart = jstart>istart? jstart : istart;
582 uint ovlend = iend<jend ? iend : jend;
583 uint ovlen=ovlend-ovlstart+1;
584 super->baseTP+=ovlen; //qbases_cov
585 if (iend<jend) i++;
586 else j++;
587 } //while mexons ovl search
588 /* if (reduceRefs) {
589 super->baseFP=super->qbases_all-super->baseTP;
590 super->baseFN=super->rbases_all-super->baseTP;
591 }
592 */
593 // -- exon level comparison:
594 int* qexovl; //flags for qry exons with ref overlap
595 GCALLOC(qexovl,super->quexons.Count()*sizeof(int));
596 int* rexovl; //flags for ref exons with qry overlap
597 GCALLOC(rexovl,super->ruexons.Count()*sizeof(int));
598 for (int i=0;i<super->quexons.Count();i++) {
599 uint istart=super->quexons[i].start;
600 uint iend=super->quexons[i].end;
601 for (int j=0;j<super->ruexons.Count();j++) {
602 uint jstart=super->ruexons[j].start;
603 uint jend=super->ruexons[j].end;
604 if (iend<jstart) break;
605 if (jend<istart) continue;
606 //--- overlap here between quexons[i] and ruexons[j]
607 qexovl[i]++;
608 rexovl[j]++;
609 if (super->quexons[i].coordMatch(&super->ruexons[j],5)) {
610 super->exonATP++;
611 if (super->quexons[i].coordMatch(&super->ruexons[j])) {
612 super->exonTP++;
613 } //exact match
614 } //fuzzy match
615 } //ref uexon loop
616 } //qry uexon loop
617 super->m_exons=0; //ref exons with no query overlap
618 super->w_exons=0; //qry exons with no ref overlap
619 for (int x=0;x<super->quexons.Count();x++)
620 if (qexovl[x]==0) super->w_exons++;
621 for (int x=0;x<super->ruexons.Count();x++)
622 if (rexovl[x]==0) super->m_exons++;
623 GFREE(rexovl);
624 GFREE(qexovl);
625
626 //-- intron level stats:
627 //query:
628 int* qinovl=NULL; //flags for qry introns with at least some ref overlap
629 int* qtpinovl=NULL; //flags for qry introns with perfect ref overlap
630 if (super->qintrons.Count()>0) {
631 GCALLOC(qinovl,super->qintrons.Count()*sizeof(int));
632 GCALLOC(qtpinovl,super->qintrons.Count()*sizeof(int));
633 }
634 //-- reference:
635 int* rinovl=NULL; //flags for ref introns with qry overlap
636 int* rtpinovl=NULL; //ref introns with perfect qry intron overlap
637 if (super->rintrons.Count()>0) {
638 GCALLOC(rinovl,super->rintrons.Count()*sizeof(int));
639 GCALLOC(rtpinovl,super->rintrons.Count()*sizeof(int));
640 }
641 for (int i=0;i<super->qintrons.Count();i++) {
642 uint istart=super->qintrons[i].start;
643 uint iend=super->qintrons[i].end;
644 for (int j=0;j<super->rintrons.Count();j++) {
645 uint jstart=super->rintrons[j].start;
646 uint jend=super->rintrons[j].end;
647 if (iend<jstart) break;
648 if (jend<istart) continue;
649 //--- overlap here between qintrons[i] and rintrons[j]
650 qinovl[i]++;
651 rinovl[j]++;
652 if (super->qintrons[i].coordMatch(&super->rintrons[j],5)) {
653 super->intronATP++;
654 if (super->qintrons[i].coordMatch(&super->rintrons[j])) {
655 super->intronTP++;
656 qtpinovl[i]++;
657 rtpinovl[j]++;
658 } //exact match
659 } //fuzzy match
660 } //ref intron loop
661 } //qry intron loop
662 super->m_introns=0; //ref introns with no query overlap
663 super->w_introns=0; //qry introns with no ref overlap
664 for (int x=0;x<super->qintrons.Count();x++) {
665 if (qinovl[x]==0) { super->w_introns++;
666 //qry introns with no ref intron overlap AT ALL
667 super->i_qwrong.Add(super->qintrons[x]);
668 }
669 else
670 if (qtpinovl[x]==0) {
671 super->i_qnotp.Add(super->qintrons[x]);
672 }
673 }
674 for (int x=0;x<super->rintrons.Count();x++) {
675 if (rinovl[x]==0) { //no intron overlap at all
676 super->m_introns++;
677 super->i_missed.Add(super->rintrons[x]);
678 }
679 else if (rtpinovl[x]==0) { //no perfect intron match
680 super->i_notp.Add(super->rintrons[x]);
681 }
682 }
683 GFREE(rinovl);
684 GFREE(rtpinovl);
685 GFREE(qinovl);
686 GFREE(qtpinovl);
687
688 // ---- now intron-chain and transcript comparison
689 for (int i=0;i<super->qmrnas.Count();i++) {
690 uint istart=super->qmrnas[i]->exons.First()->start;
691 uint iend=super->qmrnas[i]->exons.Last()->end;
692 for (int j=0;j<super->rmrnas.Count();j++) {
693 uint jstart=super->rmrnas[j]->exons.First()->start;
694 uint jend=super->rmrnas[j]->exons.Last()->end;
695 if (iend<jstart) break;
696 if (jend<istart) continue;
697 //--- overlap here --
698 bool exonMatch=false;
699 if ((super->qmrnas[i]->udata & 3) > 1) continue; //already counted a ichainTP for this qry
700 if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch, 5)) { //fuzzy match
701 GLocus* qlocus=((CTData*)super->qmrnas[i]->uptr)->locus;
702 if (super->qmrnas[i]->exons.Count()>1) {
703 super->ichainATP++;
704 qlocus->ichainATP++;
705 }
706 if (exonMatch) {
707 super->mrnaATP++;
708 qlocus->mrnaATP++;
709 }
710 if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch)) { //exact match
711 if (super->qmrnas[i]->exons.Count()>1) {
712 super->qmrnas[i]->udata|=1;
713 super->ichainTP++;
714 qlocus->ichainTP++;
715 }
716 if (exonMatch) {
717 super->qmrnas[i]->udata|=2;
718 super->mrnaTP++;
719 qlocus->mrnaTP++;
720 }
721 } //exact match
722 } //fuzzy match
723 } //ref mrna loop
724 } //qry mrna loop
725 for (int ql=0;ql<super->qloci.Count();ql++) {
726 if (super->qloci[ql]->ichainTP+super->qloci[ql]->mrnaTP >0 )
727 super->locusTP++;
728 if (super->qloci[ql]->ichainATP+super->qloci[ql]->mrnaATP>0)
729 super->locusATP++;
730 }
731
732 }//for each unlinked locus
733
734 }
735
736 //look for qry data for a specific genomic sequence
737 GSeqData* getQryData(int gid, GList<GSeqData>& qdata) {
738 int qi=-1;
739 GSeqData f(gid);
740 GSeqData* q=NULL;
741 if (qdata.Found(&f,qi))
742 q=qdata[qi];
743 return q;
744 }
745
746 const char* findDescr(GffObj* gfobj) {
747 if (refdescr.Count()==0) return NULL;
748 GStr* s=refdescr.Find(gfobj->getID());
749 if (s==NULL) {
750 s=refdescr.Find(gfobj->getGeneName());
751 if (s==NULL) s=refdescr.Find(gfobj->getGeneID());
752 }
753 if (s!=NULL)
754 return s->chars();
755 return NULL;
756 }
757
758 const char* getGeneID(GffObj* gfobj) {
759 //returns anything that might resemble a gene identifier for this transcript
760 //or, if everything fails, returns the transcript ID
761 const char* s=gfobj->getGeneName();
762 if (s) return s;
763 if ((s=gfobj->getGeneID())!=NULL) return s;
764 if ((s=gfobj->getAttr("Name"))!=NULL) return s;
765 return gfobj->getID();
766 }
767
768 const char* getGeneID(GffObj& gfobj) {
769 return getGeneID(&gfobj);
770 }
771
772 void writeLoci(FILE* f, GList<GLocus> & loci) {
773 for (int l=0;l<loci.Count();l++) {
774 GLocus& loc=*(loci[l]);
775 fprintf(f,"%s\t%s[%c]%d-%d\t", loc.mrna_maxcov->getID(),
776 loc.mrna_maxcov->getGSeqName(),
777 loc.mrna_maxcov->strand, loc.start,loc.end);
778 //now print all transcripts in this locus, comma delimited
779 int printfd=0;
780 for (int i=0;i<loc.mrnas.Count();i++) {
781 if (loc.mrnas[i]==loc.mrna_maxcov) continue;
782 if (printfd==0) fprintf(f,"%s",loc.mrnas[i]->getID());
783 else fprintf(f,",%s",loc.mrnas[i]->getID());
784 printfd++;
785 }
786 const char* rdescr=findDescr(loc.mrna_maxcov);
787 if (rdescr==NULL) fprintf(f,"\t\n");
788 else fprintf(f,"\t%s\n",rdescr);
789 }
790 }
791
792 void printXQ1(FILE* f, int qidx, GList<GLocus>& qloci) {
793 int printfd=0;
794 //print
795 for (int i=0;i<qloci.Count();i++) {
796 if (qloci[i]->qfidx!=qidx) continue;
797 for (int j=0;j<qloci[i]->mrnas.Count();j++) {
798 if (printfd==0) fprintf(f,"%s",qloci[i]->mrnas[j]->getID());
799 else fprintf(f,",%s",qloci[i]->mrnas[j]->getID());
800 printfd++;
801 }
802 }
803 if (printfd==0) fprintf(f,"-");
804 }
805
806 void numXLoci(GList<GXLocus>& xloci, int& last_id) {
807 for (int l=0;l<xloci.Count();l++) {
808 if (xloci[l]->qloci.Count()==0) continue; //we never print ref-only xloci
809 last_id++;
810 xloci[l]->id=last_id;
811 }
812 }
813
814
815 class GProtCl {
816 public:
817 GList<GXConsensus> protcl;
818 GProtCl(GXConsensus* c=NULL):protcl(true,false,false) {
819 if (c!=NULL)
820 protcl.Add(c);
821 }
822 bool add_Pcons(GXConsensus* c) {
823 if (c==NULL || c->aalen==0) return false;
824 if (protcl.Count()==0) {
825 protcl.Add(c);
826 return true;
827 }
828 if (protcl[0]->aalen!=c->aalen) return false;
829 if (strcmp(protcl[0]->aa,c->aa)!=0) return false;
830 protcl.Add(c);
831 return true;
832 }
833
834 void addMerge(GProtCl& pcl, GXConsensus* pclnk) {
835 for (int i=0;i<pcl.protcl.Count();i++) {
836 if (pcl.protcl[i]!=pclnk) {
837 protcl.Add(pcl.protcl[i]);
838 }
839 }
840 }
841
842 int aalen() {
843 if (protcl.Count()==0) return 0;
844 return protcl[0]->aalen;
845 }
846 bool operator==(GProtCl& cl) {
847 return this==&cl;
848 }
849 bool operator>(GProtCl& cl) {
850 return (this>&cl);
851 }
852 bool operator<(GProtCl& cl) {
853 return (this<&cl);
854 }
855 };
856
857 class GTssCl:public GSeg { //experiment cluster of ref loci (isoforms)
858 public:
859 uint fstart; //lowest coordinate of the first exon
860 uint fend; //highest coordinate of the first exon
861 GList<GXConsensus> tsscl;
862 GTssCl(GXConsensus* c=NULL):tsscl(true,false,false) {
863 start=0;
864 end=0;
865 fstart=0;
866 fend=0;
867 if (c!=NULL) addFirst(c);
868 }
869
870 void addFirst(GXConsensus* c) {
871 tsscl.Add(c);
872 start=c->start;
873 end=c->end;
874 GffExon* fexon=(c->tcons->strand=='-') ? c->tcons->exons.Last() :
875 c->tcons->exons.First();
876 fstart=fexon->start;
877 fend=fexon->end;
878 }
879 bool add_Xcons(GXConsensus* c) {
880 if (tsscl.Count()==0) {
881 addFirst(c);
882 return true;
883 }
884 //check if it can be added to existing xconsensi
885 uint nfend=0;
886 uint nfstart=0;
887 /*
888 if (tsscl.Get(0)->tcons->getGeneID()!=NULL &&
889 c->tcons->getGeneID()!=NULL &&
890 strcmp(tsscl.Get(0)->tcons->getGeneID(), c->tcons->getGeneID()))
891 //don't tss cluster if they don't have the same GeneID (?)
892 //FIXME: we might not want this if input files are not from Cufflinks
893 // and they could simply lack proper GeneID
894 return false;
895 */
896 if (c->tcons->strand=='-') {
897 //no, the first exons don't have to overlap
898 //if (!c->tcons->exons.Last()->overlap(fstart,fend)) return false;
899 nfstart=c->tcons->exons.Last()->start;
900 nfend=c->tcons->exons.Last()->end;
901 //proximity check for the transcript start:
902 if (nfend>fend+tssDist || fend>nfend+tssDist)
903 return false;
904 }
905 else {
906 //if (!c->tcons->exons.First()->overlap(fstart,fend)) return false;
907 nfstart=c->tcons->exons.First()->start;
908 nfend=c->tcons->exons.First()->end;
909 if (nfstart>fstart+tssDist || fstart>nfstart+tssDist)
910 return false;
911 }
912 // -- if we are here, we can add to tss cluster
913
914 tsscl.Add(c);
915 if (fstart>nfstart) fstart=nfstart;
916 if (fend<nfend) fend=nfend;
917 if (start>c->start) start=c->start;
918 if (end<c->end) end=c->end;
919 return true;
920 }
921
922 void addMerge(GTssCl& cl, GXConsensus* clnk) {
923 for (int i=0;i<cl.tsscl.Count();i++) {
924 if (cl.tsscl[i]==clnk) continue;
925 tsscl.Add(cl.tsscl[i]);
926 }
927 if (fstart>cl.fstart) fstart=cl.fstart;
928 if (fend<cl.fend) fend=cl.fend;
929 if (start>cl.start) start=cl.start;
930 if (end<cl.end) end=cl.end;
931 }
932 };
933 /*
934 class IntArray { //two dimensional int array
935 int* mem;
936 int xsize;
937 int ysize;
938 public:
939 IntArray(int xlen, int ylen) {
940 xsize=xlen;
941 ysize=ylen;
942 GMALLOC(mem, xsize*ysize*sizeof(int));
943 }
944 ~IntArray() {
945 GFREE(mem);
946 }
947 int& data(int x, int y) {
948 return mem[y*xsize+x];
949 }
950 };
951
952 int aa_diff(GXConsensus* c1, GXConsensus* c2) {
953 int diflen=abs(c1->aalen-c2->aalen);
954 if (diflen>=6) return diflen;
955 //obvious case: same CDS
956 if (diflen==0 && strcmp(c1->aa, c2->aa)==0) return 0;
957 //simple edit distance calculation
958 IntArray dist(c1->aalen+1, c2->aalen+1);
959 for (int i=0;i<=c1->aalen;i++) {
960 dist.data(i,0) = i;
961 }
962 for (int j = 0; j <= c2->aalen; j++) {
963 dist.data(0,j) = j;
964 }
965 for (int i = 1; i <= c1->aalen; i++)
966 for (int j = 1; j <= c2->aalen; j++) {
967 dist.data(i,j) = GMIN3( dist.data(i-1,j)+1,
968 dist.data(i,j-1)+1,
969 dist.data(i-1,j-1)+((c1->aa[i-1] == c2->aa[j-1]) ? 0 : 1) );
970 }
971 int r=dist.data(c1->aalen,c2->aalen);
972 return r;
973 }
974 */
975 void printConsGTF(FILE* fc, GXConsensus* xc, int xlocnum) {
976 for (int i=0;i<xc->tcons->exons.Count();i++) {
977 fprintf(fc,
978 "%s\t%s\texon\t%d\t%d\t.\t%c\t.\tgene_id \"XLOC_%06d\"; transcript_id \"%s_%08d\"; exon_number \"%d\";",
979 xc->tcons->getGSeqName(),xc->tcons->getTrackName(),xc->tcons->exons[i]->start, xc->tcons->exons[i]->end, xc->tcons->strand,
980 xlocnum, cprefix, xc->id, i+1);
981 //if (i==0) {
982 const char* gene_name=NULL;
983 if (xc->ref) {
984 gene_name=xc->ref->getGeneName();
985 if (gene_name==NULL) gene_name=xc->ref->getGeneID();
986 if (gene_name) {
987 fprintf (fc, " gene_name \"%s\";", gene_name);
988 }
989 }
990 if (!haveRefs) {
991 if (gene_name==NULL && xc->tcons->getGeneName())
992 fprintf (fc, " gene_name \"%s\";", xc->tcons->getGeneName());
993 char* s=xc->tcons->getAttr("nearest_ref", true);
994 if (s) fprintf(fc, " nearest_ref \"%s\";",s);
995 s=xc->tcons->getAttr("class_code", true);
996 if (s) fprintf(fc, " class_code \"%s\";", s);
997 }
998 fprintf(fc, " oId \"%s\";",xc->tcons->getID());
999 if (xc->contained) {
1000 fprintf(fc, " contained_in \"%s_%08d\";", cprefix, xc->contained->id);
1001 }
1002 if (haveRefs) {
1003 if (xc->ref!=NULL)
1004 fprintf(fc, " nearest_ref \"%s\";",xc->ref->getID());
1005 fprintf(fc, " class_code \"%c\";",xc->refcode ? xc->refcode : '.');
1006 }
1007 if (xc->tss_id>0) fprintf(fc, " tss_id \"TSS%d\";",xc->tss_id);
1008 if (xc->p_id>0) fprintf(fc, " p_id \"P%d\";",xc->p_id);
1009 // }
1010 fprintf(fc,"\n");
1011 }
1012 }
1013
1014 void tssCluster(GXLocus& xloc)
1015 {
1016 GList<GTssCl> xpcls(true,true,false);
1017 for (int i=0;i<xloc.tcons.Count();i++)
1018 {
1019 GXConsensus* c=xloc.tcons[i];
1020 //if (c->tcons->exons.Count()<2) continue; //should we skip single-exon transcripts ??
1021 GArray<int> mrgloci(true);
1022 int lfound=0;
1023 for (int l=0;l<xpcls.Count();l++)
1024 {
1025 if (xpcls[l]->end<c->tcons->exons.First()->start) continue;
1026 if (xpcls[l]->start>c->tcons->exons.Last()->end) break;
1027 if (xpcls[l]->add_Xcons(c))
1028 {
1029 lfound++;
1030 mrgloci.Add(l);
1031
1032 }
1033
1034 } // for each xpcluster
1035 if (lfound==0)
1036 {
1037 //create a xpcl with only this xconsensus
1038 xpcls.Add(new GTssCl(c));
1039
1040 }
1041 else if (lfound>1)
1042 {
1043 for (int l=1;l<lfound;l++)
1044 {
1045 int mlidx=mrgloci[l]-l+1;
1046 xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1047 xpcls.Delete(mlidx);
1048 }
1049 }
1050
1051 }//for each xconsensus in this xlocus
1052 for (int l=0;l<xpcls.Count();l++)
1053 {
1054 //if (xpcls[l]->tsscl.Count()<2) continue;
1055 tsscl_num++;
1056 for (int i=0;i<xpcls[l]->tsscl.Count();i++)
1057 xpcls[l]->tsscl[i]->tss_id=tsscl_num;
1058 //processTssCl(xcds_num, xpcls[l], faseq);
1059 }
1060 }
1061
1062 void protCluster(GXLocus& xloc, GFaSeqGet *faseq) {
1063 if (!faseq)
1064 return;
1065 GList<GProtCl> xpcls(true,true,false);
1066 for (int i=0;i<xloc.tcons.Count();i++) {
1067 GXConsensus* c=xloc.tcons[i];
1068 if (c->ref==NULL || c->ref->CDstart==0) continue; //no ref or CDS available
1069 if (c->refcode!='=') continue;
1070 //get the CDS translation here
1071 if (c->aa==NULL) {
1072 c->aa=c->ref->getSplicedTr(faseq, true, &c->aalen);
1073 if (c->aalen>0 && c->aa[c->aalen-1]=='.') {
1074 //discard the final stop codon
1075 c->aalen--;
1076 c->aa[c->aalen]=0;
1077 }
1078 }
1079 GArray<int> mrgloci(true);
1080 int lfound=0;
1081 for (int l=0;l<xpcls.Count();l++) {
1082 if (xpcls[l]->aalen()!=c->aalen) continue;
1083 if (xpcls[l]->add_Pcons(c)) {
1084 lfound++;
1085 mrgloci.Add(l);
1086 }
1087 } // for each xpcluster
1088 if (lfound==0) {
1089 //create a xpcl with only this xconsensus
1090 xpcls.Add(new GProtCl(c));
1091 }
1092 else if (lfound>1) {
1093 for (int l=1;l<lfound;l++) {
1094 int mlidx=mrgloci[l]-l+1;
1095 xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1096 xpcls.Delete(mlidx);
1097 }
1098 }
1099 }//for each xconsensus in this xlocus
1100 for (int l=0;l<xpcls.Count();l++) {
1101 protcl_num++;
1102 for (int i=0;i<xpcls[l]->protcl.Count();i++)
1103 xpcls[l]->protcl[i]->p_id=protcl_num;
1104 }
1105 for (int i=0;i<xloc.tcons.Count();i++) {
1106 GXConsensus* c=xloc.tcons[i];
1107 if (c->aa!=NULL) { GFREE(c->aa); }
1108 }
1109 }
1110
1111 void printXLoci(FILE* f, FILE* fc, int qcount, GList<GXLocus>& xloci, GFaSeqGet *faseq) {
1112 for (int l=0;l<xloci.Count();l++) {
1113 if (xloci[l]->qloci.Count()==0) continue;
1114 GXLocus& xloc=*(xloci[l]);
1115 xloc.checkContainment();
1116 tssCluster(xloc);//cluster and assign tss_id and cds_id to each xconsensus in xloc
1117 protCluster(xloc,faseq);
1118 for (int c=0;c<xloc.tcons.Count();c++) {
1119 if (showContained || xloc.tcons[c]->contained==NULL)
1120 printConsGTF(fc,xloc.tcons[c],xloc.id);
1121 }
1122 fprintf(f,"XLOC_%06d\t%s[%c]%d-%d\t", xloc.id,
1123 xloc.qloci[0]->mrna_maxcov->getGSeqName(),
1124 xloc.strand, xloc.start,xloc.end);
1125 //now print all transcripts in this locus, comma delimited
1126 //first, ref loci, if any
1127 int printfd=0;
1128 if (xloc.rloci.Count()>0) {
1129 for (int i=0;i<xloc.rloci.Count();i++) {
1130 for (int j=0;j<xloc.rloci[i]->mrnas.Count();j++) {
1131 if (printfd==0) fprintf(f,"%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1132 xloc.rloci[i]->mrnas[j]->getID());
1133 else fprintf(f,",%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1134 xloc.rloci[i]->mrnas[j]->getID());
1135 printfd++;
1136 }
1137 }
1138 }
1139 else {
1140 fprintf(f,"-");
1141 }
1142 //second, all the cufflinks transcripts
1143 for (int qi=0;qi<qcount;qi++) {
1144 fprintf(f,"\t");
1145 printXQ1(f,qi,xloc.qloci);
1146 }
1147 fprintf(f,"\n");
1148 }
1149 }
1150
1151 void writeIntron(FILE* f, char strand, GFaSeqGet* faseq, GSeg& iseg,
1152 GList<GffObj>& mrnas, bool wrong=false) {
1153 //find a ref mrna having this intron
1154 GffObj* rm=NULL;
1155 for (int i=0;i<mrnas.Count();i++) {
1156 GffObj* m=mrnas[i];
1157 if (m->start>iseg.end) break;
1158 if (m->end<iseg.start) continue;
1159 //intron coords overlaps mrna region
1160 for (int j=1;j<m->exons.Count();j++) {
1161 if (iseg.start==m->exons[j-1]->end+1 &&
1162 iseg.end==m->exons[j]->start-1) { rm=m; break; } //match found
1163 }//for each intron
1164 if (rm!=NULL) break;
1165 } //for each ref mrna in this locus
1166 if (rm==NULL) GError("Error: couldn't find ref mrna for intron %d-%d! (BUG)\n",
1167 iseg.start,iseg.end);
1168 int ilen=iseg.end-iseg.start+1;
1169 fprintf(f,"%s\t%s\tintron\t%d\t%d\t.\t%c\t.\t",
1170 rm->getGSeqName(),rm->getTrackName(),iseg.start,iseg.end,strand);
1171 if (faseq!=NULL) {
1172 const char* gseq=faseq->subseq(iseg.start, ilen);
1173 char* cseq=Gstrdup(gseq, gseq+ilen-1);
1174 if (strand=='-') reverseComplement(cseq, ilen);
1175 fprintf(f,"spl=\"%c%c..%c%c\"; ", toupper(cseq[0]),toupper(cseq[1]),
1176 toupper(cseq[ilen-2]),toupper(cseq[ilen-1]));
1177 GFREE(cseq);
1178 }
1179 fprintf(f,"transcript_id \"%s\";", rm->getID());
1180 if (wrong) fprintf(f," noOvl=1;");
1181 fprintf(f,"\n");
1182
1183 }
1184
1185 void reportMIntrons(FILE* fm, FILE* fn, FILE* fq, char strand,
1186 GList<GSuperLocus>& cmpdata) {
1187 if (fm==NULL) return;
1188 for (int l=0;l<cmpdata.Count();l++) {
1189 GSuperLocus *sl=cmpdata[l];
1190 //cache the whole locus sequence if possible
1191 //write these introns and their splice sites into the file
1192 for (int i=0;i<sl->i_missed.Count();i++)
1193 writeIntron(fm, strand, NULL, sl->i_missed[i], sl->rmrnas);
1194 if (fn!=NULL) {
1195 for (int i=0;i<sl->i_notp.Count();i++)
1196 writeIntron(fn, strand, NULL, sl->i_notp[i], sl->rmrnas);
1197 }
1198 if (fq!=NULL) {
1199 for (int i=0;i<sl->i_qwrong.Count();i++) {
1200 writeIntron(fq, strand, NULL, sl->i_qwrong[i], sl->qmrnas, true);
1201 }
1202 for (int i=0;i<sl->i_qnotp.Count();i++) {
1203 writeIntron(fq, strand, NULL, sl->i_qnotp[i], sl->qmrnas);
1204 }
1205 }
1206 }
1207 }
1208
1209
1210 void processLoci(GSeqData& seqdata, GSeqData* refdata, int qfidx) {
1211 //GList<GSeqLoci>& glstloci, GList<GSeqCmpRegs>& cmpdata)
1212
1213 if (refdata!=NULL) {
1214 //if (gtf_tracking_verbose) GMessage(" ..comparing to reference loci..\n") ;
1215 compareLoci2R(seqdata.loci_f, seqdata.gstats_f, refdata->loci_f, qfidx);
1216 compareLoci2R(seqdata.loci_r, seqdata.gstats_r, refdata->loci_r, qfidx);
1217 // -- report
1218
1219 if (f_mintr!=NULL) {
1220 GMessage(" ..reporting missed ref introns..\n");
1221 //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '+', seqdata.gstats_f);
1222 //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '-', seqdata.gstats_r);
1223 reportMIntrons(f_mintr, NULL, NULL, '+', seqdata.gstats_f);
1224 reportMIntrons(f_mintr, NULL, NULL, '-', seqdata.gstats_r);
1225 }
1226 }
1227 }
1228
1229 //adjust stats for a list of unoverlapped (completely missed) ref loci
1230 void collectRLocData(GSuperLocus& stats, GLocus& loc) {
1231 stats.total_rmrnas+=loc.mrnas.Count();
1232 stats.total_rexons+=loc.uexons.Count();
1233 stats.total_rintrons+=loc.introns.Count();
1234 stats.total_rmexons+=loc.mexons.Count();
1235 stats.total_richains+=loc.ichains;
1236 stats.m_exons+=loc.uexons.Count();
1237 stats.m_introns+=loc.introns.Count();
1238 stats.total_rloci++;
1239 for (int e=0;e<loc.mexons.Count();e++) {
1240 stats.rbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1241 }
1242 }
1243
1244 void collectRData(GSuperLocus& stats, GList<GLocus>& loci) {
1245 for (int l=0;l<loci.Count();l++)
1246 collectRLocData(stats,*loci[l]);
1247 }
1248
1249 //adjust stats for a list of unoverlapped (completely "wrong" or novel) qry loci
1250 void collectQLocData(GSuperLocus& stats, GLocus& loc) {
1251 stats.total_qmrnas+=loc.mrnas.Count();
1252 stats.total_qexons+=loc.uexons.Count();
1253 stats.total_qmexons+=loc.mexons.Count();
1254 stats.total_qintrons+=loc.introns.Count();
1255 stats.total_qichains+=loc.ichains;
1256 stats.total_qloci++;
1257 if (loc.ichains>0 && loc.mrnas.Count()>1)
1258 stats.total_qloci_alt++;
1259 stats.w_exons+=loc.uexons.Count();
1260 stats.w_introns+=loc.introns.Count();
1261 for (int e=0;e<loc.mexons.Count();e++) {
1262 stats.qbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1263 }
1264 }
1265
1266 void collectQData(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1267 for (int l=0;l<loci.Count();l++) {
1268 //this is called when no refdata is given, so all these loci are nloci
1269 nloci.Add(loci[l]);
1270 collectQLocData(stats,*loci[l]);
1271 }
1272 }
1273
1274 void collectQNOvl(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1275 for (int l=0;l<loci.Count();l++) {
1276 if (loci[l]->cmpovl.Count()==0) {//locus with no ref loci overlaps
1277 stats.w_loci++; //novel/wrong loci
1278 nloci.Add(loci[l]);
1279 collectQLocData(stats,*loci[l]);
1280 }
1281 }
1282 }
1283
1284 void collectQU(GSuperLocus& stats, GList<GLocus>& nloci) {
1285 for (int l=0;l<nloci.Count();l++) {
1286 stats.w_loci++; //novel/wrong loci
1287 collectQLocData(stats, *nloci[l]);
1288 }
1289 }
1290
1291 void printLocus(FILE* f, GLocus& loc, const char* gseqname) {
1292 fprintf(f, "## Locus %s:%d-%d\n",gseqname, loc.start, loc.end);
1293 for (int m=0;m<loc.mrnas.Count();m++) {
1294 loc.mrnas[m]->printGtf(f);
1295 }
1296 }
1297
1298 void collectRNOvl(GSuperLocus& stats, GList<GLocus>& loci) { //, const char* gseqname) {
1299 for (int l=0;l<loci.Count();l++) {
1300 if (loci[l]->cmpovl.Count()==0) {
1301 stats.m_loci++; //missed ref loci
1302 //if (f_mloci!=NULL)
1303 // printLocus(f_mloci,*loci[l], gseqname);
1304 collectRLocData(stats,*loci[l]);
1305 }
1306 }
1307 }
1308
1309
1310 void collectCmpData(GSuperLocus& stats, GList<GSuperLocus>& cmpdata) { //, const char* gseqname) {
1311 for (int c=0;c<cmpdata.Count();c++) {
1312 stats.addStats(*cmpdata[c]);
1313 /*
1314 if (f_nloci!=NULL && cmpdata[c]->locusTP==0 && cmpdata[c]->rloci.Count()>0) {
1315 fprintf(f_nloci, "# Superlocus %s:%d-%d\n",gseqname, cmpdata[c]->start, cmpdata[c]->end);
1316 for (int l=0;l<cmpdata[c]->rloci.Count();l++) {
1317 printLocus(f_nloci,*cmpdata[c]->rloci[l], gseqname);
1318 }
1319 }
1320 */
1321 }
1322 }
1323
1324 void collectStats(GSuperLocus& stats, GSeqData* seqdata, GSeqData* refdata) {
1325 //collect all stats for a single genomic sequence into stats
1326 if (seqdata==NULL) {
1327 if (reduceRefs || refdata==NULL) return;
1328 //special case with completely missed all refs on a contig/chromosome
1329 collectRData(stats, refdata->loci_f);
1330 collectRData(stats, refdata->loci_r);
1331 return;
1332 }
1333 if (refdata==NULL) {//reference data missing on this contig
1334 collectQData(stats, seqdata->loci_f, seqdata->nloci_f);
1335 collectQData(stats, seqdata->loci_r, seqdata->nloci_r);
1336 collectQU(stats, seqdata->nloci_u);
1337 return;
1338 }
1339
1340 /*stats.total_qloci+=seqdata->loci_f.Count();
1341 stats.total_qloci+=seqdata->loci_r.Count();
1342 if (reduceRefs) { //only collect ref loci from superloci
1343
1344 }
1345 else {
1346 stats.total_rloci+=refdata->loci_f.Count();
1347 stats.total_rloci+=refdata->loci_r.Count();
1348 }
1349 */
1350 //collect data for overlapping superloci (already in seqdata->gstats_f/_r)
1351 //char* gseqname=getGSeqName(seqdata->gseq_id);
1352 collectCmpData(stats, seqdata->gstats_f);
1353 collectCmpData(stats, seqdata->gstats_r);
1354 //for non-overlapping qry loci, always add them as false positives FP
1355 collectQNOvl(stats, seqdata->loci_f, seqdata->nloci_f);
1356 collectQNOvl(stats, seqdata->loci_r, seqdata->nloci_r);
1357 collectQU(stats, seqdata->nloci_u);
1358 if (!reduceRefs) { //find ref loci with empty cmpovl and add them
1359 collectRNOvl(stats, refdata->loci_f);
1360 collectRNOvl(stats, refdata->loci_r);
1361 }
1362 }
1363
1364 void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
1365 GSeqData* seqdata, GSeqData* refdata) {
1366 GSuperLocus stats;
1367 bool finalSummary=(seqdata==NULL && refdata==NULL);
1368 GSuperLocus *ps=(finalSummary ? &stotal : &stats );
1369 if (!finalSummary) { //collecting contig stats
1370 //gather statistics for all loci/superloci here
1371 collectStats(stats, seqdata, refdata);
1372 stotal.addStats(stats);
1373 if (!perContigStats) return;
1374 }
1375 ps->calcF();
1376 if (seqdata!=NULL) fprintf(fout, "#> Genomic sequence: %s \n", setname);
1377 else fprintf(fout, "\n#= Summary for dataset: %s :\n", setname);
1378
1379 fprintf(fout, "# Query mRNAs : %7d in %7d loci (%d multi-exon transcripts)\n",
1380 ps->total_qmrnas, ps->total_qloci, ps->total_qichains);
1381 fprintf(fout, "# (%d multi-transcript loci, ~%.1f transcripts per locus)\n",
1382 ps->total_qloci_alt, ((double)ps->total_qmrnas/ps->total_qloci));
1383
1384 if (haveRefs) {
1385 fprintf(fout, "# Reference mRNAs : %7d in %7d loci (%d multi-exon)\n",
1386 ps->total_rmrnas, ps->total_rloci, ps->total_richains);
1387 if (ps->baseTP+ps->baseFP==0 || ps->baseTP+ps->baseFN==0) return;
1388 fprintf(fout, "# Corresponding super-loci: %7d\n",ps->total_superloci);
1389
1390 /*if (seqdata!=NULL) {
1391 fprintf(fout, " ( %d/%d on forward/reverse strand)\n",
1392 seqdata->gstats_f.Count(),seqdata->gstats_r.Count());
1393 }*/
1394 fprintf(fout, "#--------------------| Sn | Sp | fSn | fSp \n");
1395 double sp=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFP);
1396 double sn=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFN);
1397 fprintf(fout, " Base level: \t%5.1f\t%5.1f\t - \t - \n",sn, sp);
1398 sp=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFP);
1399 sn=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFN);
1400 double fsp=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFP);
1401 double fsn=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFN);
1402 if (fsp>100.0) fsp=100.0;
1403 if (fsn>100.0) fsn=100.0;
1404 fprintf(fout, " Exon level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1405 if (ps->total_rintrons>0) {
1406 //intron level
1407 sp=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFP);
1408 sn=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFN);
1409 fsp=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFP);
1410 fsn=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFN);
1411 if (fsp>100.0) fsp=100.0;
1412 if (fsn>100.0) fsn=100.0;
1413 fprintf(fout, " Intron level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1414 //intron chains:
1415 sp=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFP);
1416 sn=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFN);
1417 if (sp>100.0) sp=100.0;
1418 if (sn>100.0) sn=100.0;
1419 fsp=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFP);
1420 fsn=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFN);
1421 if (fsp>100.0) fsp=100.0;
1422 if (fsn>100.0) fsn=100.0;
1423 fprintf(fout, "Intron chain level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1424 }
1425 else {
1426 fprintf(fout, " Intron level: \t - \t - \t - \t - \n");
1427 fprintf(fout, "Intron chain level: \t - \t - \t - \t - \n");
1428 }
1429 sp=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFP);
1430 sn=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFN);
1431 fsp=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFP);
1432 fsn=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFN);
1433 if (fsp>100.0) fsp=100.0;
1434 if (fsn>100.0) fsn=100.0;
1435 fprintf(fout, " Transcript level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp,fsn,fsp);
1436 sp=(100.0*(double)ps->locusTP)/(ps->locusTP+ps->locusFP);
1437 sn=(100.0*(double)ps->locusTP)/(ps->locusTP+ps->locusFN);
1438 fsp=(100.0*(double)ps->locusATP)/(ps->locusATP+ps->locusAFP);
1439 fsn=(100.0*(double)ps->locusATP)/(ps->locusATP+ps->locusAFN);
1440 fprintf(fout, " Locus level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1441 sn=(100.0*(double)ps->m_exons)/(ps->total_rexons);
1442 fprintf(fout, " Missed exons:\t%d/%d (%5.1f%%)\n",ps->m_exons, ps->total_rexons, sn);
1443 sn=(100.0*(double)ps->w_exons)/(ps->total_qexons);
1444 fprintf(fout, " Wrong exons:\t%d/%d (%5.1f%%)\n",ps->w_exons, ps->total_qexons,sn);
1445 if (ps->total_rintrons>0) {
1446 sn=(100.0*(double)ps->m_introns)/(ps->total_rintrons);
1447 fprintf(fout, " Missed introns:\t%d/%d (%5.1f%%)\n",ps->m_introns, ps->total_rintrons, sn);
1448 }
1449 if (ps->total_qintrons>0) {
1450 sn=(100.0*(double)ps->w_introns)/(ps->total_qintrons);
1451 fprintf(fout, " Wrong introns:\t%d/%d (%5.1f%%)\n",ps->w_introns, ps->total_qintrons,sn);
1452 }
1453 if (ps->total_rloci>0) {
1454 sn=(100.0*(double)ps->m_loci)/(ps->total_rloci);
1455 fprintf(fout, " Missed loci:\t%d/%d (%5.1f%%)\n",ps->m_loci, ps->total_rloci, sn);
1456 }
1457 if (ps->total_qloci>0) {
1458 sn=(100.0*(double)ps->w_loci)/(ps->total_qloci);
1459 fprintf(fout, " Wrong loci:\t%d/%d (%5.1f%%)\n",ps->w_loci, ps->total_qloci,sn);
1460 }
1461
1462 }
1463 }
1464
1465 int inbuf_len=1024; //starting inbuf capacity
1466 char* inbuf=NULL; // incoming buffer for sequence lines.
1467
1468 void loadRefDescr(const char* fname) {
1469 if (inbuf==NULL) { GMALLOC(inbuf, inbuf_len); }
1470 FILE *f=fopen(fname, "rb");
1471 if (f==NULL) GError("Error opening exon file: %s\n",fname);
1472 char* line;
1473 int llen=0;
1474 off_t fpos;
1475 while ((line=fgetline(inbuf, inbuf_len, f, &fpos, &llen))!=NULL) {
1476 if (strlen(line)<=2) continue;
1477 int idlen=strcspn(line,"\t ");
1478 char* p=line+idlen;
1479 if (idlen<llen && idlen>0) {
1480 *p=0;
1481 p++;
1482 refdescr.Add(line, new GStr(p));
1483 }
1484 }
1485 }
1486
1487 GSeqTrack* findGSeqTrack(int gsid) {
1488 GSeqTrack f(gsid);
1489 int fidx=-1;
1490 if (gseqtracks.Found(&f,fidx))
1491 return gseqtracks[fidx];
1492 fidx=gseqtracks.Add(new GSeqTrack(gsid));
1493 return gseqtracks[fidx];
1494 }
1495
1496
1497
1498 GffObj* findRefMatch(GffObj& m, GLocus& rloc, int& ovlen) {
1499 ovlen=0;
1500 CTData* mdata=((CTData*)m.uptr);
1501 if (mdata->eqref!=NULL && ((CTData*)(mdata->eqref->uptr))->locus==&rloc) {
1502 mdata->eqref=mdata->ovls.First()->mrna; //this should be unnecessary
1503 //check it?
1504 return mdata->ovls.First()->mrna;
1505 }
1506 //if (rloc==NULL|| m==NULL) return NULL;
1507 GffObj* ret=NULL;
1508 for (int r=0;r<rloc.mrnas.Count();r++) {
1509 int olen=0;
1510 if (tMatch(m, *(rloc.mrnas[r]),olen, true)) { //return rloc->mrnas[r];
1511 /*
1512 if (ovlen<olen) {
1513 ovlen=olen;
1514 ret=rloc.mrnas[r]; //keep the longest matching ref
1515 //but this is unnecessary, there can be only one matching ref
1516 // because duplicate refs were discarded (unless -G was used)
1517 }
1518 */
1519 mdata->addOvl('=',rloc.mrnas[r], olen);
1520 ret=mdata->ovls.First()->mrna;
1521 //this must be called only for the head of an equivalency chain
1522 CTData* rdata=(CTData*)rloc.mrnas[r]->uptr;
1523 rdata->addOvl('=',&m,olen);
1524 //if (rdata->eqnext==NULL) rdata->eqnext=&m;
1525 }
1526 }
1527 if (ret!=NULL)
1528 mdata->eqref=ret;
1529 return ret;
1530 }
1531
1532
1533 void addXCons(GXLocus* xloc, GffObj* ref, char ovlcode, GffObj* tcons, CEqList* ts) {
1534 GXConsensus* c=new GXConsensus(tcons, ts, ref, ovlcode);
1535 //xloc->tcons.Add(c);
1536 //this will also check c against the other tcons for containment:
1537 xloc->addXCons(c);
1538 }
1539
1540
1541 const uint pre_mrna_threshold = 100;
1542
1543 char getOvlCode(GffObj& m, GffObj& r, int& ovlen) {
1544 ovlen=0;
1545 if (!m.overlap(r.start,r.end)) return 'u';
1546 int jmax=r.exons.Count()-1;
1547
1548 if (m.exons.Count()==1) { //single-exon transfrag
1549 GSeg mseg(m.start, m.end);
1550 if (jmax==0) { //also single-exon ref
1551 ovlen=mseg.overlapLen(r.start,r.end);
1552 int lmax=GMAX(r.covlen, m.covlen);
1553 if (ovlen >= lmax*0.8) return '='; //fuzz matching for single-exon transcripts: 80% of the longer one
1554 //if (m.covlen<=ovlen+12 && m.covlen<r.covlen) return 'c'; //contained
1555 if (m.covlen<r.covlen && ovlen >= m.covlen*0.8) return 'c';
1556 return 'o'; //just plain overlapping
1557 }
1558 //single-exon qry overlaping multi-exon ref
1559 for (int j=0;j<=jmax;j++) {
1560 //check if it's contained by an exon
1561 if (m.start>r.exons[j]->start-8 && m.end<r.exons[j]->end+8)
1562 return 'c';
1563 if (j==jmax) break;
1564 //check if it's contained by an intron
1565 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1566 return 'i';
1567 // check if it's a potential pre-mRNA transcript
1568 // (if overlaps an intron at least 10 bases)
1569 uint iovlen=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1);
1570 if (iovlen>=10 && mseg.len()>iovlen+10) return 'e';
1571 }
1572 return 'o'; //plain overlap, uncategorized
1573 } //single-exon transfrag
1574 //-- from here on we have a multi-exon transfrag --
1575 // * check if contained by a ref intron
1576 for (int j=0;j<jmax;j++) {
1577 if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1578 return 'i';
1579 }
1580 //> check if m's intron chain is a subset of r's intron chain
1581 int imax=m.exons.Count()-1;// imax>0 here
1582 if (m.exons[imax]->start<r.exons[0]->end ||
1583 r.exons[jmax]->start<m.exons[0]->end ) //intron chains do not overlap at all
1584 return 'o'; //but terminal exons do, otherwise we wouldn't be here
1585 int i=1; //index of exon to the right of current qry intron
1586 int j=1; //index of exon to the right of current ref intron
1587 //find first intron overlap
1588 while (i<=imax && j<=jmax) {
1589 if (r.exons[j]->start<m.exons[i-1]->end) { j++; continue; }
1590 if (m.exons[i]->start<r.exons[j-1]->end) { i++; continue; }
1591 break; //here we have an intron overlap
1592 }
1593 if (i>imax || j>jmax)
1594 return 'o'; //no initial intron overlap found
1595 //from here on we check all qry introns against ref introns
1596 bool jmatch=false; //true if at least a junction match is found
1597 bool icmatch=(i==1); //intron chain match - it will be updated as introns are checked
1598 //bool exovli=false; // if any terminal exon of qry extends into a ref intron
1599 int jmstart=j; //index of first intron overlap of reference
1600 int jmend=0; //index of last intron overlap of reference
1601 int imend=0; //index of last intron overlap of query
1602 //check for intron matches
1603 while (i<=imax && j<=jmax) {
1604 uint mstart=m.exons[i-1]->end;
1605 uint mend=m.exons[i]->start;
1606 uint rstart=r.exons[j-1]->end;
1607 uint rend=r.exons[j]->start;
1608 if (rend<mstart) { j++; icmatch=false; continue; } //skipping ref intron, no ichain match
1609 if (mend<rstart) { i++; icmatch=false; continue; } //skipping qry intron, no ichain match
1610 //overlapping introns here, test junction matching
1611 jmend=j; //keep track of last overlapping intron
1612 imend=i;
1613 bool smatch=(mstart==rstart);
1614 bool ematch=(mend==rend);
1615 if (smatch || ematch) jmatch=true;
1616 if (smatch && ematch) { i++; j++; } //perfect match for this intron
1617 else { //at least one junction doesn't match
1618 icmatch=false;
1619 if (mend>rend) j++; else i++;
1620 }
1621 } //while checking intron overlaps
1622
1623 if (icmatch && imend==imax) { // qry intron chain match
1624 if (jmstart==1 && jmend==jmax) return '='; //identical intron chains
1625 // -- qry intron chain is shorter than ref intron chain --
1626 int l_iovh=0; // overhang of leftmost q exon left boundary beyond the end of ref intron to the left
1627 int r_iovh=0; // same type of overhang through the ref intron on the right
1628 if (jmstart>1 && r.exons[jmstart-1]->start>m.start)
1629 l_iovh = r.exons[jmstart-1]->start - m.start;
1630 if (jmend<jmax && m.end > r.exons[jmend]->end)
1631 r_iovh = m.end - r.exons[jmend]->end;
1632 if (l_iovh<4 && r_iovh<4) return 'c';
1633 //TODO? check if any x_iovl>10 and return 'e' to signal an "unspliced intron" ?
1634 // or we can check if any of them are >= the length of the corresponding ref intron on that side
1635 return 'j';
1636 }
1637 /*
1638 if (icmatch && (jmax>=imax)) { //all qry introns match
1639 //but they may overlap
1640 // if ((lbound && lbound > m.exons[0]->start+10) ||
1641 // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1642 // return 'c';
1643 // }
1644 int code = 'c';
1645 if (lbound)
1646 {
1647 uint ref_boundary = lbound;
1648 uint cuff_boundary = m.exons[0]->start;
1649 if (ref_boundary > (cuff_boundary + pre_mrna_threshold)) // cuff extends a lot
1650 {
1651 code = 'j';
1652 }
1653 if (ref_boundary > cuff_boundary) // cuff extends just a bit into a ref intron
1654 {
1655 code = 'e';
1656 }
1657 }
1658 if (j <= jmax)
1659 {
1660 uint ref_boundary = r.exons[j-1]->end;
1661 uint cuff_boundary = m.exons[i-1]->end;
1662 if (cuff_boundary > (ref_boundary + pre_mrna_threshold)) // cuff extends a lot
1663 {
1664 code = 'j';
1665 }
1666 if (cuff_boundary > ref_boundary) // cuff extends just a bit into a ref intron
1667 {
1668 code = 'e';
1669 }
1670 }
1671 //if ((lbound && lbound > m.exons[0]->start+10) ||
1672 // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1673 return code;
1674 }
1675
1676 // if (!ichain) // first and last exons more or less match, but there's a different intron somewhere
1677 // {
1678 //
1679 // }
1680
1681 */
1682 return jmatch ? 'j':'o';
1683 }
1684
1685 char getRefOvl(GffObj& m, GLocus& rloc, GffObj*& rovl, int& ovlen) {
1686 rovl=NULL;
1687 ovlen=0;
1688 if (m.start>rloc.end || m.end<rloc.start) {
1689 //see if it's a polymerase run ?
1690 /*
1691 if ((m.strand=='+' && m.end<=rloc.end+polyrun_range) ||
1692 (m.strand=='-' && m.start>=rloc.start-polyrun_range)) {
1693 rovl=rloc.mrna_maxcov;
1694 ((CTData*)m.uptr)->addOvl('p',rloc.mrna_maxcov);
1695 return 'p';
1696 }
1697 */
1698 return 0; //unknown -> intergenic space
1699 }
1700 for (int i=0;i<rloc.mrnas.Count();i++) {
1701 GffObj* r=rloc.mrnas[i];
1702 int olen=0;
1703 char ovlcode=getOvlCode(m,*r,olen);
1704 if (ovlcode!=0) { //has some sort of "overlap" with r
1705 ((CTData*)m.uptr)->addOvl(ovlcode,r,olen);
1706 if (olen>ovlen) ovlen=olen;
1707 if (ovlcode=='c' || ovlcode=='=') //keep match/containment for each reference transcript
1708 ((CTData*)r->uptr)->addOvl(ovlcode,&m,olen);
1709 }
1710 }//for each ref in rloc
1711 // i,j,o
1712 return ((CTData*)m.uptr)->getBestCode();
1713 }
1714
1715 /*
1716 void findTMatches(GTrackLocus& loctrack, int qcount) {
1717 //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1718 for (int q=0;q<qcount-1;q++) { //for each qry dataset
1719 if (loctrack[q]==NULL) continue;
1720 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1721 GffObj* qi_t=loctrack[q]->Get(qi);
1722 CTData* qi_d=(CTData*)qi_t->uptr;
1723 if ((qi_d->eqdata & EQHEAD_TAG) !=0) { //this is set as an EQ chain head already
1724 //if (qi_t->exons.Count()>1)
1725 continue;
1726 }
1727 for (int n=q+1;n<qcount;n++) { // for every successor dataset
1728 if (loctrack[n]==NULL) continue;
1729 for (int ni=0;ni<loctrack[n]->Count();ni++) {
1730 GffObj* ni_t=loctrack[n]->Get(ni);
1731 CTData* ni_d=(CTData*)ni_t->uptr;
1732 //if (ni_d->eqdata!=0) continue; //already part of an EQ chain
1733 //single exon transfrags have special treatment:
1734 bool s_match=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1735 if (ni_d->eqdata!=0 && !s_match) continue; //already part of an EQ chain
1736 if (ni_d->eqnext!=NULL) continue;
1737 int ovlen=0;
1738
1739 if (qi_d->eqnext!=NULL) {
1740 if (!s_match) continue;
1741 //test all in the EQ list for a match
1742 bool matchFound=false;
1743 CTData* next_eq_d=qi_d;
1744 if (tMatch(*qi_t, *ni_t, ovlen, true)) {
1745 matchFound=true;
1746 }
1747 else {
1748 while (next_eq_d->eqnext!=NULL) {
1749 if (tMatch(*(next_eq_d->eqnext), *ni_t, ovlen, true)) {
1750 matchFound=true;
1751 break;
1752 }
1753 next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1754 } //last in the chain
1755 }
1756 if (matchFound) {
1757 //add this to the end of the EQ chain instead
1758 next_eq_d=(CTData*)(qi_d->eqnext->uptr);
1759 while (next_eq_d->eqnext!=NULL) {
1760 next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1761 } //last in the chain
1762 next_eq_d->eqnext=ni_t;
1763 ni_d->eqdata=n+1;
1764 ni_d->eqnext=NULL; ///TEST
1765 }
1766 }
1767 else {
1768 if (tMatch(*qi_t,*ni_t, ovlen, true)) {
1769 qi_d->eqnext=ni_t;
1770 ni_d->eqnext=NULL; ///TEST
1771 if (qi_d->eqdata == 0) {//only start of chain is tagged
1772 qi_d->eqdata = ((q+1) | EQHEAD_TAG);
1773 }
1774 ni_d->eqdata=n+1; //EQ chain member only marked with qry# (1-based)
1775 }
1776 } //multi-exon case
1777 } //for each transfrag in the next qry dataset
1778
1779 if (qi_d->eqnext!=NULL && qi_t->exons.Count()>1) break;
1780 //part of a chain already, skip other datasets
1781 } // for each successor dataset
1782 } //for each transcript in qry dataset
1783 } //for each qry dataset
1784 }
1785 */
1786
1787 void findTMatches(GTrackLocus& loctrack, int qcount) {
1788 //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1789 for (int q=0;q<qcount-1;q++) { //for each qry dataset
1790 if (loctrack[q]==NULL) continue;
1791 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1792 GffObj* qi_t=loctrack[q]->Get(qi);
1793 CTData* qi_d=(CTData*)qi_t->uptr;
1794 if (qi_d->eqlist!=NULL && qi_t->exons.Count()>1) {
1795 continue; //this is part of an EQ chain already
1796 }
1797 for (int n=q+1;n<qcount;n++) { // for every successor dataset
1798 if (loctrack[n]==NULL) continue;
1799 for (int ni=0;ni<loctrack[n]->Count();ni++) {
1800 GffObj* ni_t=loctrack[n]->Get(ni);
1801 CTData* ni_d=(CTData*)ni_t->uptr;
1802 bool singleExon=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1803 if (ni_d->eqlist!=NULL &&
1804 (ni_d->eqlist==qi_d->eqlist || !singleExon)) continue;
1805 int ovlen=0;
1806 if ((ni_d->eqlist==qi_d->eqlist && qi_d->eqlist!=NULL) ||
1807 tMatch(*qi_t,*ni_t, ovlen, singleExon)) {
1808 qi_d->joinEqList(ni_t);
1809 }
1810 }
1811 } // for each successor dataset
1812 } //for each transcript in qry dataset
1813 } //for each qry dataset
1814 }
1815
1816
1817 int cmpTData_qset(const pointer* p1, const pointer* p2) {
1818 CTData* d1=(CTData*)(((GffObj*)p1)->uptr);
1819 CTData* d2=(CTData*)(((GffObj*)p2)->uptr);
1820 return (d1->qset - d2->qset);
1821 }
1822
1823 void printITrack(FILE* ft, GList<GffObj>& mrnas, int qcount, int& cnum) {
1824 for (int i=0;i<mrnas.Count();i++) {
1825 GffObj& qt=*(mrnas[i]);
1826 CTData* qtdata=(CTData*)qt.uptr;
1827 int qfidx=qtdata->qset;
1828 char ovlcode=qtdata->classcode;
1829 //GList<GffObj> eqchain(false,false,false);
1830 CEqList* eqchain=qtdata->eqlist;
1831 GffObj* ref=NULL; //related ref -- it doesn't have to be fully matching
1832 GffObj* eqref=NULL; //fully ichain-matching ref
1833 GffObj* tcons=NULL; //"consensus" (largest) transcript for a clique
1834 int tmaxcov=0;
1835 //eqchain.Add(&qt);
1836 eqref=qtdata->eqref;
1837 if (qtdata->ovls.Count()>0 && qtdata->ovls[0]->mrna!=NULL) {
1838 //if it has ovlcode with a ref
1839 ref=qtdata->ovls[0]->mrna;
1840 //consistency check: qtdata->ovls[0]->code==ovlcode
1841 // -- let tcons be a transfrag, not a ref transcript
1842 //tcons=eqref;
1843 //if (tcons!=NULL) tmaxcov=tcons->covlen;
1844 }
1845 //chain pre-check
1846 if (tcons==NULL || mrnas[i]->covlen>tmaxcov) {
1847 tcons=mrnas[i];
1848 tmaxcov=tcons->covlen;
1849 }
1850 if (qtdata->isEqHead()) {//head of a equivalency chain
1851 //check if all transcripts in this chain have the same ovlcode
1852 for (int k=0;k<qtdata->eqlist->Count();k++) {
1853 GffObj* m=qtdata->eqlist->Get(k);
1854 if (m->covlen>tmaxcov) {
1855 tmaxcov=m->covlen;
1856 tcons=m;
1857 }
1858 if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1859 ovlcode='.'; //non-uniform ovlcode
1860 }
1861 }
1862 /*
1863 GffObj* m=mrnas[i];
1864 while (((CTData*)m->uptr)->eqnext!=NULL) {
1865 m=((CTData*)m->uptr)->eqnext;
1866 eqchain.Add(m);
1867 if (m->covlen>tmaxcov) {
1868 tmaxcov=m->covlen;
1869 tcons=m;
1870 }
1871 if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1872 ovlcode='.'; //non-uniform ovlcode
1873 //break;
1874 }
1875 } //while elements in chain
1876 */
1877
1878 }//chain check
1879 //if (ovlcode=='p') ref=NULL; //ignore polymerase runs?
1880 if (ovlcode==0 || ovlcode=='-') ovlcode = (ref==NULL) ? 'u' : '.';
1881 //-- print columns 1 and 2 as LOCUS_ID and TCONS_ID
1882 //bool chainHead=(qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0));
1883 bool chainHead=qtdata->isEqHead();
1884 //bool noChain=((qtdata->eqdata & EQCHAIN_TAGMASK)==0);
1885 bool noChain=(eqchain==NULL);
1886 if (chainHead || noChain) {
1887 cnum++;
1888 if (ft!=NULL) fprintf(ft,"%s_%08d\t",cprefix,cnum);
1889 GXLocus* xloc=qtdata->locus->xlocus;
1890 if (xloc!=NULL) {
1891 if (ft!=NULL) fprintf(ft, "XLOC_%06d\t",xloc->id);
1892 if (tcons->exons.Count()>1) {
1893 //! only multi-exon mRNAs are counted for multi-transcript xloci !
1894 xloc->num_mtcons++;
1895 if (xloc->num_mtcons==2)
1896 total_xloci_alt++;
1897 }
1898 }
1899 else {
1900 //should NEVER happen!
1901 int fidx=qtdata->qset;
1902 GError("Error: no XLocus created for transcript %s (file %s) [%d, %d], on %s%c:%d-%d\n", qt.getID(),
1903 qryfiles[qtdata->locus->qfidx]->chars(), qtdata->locus->qfidx, fidx, qt.getGSeqName(), qt.strand, qt.start, qt.end);
1904 }
1905 addXCons(xloc, ref, ovlcode, tcons, eqchain);
1906 } // if chain head or uniq entry (not part of a chain)
1907 if (ft==NULL) continue;
1908 if (chainHead) {
1909 //this is the start of a equivalence class as a printing chain
1910 if (ref!=NULL) fprintf(ft,"%s|%s\t%c", getGeneID(ref),ref->getID(), ovlcode);
1911 else fprintf(ft,"-\t%c", ovlcode);
1912 GffObj* m=mrnas[i];
1913 CTData* mdata=(CTData*)m->uptr;
1914
1915 int lastpq=-1;
1916 /*
1917 for (int ptab=mdata->qset-lastpq; ptab>0;ptab--)
1918 if (ptab>1) fprintf(ft,"\t-");
1919 else fprintf(ft,"\t");
1920 lastpq=mdata->qset;
1921 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1922 iround(m->gscore/10), mdata->FPKM, mdata->conf_lo, mdata->conf_hi, mdata->cov, m->covlen);
1923 //traverse linked list of matching transcripts
1924 while (mdata->eqnext!=NULL) {
1925 m=mdata->eqnext;
1926 mdata=(CTData*)m->uptr;
1927 for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1928 if (ptab>1) fprintf(ft,"\t-");
1929 else fprintf(ft,"\t");
1930 lastpq = mdata->qset;
1931 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1932 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1933 } //traverse and print row
1934 */
1935 eqchain->setUnique(false);
1936 eqchain->setSorted((GCompareProc*) cmpTData_qset);
1937
1938 for (int k=0;k<eqchain->Count();k++) {
1939 m=eqchain->Get(k);
1940 mdata=(CTData*)m->uptr;
1941 if (mdata->qset==lastpq) {
1942 //shouldn't happen, unless this input set is messed up (has duplicates/redundant transfrags)
1943 fprintf(ft,",%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", getGeneID(m), m->getID(),
1944 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1945 continue;
1946 }
1947 for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1948 if (ptab>1) fprintf(ft,"\t-");
1949 else fprintf(ft,"\t");
1950 lastpq = mdata->qset;
1951 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1952 iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1953 }
1954 for (int ptab=qcount-lastpq-1;ptab>0;ptab--)
1955 fprintf(ft,"\t-");
1956 fprintf(ft,"\n");
1957 continue;
1958 } //start of eq class (printing chain)
1959
1960 if (eqchain!=NULL) continue; //part of a matching chain, dealt with previously
1961
1962 //--------- not in an ichain-matching class, print as singleton
1963
1964 if (ref!=NULL) fprintf(ft,"%s|%s\t%c",getGeneID(ref), ref->getID(), ovlcode);
1965 else fprintf(ft,"-\t%c",ovlcode);
1966 for (int ptab=qfidx;ptab>=0;ptab--)
1967 if (ptab>0) fprintf(ft,"\t-");
1968 else fprintf(ft,"\t");
1969 fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|-",qfidx+1, getGeneID(qt), qt.getID(),iround(qt.gscore/10),
1970 qtdata->FPKM, qtdata->conf_lo,qtdata->conf_hi,qtdata->cov);
1971 for (int ptab=qcount-qfidx-1;ptab>0;ptab--)
1972 fprintf(ft,"\t-");
1973 fprintf(ft,"\n");
1974 } //for each transcript
1975 }
1976
1977
1978 void findTRMatch(GTrackLocus& loctrack, int qcount, GLocus& rloc) {
1979 //requires loctrack to be already populated with overlapping qloci by findTMatches()
1980 // which also found (and tagged) all matching qry transcripts
1981 for (int q=0;q<qcount;q++) { //for each qry dataset
1982 if (loctrack[q]==NULL) continue;
1983 for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1984 //if (loctrack[q]->cl[qi]->exons.Count()<2) continue; //skip single-exon transcripts
1985 GffObj& qt=*(loctrack[q]->Get(qi));
1986 CTData* qtdata=(CTData*)qt.uptr;
1987 GffObj* rmatch=NULL; //== ref match for this row
1988 int rovlen=0;
1989 //if (qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0)) {
1990 if (qtdata->isEqHead()) {
1991 //EQ chain head -- transfrag equivalency list starts here
1992 if (qtdata->eqref==NULL) { //find rloc overlap
1993 if (qt.overlap(rloc.start, rloc.end)) {
1994 rmatch=findRefMatch(qt, rloc, rovlen);
1995 }
1996 } else rmatch=qtdata->eqref;
1997 if (rmatch!=NULL) {
1998 /*
1999 GffObj* m=loctrack[q]->Get(qi);
2000 //traverse linked list of matching transcripts
2001 while (((CTData*)m->uptr)->eqnext!=NULL) {
2002 m=((CTData*)m->uptr)->eqnext;
2003 if (rmatch!=NULL) {
2004 ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2005 }
2006 } //traverse qry data sets
2007 continue;
2008 }
2009 */
2010 for (int k=0;k<qtdata->eqlist->Count();k++) {
2011 GffObj* m=qtdata->eqlist->Get(k);
2012 ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2013 continue;
2014 }
2015 }
2016 //if (rmatch!=NULL) continue;
2017 } //equivalence class (chain of intron-matching)
2018 //if ((qtdata->eqdata & EQCHAIN_TAGMASK)!=0) continue; //part of a matching chain, dealt with previously
2019
2020 //--------- qry mrna not in a '=' matching clique
2021 if (qtdata->eqref==NULL) { //find any rloc overlap
2022 if (qt.overlap(rloc.start, rloc.end)) {
2023 rmatch=findRefMatch(qt, rloc, rovlen);
2024 if (rmatch==NULL) {
2025 //not an ichain match, look for other codes
2026 GffObj* rovl=NULL;
2027 int rovlen=0;
2028 //char ovlcode=
2029 getRefOvl(qt, rloc,rovl,rovlen);
2030 }
2031 }
2032 }
2033 else rmatch=qtdata->eqref;
2034 } //for each qry transcript
2035 }//for each qry dataset
2036 }
2037
2038
2039 bool inPolyRun(char strand, GffObj& m, GList<GLocus>* rloci, int& rlocidx) {
2040 //we are only here if there is no actual overlap between m and any locus in rloci
2041 if (rloci==NULL || rloci->Count()==0) return false; // || m.exons.Count()>1
2042 if (strand=='-') {
2043 rlocidx=qsearch_loci(m.end, *rloci);
2044 //returns index of locus starting just ABOVE m.end
2045 // or -1 if last locus start <= m.end
2046 GLocus* rloc=NULL;
2047 if (rlocidx<0) return false;
2048 while (rlocidx<rloci->Count()) {
2049 rloc=rloci->Get(rlocidx);
2050 if (rloc->start>m.end+polyrun_range) break;
2051 if (rloc->start+6>m.end) return true;
2052 rlocidx++;
2053 }
2054 }
2055 else { // strand == '+' (or '.' ?)
2056 rlocidx=qsearch_loci(m.end, *rloci);
2057 GLocus* rloc=NULL;
2058 //returns index of closest locus starting ABOVE m.end
2059 // or -1 if last locus start <= m.end
2060 if (rlocidx<0) rlocidx=rloci->Count(); //this may actually start below m.end
2061 while ((--rlocidx)>=0) {
2062 rloc=rloci->Get(rlocidx);
2063 if (m.start>rloc->start+GFF_MAX_LOCUS) break;
2064 if (m.start+6>rloc->end && m.start<rloc->end+polyrun_range) return true;
2065 }
2066 }
2067 return false;
2068 }
2069
2070 CTData* getBestOvl(GffObj& m) {
2071 //CTData* mdata=(CTData*)m.uptr;
2072 //return mdata->getBestCode();
2073 if ( ((CTData*)m.uptr)->ovls.Count()>0)
2074 return (CTData*)m.uptr;
2075 return NULL;
2076 }
2077
2078 void reclass_XStrand(GList<GffObj>& mrnas, GList<GLocus>* rloci) {
2079 //checking for relationship with ref transcripts on opposite strand
2080 if (rloci==NULL || rloci->Count()<1) return;
2081 int j=0;//current rloci index
2082 for (int i=0;i<mrnas.Count();i++) {
2083 GffObj& m=*mrnas[i];
2084 char ovlcode=((CTData*)m.uptr)->getBestCode();
2085 if (ovlcode>47 && strchr("=cjeo",ovlcode)!=NULL) continue;
2086 GLocus* rloc=rloci->Get(j);
2087 if (rloc->start>m.end) continue; //check next transfrag
2088 while (m.start>rloc->end && j+1<rloci->Count()) {
2089 j++;
2090 rloc=rloci->Get(j);
2091 }
2092 if (rloc->start>m.end) continue; //check next transfrag
2093 //m overlaps rloc:
2094 //check if m has a fuzzy intron overlap -> 's' (shadow, mapped on the wrong strand)
2095 // then if m is contained within an intron -> 'i'
2096 // otherwise it's just a plain cross-strand overlap: 'x'
2097 int jm=0;
2098 do { //while rloci overlap this transfrag (m)
2099 rloc=rloci->Get(j+jm);
2100 bool is_shadow=false;
2101 GffObj* sovl=NULL;
2102 bool is_intraintron=false;
2103 GffObj* iovl=NULL;
2104 if (rloc->introns.Count()>0) {
2105 for (int n=0;n<rloc->introns.Count();n++) {
2106 GISeg& rintron=rloc->introns[n];
2107 if (rintron.start>m.end) break;
2108 if (m.start>rintron.end) continue;
2109 //overlap between m and intron
2110 if (m.end<=rintron.end && m.start>=rintron.start) {
2111 is_intraintron=true;
2112 if (iovl==NULL || iovl->covlen<rintron.t->covlen) iovl=rintron.t;
2113 continue;
2114 }
2115 //check if any intron of m has a fuzz-match with rintron
2116 for (int e=1;e<m.exons.Count();e++) {
2117 GSeg mintron(m.exons[e-1]->end+1,m.exons[e]->start-1);
2118 if (rintron.coordMatch(&mintron,10)) {
2119 is_shadow=true;
2120 if (sovl==NULL || sovl->covlen<rintron.t->covlen) sovl=rintron.t;
2121 break;
2122 }
2123 } //for each m intron
2124 } //for each intron of rloc
2125 }//rloc has introns
2126 bool xcode=true;
2127 if (is_shadow) { ((CTData*)m.uptr)->addOvl('s', sovl); xcode=false; }
2128 // else
2129 if (ovlcode!='i' && is_intraintron) { ((CTData*)m.uptr)->addOvl('i', iovl); xcode=false; }
2130 if (xcode) {
2131 // just plain overlap, find the overlapping mrna in rloc
2132 GffObj* maxovl=NULL;
2133 int ovlen=0;
2134 for (int ri=0;ri<rloc->mrnas.Count();ri++) {
2135 int o=m.exonOverlapLen(*(rloc->mrnas[ri]));
2136 if (o>ovlen) {
2137 ovlen=o;
2138 maxovl=rloc->mrnas[ri];
2139 }
2140 }
2141 if (maxovl!=NULL) ((CTData*)m.uptr)->addOvl('x',maxovl);
2142 } //'x'
2143 jm++;
2144 } while (j+jm<rloci->Count() && rloci->Get(j+jm)->overlap(m));
2145 } //for each transfrag
2146 }
2147
2148 void reclass_mRNAs(char strand, GList<GffObj>& mrnas, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2149 int rlocidx=-1;
2150 for (int i=0;i<mrnas.Count();i++) {
2151 GffObj& m=*mrnas[i];
2152 char ovlcode=((CTData*)m.uptr)->getBestCode();
2153 //if (ovlcode=='u' || ovlcode=='i' || ovlcode==0) {
2154 if (ovlcode=='u' || ovlcode<47) {
2155 //check for overlaps with ref transcripts on the other strand
2156 if (m.exons.Count()==1 && inPolyRun(strand, m, rloci, rlocidx)) {
2157 ((CTData*)m.uptr)->addOvl('p',rloci->Get(rlocidx)->mrna_maxcov);
2158 }
2159 else { //check for repeat content
2160 if (faseq!=NULL) {
2161 int seqlen;
2162 char* seq=m.getSpliced(faseq, false, &seqlen);
2163 //get percentage of lowercase
2164 int numlc=0;
2165 for (int c=0;c<seqlen;c++) if (seq[c]>='a') numlc++;
2166 if (numlc > seqlen/2)
2167 ((CTData*)m.uptr)->addOvl('r');
2168 GFREE(seq);
2169 }
2170 }
2171 } //for unassigned class
2172 }//for each mrna
2173
2174 }
2175
2176 void reclassLoci(char strand, GList<GLocus>& qloci, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2177 for (int ql=0;ql<qloci.Count();ql++) {
2178 reclass_mRNAs(strand, qloci[ql]->mrnas, rloci, faseq);
2179 //find closest upstream ref locus for this q locus
2180 } //for each locus
2181 }
2182
2183 //for a single genomic sequence, all qry data and ref data is stored in gtrack
2184 //check for all 'u' transfrags if they are repeat ('r') or polymerase run 'p' or anything else
2185 void umrnaReclass(int qcount, GSeqTrack& gtrack, FILE** ftr, GFaSeqGet* faseq=NULL) {
2186 for (int q=0;q<qcount;q++) {
2187 if (gtrack.qdata[q]==NULL) continue; //no transcripts in this q dataset for this genomic seq
2188 reclassLoci('+', gtrack.qdata[q]->loci_f, gtrack.rloci_f, faseq);
2189 reclassLoci('-', gtrack.qdata[q]->loci_r, gtrack.rloci_r, faseq);
2190 reclass_mRNAs('+', gtrack.qdata[q]->umrnas, gtrack.rloci_f, faseq);
2191 reclass_mRNAs('-', gtrack.qdata[q]->umrnas, gtrack.rloci_r, faseq);
2192 //and also check for special cases with cross-strand overlaps:
2193 reclass_XStrand(gtrack.qdata[q]->mrnas_f, gtrack.rloci_r);
2194 reclass_XStrand(gtrack.qdata[q]->mrnas_r, gtrack.rloci_f);
2195 // print all tmap data here here:
2196 for (int i=0;i<gtrack.qdata[q]->tdata.Count();i++) {
2197 CTData* mdata=gtrack.qdata[q]->tdata[i];
2198 if (mdata->mrna==NULL) continue; //invalidated -- removed earlier
2199 //GLocus* rlocus=NULL;
2200 mdata->classcode='u';
2201 GffObj* ref=NULL;
2202 if (mdata->ovls.Count()>0) {
2203 mdata->classcode=mdata->ovls[0]->code;
2204 ref=mdata->ovls[0]->mrna;
2205 }
2206 //if (mdata->classcode<33) mdata->classcode='u';
2207 if (mdata->classcode<47) mdata->classcode='u'; // if 0, '-' or '.'
2208 if (tmapFiles) {
2209 char ref_match_len[2048];
2210 if (ref!=NULL) {
2211 sprintf(ref_match_len, "%d",ref->covlen);
2212 fprintf(ftr[q],"%s\t%s\t",getGeneID(ref),ref->getID());
2213 //rlocus=((CTData*)(ref->uptr))->locus;
2214 }
2215 else {
2216 fprintf(ftr[q],"-\t-\t");
2217 strcpy(ref_match_len, "-");
2218 }
2219 //fprintf(ftr[q],"%c\t%s\t%d\t%8.6f\t%8.6f\t%d\n", ovlcode, mdata->mrna->getID(),
2220 // iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->cov, mdata->mrna->covlen);
2221 const char* mlocname = (mdata->locus!=NULL) ? mdata->locus->mrna_maxcov->getID() : mdata->mrna->getID();
2222 fprintf(ftr[q],"%c\t%s\t%s\t%d\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%d\t%s\t%s\n", mdata->classcode, getGeneID(mdata->mrna), mdata->mrna->getID(),
2223 iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->conf_lo,mdata->conf_hi, mdata->cov, mdata->mrna->covlen, mlocname, ref_match_len);
2224 }
2225 } //for each tdata
2226 } //for each qdata
2227 }
2228
2229 void buildXLoci(GTrackLocus& loctrack, int qcount, GSeqTrack& gtrack, char strand,
2230 GList<GXLocus>* retxloci=NULL) {
2231 GList<GXLocus>* dest_xloci=NULL;
2232 GList<GXLocus> tmpxloci(true,false,true); //local set of newly created xloci
2233 GList<GXLocus>* xloci=&tmpxloci;
2234 if (strand=='+') {
2235 dest_xloci=& gtrack.xloci_f;
2236 }
2237 else if (strand=='-') {
2238 dest_xloci = & gtrack.xloci_r;
2239 }
2240 else dest_xloci= & gtrack.xloci_u;
2241
2242 if (retxloci==NULL) {
2243 //if no return set of build xloci was given
2244 //take it as a directive to work directly on the global xloci
2245 xloci=dest_xloci;
2246 dest_xloci=NULL;
2247 }
2248 for (int q=-1;q<qcount;q++) {
2249 GList<GLocus>* wrkloci=NULL;
2250 if (q<0) {
2251 if (loctrack.rloci.Count()==0) continue;
2252 //loci=new GList<GLocus>(true,false,false);
2253 //loci->Add(loctrack.rloc);
2254 wrkloci = &(loctrack.rloci);
2255 }
2256 else {
2257 if (loctrack[q]==NULL) continue;
2258 wrkloci = &(loctrack[q]->qloci);
2259 }
2260
2261 for (int t=0;t<wrkloci->Count();t++) {
2262 GLocus* loc=wrkloci->Get(t);
2263 int xfound=0; //count of parent xloci
2264 if (loc->xlocus!=NULL) continue; //already assigned a superlocus
2265 GArray<int> mrgxloci(true);
2266 for (int xl=0;xl<xloci->Count();xl++) {
2267 GXLocus& xloc=*(xloci->Get(xl));
2268 if (xloc.start>loc->end) {
2269 if (xloc.start-loc->end > GFF_MAX_LOCUS) break;
2270 continue;
2271 }
2272 if (loc->start>xloc.end) continue;
2273 if (xloc.add_Locus(loc)) {
2274 xfound++;
2275 mrgxloci.Add(xl);
2276 }
2277 } //for each existing Xlocus
2278 if (xfound==0) {
2279 xloci->Add(new GXLocus(loc));
2280 }
2281 else {
2282 int il=mrgxloci[0];
2283 GXLocus& xloc=*(xloci->Get(il));
2284 if (xfound>1) {
2285 for (int l=1;l<xfound;l++) {
2286 int mlidx=mrgxloci[l]-l+1;
2287 xloc.addMerge(*(xloci->Get(mlidx)));
2288 GXLocus* ldel=xloci->Get(mlidx);
2289 xloci->Delete(mlidx);
2290 if (retxloci!=NULL)
2291 delete ldel;
2292 }
2293 }
2294 //in case xloc.start was decreased, bubble-down until it's in the proper order
2295 while (il>0 && xloc<*(xloci->Get(il-1))) {
2296 il--;
2297 xloci->Swap(il,il+1);
2298 }
2299 } //at least one locus is being merged
2300 }//for each locus
2301 }//for each set of loci in the region (refs and each qry set)
2302 //-- add xloci to the global set of xloci unless retxloci was given,
2303 if (retxloci!=NULL) retxloci->Add(*xloci);
2304 else dest_xloci->Add(*xloci);
2305 }
2306
2307 void singleQData(GList<GLocus>& qloci, GList<GTrackLocus>& loctracks) {
2308 for (int i=0;i<qloci.Count();i++) {
2309 if (qloci[i]->t_ptr==NULL) {
2310 GTrackLocus* tloc=new GTrackLocus();
2311 tloc->addQLocus(qloci[i],0);
2312 loctracks.Add(tloc);
2313 }
2314 }
2315 }
2316
2317 void recheckUmrnas(GSeqData* gseqdata, GList<GffObj>& mrnas,
2318 GList<GLocus>& loci, GList<GLocus>& nloci, GList<GLocus>& oloci) {
2319 GList<GLocus> reassignedLocs(false,false);
2320 for (int u=0;u<gseqdata->umrnas.Count();u++) {
2321 for (int l=0;l<oloci.Count();l++) {
2322 if (gseqdata->umrnas[u]==NULL) break;
2323 if (gseqdata->umrnas[u]->end<oloci[l]->start) break; //try next umrna
2324 if (oloci[l]->end<gseqdata->umrnas[u]->start) continue; //try next locus
2325 if (gseqdata->umrnas[u]->strand=='+' || gseqdata->umrnas[u]->strand=='-') {
2326 gseqdata->umrnas.Forget(u);
2327 continue; //already reassigned earlier
2328 }
2329 //umrna overlaps locus region
2330 GffObj* umrna=gseqdata->umrnas[u];
2331 for (int m=0;m<oloci[l]->mrnas.Count();m++) {
2332 if (oloci[l]->mrnas[m]->exonOverlap(umrna)) {
2333 gseqdata->umrnas.Forget(u);
2334 CTData* umdata=((CTData*)umrna->uptr);
2335 //must be in a Loci anyway
2336 if (umdata==NULL || umdata->locus==NULL)
2337 GError("Error: no locus pointer for umrna %s!\n",umrna->getID());
2338 for (int i=0;i<umdata->locus->mrnas.Count();i++) {
2339 GffObj* um=umdata->locus->mrnas[i];
2340 um->strand=oloci[l]->mrnas[m]->strand;
2341 }
2342 reassignedLocs.Add(umdata->locus);
2343 break;
2344 }
2345 } //for each mrna in locus
2346 } //for each locus
2347 } //for each umrna
2348 if (reassignedLocs.Count()>0) {
2349 gseqdata->umrnas.Pack();
2350 gseqdata->nloci_u.setFreeItem(false);
2351 for (int i=0;i<reassignedLocs.Count();i++) {
2352 GLocus* loc=reassignedLocs[i];
2353 for (int m=0;m<loc->mrnas.Count();m++) {
2354 mrnas.Add(loc->mrnas[m]);
2355 }
2356 loci.Add(loc);
2357 nloci.Add(loc);
2358 gseqdata->nloci_u.Remove(loc);
2359 }
2360 gseqdata->nloci_u.setFreeItem(true);
2361 }
2362 }
2363
2364 void umrnasXStrand(GList<GXLocus>& xloci, GSeqTrack& gtrack) {
2365 //try to determine the strand of unoriented transfrags based on possible overlaps
2366 //with other, oriented transfrags
2367 for (int x=0;x<xloci.Count();x++) {
2368 if (xloci[x]->strand=='.') continue;
2369 if (xloci[x]->qloci.Count()==0) continue;
2370 //go through all qloci in this xlocus
2371 for (int l = 0; l < xloci[x]->qloci.Count(); l++) {
2372 char locstrand=xloci[x]->qloci[l]->mrna_maxcov->strand;
2373 if (locstrand=='.') {
2374 //this is a umrna cluster
2375 GLocus* qloc=xloci[x]->qloci[l];
2376 //we don't really need to update loci lists (loci_f, nloci_f etc.)
2377 /*
2378 if (xloci[x]->strand=='+') {
2379 }
2380 else { // - strand
2381 }
2382 */
2383 for (int i=0;i<qloc->mrnas.Count();i++) {
2384 qloc->mrnas[i]->strand=xloci[x]->strand;
2385 int uidx=gtrack.qdata[qloc->qfidx]->umrnas.IndexOf(qloc->mrnas[i]);
2386 if (uidx>=0) {
2387 gtrack.qdata[qloc->qfidx]->umrnas.Forget(uidx);
2388 gtrack.qdata[qloc->qfidx]->umrnas.Delete(uidx);
2389 if (xloci[x]->strand=='+')
2390 gtrack.qdata[qloc->qfidx]->mrnas_f.Add(qloc->mrnas[i]);
2391 else
2392 gtrack.qdata[qloc->qfidx]->mrnas_r.Add(qloc->mrnas[i]);
2393 }
2394 }
2395 } //unknown strand
2396 } //for each xloci[x].qloci (l)
2397
2398 } //for each xloci (x)
2399 }
2400
2401 //cluster loci across all datasets
2402 void xclusterLoci(int qcount, char strand, GSeqTrack& gtrack) {
2403 //gtrack holds data for all input qry datasets for a chromosome/contig
2404 //cluster QLoci
2405 GList<GTrackLocus> loctracks(true,true,false);
2406 //all vs all clustering across all qry data sets + ref
2407 //one-strand set of loci from all datasets + ref loci
2408 GList<GLocus>* wrkloci=NULL;
2409 //build xloci without references first
2410 //then add references only if they overlap an existing xloci
2411
2412 int nq=0;
2413 for (int q=0;q<=qcount+1;q++) {
2414 bool refcheck=false;
2415 if (q==qcount) { // check the unoriented loci for each query file
2416 while (nq<qcount &&
2417 (gtrack.qdata[nq]==NULL || gtrack.qdata[nq]->nloci_u.Count()==0))
2418 nq++; //skip query files with no unoriented loci
2419 if (nq<qcount) {
2420 wrkloci=&(gtrack.qdata[nq]->nloci_u);
2421 nq++;
2422 if (nq<qcount) q--; //so we can fetch the next nq in the next q cycle
2423 }
2424 else continue; //no more q files with unoriented loci
2425 }
2426 else if (q==qcount+1) { // check the reference loci
2427 if (strand=='+') wrkloci=gtrack.rloci_f;
2428 else wrkloci=gtrack.rloci_r;
2429
2430 if (wrkloci==NULL) break; //no ref loci here
2431 refcheck=true;
2432 }
2433 else {
2434 if (gtrack.qdata[q]==NULL) continue;
2435 if (strand=='+') wrkloci=&(gtrack.qdata[q]->loci_f);
2436 else wrkloci=&(gtrack.qdata[q]->loci_r);
2437 }
2438 // now do the all-vs-all clustering thing:
2439 for (int t=0;t<wrkloci->Count();t++) {
2440 GLocus* loc=wrkloci->Get(t);
2441 int xfound=0; //count of parent loctracks
2442 if (loc->t_ptr!=NULL) continue; //already assigned a loctrack
2443 GArray<int> mrgloctracks(true);
2444 for (int xl=0;xl<loctracks.Count();xl++) {
2445 GTrackLocus& trackloc=*loctracks[xl];
2446 if (trackloc.start>loc->end) break;
2447 if (loc->start>trackloc.end) continue;
2448 if (trackloc.add_Locus(loc)) {
2449 xfound++;
2450 mrgloctracks.Add(xl);
2451 }
2452 } //for each existing Xlocus
2453 if (xfound==0) {
2454 if (!refcheck) //we really don't care about ref-only clusters
2455 loctracks.Add(new GTrackLocus(loc));
2456 }
2457 else {
2458 int il=mrgloctracks[0];
2459 GTrackLocus& tloc=*(loctracks.Get(il));
2460 if (xfound>1) {
2461 for (int l=1;l<xfound;l++) {
2462 int mlidx=mrgloctracks[l]-l+1;
2463 tloc.addMerge(loctracks[mlidx], qcount, loc);
2464 loctracks.Delete(mlidx);
2465 }
2466 }
2467 //in case tloc.start was decreased, bubble-down 'til it's in the proper place
2468 while (il>0 && tloc<*(loctracks[il-1])) {
2469 il--;
2470 loctracks.Swap(il,il+1);
2471 }
2472 } //at least one locus found
2473 }//for each wrklocus
2474 } //for each set of loci (q)
2475 //loctracks is now set with all x-clusters on this strand
2476 for (int i=0;i<loctracks.Count();i++) {
2477 if (!loctracks[i]->hasQloci) continue; //we really don't care here about reference-only clusters
2478 GTrackLocus& loctrack=*loctracks[i];
2479 findTMatches(loctrack, qcount); //find matching transfrags in this xcluster
2480 for (int rl=0; rl < loctrack.rloci.Count(); rl++) {
2481 findTRMatch(loctrack, qcount, *(loctrack.rloci[rl]));
2482 //find matching reference annotation for this xcluster and assign class codes to transfrags
2483 }
2484 GList<GXLocus> xloci(false,false,false);
2485 buildXLoci(loctrack, qcount, gtrack, strand, &xloci);
2486 //the newly created xloci are in xloci
2487 umrnasXStrand(xloci, gtrack);
2488 //also merge these xloci into the global list of xloci
2489 for (int l=0; l < xloci.Count(); l++) {
2490 if (xloci[l]->strand=='+') {
2491 gtrack.xloci_f.Add(xloci[l]);
2492 }
2493 else if (xloci[l]->strand=='-') {
2494 gtrack.xloci_r.Add(xloci[l]);
2495 }
2496 else gtrack.xloci_u.Add(xloci[l]);
2497 }
2498 }//for each xcluster
2499 }
2500
2501
2502 void printRefMap(FILE** frs, int qcount, GList<GLocus>* rloci) {
2503 if (rloci==NULL) return;
2504
2505 for (int l=0;l<rloci->Count(); l++) {
2506 for (int r=0;r<rloci->Get(l)->mrnas.Count(); r++) {
2507 GffObj& ref = *(rloci->Get(l)->mrnas[r]);
2508 CTData* refdata = ((CTData*)ref.uptr);
2509 GStr* clist = new GStr[qcount];
2510 GStr* eqlist = new GStr[qcount];
2511 for (int i = 0; i<refdata->ovls.Count(); i++) {
2512 GffObj* m=refdata->ovls[i]->mrna;
2513 char ovlcode=refdata->ovls[i]->code;
2514 if (m==NULL) {
2515 GMessage("Warning: NULL mRNA found for ref %s with ovlcode '%c'\n",
2516 ref.getID(), refdata->ovls[i]->code);
2517 continue;
2518 }
2519 int qfidx = ((CTData*)m->uptr)->qset;
2520 if (ovlcode == '=') {
2521 eqlist[qfidx].append(getGeneID(m));
2522 eqlist[qfidx].append('|');
2523 eqlist[qfidx].append(m->getID());
2524 eqlist[qfidx].append(',');
2525 }
2526 else if (ovlcode == 'c') {
2527 clist[qfidx].append(getGeneID(m));
2528 clist[qfidx].append('|');
2529 clist[qfidx].append(m->getID());
2530 clist[qfidx].append(',');
2531 }
2532 }//for each reference overlap
2533 for (int q=0;q<qcount;q++) {
2534 if (!eqlist[q].is_empty()) {
2535 eqlist[q].trimR(',');
2536 fprintf(frs[q],"%s\t%s\t=\t%s\n", getGeneID(ref), ref.getID(),eqlist[q].chars());
2537 }
2538 if (!clist[q].is_empty()) {
2539 clist[q].trimR(',');
2540 fprintf(frs[q],"%s\t%s\tc\t%s\n",getGeneID(ref), ref.getID(),clist[q].chars());
2541 }
2542 }
2543 delete[] clist;
2544 delete[] eqlist;
2545 }// ref loop
2546 }//ref locus loop
2547 }
2548
2549
2550
2551 void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs) {
2552 FILE* f_ltrack=NULL;
2553 FILE* f_itrack=NULL;
2554 FILE* f_ctrack=NULL;
2555 FILE* f_xloci=NULL;
2556 int cnum=0; //consensus numbering for printITrack()
2557 GStr s=fbasename;
2558 //if (qcount>1 || generic_GFF) { //doesn't make much sense for only 1 query file
2559 s.append(".tracking");
2560 f_itrack=fopen(s.chars(),"w");
2561 if (f_itrack==NULL) GError("Error creating file %s !\n",s.chars());
2562 // }
2563 s=fbasename;
2564 s.append(".combined.gtf");
2565 f_ctrack=fopen(s.chars(),"w");
2566 if (f_ctrack==NULL) GError("Error creating file %s !\n",s.chars());
2567
2568 s=fbasename;
2569 s.append(".loci");
2570 f_xloci=fopen(s.chars(),"w");
2571 if (f_xloci==NULL) GError("Error creating file %s !\n",s.chars());
2572 for (int g=0;g<gtracks.Count();g++) { //for each genomic sequence
2573 GSeqTrack& gseqtrack=*gtracks[g];
2574
2575 xclusterLoci(qcount, '+', gseqtrack);
2576 xclusterLoci(qcount, '-', gseqtrack);
2577
2578 //count XLoci, setting their id
2579 numXLoci(gseqtrack.xloci_f, xlocnum);
2580 numXLoci(gseqtrack.xloci_r, xlocnum);
2581 numXLoci(gseqtrack.xloci_u, xlocnum);
2582 //transcript accounting: for all those transcripts with 'u' or 0 class code
2583 // we have to check for polymerase runs 'p' or repeats 'r'
2584
2585 GFaSeqGet *faseq=gfasta.fetch(gseqtrack.get_gseqid(), checkFasta);
2586
2587 umrnaReclass(qcount, gseqtrack, ftr, faseq);
2588
2589 // print transcript tracking (ichain_tracking)
2590 //if (qcount>1)
2591 for (int q=0;q<qcount;q++) {
2592 if (gseqtrack.qdata[q]==NULL) continue;
2593 printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_f, qcount, cnum);
2594 printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_r, qcount, cnum);
2595 //just for the sake of completion:
2596 printITrack(f_itrack, gseqtrack.qdata[q]->umrnas, qcount, cnum);
2597 }
2598 //print XLoci and XConsensi within each xlocus
2599 //also TSS clustering and protein ID assignment for XConsensi
2600 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_f, faseq);
2601 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_r, faseq);
2602 printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_u, faseq);
2603 if (tmapFiles && haveRefs) {
2604 printRefMap(frs, qcount, gseqtrack.rloci_f);
2605 printRefMap(frs, qcount, gseqtrack.rloci_r);
2606 }
2607 delete faseq;
2608 }
2609 if (tmapFiles) {
2610 for (int q=0;q<qcount;q++) {
2611 fclose(ftr[q]);
2612 if (haveRefs) fclose(frs[q]);
2613 }
2614 }
2615 if (f_ltrack!=NULL) fclose(f_ltrack);
2616 if (f_itrack!=NULL) fclose(f_itrack);
2617 if (f_ctrack!=NULL) fclose(f_ctrack);
2618 if (f_xloci!=NULL) fclose(f_xloci);
2619
2620 }

Properties

Name Value
svn:executable *