ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/cuffcompare/cuffcompare.cpp
Revision: 118
Committed: Sun Nov 27 21:14:57 2011 UTC (8 years, 11 months ago) by gpertea
File size: 99456 byte(s)
Log Message:
fix locus Sn calculation

Line User Rev File contents
1 gpertea 20 #ifdef HAVE_CONFIG_H
2     #include <config.h>
3     #else
4     #define PACKAGE_VERSION "INTERNAL"
5     #define SVN_REVISION "SVN"
6     #endif
7    
8     #include "GArgs.h"
9     #include <ctype.h>
10     #include <errno.h>
11     #include "gtf_tracking.h"
12    
13     #ifdef HAVE_CONFIG_H
14     #include "update_check.h"
15     #endif
16    
17     #define USAGE "Usage:\n\
18     cuffcompare [-r <reference_mrna.gtf>] [-R] [-T] [-V] [-s <seq_path>] \n\
19     [-o <outprefix>] [-p <cprefix>] \n\
20     {-i <input_gtf_list> | <input1.gtf> [<input2.gtf> .. <inputN.gtf>]}\n\
21     \n\
22     Cuffcompare provides classification, reference annotation mapping and various\n\
23     statistics for Cufflinks transfrags.\n\
24     Cuffcompare clusters and tracks transfrags across multiple samples, writing\n\
25     matching transcripts (intron chains) into <outprefix>.tracking, and a GTF\n\
26     file <outprefix>.combined.gtf containing a nonredundant set of transcripts \n\
27     across all input files (with a single representative transfrag chosen\n\
28     for each clique of matching transfrags across samples).\n\
29     \n\
30     Options:\n\
31     -i provide a text file with a list of Cufflinks GTF files to process instead\n\
32     of expecting them as command line arguments (useful when a large number\n\
33     of GTF files should be processed)\n\
34     \n\
35     -r a set of known mRNAs to use as a reference for assessing \n\
36     the accuracy of mRNAs or gene models given in <input.gtf>\n\
37     \n\
38     -R for -r option, reduce the set of reference transcripts to \n\
39     only those found to overlap any of the input loci\n\
40     -M discard (ignore) single-exon transfrags and reference transcripts\n\
41     -N discard (ignore) single-exon reference transcripts\n\
42     \n\
43     -s <seq_path> can be a multi-fasta file with all the genomic sequences or \n\
44     a directory containing multiple single-fasta files (one file per contig);\n\
45     lower case bases will be used to classify input transcripts as repeats\n\
46     \n\
47     -d max distance (range) for grouping transcript start sites (100)\n\
48     -p the name prefix to use for consensus transcripts in the \n\
49     <outprefix>.combined.gtf file (default: 'TCONS')\n\
50     -C include the \"contained\" transcripts in the .combined.gtf file\n\
51     -G generic GFF input file(s) (do not assume Cufflinks GTF)\n\
52     -T do not generate .tmap and .refmap files for each input file\n\
53     -V verbose processing mode (showing all GFF parsing warnings)\n\
54     "
55     bool debug=false;
56     bool perContigStats=false; // -S to enable stats for every single contig
57     bool generic_GFF=false; //-G, don't assume Cufflinks GTF as input
58     bool showContained=false; // -C
59     bool reduceRefs=false;
60     bool checkFasta=false;
61     bool tmapFiles=true;
62     bool only_spliced_refs=false;
63     int debugCounter=0;
64    
65     int polyrun_range=2000; //polymerase run range 2KB
66     double scoreThreshold=0;
67     double exprThreshold=0;
68     char* cprefix=NULL;
69     FILE* ffasta=NULL; //genomic seq file
70     FILE *f_ref=NULL; //reference mRNA GFF, if provided
71     FILE* f_in=NULL; //sequentially, each input GFF file
72     FILE* f_out=NULL; //stdout if not provided
73     GFastaHandler gfasta;
74     int xlocnum=0;
75     int tsscl_num=0; //for tss cluster IDs
76     int protcl_num=0; //for "unique" protein IDs within TSS clusters
77     int tssDist=100;
78     //int total_tcons=0;
79     int total_xloci_alt=0;
80    
81     void openfwrite(FILE* &f, GArgs& args, char opt) {
82     GStr s=args.getOpt(opt);
83     if (!s.is_empty()) {
84     if (s=='-')
85     f=stdout;
86     else {
87     f=fopen(s,"w");
88     if (f==NULL) GError("Error creating file: %s\n", s.chars());
89     }
90     }
91     }
92    
93     //-- structure to keep track of data from multiple qry input files for a single genomic seq
94     class GSeqTrack {
95     int gseq_id;
96     public:
97     const char* gseq_name;
98     GList<GLocus>* rloci_f; //reference loci for this genomic sequence
99     GList<GLocus>* rloci_r;
100     GList<GXLocus> xloci_f; // extended super-loci across all qry datasets
101     GList<GXLocus> xloci_r; // extended super-loci across all qry datasets
102     GList<GXLocus> xloci_u; // extended super-loci across all qry datasets
103     GSeqData* qdata[MAX_QFILES]; //fixed order array with GSeqData for each qry input
104     //element in array is NULL if a qry file has no transcripts on this genomic sequence
105     int get_gseqid() { return gseq_id; }
106     GSeqTrack(int gid=-1):xloci_f(true,true,false),
107     xloci_r(true,true,false), xloci_u(true,true,false) {
108     gseq_id=gid;
109     if (gseq_id>=0) {
110     gseq_name=GffObj::names->gseqs.getName(gseq_id);
111     }
112     rloci_f=NULL;
113     rloci_r=NULL;
114     for (int i=0;i<MAX_QFILES;i++) qdata[i]=NULL;
115     }
116     bool operator==(GSeqTrack& d){
117     return (gseq_id==d.gseq_id);
118     }
119     bool operator>(GSeqTrack& d){
120     return (gseq_id>d.gseq_id);
121     }
122     bool operator<(GSeqTrack& d){
123     return (gseq_id<d.gseq_id);
124     }
125     };
126    
127     char* getFastaFile(int gseq_id);
128    
129     // ref globals
130     bool haveRefs=false; //true if a reference annotation (-r) is provide
131    
132     GList<GSeqData> ref_data(true,true,true); //list of reference mRNAs and loci data for each genomic seq
133     //each locus will keep track of any superloci which includes it, formed during the analysis
134    
135     void processLoci(GSeqData& seqdata, GSeqData* refdata=NULL, int qfidx=0);
136    
137     void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
138     GSeqData* seqdata=NULL, GSeqData* refdata=NULL);
139    
140     GSeqData* getQryData(int gid, GList<GSeqData>& qdata);
141     void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs);
142    
143     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
144     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
145    
146     FILE* f_mintr=NULL; //missed ref introns
147    
148     bool multiexon_only=false;
149     bool multiexonrefs_only=false;
150    
151     GHash<GStr> refdescr;
152     void loadRefDescr(const char* fname);
153    
154     GList<GStr> qryfiles(false,true,false);
155    
156     //list of GSeqTrack data, sorted by gseq_id
157     GList<GSeqTrack> gseqtracks(true,true,true);
158     GSeqTrack* findGSeqTrack(int gsid);
159    
160    
161     int cmpGTrackByName(const pointer p1, const pointer p2) {
162     return strcmp(((GSeqTrack*)p1)->gseq_name, ((GSeqTrack*)p2)->gseq_name);
163     }
164    
165    
166     void show_usage() {
167     GMessage("cuffcompare v%s (%s)\n", PACKAGE_VERSION, SVN_REVISION);
168     GMessage( "-----------------------------\n");
169     GMessage("%s\n", USAGE);
170     }
171    
172     int main(int argc, char * const argv[]) {
173     GArgs args(argc, argv, "XDTMNVGSCKRLhp:c:d:s:i:n:r:o:");
174     int e;
175     if ((e=args.isError())>0) {
176     show_usage();
177     GMessage("Invalid argument: %s\n", argv[e]);
178     exit(1);
179     }
180     if (args.getOpt('h')!=NULL){
181     show_usage();
182     exit(1);
183     }
184     showContained=(args.getOpt('C')!=NULL);
185     debug=(args.getOpt('D')!=NULL);
186     tmapFiles=(args.getOpt('T')==NULL);
187     multiexon_only=(args.getOpt('M')!=NULL);
188     multiexonrefs_only=(args.getOpt('N')!=NULL);
189     perContigStats=(args.getOpt('S')!=NULL);
190     checkFasta=(args.getOpt('K')!=NULL);
191     gtf_tracking_verbose=((args.getOpt('V')!=NULL) || debug);
192     FILE* finlst=NULL;
193     GStr s=args.getOpt('i');
194     if (!s.is_empty()) {
195     if (s=='-')
196     finlst=stdin;
197     else {
198     finlst=fopen(s,"r");
199     if (finlst==NULL) GError("Error opening file: %s\n", s.chars());
200     }
201     }
202     int numqryfiles=0;
203     if (finlst) {
204     GLineReader* lr=new GLineReader(finlst);
205     char* l=NULL;
206     while ((l=lr->getLine())!=NULL) {
207     if (strlen(l)<2 || startsWith(l,"# ") || isspace(*l)) continue;
208     if (!fileExists(l)) GError("Error: cannot locate input file: %s\n", l);
209     qryfiles.Add(new GStr(l));
210     }
211     delete lr;
212     //if (qryfiles.Count()>10)
213     gtf_tracking_largeScale=true;
214     }
215     else {
216     numqryfiles=args.startNonOpt();
217     char *infile=NULL;
218     if (numqryfiles>0) {
219     while ((infile=args.nextNonOpt())!=NULL) {
220     if (!fileExists(infile)) GError("Error: cannot locate input file: %s\n", infile);
221     qryfiles.Add(new GStr(infile));
222     } //for each argument
223     }
224     }
225     numqryfiles=qryfiles.Count();
226     if (numqryfiles==0) {
227     show_usage();
228     exit(1);
229     }
230     if (numqryfiles>MAX_QFILES) {
231     GMessage("Error: too many input files (limit set to %d at compile time)\n",MAX_QFILES);
232     GMessage("(if you need to raise this limit set a new value for\nMAX_QFILES in gtf_tracking.h and recompile)\n");
233     exit(0x5000);
234     }
235     #ifdef HAVE_CONFIG_H
236     check_version(PACKAGE_VERSION);
237     #endif
238     gfasta.init(args.getOpt('s'));
239     // determine if -s points to a multi-fasta file or a directory
240     s=args.getOpt('c');
241     if (!s.is_empty()) scoreThreshold=s.asReal();
242     s=args.getOpt('p');
243     if (!s.is_empty()) cprefix=Gstrdup(s.chars());
244     else cprefix=Gstrdup("TCONS");
245     s=args.getOpt('e');
246     if (!s.is_empty()) exprThreshold=s.asReal();
247     s=args.getOpt('d');
248     if (!s.is_empty()) {
249     tssDist=s.asInt();
250     }
251    
252     s=args.getOpt('n');
253     if (!s.is_empty()) loadRefDescr(s.chars());
254     s=args.getOpt('r');
255     if (!s.is_empty()) {
256     f_ref=fopen(s,"r");
257     if (f_ref==NULL) GError("Error opening reference gff: %s\n",s.chars());
258     haveRefs=true;
259     if (gtf_tracking_verbose) GMessage("Loading reference transcripts..\n");
260     read_mRNAs(f_ref, ref_data, &ref_data, true, -1, s.chars(), (multiexonrefs_only || multiexon_only));
261     haveRefs=(ref_data.Count()>0);
262     reduceRefs=(args.getOpt('R')!=NULL);
263     if (gtf_tracking_verbose) GMessage("..reference annotation loaded\n");
264     }
265     bool discard_redundant=true; //discard redundant input transfrags
266     generic_GFF=args.getOpt('G');
267     if (generic_GFF) discard_redundant=false; //generic GTF, don't try to discard "redundant" transcripts
268     //if a full pathname is given
269     //the other common output files will still be created in the current directory:
270     // .loci, .tracking, .stats
271     GStr outbasename; //include path, if provided
272     GStr outprefix; //without path and/or extension
273     GStr outstats=args.getOpt('o');
274 gpertea 52 if (outstats.is_empty() || outstats=="-") {
275 gpertea 20 outstats="cuffcmp";
276     }
277     outbasename=outstats;
278     GStr outext(getFileExt(outstats.chars()));
279     if (outext.is_empty()) {
280     outext="stats";
281     outstats.append(".stats");
282     outbasename=outstats;
283     }
284     else outext.lower();
285     if (outext=="txt" || outext=="out" || outext=="stats" || outext=="summary") {
286     outbasename.cut(outbasename.length()-outext.length()-1);
287     }
288    
289     outprefix=outbasename;
290     int di=outprefix.rindex(CHPATHSEP);
291     if (di>=0) outprefix.cut(0,di+1);
292    
293     if (debug) { //create a few more files potentially useful for debugging
294     s=outbasename;
295     s.append(".missed_introns.gtf");
296     f_mintr=fopen(s.chars(),"w");
297     if (f_mintr==NULL) GError("Error creating file %s!\n",s.chars());
298     /*
299     s=outbasename;
300     s.append(".noTP_introns.gtf");
301     f_nintr=fopen(s.chars(),"w");
302     s=outbasename;
303     s.append(".wrong_Qintrons.gtf");
304     f_qintr=fopen(s.chars(),"w");
305     */
306     }
307    
308     f_out=fopen(outstats, "w");
309     if (f_out==NULL) GError("Error creating output file %s!\n", outstats.chars());
310     if (gtf_tracking_verbose) GMessage("Prefix for output files: %s\n", outprefix.chars());
311     fprintf(f_out, "# Cuffcompare v%s | Command line was:\n#", PACKAGE_VERSION);
312     for (int i=0;i<argc-1;i++)
313     fprintf(f_out, "%s ", argv[i]);
314     fprintf(f_out, "%s\n#\n", argv[argc-1]);
315     //int qfileno=0;
316     GList<GSeqData>** qrysdata=NULL;
317     FILE** tfiles=NULL;
318     FILE** rtfiles=NULL;
319     GMALLOC(qrysdata, numqryfiles*sizeof(GList<GSeqData>*));
320     if (tmapFiles) {
321     GMALLOC(tfiles, numqryfiles*sizeof(FILE*));
322     if (haveRefs) {
323     GMALLOC(rtfiles, numqryfiles*sizeof(FILE*));
324     }
325     }
326     for (int fi=0;fi<qryfiles.Count();fi++) {
327     GStr in_file(qryfiles[fi]->chars());
328     GStr infname(getFileName(qryfiles[fi]->chars())); //file name only
329     GStr indir(qryfiles[fi]->chars());
330     di=indir.rindex(CHPATHSEP);
331     if (di>=0) indir.cut(di+1); //directory path for this input file
332     else indir=""; //current directory
333    
334     if (debug || (gtf_tracking_verbose && !gtf_tracking_largeScale))
335     GMessage("Processing qfile #%d: %s\n",fi+1, in_file.chars());
336     if (in_file=="-") { f_in=stdin; in_file="stdin"; }
337     else {
338     f_in=fopen(in_file.chars(),"r");
339     if (f_in==NULL)
340     GError("Cannot open input file %s!\n",in_file.chars());
341     }
342     //f_in is the query gff file to process
343    
344     GStr sbase(indir);
345     sbase.append(outprefix);
346     sbase.append(".");
347     sbase.append(infname);
348     if (tmapFiles) {
349     //-- we should keep the infname path, otherwise the remaining file names
350     // may be the same and clobber each other
351     s=sbase;
352     s.append(".tmap");
353     tfiles[fi]=fopen(s.chars(),"w");
354     if (tfiles[fi]==NULL)
355     GError("Error creating file '%s'!\n",s.chars());
356     fprintf(tfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_gene_id\tcuff_id\tFMI\tFPKM\tFPKM_conf_lo\tFPKM_conf_hi\tcov\tlen\tmajor_iso_id\tref_match_len\n");
357     if (haveRefs) {
358     s=sbase;
359     s.append(".refmap");
360     rtfiles[fi]=fopen(s.chars(),"w");
361     if (rtfiles[fi]==NULL)
362     GError("Error creating file '%s'!\n",s.chars());
363     fprintf(rtfiles[fi],"ref_gene_id\tref_id\tclass_code\tcuff_id_list\n");
364     }
365     }
366    
367     GList<GSeqData>* pdata=new GList<GSeqData>(true,true,true);
368     qrysdata[fi]=pdata;
369     if (gtf_tracking_verbose) GMessage("Loading transcripts from %s..\n",in_file.chars());
370     read_mRNAs(f_in, *pdata, &ref_data, discard_redundant, fi, in_file.chars(), multiexon_only);
371     GSuperLocus gstats;
372     GFaSeqGet *faseq=NULL;
373     for (int g=0;g<pdata->Count();g++) { //for each seqdata related to a genomic sequence
374     int gsid=pdata->Get(g)->get_gseqid();
375     GSeqData* refdata=getRefData(gsid, ref_data);//ref data for this contig
376     if (!gtf_tracking_largeScale)
377     processLoci(*(pdata->Get(g)), refdata, fi);
378     GSeqTrack* seqtrack=findGSeqTrack(gsid); //this will add a gseqtrack if it doesn't exist
379     // for gsid
380     if (refdata!=NULL) {
381     seqtrack->rloci_f=&(refdata->loci_f);
382     seqtrack->rloci_r=&(refdata->loci_r);
383     }
384     seqtrack->qdata[fi]=pdata->Get(g);
385     //will only gather data into stats if perContig==false
386     if (!gtf_tracking_largeScale) reportStats(f_out, getGSeqName(gsid), gstats,
387     pdata->Get(g), refdata);
388     if (faseq!=NULL) delete faseq;
389     } //for each genomic sequence data
390     //there could be genomic sequences with no qry transcripts
391     //but with reference transcripts
392     if (haveRefs && !reduceRefs && !gtf_tracking_largeScale) {
393     for (int r=0;r<ref_data.Count();r++) {
394     GSeqData* refdata=ref_data[r];
395     int gsid=refdata->get_gseqid();
396     if (getQryData(gsid, *pdata)==NULL) {
397     reportStats(f_out, getGSeqName(gsid), gstats, NULL, refdata);
398     }//completely missed all refdata on this contig
399     }
400     }
401     //now report the summary:
402     if (!gtf_tracking_largeScale) reportStats(f_out, in_file.chars(), gstats);
403     if (f_in!=stdin) fclose(f_in);
404     //qfileno++;
405     }//for each input file
406     if (f_mintr!=NULL) fclose(f_mintr);
407     gseqtracks.setSorted(&cmpGTrackByName);
408     if (gtf_tracking_verbose) GMessage("Tracking transcripts across %d query files..\n", numqryfiles);
409     trackGData(numqryfiles, gseqtracks, outbasename, tfiles, rtfiles);
410     fprintf(f_out, "\n Total union super-loci across all input datasets: %d \n", xlocnum);
411     if (numqryfiles>1) {
412     fprintf(f_out, " (%d multi-transcript, ~%.1f transcripts per locus)\n",
413     total_xloci_alt, ((double)(GXConsensus::count))/xlocnum);
414     }
415     if (gtf_tracking_verbose) GMessage("Cleaning up..\n");
416     GFREE(cprefix);
417     // clean up
418     for (int i=0;i<numqryfiles;i++) {
419     delete qrysdata[i];
420     }
421     GFREE(qrysdata);
422     GFREE(tfiles);
423     GFREE(rtfiles);
424     gseqtracks.Clear();
425     FRCLOSE(f_ref);
426     FWCLOSE(f_out);
427     if (gtf_tracking_verbose) GMessage("Done.\n");
428     ref_data.Clear();
429     //getchar();
430     } //main ends here
431    
432     void show_exons(FILE* f, GffObj& m) {
433     fprintf(f,"(");
434     int imax=m.exons.Count()-1;
435     for (int i=0;i<=imax;i++) {
436     if (i==imax) fprintf(f,"%d-%d)",m.exons[i]->start, m.exons[i]->end);
437     else fprintf(f,"%d-%d,",m.exons[i]->start, m.exons[i]->end);
438     }
439     }
440    
441     bool ichainMatch(GffObj* t, GffObj* r, bool& exonMatch, int fuzz=0) {
442     //t's intron chain is considered matching to reference r
443     //if r chain is the same or a subset of t's chain
444     exonMatch=false;
445     int imax=r->exons.Count()-1;
446     int jmax=t->exons.Count()-1;
447     if (imax==0 || jmax==0) { //single-exon mRNAs
448     if (imax!=jmax) return false;
449     exonMatch=r->exons[0]->coordMatch(t->exons[0],fuzz);
450     /*if (exonMatch) return true;
451     else return (r->exons[0]->start>=t->exons[0]->start &&
452     r->exons[0]->end<=t->exons[0]->end);*/
453     return exonMatch;
454     }
455    
456     if (r->exons[imax]->start<t->exons[0]->end ||
457     t->exons[jmax]->start<r->exons[0]->end ) //intron chains do not overlap at all
458     {
459     return false;
460     }
461     //check intron overlaps
462     int i=1;
463     int j=1;
464     bool exmism=false; //any mismatch
465     while (i<=imax && j<=jmax) {
466     uint rstart=r->exons[i-1]->end;
467     uint rend=r->exons[i]->start;
468     uint tstart=t->exons[j-1]->end;
469     uint tend=t->exons[j]->start;
470     if (tend<rstart) { j++; continue; }
471     if (rend<tstart) { i++; continue; }
472     break; //here we have an intron overlap
473     }
474     if (i>1 || i>imax || j>jmax) {
475     return false; //no intron overlaps found at all
476     //or first intron of ref not overlapping
477     }
478     //from now on we expect intron matches up to imax
479     if (i!=j || imax!=jmax) { exmism=true; if (fuzz==0) return false; }
480     for (;i<=imax && j<=jmax;i++,j++) {
481     if (abs((int)(r->exons[i-1]->end-t->exons[j-1]->end))>fuzz ||
482     abs((int)(r->exons[i]->start-t->exons[j]->start))>fuzz) {
483     return false; //just run away
484     }
485     }
486     //if we made it here, we have matching intron chains up to MIN(imax,jmax)
487     if (imax!=jmax) {
488     exmism=true;
489     if (jmax<imax) return false; // qry ichain included in ref ichain
490     else //ref ichain included in qry ichain
491     if (fuzz==0) return false;
492     }
493     if (exmism) {
494     //exonMatch=false; -- it's default
495     return true;
496     }
497     exonMatch = ( abs((int)(r->exons[0]->start-t->exons[0]->start))<=fuzz &&
498     abs((int)(r->exons[imax]->end-t->exons[jmax]->end))<=fuzz );
499     return true;
500     }
501    
502    
503     void compareLoci2R(GList<GLocus>& loci, GList<GSuperLocus>& cmpdata,
504     GList<GLocus>& refloci, int qfidx) {
505     cmpdata.Clear();//a new list of superloci will be built
506     if (refloci.Count()==0 || loci.Count()==0) return;
507     //reset cmpovl and stats
508     for (int i=0;i<refloci.Count();i++) refloci[i]->creset();
509     //find loci with overlapping refloci
510     //and store cmpovl links both ways for ALL loci and refloci on this strand
511     for (int l=0;l<loci.Count();l++) {
512     GLocus* locus=loci[l];
513     locus->creset();
514     for (int j=0;j<refloci.Count();j++) {
515     //if (refloci[j]->start>locus->end) break;
516     if (refloci[j]->start>locus->end) {
517     if (refloci[j]->start-locus->end > GFF_MAX_LOCUS) break;
518     continue;
519     }
520     if (locus->start>refloci[j]->end) continue;
521     // then we must have overlap here:
522     //if (locus->overlap(refloci[j]->start, refloci[j]->end)) {
523     locus->cmpovl.Add(refloci[j]);
524     refloci[j]->cmpovl.Add(locus);
525     //}
526     }//for each reflocus
527     } //for each locus
528    
529     //create corresponding "superloci" from transitive overlapping between loci and ref
530     for (int l=0;l<loci.Count();l++) {
531     if (loci[l]->v!=0) continue; //skip, already processed
532     GSuperLocus* super=new GSuperLocus();
533     super->qfidx=qfidx;
534     //try to find all other loci connected to this locus loci[l]
535     GList<GLocus> lstack(false,false,false); //traversal stack
536     lstack.Push(loci[l]);
537     while (lstack.Count()>0) {
538     GLocus* locus=lstack.Pop();
539     if (locus->v!=0 || locus->cmpovl.Count()==0) continue;
540     super->addQlocus(*locus);
541     locus->v=1;
542     for (int r=0;r<locus->cmpovl.Count();r++) {
543     GLocus* rloc=locus->cmpovl[r];
544     if (rloc->v==0) {
545     super->addRlocus(*rloc);
546     rloc->v=1;
547     for (int ll=0;ll<rloc->cmpovl.Count();ll++) {
548     if (rloc->cmpovl[ll]->v==0) lstack.Push(rloc->cmpovl[ll]);
549     }
550     }
551     } //for each overlapping reflocus
552     } //while linking
553    
554     if (super->qloci.Count()==0) {
555     delete super;
556     continue; //try next query loci
557     }
558     //--here we have a "superlocus" region data on both qry and ref
559     // -- analyze mexons matching (base level metrics)
560     cmpdata.Add(super);
561     //make each ref locus keep track of all superloci containing it
562     for (int rl=0;rl<super->rloci.Count();rl++) {
563     super->rloci[rl]->superlst->Add(super);
564     }
565     for (int x=0;x<super->rmexons.Count();x++) {
566     super->rbases_all += super->rmexons[x].end-super->rmexons[x].start+1;
567     }
568     for (int x=0;x<super->qmexons.Count();x++) {
569     super->qbases_all += super->qmexons[x].end-super->qmexons[x].start+1;
570     }
571     int i=0; //locus mexons
572     int j=0; //refmexons
573     while (i<super->qmexons.Count() && j<super->rmexons.Count()) {
574     uint istart=super->qmexons[i].start;
575     uint iend=super->qmexons[i].end;
576     uint jstart=super->rmexons[j].start;
577     uint jend=super->rmexons[j].end;
578     if (iend<jstart) { i++; continue; }
579     if (jend<istart) { j++; continue; }
580     //v--overlap here:
581     uint ovlstart = jstart>istart? jstart : istart;
582     uint ovlend = iend<jend ? iend : jend;
583     uint ovlen=ovlend-ovlstart+1;
584     super->baseTP+=ovlen; //qbases_cov
585     if (iend<jend) i++;
586     else j++;
587     } //while mexons ovl search
588     /* if (reduceRefs) {
589     super->baseFP=super->qbases_all-super->baseTP;
590     super->baseFN=super->rbases_all-super->baseTP;
591     }
592     */
593     // -- exon level comparison:
594     int* qexovl; //flags for qry exons with ref overlap
595     GCALLOC(qexovl,super->quexons.Count()*sizeof(int));
596     int* rexovl; //flags for ref exons with qry overlap
597     GCALLOC(rexovl,super->ruexons.Count()*sizeof(int));
598     for (int i=0;i<super->quexons.Count();i++) {
599     uint istart=super->quexons[i].start;
600     uint iend=super->quexons[i].end;
601     for (int j=0;j<super->ruexons.Count();j++) {
602     uint jstart=super->ruexons[j].start;
603     uint jend=super->ruexons[j].end;
604     if (iend<jstart) break;
605     if (jend<istart) continue;
606     //--- overlap here between quexons[i] and ruexons[j]
607     qexovl[i]++;
608     rexovl[j]++;
609     if (super->quexons[i].coordMatch(&super->ruexons[j],5)) {
610     super->exonATP++;
611     if (super->quexons[i].coordMatch(&super->ruexons[j])) {
612     super->exonTP++;
613     } //exact match
614     } //fuzzy match
615     } //ref uexon loop
616     } //qry uexon loop
617     super->m_exons=0; //ref exons with no query overlap
618     super->w_exons=0; //qry exons with no ref overlap
619     for (int x=0;x<super->quexons.Count();x++)
620     if (qexovl[x]==0) super->w_exons++;
621     for (int x=0;x<super->ruexons.Count();x++)
622     if (rexovl[x]==0) super->m_exons++;
623     GFREE(rexovl);
624     GFREE(qexovl);
625    
626     //-- intron level stats:
627     //query:
628     int* qinovl=NULL; //flags for qry introns with at least some ref overlap
629     int* qtpinovl=NULL; //flags for qry introns with perfect ref overlap
630     if (super->qintrons.Count()>0) {
631     GCALLOC(qinovl,super->qintrons.Count()*sizeof(int));
632     GCALLOC(qtpinovl,super->qintrons.Count()*sizeof(int));
633     }
634     //-- reference:
635     int* rinovl=NULL; //flags for ref introns with qry overlap
636     int* rtpinovl=NULL; //ref introns with perfect qry intron overlap
637     if (super->rintrons.Count()>0) {
638     GCALLOC(rinovl,super->rintrons.Count()*sizeof(int));
639     GCALLOC(rtpinovl,super->rintrons.Count()*sizeof(int));
640     }
641     for (int i=0;i<super->qintrons.Count();i++) {
642     uint istart=super->qintrons[i].start;
643     uint iend=super->qintrons[i].end;
644     for (int j=0;j<super->rintrons.Count();j++) {
645     uint jstart=super->rintrons[j].start;
646     uint jend=super->rintrons[j].end;
647     if (iend<jstart) break;
648     if (jend<istart) continue;
649     //--- overlap here between qintrons[i] and rintrons[j]
650     qinovl[i]++;
651     rinovl[j]++;
652     if (super->qintrons[i].coordMatch(&super->rintrons[j],5)) {
653     super->intronATP++;
654     if (super->qintrons[i].coordMatch(&super->rintrons[j])) {
655     super->intronTP++;
656     qtpinovl[i]++;
657     rtpinovl[j]++;
658     } //exact match
659     } //fuzzy match
660     } //ref intron loop
661     } //qry intron loop
662     super->m_introns=0; //ref introns with no query overlap
663     super->w_introns=0; //qry introns with no ref overlap
664     for (int x=0;x<super->qintrons.Count();x++) {
665     if (qinovl[x]==0) { super->w_introns++;
666     //qry introns with no ref intron overlap AT ALL
667     super->i_qwrong.Add(super->qintrons[x]);
668     }
669     else
670     if (qtpinovl[x]==0) {
671     super->i_qnotp.Add(super->qintrons[x]);
672     }
673     }
674     for (int x=0;x<super->rintrons.Count();x++) {
675     if (rinovl[x]==0) { //no intron overlap at all
676     super->m_introns++;
677     super->i_missed.Add(super->rintrons[x]);
678     }
679     else if (rtpinovl[x]==0) { //no perfect intron match
680     super->i_notp.Add(super->rintrons[x]);
681     }
682     }
683     GFREE(rinovl);
684     GFREE(rtpinovl);
685     GFREE(qinovl);
686     GFREE(qtpinovl);
687    
688     // ---- now intron-chain and transcript comparison
689     for (int i=0;i<super->qmrnas.Count();i++) {
690     uint istart=super->qmrnas[i]->exons.First()->start;
691     uint iend=super->qmrnas[i]->exons.Last()->end;
692     for (int j=0;j<super->rmrnas.Count();j++) {
693     uint jstart=super->rmrnas[j]->exons.First()->start;
694     uint jend=super->rmrnas[j]->exons.Last()->end;
695     if (iend<jstart) break;
696     if (jend<istart) continue;
697     //--- overlap here --
698     bool exonMatch=false;
699     if ((super->qmrnas[i]->udata & 3) > 1) continue; //already counted a ichainTP for this qry
700     if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch, 5)) { //fuzzy match
701     GLocus* qlocus=((CTData*)super->qmrnas[i]->uptr)->locus;
702 gpertea 118 GLocus* rlocus=((CTData*)super->rmrnas[j]->uptr)->locus;
703 gpertea 20 if (super->qmrnas[i]->exons.Count()>1) {
704     super->ichainATP++;
705     qlocus->ichainATP++;
706 gpertea 118 rlocus->ichainATP++;
707 gpertea 20 }
708     if (exonMatch) {
709     super->mrnaATP++;
710     qlocus->mrnaATP++;
711 gpertea 118 rlocus->mrnaATP++;
712 gpertea 20 }
713     if (ichainMatch(super->qmrnas[i],super->rmrnas[j],exonMatch)) { //exact match
714     if (super->qmrnas[i]->exons.Count()>1) {
715     super->qmrnas[i]->udata|=1;
716     super->ichainTP++;
717     qlocus->ichainTP++;
718 gpertea 118 rlocus->ichainTP++;
719 gpertea 20 }
720     if (exonMatch) {
721     super->qmrnas[i]->udata|=2;
722     super->mrnaTP++;
723     qlocus->mrnaTP++;
724 gpertea 118 rlocus->mrnaTP++;
725 gpertea 20 }
726     } //exact match
727     } //fuzzy match
728     } //ref mrna loop
729     } //qry mrna loop
730     for (int ql=0;ql<super->qloci.Count();ql++) {
731     if (super->qloci[ql]->ichainTP+super->qloci[ql]->mrnaTP >0 )
732 gpertea 118 super->locusQTP++;
733     if (super->qloci[ql]->ichainATP+super->qloci[ql]->mrnaATP>0)
734     super->locusAQTP++;
735     }
736     for (int rl=0;rl<super->rloci.Count();rl++) {
737     if (super->rloci[rl]->ichainTP+super->rloci[rl]->mrnaTP >0 )
738 gpertea 20 super->locusTP++;
739 gpertea 118 if (super->rloci[rl]->ichainATP+super->rloci[rl]->mrnaATP>0)
740 gpertea 20 super->locusATP++;
741     }
742    
743     }//for each unlinked locus
744    
745     }
746    
747     //look for qry data for a specific genomic sequence
748     GSeqData* getQryData(int gid, GList<GSeqData>& qdata) {
749     int qi=-1;
750     GSeqData f(gid);
751     GSeqData* q=NULL;
752     if (qdata.Found(&f,qi))
753     q=qdata[qi];
754     return q;
755     }
756    
757     const char* findDescr(GffObj* gfobj) {
758     if (refdescr.Count()==0) return NULL;
759     GStr* s=refdescr.Find(gfobj->getID());
760     if (s==NULL) {
761     s=refdescr.Find(gfobj->getGeneName());
762     if (s==NULL) s=refdescr.Find(gfobj->getGeneID());
763     }
764     if (s!=NULL)
765     return s->chars();
766     return NULL;
767     }
768    
769     const char* getGeneID(GffObj* gfobj) {
770     //returns anything that might resemble a gene identifier for this transcript
771     //or, if everything fails, returns the transcript ID
772     const char* s=gfobj->getGeneName();
773     if (s) return s;
774     if ((s=gfobj->getGeneID())!=NULL) return s;
775     if ((s=gfobj->getAttr("Name"))!=NULL) return s;
776     return gfobj->getID();
777     }
778    
779     const char* getGeneID(GffObj& gfobj) {
780     return getGeneID(&gfobj);
781     }
782    
783     void writeLoci(FILE* f, GList<GLocus> & loci) {
784     for (int l=0;l<loci.Count();l++) {
785     GLocus& loc=*(loci[l]);
786     fprintf(f,"%s\t%s[%c]%d-%d\t", loc.mrna_maxcov->getID(),
787     loc.mrna_maxcov->getGSeqName(),
788     loc.mrna_maxcov->strand, loc.start,loc.end);
789     //now print all transcripts in this locus, comma delimited
790     int printfd=0;
791     for (int i=0;i<loc.mrnas.Count();i++) {
792     if (loc.mrnas[i]==loc.mrna_maxcov) continue;
793     if (printfd==0) fprintf(f,"%s",loc.mrnas[i]->getID());
794     else fprintf(f,",%s",loc.mrnas[i]->getID());
795     printfd++;
796     }
797     const char* rdescr=findDescr(loc.mrna_maxcov);
798     if (rdescr==NULL) fprintf(f,"\t\n");
799     else fprintf(f,"\t%s\n",rdescr);
800     }
801     }
802    
803     void printXQ1(FILE* f, int qidx, GList<GLocus>& qloci) {
804     int printfd=0;
805     //print
806     for (int i=0;i<qloci.Count();i++) {
807     if (qloci[i]->qfidx!=qidx) continue;
808     for (int j=0;j<qloci[i]->mrnas.Count();j++) {
809     if (printfd==0) fprintf(f,"%s",qloci[i]->mrnas[j]->getID());
810     else fprintf(f,",%s",qloci[i]->mrnas[j]->getID());
811     printfd++;
812     }
813     }
814     if (printfd==0) fprintf(f,"-");
815     }
816    
817     void numXLoci(GList<GXLocus>& xloci, int& last_id) {
818     for (int l=0;l<xloci.Count();l++) {
819     if (xloci[l]->qloci.Count()==0) continue; //we never print ref-only xloci
820     last_id++;
821     xloci[l]->id=last_id;
822     }
823     }
824    
825    
826     class GProtCl {
827     public:
828     GList<GXConsensus> protcl;
829     GProtCl(GXConsensus* c=NULL):protcl(true,false,false) {
830     if (c!=NULL)
831     protcl.Add(c);
832     }
833     bool add_Pcons(GXConsensus* c) {
834     if (c==NULL || c->aalen==0) return false;
835     if (protcl.Count()==0) {
836     protcl.Add(c);
837     return true;
838     }
839     if (protcl[0]->aalen!=c->aalen) return false;
840     if (strcmp(protcl[0]->aa,c->aa)!=0) return false;
841     protcl.Add(c);
842     return true;
843     }
844    
845     void addMerge(GProtCl& pcl, GXConsensus* pclnk) {
846     for (int i=0;i<pcl.protcl.Count();i++) {
847     if (pcl.protcl[i]!=pclnk) {
848     protcl.Add(pcl.protcl[i]);
849     }
850     }
851     }
852    
853     int aalen() {
854     if (protcl.Count()==0) return 0;
855     return protcl[0]->aalen;
856     }
857     bool operator==(GProtCl& cl) {
858     return this==&cl;
859     }
860     bool operator>(GProtCl& cl) {
861     return (this>&cl);
862     }
863     bool operator<(GProtCl& cl) {
864     return (this<&cl);
865     }
866     };
867    
868     class GTssCl:public GSeg { //experiment cluster of ref loci (isoforms)
869     public:
870     uint fstart; //lowest coordinate of the first exon
871     uint fend; //highest coordinate of the first exon
872     GList<GXConsensus> tsscl;
873     GTssCl(GXConsensus* c=NULL):tsscl(true,false,false) {
874     start=0;
875     end=0;
876     fstart=0;
877     fend=0;
878     if (c!=NULL) addFirst(c);
879     }
880    
881     void addFirst(GXConsensus* c) {
882     tsscl.Add(c);
883     start=c->start;
884     end=c->end;
885     GffExon* fexon=(c->tcons->strand=='-') ? c->tcons->exons.Last() :
886     c->tcons->exons.First();
887     fstart=fexon->start;
888     fend=fexon->end;
889     }
890     bool add_Xcons(GXConsensus* c) {
891     if (tsscl.Count()==0) {
892     addFirst(c);
893     return true;
894     }
895     //check if it can be added to existing xconsensi
896     uint nfend=0;
897     uint nfstart=0;
898 gpertea 72 /*
899 gpertea 21 if (tsscl.Get(0)->tcons->getGeneID()!=NULL &&
900     c->tcons->getGeneID()!=NULL &&
901     strcmp(tsscl.Get(0)->tcons->getGeneID(), c->tcons->getGeneID()))
902     //don't tss cluster if they don't have the same GeneID (?)
903 gpertea 27 //FIXME: we might not want this if input files are not from Cufflinks
904     // and they could simply lack proper GeneID
905 gpertea 21 return false;
906 gpertea 72 */
907 gpertea 20 if (c->tcons->strand=='-') {
908     //no, the first exons don't have to overlap
909     //if (!c->tcons->exons.Last()->overlap(fstart,fend)) return false;
910     nfstart=c->tcons->exons.Last()->start;
911     nfend=c->tcons->exons.Last()->end;
912     //proximity check for the transcript start:
913 gpertea 21 if (nfend>fend+tssDist || fend>nfend+tssDist)
914     return false;
915 gpertea 20 }
916     else {
917     //if (!c->tcons->exons.First()->overlap(fstart,fend)) return false;
918     nfstart=c->tcons->exons.First()->start;
919     nfend=c->tcons->exons.First()->end;
920 gpertea 21 if (nfstart>fstart+tssDist || fstart>nfstart+tssDist)
921     return false;
922 gpertea 20 }
923     // -- if we are here, we can add to tss cluster
924    
925     tsscl.Add(c);
926     if (fstart>nfstart) fstart=nfstart;
927     if (fend<nfend) fend=nfend;
928     if (start>c->start) start=c->start;
929     if (end<c->end) end=c->end;
930     return true;
931     }
932    
933     void addMerge(GTssCl& cl, GXConsensus* clnk) {
934     for (int i=0;i<cl.tsscl.Count();i++) {
935     if (cl.tsscl[i]==clnk) continue;
936     tsscl.Add(cl.tsscl[i]);
937     }
938     if (fstart>cl.fstart) fstart=cl.fstart;
939     if (fend<cl.fend) fend=cl.fend;
940     if (start>cl.start) start=cl.start;
941     if (end<cl.end) end=cl.end;
942     }
943     };
944     /*
945     class IntArray { //two dimensional int array
946     int* mem;
947     int xsize;
948     int ysize;
949     public:
950     IntArray(int xlen, int ylen) {
951     xsize=xlen;
952     ysize=ylen;
953     GMALLOC(mem, xsize*ysize*sizeof(int));
954     }
955     ~IntArray() {
956     GFREE(mem);
957     }
958     int& data(int x, int y) {
959     return mem[y*xsize+x];
960     }
961     };
962    
963     int aa_diff(GXConsensus* c1, GXConsensus* c2) {
964     int diflen=abs(c1->aalen-c2->aalen);
965     if (diflen>=6) return diflen;
966     //obvious case: same CDS
967     if (diflen==0 && strcmp(c1->aa, c2->aa)==0) return 0;
968     //simple edit distance calculation
969     IntArray dist(c1->aalen+1, c2->aalen+1);
970     for (int i=0;i<=c1->aalen;i++) {
971     dist.data(i,0) = i;
972     }
973     for (int j = 0; j <= c2->aalen; j++) {
974     dist.data(0,j) = j;
975     }
976     for (int i = 1; i <= c1->aalen; i++)
977     for (int j = 1; j <= c2->aalen; j++) {
978     dist.data(i,j) = GMIN3( dist.data(i-1,j)+1,
979     dist.data(i,j-1)+1,
980     dist.data(i-1,j-1)+((c1->aa[i-1] == c2->aa[j-1]) ? 0 : 1) );
981     }
982     int r=dist.data(c1->aalen,c2->aalen);
983     return r;
984     }
985     */
986     void printConsGTF(FILE* fc, GXConsensus* xc, int xlocnum) {
987     for (int i=0;i<xc->tcons->exons.Count();i++) {
988     fprintf(fc,
989     "%s\t%s\texon\t%d\t%d\t.\t%c\t.\tgene_id \"XLOC_%06d\"; transcript_id \"%s_%08d\"; exon_number \"%d\";",
990     xc->tcons->getGSeqName(),xc->tcons->getTrackName(),xc->tcons->exons[i]->start, xc->tcons->exons[i]->end, xc->tcons->strand,
991     xlocnum, cprefix, xc->id, i+1);
992     //if (i==0) {
993     const char* gene_name=NULL;
994     if (xc->ref) {
995     gene_name=xc->ref->getGeneName();
996     if (gene_name==NULL) gene_name=xc->ref->getGeneID();
997     if (gene_name) {
998     fprintf (fc, " gene_name \"%s\";", gene_name);
999     }
1000     }
1001     if (!haveRefs) {
1002     if (gene_name==NULL && xc->tcons->getGeneName())
1003     fprintf (fc, " gene_name \"%s\";", xc->tcons->getGeneName());
1004     char* s=xc->tcons->getAttr("nearest_ref", true);
1005     if (s) fprintf(fc, " nearest_ref \"%s\";",s);
1006     s=xc->tcons->getAttr("class_code", true);
1007     if (s) fprintf(fc, " class_code \"%s\";", s);
1008     }
1009     fprintf(fc, " oId \"%s\";",xc->tcons->getID());
1010     if (xc->contained) {
1011     fprintf(fc, " contained_in \"%s_%08d\";", cprefix, xc->contained->id);
1012     }
1013     if (haveRefs) {
1014     if (xc->ref!=NULL)
1015     fprintf(fc, " nearest_ref \"%s\";",xc->ref->getID());
1016     fprintf(fc, " class_code \"%c\";",xc->refcode ? xc->refcode : '.');
1017     }
1018     if (xc->tss_id>0) fprintf(fc, " tss_id \"TSS%d\";",xc->tss_id);
1019     if (xc->p_id>0) fprintf(fc, " p_id \"P%d\";",xc->p_id);
1020     // }
1021     fprintf(fc,"\n");
1022     }
1023     }
1024    
1025 gpertea 21 void tssCluster(GXLocus& xloc)
1026     {
1027     GList<GTssCl> xpcls(true,true,false);
1028     for (int i=0;i<xloc.tcons.Count();i++)
1029     {
1030     GXConsensus* c=xloc.tcons[i];
1031     //if (c->tcons->exons.Count()<2) continue; //should we skip single-exon transcripts ??
1032     GArray<int> mrgloci(true);
1033     int lfound=0;
1034     for (int l=0;l<xpcls.Count();l++)
1035     {
1036     if (xpcls[l]->end<c->tcons->exons.First()->start) continue;
1037     if (xpcls[l]->start>c->tcons->exons.Last()->end) break;
1038     if (xpcls[l]->add_Xcons(c))
1039     {
1040     lfound++;
1041     mrgloci.Add(l);
1042    
1043 gpertea 20 }
1044 gpertea 21
1045 gpertea 20 } // for each xpcluster
1046 gpertea 21 if (lfound==0)
1047     {
1048     //create a xpcl with only this xconsensus
1049     xpcls.Add(new GTssCl(c));
1050    
1051 gpertea 20 }
1052 gpertea 21 else if (lfound>1)
1053     {
1054     for (int l=1;l<lfound;l++)
1055     {
1056     int mlidx=mrgloci[l]-l+1;
1057     xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1058     xpcls.Delete(mlidx);
1059     }
1060 gpertea 20 }
1061 gpertea 21
1062 gpertea 20 }//for each xconsensus in this xlocus
1063 gpertea 21 for (int l=0;l<xpcls.Count();l++)
1064     {
1065     //if (xpcls[l]->tsscl.Count()<2) continue;
1066     tsscl_num++;
1067     for (int i=0;i<xpcls[l]->tsscl.Count();i++)
1068     xpcls[l]->tsscl[i]->tss_id=tsscl_num;
1069     //processTssCl(xcds_num, xpcls[l], faseq);
1070     }
1071 gpertea 20 }
1072    
1073     void protCluster(GXLocus& xloc, GFaSeqGet *faseq) {
1074     if (!faseq)
1075     return;
1076     GList<GProtCl> xpcls(true,true,false);
1077     for (int i=0;i<xloc.tcons.Count();i++) {
1078     GXConsensus* c=xloc.tcons[i];
1079     if (c->ref==NULL || c->ref->CDstart==0) continue; //no ref or CDS available
1080     if (c->refcode!='=') continue;
1081     //get the CDS translation here
1082     if (c->aa==NULL) {
1083     c->aa=c->ref->getSplicedTr(faseq, true, &c->aalen);
1084     if (c->aalen>0 && c->aa[c->aalen-1]=='.') {
1085     //discard the final stop codon
1086     c->aalen--;
1087     c->aa[c->aalen]=0;
1088     }
1089     }
1090     GArray<int> mrgloci(true);
1091     int lfound=0;
1092     for (int l=0;l<xpcls.Count();l++) {
1093     if (xpcls[l]->aalen()!=c->aalen) continue;
1094     if (xpcls[l]->add_Pcons(c)) {
1095     lfound++;
1096     mrgloci.Add(l);
1097     }
1098     } // for each xpcluster
1099     if (lfound==0) {
1100     //create a xpcl with only this xconsensus
1101     xpcls.Add(new GProtCl(c));
1102     }
1103     else if (lfound>1) {
1104     for (int l=1;l<lfound;l++) {
1105     int mlidx=mrgloci[l]-l+1;
1106     xpcls[mrgloci[0]]->addMerge(*xpcls[mlidx], c);
1107     xpcls.Delete(mlidx);
1108     }
1109     }
1110     }//for each xconsensus in this xlocus
1111     for (int l=0;l<xpcls.Count();l++) {
1112     protcl_num++;
1113     for (int i=0;i<xpcls[l]->protcl.Count();i++)
1114     xpcls[l]->protcl[i]->p_id=protcl_num;
1115     }
1116     for (int i=0;i<xloc.tcons.Count();i++) {
1117     GXConsensus* c=xloc.tcons[i];
1118     if (c->aa!=NULL) { GFREE(c->aa); }
1119     }
1120     }
1121    
1122     void printXLoci(FILE* f, FILE* fc, int qcount, GList<GXLocus>& xloci, GFaSeqGet *faseq) {
1123     for (int l=0;l<xloci.Count();l++) {
1124     if (xloci[l]->qloci.Count()==0) continue;
1125     GXLocus& xloc=*(xloci[l]);
1126     xloc.checkContainment();
1127     tssCluster(xloc);//cluster and assign tss_id and cds_id to each xconsensus in xloc
1128     protCluster(xloc,faseq);
1129     for (int c=0;c<xloc.tcons.Count();c++) {
1130     if (showContained || xloc.tcons[c]->contained==NULL)
1131     printConsGTF(fc,xloc.tcons[c],xloc.id);
1132     }
1133     fprintf(f,"XLOC_%06d\t%s[%c]%d-%d\t", xloc.id,
1134     xloc.qloci[0]->mrna_maxcov->getGSeqName(),
1135     xloc.strand, xloc.start,xloc.end);
1136     //now print all transcripts in this locus, comma delimited
1137     //first, ref loci, if any
1138     int printfd=0;
1139     if (xloc.rloci.Count()>0) {
1140     for (int i=0;i<xloc.rloci.Count();i++) {
1141     for (int j=0;j<xloc.rloci[i]->mrnas.Count();j++) {
1142     if (printfd==0) fprintf(f,"%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1143     xloc.rloci[i]->mrnas[j]->getID());
1144     else fprintf(f,",%s|%s",getGeneID(xloc.rloci[i]->mrnas[j]),
1145     xloc.rloci[i]->mrnas[j]->getID());
1146     printfd++;
1147     }
1148     }
1149     }
1150     else {
1151     fprintf(f,"-");
1152     }
1153     //second, all the cufflinks transcripts
1154     for (int qi=0;qi<qcount;qi++) {
1155     fprintf(f,"\t");
1156     printXQ1(f,qi,xloc.qloci);
1157     }
1158     fprintf(f,"\n");
1159     }
1160     }
1161    
1162     void writeIntron(FILE* f, char strand, GFaSeqGet* faseq, GSeg& iseg,
1163     GList<GffObj>& mrnas, bool wrong=false) {
1164     //find a ref mrna having this intron
1165     GffObj* rm=NULL;
1166     for (int i=0;i<mrnas.Count();i++) {
1167     GffObj* m=mrnas[i];
1168     if (m->start>iseg.end) break;
1169     if (m->end<iseg.start) continue;
1170     //intron coords overlaps mrna region
1171     for (int j=1;j<m->exons.Count();j++) {
1172     if (iseg.start==m->exons[j-1]->end+1 &&
1173     iseg.end==m->exons[j]->start-1) { rm=m; break; } //match found
1174     }//for each intron
1175     if (rm!=NULL) break;
1176     } //for each ref mrna in this locus
1177     if (rm==NULL) GError("Error: couldn't find ref mrna for intron %d-%d! (BUG)\n",
1178     iseg.start,iseg.end);
1179     int ilen=iseg.end-iseg.start+1;
1180     fprintf(f,"%s\t%s\tintron\t%d\t%d\t.\t%c\t.\t",
1181     rm->getGSeqName(),rm->getTrackName(),iseg.start,iseg.end,strand);
1182     if (faseq!=NULL) {
1183     const char* gseq=faseq->subseq(iseg.start, ilen);
1184     char* cseq=Gstrdup(gseq, gseq+ilen-1);
1185     if (strand=='-') reverseComplement(cseq, ilen);
1186     fprintf(f,"spl=\"%c%c..%c%c\"; ", toupper(cseq[0]),toupper(cseq[1]),
1187     toupper(cseq[ilen-2]),toupper(cseq[ilen-1]));
1188     GFREE(cseq);
1189     }
1190     fprintf(f,"transcript_id \"%s\";", rm->getID());
1191     if (wrong) fprintf(f," noOvl=1;");
1192     fprintf(f,"\n");
1193    
1194     }
1195    
1196     void reportMIntrons(FILE* fm, FILE* fn, FILE* fq, char strand,
1197     GList<GSuperLocus>& cmpdata) {
1198     if (fm==NULL) return;
1199     for (int l=0;l<cmpdata.Count();l++) {
1200     GSuperLocus *sl=cmpdata[l];
1201     //cache the whole locus sequence if possible
1202     //write these introns and their splice sites into the file
1203     for (int i=0;i<sl->i_missed.Count();i++)
1204     writeIntron(fm, strand, NULL, sl->i_missed[i], sl->rmrnas);
1205     if (fn!=NULL) {
1206     for (int i=0;i<sl->i_notp.Count();i++)
1207     writeIntron(fn, strand, NULL, sl->i_notp[i], sl->rmrnas);
1208     }
1209     if (fq!=NULL) {
1210     for (int i=0;i<sl->i_qwrong.Count();i++) {
1211     writeIntron(fq, strand, NULL, sl->i_qwrong[i], sl->qmrnas, true);
1212     }
1213     for (int i=0;i<sl->i_qnotp.Count();i++) {
1214     writeIntron(fq, strand, NULL, sl->i_qnotp[i], sl->qmrnas);
1215     }
1216     }
1217     }
1218     }
1219    
1220    
1221     void processLoci(GSeqData& seqdata, GSeqData* refdata, int qfidx) {
1222     //GList<GSeqLoci>& glstloci, GList<GSeqCmpRegs>& cmpdata)
1223    
1224     if (refdata!=NULL) {
1225     //if (gtf_tracking_verbose) GMessage(" ..comparing to reference loci..\n") ;
1226     compareLoci2R(seqdata.loci_f, seqdata.gstats_f, refdata->loci_f, qfidx);
1227     compareLoci2R(seqdata.loci_r, seqdata.gstats_r, refdata->loci_r, qfidx);
1228     // -- report
1229    
1230     if (f_mintr!=NULL) {
1231     GMessage(" ..reporting missed ref introns..\n");
1232     //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '+', seqdata.gstats_f);
1233     //reportIntrons(f_mintr, f_nintr, f_qintr, faseq, '-', seqdata.gstats_r);
1234     reportMIntrons(f_mintr, NULL, NULL, '+', seqdata.gstats_f);
1235     reportMIntrons(f_mintr, NULL, NULL, '-', seqdata.gstats_r);
1236     }
1237     }
1238     }
1239    
1240     //adjust stats for a list of unoverlapped (completely missed) ref loci
1241     void collectRLocData(GSuperLocus& stats, GLocus& loc) {
1242     stats.total_rmrnas+=loc.mrnas.Count();
1243     stats.total_rexons+=loc.uexons.Count();
1244     stats.total_rintrons+=loc.introns.Count();
1245     stats.total_rmexons+=loc.mexons.Count();
1246     stats.total_richains+=loc.ichains;
1247     stats.m_exons+=loc.uexons.Count();
1248     stats.m_introns+=loc.introns.Count();
1249     stats.total_rloci++;
1250     for (int e=0;e<loc.mexons.Count();e++) {
1251     stats.rbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1252     }
1253     }
1254    
1255     void collectRData(GSuperLocus& stats, GList<GLocus>& loci) {
1256     for (int l=0;l<loci.Count();l++)
1257     collectRLocData(stats,*loci[l]);
1258     }
1259    
1260     //adjust stats for a list of unoverlapped (completely "wrong" or novel) qry loci
1261     void collectQLocData(GSuperLocus& stats, GLocus& loc) {
1262     stats.total_qmrnas+=loc.mrnas.Count();
1263     stats.total_qexons+=loc.uexons.Count();
1264     stats.total_qmexons+=loc.mexons.Count();
1265     stats.total_qintrons+=loc.introns.Count();
1266     stats.total_qichains+=loc.ichains;
1267     stats.total_qloci++;
1268     if (loc.ichains>0 && loc.mrnas.Count()>1)
1269     stats.total_qloci_alt++;
1270     stats.w_exons+=loc.uexons.Count();
1271     stats.w_introns+=loc.introns.Count();
1272     for (int e=0;e<loc.mexons.Count();e++) {
1273     stats.qbases_all+=loc.mexons[e].end-loc.mexons[e].start+1;
1274     }
1275     }
1276    
1277     void collectQData(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1278     for (int l=0;l<loci.Count();l++) {
1279     //this is called when no refdata is given, so all these loci are nloci
1280     nloci.Add(loci[l]);
1281     collectQLocData(stats,*loci[l]);
1282     }
1283     }
1284    
1285     void collectQNOvl(GSuperLocus& stats, GList<GLocus>& loci, GList<GLocus>& nloci) {
1286     for (int l=0;l<loci.Count();l++) {
1287     if (loci[l]->cmpovl.Count()==0) {//locus with no ref loci overlaps
1288     stats.w_loci++; //novel/wrong loci
1289     nloci.Add(loci[l]);
1290     collectQLocData(stats,*loci[l]);
1291     }
1292     }
1293     }
1294    
1295     void collectQU(GSuperLocus& stats, GList<GLocus>& nloci) {
1296     for (int l=0;l<nloci.Count();l++) {
1297     stats.w_loci++; //novel/wrong loci
1298     collectQLocData(stats, *nloci[l]);
1299     }
1300     }
1301    
1302     void printLocus(FILE* f, GLocus& loc, const char* gseqname) {
1303     fprintf(f, "## Locus %s:%d-%d\n",gseqname, loc.start, loc.end);
1304     for (int m=0;m<loc.mrnas.Count();m++) {
1305     loc.mrnas[m]->printGtf(f);
1306     }
1307     }
1308    
1309     void collectRNOvl(GSuperLocus& stats, GList<GLocus>& loci) { //, const char* gseqname) {
1310     for (int l=0;l<loci.Count();l++) {
1311     if (loci[l]->cmpovl.Count()==0) {
1312     stats.m_loci++; //missed ref loci
1313     //if (f_mloci!=NULL)
1314     // printLocus(f_mloci,*loci[l], gseqname);
1315     collectRLocData(stats,*loci[l]);
1316     }
1317     }
1318     }
1319    
1320    
1321     void collectCmpData(GSuperLocus& stats, GList<GSuperLocus>& cmpdata) { //, const char* gseqname) {
1322     for (int c=0;c<cmpdata.Count();c++) {
1323     stats.addStats(*cmpdata[c]);
1324     /*
1325     if (f_nloci!=NULL && cmpdata[c]->locusTP==0 && cmpdata[c]->rloci.Count()>0) {
1326     fprintf(f_nloci, "# Superlocus %s:%d-%d\n",gseqname, cmpdata[c]->start, cmpdata[c]->end);
1327     for (int l=0;l<cmpdata[c]->rloci.Count();l++) {
1328     printLocus(f_nloci,*cmpdata[c]->rloci[l], gseqname);
1329     }
1330     }
1331     */
1332     }
1333     }
1334    
1335     void collectStats(GSuperLocus& stats, GSeqData* seqdata, GSeqData* refdata) {
1336     //collect all stats for a single genomic sequence into stats
1337     if (seqdata==NULL) {
1338     if (reduceRefs || refdata==NULL) return;
1339     //special case with completely missed all refs on a contig/chromosome
1340     collectRData(stats, refdata->loci_f);
1341     collectRData(stats, refdata->loci_r);
1342     return;
1343     }
1344     if (refdata==NULL) {//reference data missing on this contig
1345     collectQData(stats, seqdata->loci_f, seqdata->nloci_f);
1346     collectQData(stats, seqdata->loci_r, seqdata->nloci_r);
1347     collectQU(stats, seqdata->nloci_u);
1348     return;
1349     }
1350    
1351     /*stats.total_qloci+=seqdata->loci_f.Count();
1352     stats.total_qloci+=seqdata->loci_r.Count();
1353     if (reduceRefs) { //only collect ref loci from superloci
1354    
1355     }
1356     else {
1357     stats.total_rloci+=refdata->loci_f.Count();
1358     stats.total_rloci+=refdata->loci_r.Count();
1359     }
1360     */
1361     //collect data for overlapping superloci (already in seqdata->gstats_f/_r)
1362     //char* gseqname=getGSeqName(seqdata->gseq_id);
1363     collectCmpData(stats, seqdata->gstats_f);
1364     collectCmpData(stats, seqdata->gstats_r);
1365     //for non-overlapping qry loci, always add them as false positives FP
1366     collectQNOvl(stats, seqdata->loci_f, seqdata->nloci_f);
1367     collectQNOvl(stats, seqdata->loci_r, seqdata->nloci_r);
1368     collectQU(stats, seqdata->nloci_u);
1369     if (!reduceRefs) { //find ref loci with empty cmpovl and add them
1370     collectRNOvl(stats, refdata->loci_f);
1371     collectRNOvl(stats, refdata->loci_r);
1372     }
1373     }
1374    
1375     void reportStats(FILE* fout, const char* setname, GSuperLocus& stotal,
1376     GSeqData* seqdata, GSeqData* refdata) {
1377     GSuperLocus stats;
1378     bool finalSummary=(seqdata==NULL && refdata==NULL);
1379     GSuperLocus *ps=(finalSummary ? &stotal : &stats );
1380     if (!finalSummary) { //collecting contig stats
1381     //gather statistics for all loci/superloci here
1382     collectStats(stats, seqdata, refdata);
1383     stotal.addStats(stats);
1384     if (!perContigStats) return;
1385     }
1386     ps->calcF();
1387     if (seqdata!=NULL) fprintf(fout, "#> Genomic sequence: %s \n", setname);
1388     else fprintf(fout, "\n#= Summary for dataset: %s :\n", setname);
1389    
1390     fprintf(fout, "# Query mRNAs : %7d in %7d loci (%d multi-exon transcripts)\n",
1391     ps->total_qmrnas, ps->total_qloci, ps->total_qichains);
1392     fprintf(fout, "# (%d multi-transcript loci, ~%.1f transcripts per locus)\n",
1393     ps->total_qloci_alt, ((double)ps->total_qmrnas/ps->total_qloci));
1394    
1395     if (haveRefs) {
1396     fprintf(fout, "# Reference mRNAs : %7d in %7d loci (%d multi-exon)\n",
1397     ps->total_rmrnas, ps->total_rloci, ps->total_richains);
1398     if (ps->baseTP+ps->baseFP==0 || ps->baseTP+ps->baseFN==0) return;
1399     fprintf(fout, "# Corresponding super-loci: %7d\n",ps->total_superloci);
1400    
1401     /*if (seqdata!=NULL) {
1402     fprintf(fout, " ( %d/%d on forward/reverse strand)\n",
1403     seqdata->gstats_f.Count(),seqdata->gstats_r.Count());
1404     }*/
1405     fprintf(fout, "#--------------------| Sn | Sp | fSn | fSp \n");
1406     double sp=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFP);
1407     double sn=(100.0*(double)ps->baseTP)/(ps->baseTP+ps->baseFN);
1408     fprintf(fout, " Base level: \t%5.1f\t%5.1f\t - \t - \n",sn, sp);
1409     sp=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFP);
1410     sn=(100.0*(double)ps->exonTP)/(ps->exonTP+ps->exonFN);
1411     double fsp=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFP);
1412     double fsn=(100.0*(double)ps->exonATP)/(ps->exonATP+ps->exonAFN);
1413     if (fsp>100.0) fsp=100.0;
1414     if (fsn>100.0) fsn=100.0;
1415     fprintf(fout, " Exon level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1416     if (ps->total_rintrons>0) {
1417     //intron level
1418     sp=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFP);
1419     sn=(100.0*(double)ps->intronTP)/(ps->intronTP+ps->intronFN);
1420     fsp=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFP);
1421     fsn=(100.0*(double)ps->intronATP)/(ps->intronATP+ps->intronAFN);
1422     if (fsp>100.0) fsp=100.0;
1423     if (fsn>100.0) fsn=100.0;
1424     fprintf(fout, " Intron level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1425     //intron chains:
1426     sp=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFP);
1427     sn=(100.0*(double)ps->ichainTP)/(ps->ichainTP+ps->ichainFN);
1428     if (sp>100.0) sp=100.0;
1429     if (sn>100.0) sn=100.0;
1430     fsp=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFP);
1431     fsn=(100.0*(double)ps->ichainATP)/(ps->ichainATP+ps->ichainAFN);
1432     if (fsp>100.0) fsp=100.0;
1433     if (fsn>100.0) fsn=100.0;
1434     fprintf(fout, "Intron chain level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1435     }
1436     else {
1437     fprintf(fout, " Intron level: \t - \t - \t - \t - \n");
1438     fprintf(fout, "Intron chain level: \t - \t - \t - \t - \n");
1439     }
1440     sp=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFP);
1441     sn=(100.0*(double)ps->mrnaTP)/(ps->mrnaTP+ps->mrnaFN);
1442     fsp=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFP);
1443     fsn=(100.0*(double)ps->mrnaATP)/(ps->mrnaATP+ps->mrnaAFN);
1444     if (fsp>100.0) fsp=100.0;
1445     if (fsn>100.0) fsn=100.0;
1446 gpertea 118 fprintf(fout, " Transcript level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1447     //sp=(100.0*(double)ps->locusTP)/(ps->locusTP+ps->locusFP);
1448     sp=(100.0*(double)ps->locusQTP)/ps->total_qloci;
1449     sn=(100.0*(double)ps->locusTP)/ps->total_rloci; //(ps->locusTP+ps->locusFN);
1450     fsp=(100.0*(double)ps->locusAQTP)/ps->total_qloci; //(ps->locusATP+ps->locusAFP);
1451     fsn=(100.0*(double)ps->locusATP)/ps->total_rloci; //(ps->locusATP+ps->locusAFN);
1452 gpertea 20 fprintf(fout, " Locus level: \t%5.1f\t%5.1f\t%5.1f\t%5.1f\n",sn, sp, fsn, fsp);
1453     sn=(100.0*(double)ps->m_exons)/(ps->total_rexons);
1454     fprintf(fout, " Missed exons:\t%d/%d (%5.1f%%)\n",ps->m_exons, ps->total_rexons, sn);
1455     sn=(100.0*(double)ps->w_exons)/(ps->total_qexons);
1456     fprintf(fout, " Wrong exons:\t%d/%d (%5.1f%%)\n",ps->w_exons, ps->total_qexons,sn);
1457     if (ps->total_rintrons>0) {
1458     sn=(100.0*(double)ps->m_introns)/(ps->total_rintrons);
1459     fprintf(fout, " Missed introns:\t%d/%d (%5.1f%%)\n",ps->m_introns, ps->total_rintrons, sn);
1460     }
1461     if (ps->total_qintrons>0) {
1462     sn=(100.0*(double)ps->w_introns)/(ps->total_qintrons);
1463     fprintf(fout, " Wrong introns:\t%d/%d (%5.1f%%)\n",ps->w_introns, ps->total_qintrons,sn);
1464     }
1465     if (ps->total_rloci>0) {
1466     sn=(100.0*(double)ps->m_loci)/(ps->total_rloci);
1467     fprintf(fout, " Missed loci:\t%d/%d (%5.1f%%)\n",ps->m_loci, ps->total_rloci, sn);
1468     }
1469     if (ps->total_qloci>0) {
1470     sn=(100.0*(double)ps->w_loci)/(ps->total_qloci);
1471     fprintf(fout, " Wrong loci:\t%d/%d (%5.1f%%)\n",ps->w_loci, ps->total_qloci,sn);
1472     }
1473    
1474     }
1475     }
1476    
1477     int inbuf_len=1024; //starting inbuf capacity
1478     char* inbuf=NULL; // incoming buffer for sequence lines.
1479    
1480     void loadRefDescr(const char* fname) {
1481     if (inbuf==NULL) { GMALLOC(inbuf, inbuf_len); }
1482     FILE *f=fopen(fname, "rb");
1483     if (f==NULL) GError("Error opening exon file: %s\n",fname);
1484     char* line;
1485     int llen=0;
1486     off_t fpos;
1487     while ((line=fgetline(inbuf, inbuf_len, f, &fpos, &llen))!=NULL) {
1488     if (strlen(line)<=2) continue;
1489     int idlen=strcspn(line,"\t ");
1490     char* p=line+idlen;
1491     if (idlen<llen && idlen>0) {
1492     *p=0;
1493     p++;
1494     refdescr.Add(line, new GStr(p));
1495     }
1496     }
1497     }
1498    
1499     GSeqTrack* findGSeqTrack(int gsid) {
1500     GSeqTrack f(gsid);
1501     int fidx=-1;
1502     if (gseqtracks.Found(&f,fidx))
1503     return gseqtracks[fidx];
1504     fidx=gseqtracks.Add(new GSeqTrack(gsid));
1505     return gseqtracks[fidx];
1506     }
1507    
1508    
1509    
1510     GffObj* findRefMatch(GffObj& m, GLocus& rloc, int& ovlen) {
1511     ovlen=0;
1512     CTData* mdata=((CTData*)m.uptr);
1513     if (mdata->eqref!=NULL && ((CTData*)(mdata->eqref->uptr))->locus==&rloc) {
1514     mdata->eqref=mdata->ovls.First()->mrna; //this should be unnecessary
1515     //check it?
1516     return mdata->ovls.First()->mrna;
1517     }
1518     //if (rloc==NULL|| m==NULL) return NULL;
1519     GffObj* ret=NULL;
1520     for (int r=0;r<rloc.mrnas.Count();r++) {
1521     int olen=0;
1522     if (tMatch(m, *(rloc.mrnas[r]),olen, true)) { //return rloc->mrnas[r];
1523     /*
1524     if (ovlen<olen) {
1525     ovlen=olen;
1526     ret=rloc.mrnas[r]; //keep the longest matching ref
1527     //but this is unnecessary, there can be only one matching ref
1528     // because duplicate refs were discarded (unless -G was used)
1529     }
1530     */
1531     mdata->addOvl('=',rloc.mrnas[r], olen);
1532     ret=mdata->ovls.First()->mrna;
1533     //this must be called only for the head of an equivalency chain
1534     CTData* rdata=(CTData*)rloc.mrnas[r]->uptr;
1535     rdata->addOvl('=',&m,olen);
1536     //if (rdata->eqnext==NULL) rdata->eqnext=&m;
1537     }
1538     }
1539     if (ret!=NULL)
1540     mdata->eqref=ret;
1541     return ret;
1542     }
1543    
1544    
1545     void addXCons(GXLocus* xloc, GffObj* ref, char ovlcode, GffObj* tcons, CEqList* ts) {
1546     GXConsensus* c=new GXConsensus(tcons, ts, ref, ovlcode);
1547     //xloc->tcons.Add(c);
1548     //this will also check c against the other tcons for containment:
1549     xloc->addXCons(c);
1550     }
1551    
1552    
1553     const uint pre_mrna_threshold = 100;
1554    
1555     char getOvlCode(GffObj& m, GffObj& r, int& ovlen) {
1556     ovlen=0;
1557     if (!m.overlap(r.start,r.end)) return 'u';
1558     int jmax=r.exons.Count()-1;
1559    
1560     if (m.exons.Count()==1) { //single-exon transfrag
1561     GSeg mseg(m.start, m.end);
1562     if (jmax==0) { //also single-exon ref
1563     ovlen=mseg.overlapLen(r.start,r.end);
1564     int lmax=GMAX(r.covlen, m.covlen);
1565     if (ovlen >= lmax*0.8) return '='; //fuzz matching for single-exon transcripts: 80% of the longer one
1566     //if (m.covlen<=ovlen+12 && m.covlen<r.covlen) return 'c'; //contained
1567     if (m.covlen<r.covlen && ovlen >= m.covlen*0.8) return 'c';
1568     return 'o'; //just plain overlapping
1569     }
1570     //single-exon qry overlaping multi-exon ref
1571     for (int j=0;j<=jmax;j++) {
1572     //check if it's contained by an exon
1573     if (m.start>r.exons[j]->start-8 && m.end<r.exons[j]->end+8)
1574     return 'c';
1575     if (j==jmax) break;
1576     //check if it's contained by an intron
1577     if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1578     return 'i';
1579     // check if it's a potential pre-mRNA transcript
1580     // (if overlaps an intron at least 10 bases)
1581     uint iovlen=mseg.overlapLen(r.exons[j]->end+1, r.exons[j+1]->start-1);
1582     if (iovlen>=10 && mseg.len()>iovlen+10) return 'e';
1583     }
1584     return 'o'; //plain overlap, uncategorized
1585     } //single-exon transfrag
1586     //-- from here on we have a multi-exon transfrag --
1587     // * check if contained by a ref intron
1588     for (int j=0;j<jmax;j++) {
1589     if (m.end<r.exons[j+1]->start && m.start>r.exons[j]->end)
1590     return 'i';
1591     }
1592     //> check if m's intron chain is a subset of r's intron chain
1593     int imax=m.exons.Count()-1;// imax>0 here
1594     if (m.exons[imax]->start<r.exons[0]->end ||
1595     r.exons[jmax]->start<m.exons[0]->end ) //intron chains do not overlap at all
1596     return 'o'; //but terminal exons do, otherwise we wouldn't be here
1597     int i=1; //index of exon to the right of current qry intron
1598     int j=1; //index of exon to the right of current ref intron
1599     //find first intron overlap
1600     while (i<=imax && j<=jmax) {
1601     if (r.exons[j]->start<m.exons[i-1]->end) { j++; continue; }
1602     if (m.exons[i]->start<r.exons[j-1]->end) { i++; continue; }
1603     break; //here we have an intron overlap
1604     }
1605     if (i>imax || j>jmax)
1606     return 'o'; //no initial intron overlap found
1607     //from here on we check all qry introns against ref introns
1608     bool jmatch=false; //true if at least a junction match is found
1609     bool icmatch=(i==1); //intron chain match - it will be updated as introns are checked
1610     //bool exovli=false; // if any terminal exon of qry extends into a ref intron
1611     int jmstart=j; //index of first intron overlap of reference
1612     int jmend=0; //index of last intron overlap of reference
1613     int imend=0; //index of last intron overlap of query
1614     //check for intron matches
1615     while (i<=imax && j<=jmax) {
1616     uint mstart=m.exons[i-1]->end;
1617     uint mend=m.exons[i]->start;
1618     uint rstart=r.exons[j-1]->end;
1619     uint rend=r.exons[j]->start;
1620     if (rend<mstart) { j++; icmatch=false; continue; } //skipping ref intron, no ichain match
1621     if (mend<rstart) { i++; icmatch=false; continue; } //skipping qry intron, no ichain match
1622     //overlapping introns here, test junction matching
1623     jmend=j; //keep track of last overlapping intron
1624     imend=i;
1625     bool smatch=(mstart==rstart);
1626     bool ematch=(mend==rend);
1627     if (smatch || ematch) jmatch=true;
1628     if (smatch && ematch) { i++; j++; } //perfect match for this intron
1629     else { //at least one junction doesn't match
1630     icmatch=false;
1631     if (mend>rend) j++; else i++;
1632     }
1633     } //while checking intron overlaps
1634    
1635     if (icmatch && imend==imax) { // qry intron chain match
1636     if (jmstart==1 && jmend==jmax) return '='; //identical intron chains
1637     // -- qry intron chain is shorter than ref intron chain --
1638 gpertea 49 int l_iovh=0; // overhang of leftmost q exon left boundary beyond the end of ref intron to the left
1639     int r_iovh=0; // same type of overhang through the ref intron on the right
1640     if (jmstart>1 && r.exons[jmstart-1]->start>m.start)
1641     l_iovh = r.exons[jmstart-1]->start - m.start;
1642     if (jmend<jmax && m.end > r.exons[jmend]->end)
1643     r_iovh = m.end - r.exons[jmend]->end;
1644     if (l_iovh<4 && r_iovh<4) return 'c';
1645 gpertea 20 //TODO? check if any x_iovl>10 and return 'e' to signal an "unspliced intron" ?
1646     // or we can check if any of them are >= the length of the corresponding ref intron on that side
1647     return 'j';
1648     }
1649     /*
1650     if (icmatch && (jmax>=imax)) { //all qry introns match
1651     //but they may overlap
1652     // if ((lbound && lbound > m.exons[0]->start+10) ||
1653     // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1654     // return 'c';
1655     // }
1656     int code = 'c';
1657     if (lbound)
1658     {
1659     uint ref_boundary = lbound;
1660     uint cuff_boundary = m.exons[0]->start;
1661     if (ref_boundary > (cuff_boundary + pre_mrna_threshold)) // cuff extends a lot
1662     {
1663     code = 'j';
1664     }
1665     if (ref_boundary > cuff_boundary) // cuff extends just a bit into a ref intron
1666     {
1667     code = 'e';
1668     }
1669     }
1670     if (j <= jmax)
1671     {
1672     uint ref_boundary = r.exons[j-1]->end;
1673     uint cuff_boundary = m.exons[i-1]->end;
1674     if (cuff_boundary > (ref_boundary + pre_mrna_threshold)) // cuff extends a lot
1675     {
1676     code = 'j';
1677     }
1678     if (cuff_boundary > ref_boundary) // cuff extends just a bit into a ref intron
1679     {
1680     code = 'e';
1681     }
1682     }
1683     //if ((lbound && lbound > m.exons[0]->start+10) ||
1684     // (j<=jmax && m.exons[i-1]->end > r.exons[j-1]->end+10)) return 'j';
1685     return code;
1686     }
1687    
1688     // if (!ichain) // first and last exons more or less match, but there's a different intron somewhere
1689     // {
1690     //
1691     // }
1692    
1693     */
1694     return jmatch ? 'j':'o';
1695     }
1696    
1697     char getRefOvl(GffObj& m, GLocus& rloc, GffObj*& rovl, int& ovlen) {
1698     rovl=NULL;
1699     ovlen=0;
1700     if (m.start>rloc.end || m.end<rloc.start) {
1701     //see if it's a polymerase run ?
1702     /*
1703     if ((m.strand=='+' && m.end<=rloc.end+polyrun_range) ||
1704     (m.strand=='-' && m.start>=rloc.start-polyrun_range)) {
1705     rovl=rloc.mrna_maxcov;
1706     ((CTData*)m.uptr)->addOvl('p',rloc.mrna_maxcov);
1707     return 'p';
1708     }
1709     */
1710     return 0; //unknown -> intergenic space
1711     }
1712     for (int i=0;i<rloc.mrnas.Count();i++) {
1713     GffObj* r=rloc.mrnas[i];
1714     int olen=0;
1715     char ovlcode=getOvlCode(m,*r,olen);
1716     if (ovlcode!=0) { //has some sort of "overlap" with r
1717     ((CTData*)m.uptr)->addOvl(ovlcode,r,olen);
1718     if (olen>ovlen) ovlen=olen;
1719     if (ovlcode=='c' || ovlcode=='=') //keep match/containment for each reference transcript
1720     ((CTData*)r->uptr)->addOvl(ovlcode,&m,olen);
1721     }
1722     }//for each ref in rloc
1723     // i,j,o
1724     return ((CTData*)m.uptr)->getBestCode();
1725     }
1726    
1727     /*
1728     void findTMatches(GTrackLocus& loctrack, int qcount) {
1729     //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1730     for (int q=0;q<qcount-1;q++) { //for each qry dataset
1731     if (loctrack[q]==NULL) continue;
1732     for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1733     GffObj* qi_t=loctrack[q]->Get(qi);
1734     CTData* qi_d=(CTData*)qi_t->uptr;
1735     if ((qi_d->eqdata & EQHEAD_TAG) !=0) { //this is set as an EQ chain head already
1736     //if (qi_t->exons.Count()>1)
1737     continue;
1738     }
1739     for (int n=q+1;n<qcount;n++) { // for every successor dataset
1740     if (loctrack[n]==NULL) continue;
1741     for (int ni=0;ni<loctrack[n]->Count();ni++) {
1742     GffObj* ni_t=loctrack[n]->Get(ni);
1743     CTData* ni_d=(CTData*)ni_t->uptr;
1744     //if (ni_d->eqdata!=0) continue; //already part of an EQ chain
1745     //single exon transfrags have special treatment:
1746     bool s_match=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1747     if (ni_d->eqdata!=0 && !s_match) continue; //already part of an EQ chain
1748     if (ni_d->eqnext!=NULL) continue;
1749     int ovlen=0;
1750    
1751     if (qi_d->eqnext!=NULL) {
1752     if (!s_match) continue;
1753     //test all in the EQ list for a match
1754     bool matchFound=false;
1755     CTData* next_eq_d=qi_d;
1756     if (tMatch(*qi_t, *ni_t, ovlen, true)) {
1757     matchFound=true;
1758     }
1759     else {
1760     while (next_eq_d->eqnext!=NULL) {
1761     if (tMatch(*(next_eq_d->eqnext), *ni_t, ovlen, true)) {
1762     matchFound=true;
1763     break;
1764     }
1765     next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1766     } //last in the chain
1767     }
1768     if (matchFound) {
1769     //add this to the end of the EQ chain instead
1770     next_eq_d=(CTData*)(qi_d->eqnext->uptr);
1771     while (next_eq_d->eqnext!=NULL) {
1772     next_eq_d=(CTData*)(next_eq_d->eqnext->uptr);
1773     } //last in the chain
1774     next_eq_d->eqnext=ni_t;
1775     ni_d->eqdata=n+1;
1776     ni_d->eqnext=NULL; ///TEST
1777     }
1778     }
1779     else {
1780     if (tMatch(*qi_t,*ni_t, ovlen, true)) {
1781     qi_d->eqnext=ni_t;
1782     ni_d->eqnext=NULL; ///TEST
1783     if (qi_d->eqdata == 0) {//only start of chain is tagged
1784     qi_d->eqdata = ((q+1) | EQHEAD_TAG);
1785     }
1786     ni_d->eqdata=n+1; //EQ chain member only marked with qry# (1-based)
1787     }
1788     } //multi-exon case
1789     } //for each transfrag in the next qry dataset
1790    
1791     if (qi_d->eqnext!=NULL && qi_t->exons.Count()>1) break;
1792     //part of a chain already, skip other datasets
1793     } // for each successor dataset
1794     } //for each transcript in qry dataset
1795     } //for each qry dataset
1796     }
1797     */
1798    
1799     void findTMatches(GTrackLocus& loctrack, int qcount) {
1800     //perform an all vs. all ichain-match for all transcripts across all loctrack[i]->qloci
1801     for (int q=0;q<qcount-1;q++) { //for each qry dataset
1802     if (loctrack[q]==NULL) continue;
1803     for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1804     GffObj* qi_t=loctrack[q]->Get(qi);
1805     CTData* qi_d=(CTData*)qi_t->uptr;
1806     if (qi_d->eqlist!=NULL && qi_t->exons.Count()>1) {
1807     continue; //this is part of an EQ chain already
1808     }
1809     for (int n=q+1;n<qcount;n++) { // for every successor dataset
1810     if (loctrack[n]==NULL) continue;
1811     for (int ni=0;ni<loctrack[n]->Count();ni++) {
1812     GffObj* ni_t=loctrack[n]->Get(ni);
1813     CTData* ni_d=(CTData*)ni_t->uptr;
1814     bool singleExon=(ni_t->exons.Count()==1 && qi_t->exons.Count()==1);
1815     if (ni_d->eqlist!=NULL &&
1816     (ni_d->eqlist==qi_d->eqlist || !singleExon)) continue;
1817     int ovlen=0;
1818     if ((ni_d->eqlist==qi_d->eqlist && qi_d->eqlist!=NULL) ||
1819     tMatch(*qi_t,*ni_t, ovlen, singleExon)) {
1820     qi_d->joinEqList(ni_t);
1821     }
1822     }
1823     } // for each successor dataset
1824     } //for each transcript in qry dataset
1825     } //for each qry dataset
1826     }
1827    
1828    
1829     int cmpTData_qset(const pointer* p1, const pointer* p2) {
1830     CTData* d1=(CTData*)(((GffObj*)p1)->uptr);
1831     CTData* d2=(CTData*)(((GffObj*)p2)->uptr);
1832     return (d1->qset - d2->qset);
1833     }
1834    
1835     void printITrack(FILE* ft, GList<GffObj>& mrnas, int qcount, int& cnum) {
1836     for (int i=0;i<mrnas.Count();i++) {
1837     GffObj& qt=*(mrnas[i]);
1838     CTData* qtdata=(CTData*)qt.uptr;
1839     int qfidx=qtdata->qset;
1840     char ovlcode=qtdata->classcode;
1841     //GList<GffObj> eqchain(false,false,false);
1842     CEqList* eqchain=qtdata->eqlist;
1843     GffObj* ref=NULL; //related ref -- it doesn't have to be fully matching
1844     GffObj* eqref=NULL; //fully ichain-matching ref
1845     GffObj* tcons=NULL; //"consensus" (largest) transcript for a clique
1846     int tmaxcov=0;
1847     //eqchain.Add(&qt);
1848     eqref=qtdata->eqref;
1849     if (qtdata->ovls.Count()>0 && qtdata->ovls[0]->mrna!=NULL) {
1850     //if it has ovlcode with a ref
1851     ref=qtdata->ovls[0]->mrna;
1852     //consistency check: qtdata->ovls[0]->code==ovlcode
1853     // -- let tcons be a transfrag, not a ref transcript
1854     //tcons=eqref;
1855     //if (tcons!=NULL) tmaxcov=tcons->covlen;
1856     }
1857     //chain pre-check
1858     if (tcons==NULL || mrnas[i]->covlen>tmaxcov) {
1859     tcons=mrnas[i];
1860     tmaxcov=tcons->covlen;
1861     }
1862     if (qtdata->isEqHead()) {//head of a equivalency chain
1863     //check if all transcripts in this chain have the same ovlcode
1864     for (int k=0;k<qtdata->eqlist->Count();k++) {
1865     GffObj* m=qtdata->eqlist->Get(k);
1866     if (m->covlen>tmaxcov) {
1867     tmaxcov=m->covlen;
1868     tcons=m;
1869     }
1870     if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1871     ovlcode='.'; //non-uniform ovlcode
1872     }
1873     }
1874     /*
1875     GffObj* m=mrnas[i];
1876     while (((CTData*)m->uptr)->eqnext!=NULL) {
1877     m=((CTData*)m->uptr)->eqnext;
1878     eqchain.Add(m);
1879     if (m->covlen>tmaxcov) {
1880     tmaxcov=m->covlen;
1881     tcons=m;
1882     }
1883     if (ovlcode!='=' && ovlcode!='.' && ((CTData*)m->uptr)->getBestCode()!=ovlcode) {
1884     ovlcode='.'; //non-uniform ovlcode
1885     //break;
1886     }
1887     } //while elements in chain
1888     */
1889    
1890     }//chain check
1891     //if (ovlcode=='p') ref=NULL; //ignore polymerase runs?
1892     if (ovlcode==0 || ovlcode=='-') ovlcode = (ref==NULL) ? 'u' : '.';
1893     //-- print columns 1 and 2 as LOCUS_ID and TCONS_ID
1894     //bool chainHead=(qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0));
1895     bool chainHead=qtdata->isEqHead();
1896     //bool noChain=((qtdata->eqdata & EQCHAIN_TAGMASK)==0);
1897     bool noChain=(eqchain==NULL);
1898     if (chainHead || noChain) {
1899     cnum++;
1900     if (ft!=NULL) fprintf(ft,"%s_%08d\t",cprefix,cnum);
1901     GXLocus* xloc=qtdata->locus->xlocus;
1902     if (xloc!=NULL) {
1903     if (ft!=NULL) fprintf(ft, "XLOC_%06d\t",xloc->id);
1904     if (tcons->exons.Count()>1) {
1905     //! only multi-exon mRNAs are counted for multi-transcript xloci !
1906     xloc->num_mtcons++;
1907     if (xloc->num_mtcons==2)
1908     total_xloci_alt++;
1909     }
1910     }
1911     else {
1912     //should NEVER happen!
1913     int fidx=qtdata->qset;
1914     GError("Error: no XLocus created for transcript %s (file %s) [%d, %d], on %s%c:%d-%d\n", qt.getID(),
1915     qryfiles[qtdata->locus->qfidx]->chars(), qtdata->locus->qfidx, fidx, qt.getGSeqName(), qt.strand, qt.start, qt.end);
1916     }
1917     addXCons(xloc, ref, ovlcode, tcons, eqchain);
1918     } // if chain head or uniq entry (not part of a chain)
1919     if (ft==NULL) continue;
1920     if (chainHead) {
1921     //this is the start of a equivalence class as a printing chain
1922     if (ref!=NULL) fprintf(ft,"%s|%s\t%c", getGeneID(ref),ref->getID(), ovlcode);
1923     else fprintf(ft,"-\t%c", ovlcode);
1924     GffObj* m=mrnas[i];
1925     CTData* mdata=(CTData*)m->uptr;
1926    
1927     int lastpq=-1;
1928     /*
1929     for (int ptab=mdata->qset-lastpq; ptab>0;ptab--)
1930     if (ptab>1) fprintf(ft,"\t-");
1931     else fprintf(ft,"\t");
1932     lastpq=mdata->qset;
1933     fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1934     iround(m->gscore/10), mdata->FPKM, mdata->conf_lo, mdata->conf_hi, mdata->cov, m->covlen);
1935     //traverse linked list of matching transcripts
1936     while (mdata->eqnext!=NULL) {
1937     m=mdata->eqnext;
1938     mdata=(CTData*)m->uptr;
1939     for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1940     if (ptab>1) fprintf(ft,"\t-");
1941     else fprintf(ft,"\t");
1942     lastpq = mdata->qset;
1943     fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1944     iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1945     } //traverse and print row
1946     */
1947     eqchain->setUnique(false);
1948     eqchain->setSorted((GCompareProc*) cmpTData_qset);
1949    
1950     for (int k=0;k<eqchain->Count();k++) {
1951     m=eqchain->Get(k);
1952     mdata=(CTData*)m->uptr;
1953     if (mdata->qset==lastpq) {
1954     //shouldn't happen, unless this input set is messed up (has duplicates/redundant transfrags)
1955     fprintf(ft,",%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", getGeneID(m), m->getID(),
1956     iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1957     continue;
1958     }
1959     for (int ptab=mdata->qset-lastpq;ptab>0;ptab--)
1960     if (ptab>1) fprintf(ft,"\t-");
1961     else fprintf(ft,"\t");
1962     lastpq = mdata->qset;
1963     fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|%d", lastpq+1, getGeneID(m), m->getID(),
1964     iround(m->gscore/10), mdata->FPKM,mdata->conf_lo,mdata->conf_hi,mdata->cov, m->covlen);
1965     }
1966     for (int ptab=qcount-lastpq-1;ptab>0;ptab--)
1967     fprintf(ft,"\t-");
1968     fprintf(ft,"\n");
1969     continue;
1970     } //start of eq class (printing chain)
1971    
1972     if (eqchain!=NULL) continue; //part of a matching chain, dealt with previously
1973    
1974     //--------- not in an ichain-matching class, print as singleton
1975    
1976     if (ref!=NULL) fprintf(ft,"%s|%s\t%c",getGeneID(ref), ref->getID(), ovlcode);
1977     else fprintf(ft,"-\t%c",ovlcode);
1978     for (int ptab=qfidx;ptab>=0;ptab--)
1979     if (ptab>0) fprintf(ft,"\t-");
1980     else fprintf(ft,"\t");
1981     fprintf(ft,"q%d:%s|%s|%d|%8.6f|%8.6f|%8.6f|%8.6f|-",qfidx+1, getGeneID(qt), qt.getID(),iround(qt.gscore/10),
1982     qtdata->FPKM, qtdata->conf_lo,qtdata->conf_hi,qtdata->cov);
1983     for (int ptab=qcount-qfidx-1;ptab>0;ptab--)
1984     fprintf(ft,"\t-");
1985     fprintf(ft,"\n");
1986     } //for each transcript
1987     }
1988    
1989    
1990     void findTRMatch(GTrackLocus& loctrack, int qcount, GLocus& rloc) {
1991     //requires loctrack to be already populated with overlapping qloci by findTMatches()
1992     // which also found (and tagged) all matching qry transcripts
1993     for (int q=0;q<qcount;q++) { //for each qry dataset
1994     if (loctrack[q]==NULL) continue;
1995     for (int qi=0;qi<loctrack[q]->Count();qi++) { // for each transcript in q dataset
1996     //if (loctrack[q]->cl[qi]->exons.Count()<2) continue; //skip single-exon transcripts
1997     GffObj& qt=*(loctrack[q]->Get(qi));
1998     CTData* qtdata=(CTData*)qt.uptr;
1999     GffObj* rmatch=NULL; //== ref match for this row
2000     int rovlen=0;
2001     //if (qtdata->eqnext!=NULL && ((qtdata->eqdata & EQHEAD_TAG)!=0)) {
2002     if (qtdata->isEqHead()) {
2003     //EQ chain head -- transfrag equivalency list starts here
2004     if (qtdata->eqref==NULL) { //find rloc overlap
2005     if (qt.overlap(rloc.start, rloc.end)) {
2006     rmatch=findRefMatch(qt, rloc, rovlen);
2007     }
2008     } else rmatch=qtdata->eqref;
2009     if (rmatch!=NULL) {
2010     /*
2011     GffObj* m=loctrack[q]->Get(qi);
2012     //traverse linked list of matching transcripts
2013     while (((CTData*)m->uptr)->eqnext!=NULL) {
2014     m=((CTData*)m->uptr)->eqnext;
2015     if (rmatch!=NULL) {
2016     ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2017     }
2018     } //traverse qry data sets
2019     continue;
2020     }
2021     */
2022     for (int k=0;k<qtdata->eqlist->Count();k++) {
2023     GffObj* m=qtdata->eqlist->Get(k);
2024     ((CTData*)m->uptr)->addOvl('=',rmatch,rovlen);
2025     continue;
2026     }
2027     }
2028     //if (rmatch!=NULL) continue;
2029     } //equivalence class (chain of intron-matching)
2030     //if ((qtdata->eqdata & EQCHAIN_TAGMASK)!=0) continue; //part of a matching chain, dealt with previously
2031    
2032     //--------- qry mrna not in a '=' matching clique
2033     if (qtdata->eqref==NULL) { //find any rloc overlap
2034     if (qt.overlap(rloc.start, rloc.end)) {
2035     rmatch=findRefMatch(qt, rloc, rovlen);
2036     if (rmatch==NULL) {
2037     //not an ichain match, look for other codes
2038     GffObj* rovl=NULL;
2039     int rovlen=0;
2040     //char ovlcode=
2041     getRefOvl(qt, rloc,rovl,rovlen);
2042     }
2043     }
2044     }
2045     else rmatch=qtdata->eqref;
2046     } //for each qry transcript
2047     }//for each qry dataset
2048     }
2049    
2050    
2051     bool inPolyRun(char strand, GffObj& m, GList<GLocus>* rloci, int& rlocidx) {
2052     //we are only here if there is no actual overlap between m and any locus in rloci
2053     if (rloci==NULL || rloci->Count()==0) return false; // || m.exons.Count()>1
2054     if (strand=='-') {
2055     rlocidx=qsearch_loci(m.end, *rloci);
2056     //returns index of locus starting just ABOVE m.end
2057     // or -1 if last locus start <= m.end
2058     GLocus* rloc=NULL;
2059     if (rlocidx<0) return false;
2060     while (rlocidx<rloci->Count()) {
2061     rloc=rloci->Get(rlocidx);
2062     if (rloc->start>m.end+polyrun_range) break;
2063     if (rloc->start+6>m.end) return true;
2064     rlocidx++;
2065     }
2066     }
2067     else { // strand == '+' (or '.' ?)
2068     rlocidx=qsearch_loci(m.end, *rloci);
2069     GLocus* rloc=NULL;
2070     //returns index of closest locus starting ABOVE m.end
2071     // or -1 if last locus start <= m.end
2072     if (rlocidx<0) rlocidx=rloci->Count(); //this may actually start below m.end
2073     while ((--rlocidx)>=0) {
2074     rloc=rloci->Get(rlocidx);
2075     if (m.start>rloc->start+GFF_MAX_LOCUS) break;
2076     if (m.start+6>rloc->end && m.start<rloc->end+polyrun_range) return true;
2077     }
2078     }
2079     return false;
2080     }
2081    
2082     CTData* getBestOvl(GffObj& m) {
2083     //CTData* mdata=(CTData*)m.uptr;
2084     //return mdata->getBestCode();
2085     if ( ((CTData*)m.uptr)->ovls.Count()>0)
2086     return (CTData*)m.uptr;
2087     return NULL;
2088     }
2089    
2090     void reclass_XStrand(GList<GffObj>& mrnas, GList<GLocus>* rloci) {
2091     //checking for relationship with ref transcripts on opposite strand
2092     if (rloci==NULL || rloci->Count()<1) return;
2093     int j=0;//current rloci index
2094     for (int i=0;i<mrnas.Count();i++) {
2095     GffObj& m=*mrnas[i];
2096     char ovlcode=((CTData*)m.uptr)->getBestCode();
2097     if (ovlcode>47 && strchr("=cjeo",ovlcode)!=NULL) continue;
2098 gpertea 52 GLocus* rloc=rloci->Get(j);
2099     if (rloc->start>m.end) continue; //check next transfrag
2100     while (m.start>rloc->end && j+1<rloci->Count()) {
2101     j++;
2102     rloc=rloci->Get(j);
2103     }
2104     if (rloc->start>m.end) continue; //check next transfrag
2105     //m overlaps rloc:
2106     //check if m has a fuzzy intron overlap -> 's' (shadow, mapped on the wrong strand)
2107     // then if m is contained within an intron -> 'i'
2108     // otherwise it's just a plain cross-strand overlap: 'x'
2109     int jm=0;
2110     do { //while rloci overlap this transfrag (m)
2111     rloc=rloci->Get(j+jm);
2112     bool is_shadow=false;
2113     GffObj* sovl=NULL;
2114     bool is_intraintron=false;
2115     GffObj* iovl=NULL;
2116     if (rloc->introns.Count()>0) {
2117     for (int n=0;n<rloc->introns.Count();n++) {
2118     GISeg& rintron=rloc->introns[n];
2119     if (rintron.start>m.end) break;
2120     if (m.start>rintron.end) continue;
2121     //overlap between m and intron
2122     if (m.end<=rintron.end && m.start>=rintron.start) {
2123     is_intraintron=true;
2124     if (iovl==NULL || iovl->covlen<rintron.t->covlen) iovl=rintron.t;
2125     continue;
2126     }
2127     //check if any intron of m has a fuzz-match with rintron
2128     for (int e=1;e<m.exons.Count();e++) {
2129     GSeg mintron(m.exons[e-1]->end+1,m.exons[e]->start-1);
2130     if (rintron.coordMatch(&mintron,10)) {
2131     is_shadow=true;
2132     if (sovl==NULL || sovl->covlen<rintron.t->covlen) sovl=rintron.t;
2133     break;
2134     }
2135     } //for each m intron
2136     } //for each intron of rloc
2137     }//rloc has introns
2138     bool xcode=true;
2139     if (is_shadow) { ((CTData*)m.uptr)->addOvl('s', sovl); xcode=false; }
2140     // else
2141     if (ovlcode!='i' && is_intraintron) { ((CTData*)m.uptr)->addOvl('i', iovl); xcode=false; }
2142     if (xcode) {
2143     // just plain overlap, find the overlapping mrna in rloc
2144     GffObj* maxovl=NULL;
2145     int ovlen=0;
2146 gpertea 91 GffObj* max_lovl=NULL; //max len ref transcript
2147     // having no exon overlap but simply range overlap (interleaved exons)
2148 gpertea 52 for (int ri=0;ri<rloc->mrnas.Count();ri++) {
2149 gpertea 91 if (!m.overlap(*(rloc->mrnas[ri]))) continue;
2150 gpertea 52 int o=m.exonOverlapLen(*(rloc->mrnas[ri]));
2151 gpertea 91 if (o>0) {
2152     if (o>ovlen) {
2153     ovlen=o;
2154     maxovl=rloc->mrnas[ri];
2155     }
2156     }
2157     else { //no exon overlap, but still overlapping (interleaved exons)
2158     if (max_lovl==NULL || max_lovl->covlen<rloc->mrnas[ri]->covlen)
2159     max_lovl=rloc->mrnas[ri];
2160     }
2161 gpertea 52 }
2162 gpertea 91 if (maxovl) ((CTData*)m.uptr)->addOvl('x',maxovl);
2163     else if (max_lovl) ((CTData*)m.uptr)->addOvl('x',max_lovl);
2164 gpertea 52 } //'x'
2165     jm++;
2166     } while (j+jm<rloci->Count() && rloci->Get(j+jm)->overlap(m));
2167 gpertea 20 } //for each transfrag
2168     }
2169    
2170     void reclass_mRNAs(char strand, GList<GffObj>& mrnas, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2171     int rlocidx=-1;
2172     for (int i=0;i<mrnas.Count();i++) {
2173     GffObj& m=*mrnas[i];
2174     char ovlcode=((CTData*)m.uptr)->getBestCode();
2175     //if (ovlcode=='u' || ovlcode=='i' || ovlcode==0) {
2176     if (ovlcode=='u' || ovlcode<47) {
2177     //check for overlaps with ref transcripts on the other strand
2178     if (m.exons.Count()==1 && inPolyRun(strand, m, rloci, rlocidx)) {
2179     ((CTData*)m.uptr)->addOvl('p',rloci->Get(rlocidx)->mrna_maxcov);
2180     }
2181     else { //check for repeat content
2182     if (faseq!=NULL) {
2183     int seqlen;
2184     char* seq=m.getSpliced(faseq, false, &seqlen);
2185     //get percentage of lowercase
2186     int numlc=0;
2187     for (int c=0;c<seqlen;c++) if (seq[c]>='a') numlc++;
2188     if (numlc > seqlen/2)
2189     ((CTData*)m.uptr)->addOvl('r');
2190     GFREE(seq);
2191     }
2192     }
2193     } //for unassigned class
2194     }//for each mrna
2195    
2196     }
2197    
2198     void reclassLoci(char strand, GList<GLocus>& qloci, GList<GLocus>* rloci, GFaSeqGet *faseq) {
2199     for (int ql=0;ql<qloci.Count();ql++) {
2200     reclass_mRNAs(strand, qloci[ql]->mrnas, rloci, faseq);
2201     //find closest upstream ref locus for this q locus
2202     } //for each locus
2203     }
2204    
2205     //for a single genomic sequence, all qry data and ref data is stored in gtrack
2206     //check for all 'u' transfrags if they are repeat ('r') or polymerase run 'p' or anything else
2207     void umrnaReclass(int qcount, GSeqTrack& gtrack, FILE** ftr, GFaSeqGet* faseq=NULL) {
2208     for (int q=0;q<qcount;q++) {
2209     if (gtrack.qdata[q]==NULL) continue; //no transcripts in this q dataset for this genomic seq
2210     reclassLoci('+', gtrack.qdata[q]->loci_f, gtrack.rloci_f, faseq);
2211     reclassLoci('-', gtrack.qdata[q]->loci_r, gtrack.rloci_r, faseq);
2212     reclass_mRNAs('+', gtrack.qdata[q]->umrnas, gtrack.rloci_f, faseq);
2213     reclass_mRNAs('-', gtrack.qdata[q]->umrnas, gtrack.rloci_r, faseq);
2214     //and also check for special cases with cross-strand overlaps:
2215     reclass_XStrand(gtrack.qdata[q]->mrnas_f, gtrack.rloci_r);
2216     reclass_XStrand(gtrack.qdata[q]->mrnas_r, gtrack.rloci_f);
2217     // print all tmap data here here:
2218     for (int i=0;i<gtrack.qdata[q]->tdata.Count();i++) {
2219     CTData* mdata=gtrack.qdata[q]->tdata[i];
2220     if (mdata->mrna==NULL) continue; //invalidated -- removed earlier
2221     //GLocus* rlocus=NULL;
2222     mdata->classcode='u';
2223     GffObj* ref=NULL;
2224     if (mdata->ovls.Count()>0) {
2225     mdata->classcode=mdata->ovls[0]->code;
2226     ref=mdata->ovls[0]->mrna;
2227     }
2228     //if (mdata->classcode<33) mdata->classcode='u';
2229     if (mdata->classcode<47) mdata->classcode='u'; // if 0, '-' or '.'
2230     if (tmapFiles) {
2231     char ref_match_len[2048];
2232     if (ref!=NULL) {
2233     sprintf(ref_match_len, "%d",ref->covlen);
2234     fprintf(ftr[q],"%s\t%s\t",getGeneID(ref),ref->getID());
2235     //rlocus=((CTData*)(ref->uptr))->locus;
2236     }
2237     else {
2238     fprintf(ftr[q],"-\t-\t");
2239     strcpy(ref_match_len, "-");
2240     }
2241     //fprintf(ftr[q],"%c\t%s\t%d\t%8.6f\t%8.6f\t%d\n", ovlcode, mdata->mrna->getID(),
2242     // iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->cov, mdata->mrna->covlen);
2243     const char* mlocname = (mdata->locus!=NULL) ? mdata->locus->mrna_maxcov->getID() : mdata->mrna->getID();
2244     fprintf(ftr[q],"%c\t%s\t%s\t%d\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%d\t%s\t%s\n", mdata->classcode, getGeneID(mdata->mrna), mdata->mrna->getID(),
2245     iround(mdata->mrna->gscore/10), mdata->FPKM, mdata->conf_lo,mdata->conf_hi, mdata->cov, mdata->mrna->covlen, mlocname, ref_match_len);
2246     }
2247     } //for each tdata
2248     } //for each qdata
2249     }
2250    
2251     void buildXLoci(GTrackLocus& loctrack, int qcount, GSeqTrack& gtrack, char strand,
2252     GList<GXLocus>* retxloci=NULL) {
2253     GList<GXLocus>* dest_xloci=NULL;
2254     GList<GXLocus> tmpxloci(true,false,true); //local set of newly created xloci
2255     GList<GXLocus>* xloci=&tmpxloci;
2256     if (strand=='+') {
2257     dest_xloci=& gtrack.xloci_f;
2258     }
2259     else if (strand=='-') {
2260     dest_xloci = & gtrack.xloci_r;
2261     }
2262     else dest_xloci= & gtrack.xloci_u;
2263    
2264     if (retxloci==NULL) {
2265     //if no return set of build xloci was given
2266     //take it as a directive to work directly on the global xloci
2267     xloci=dest_xloci;
2268     dest_xloci=NULL;
2269     }
2270     for (int q=-1;q<qcount;q++) {
2271     GList<GLocus>* wrkloci=NULL;
2272     if (q<0) {
2273     if (loctrack.rloci.Count()==0) continue;
2274     //loci=new GList<GLocus>(true,false,false);
2275     //loci->Add(loctrack.rloc);
2276     wrkloci = &(loctrack.rloci);
2277     }
2278     else {
2279     if (loctrack[q]==NULL) continue;
2280     wrkloci = &(loctrack[q]->qloci);
2281     }
2282    
2283     for (int t=0;t<wrkloci->Count();t++) {
2284     GLocus* loc=wrkloci->Get(t);
2285     int xfound=0; //count of parent xloci
2286     if (loc->xlocus!=NULL) continue; //already assigned a superlocus
2287     GArray<int> mrgxloci(true);
2288     for (int xl=0;xl<xloci->Count();xl++) {
2289     GXLocus& xloc=*(xloci->Get(xl));
2290     if (xloc.start>loc->end) {
2291     if (xloc.start-loc->end > GFF_MAX_LOCUS) break;
2292     continue;
2293     }
2294     if (loc->start>xloc.end) continue;
2295     if (xloc.add_Locus(loc)) {
2296     xfound++;
2297     mrgxloci.Add(xl);
2298     }
2299     } //for each existing Xlocus
2300     if (xfound==0) {
2301     xloci->Add(new GXLocus(loc));
2302     }
2303 gpertea 64 else {
2304     int il=mrgxloci[0];
2305     GXLocus& xloc=*(xloci->Get(il));
2306     if (xfound>1) {
2307     for (int l=1;l<xfound;l++) {
2308     int mlidx=mrgxloci[l]-l+1;
2309     xloc.addMerge(*(xloci->Get(mlidx)));
2310     GXLocus* ldel=xloci->Get(mlidx);
2311     xloci->Delete(mlidx);
2312     if (retxloci!=NULL)
2313     delete ldel;
2314     }
2315     }
2316     //in case xloc.start was decreased, bubble-down until it's in the proper order
2317     while (il>0 && xloc<*(xloci->Get(il-1))) {
2318     il--;
2319     xloci->Swap(il,il+1);
2320     }
2321     } //at least one locus is being merged
2322 gpertea 20 }//for each locus
2323     }//for each set of loci in the region (refs and each qry set)
2324     //-- add xloci to the global set of xloci unless retxloci was given,
2325     if (retxloci!=NULL) retxloci->Add(*xloci);
2326     else dest_xloci->Add(*xloci);
2327     }
2328    
2329     void singleQData(GList<GLocus>& qloci, GList<GTrackLocus>& loctracks) {
2330     for (int i=0;i<qloci.Count();i++) {
2331     if (qloci[i]->t_ptr==NULL) {
2332     GTrackLocus* tloc=new GTrackLocus();
2333     tloc->addQLocus(qloci[i],0);
2334     loctracks.Add(tloc);
2335     }
2336     }
2337     }
2338    
2339     void recheckUmrnas(GSeqData* gseqdata, GList<GffObj>& mrnas,
2340     GList<GLocus>& loci, GList<GLocus>& nloci, GList<GLocus>& oloci) {
2341     GList<GLocus> reassignedLocs(false,false);
2342     for (int u=0;u<gseqdata->umrnas.Count();u++) {
2343     for (int l=0;l<oloci.Count();l++) {
2344     if (gseqdata->umrnas[u]==NULL) break;
2345     if (gseqdata->umrnas[u]->end<oloci[l]->start) break; //try next umrna
2346     if (oloci[l]->end<gseqdata->umrnas[u]->start) continue; //try next locus
2347     if (gseqdata->umrnas[u]->strand=='+' || gseqdata->umrnas[u]->strand=='-') {
2348     gseqdata->umrnas.Forget(u);
2349     continue; //already reassigned earlier
2350     }
2351     //umrna overlaps locus region
2352     GffObj* umrna=gseqdata->umrnas[u];
2353     for (int m=0;m<oloci[l]->mrnas.Count();m++) {
2354     if (oloci[l]->mrnas[m]->exonOverlap(umrna)) {
2355     gseqdata->umrnas.Forget(u);
2356     CTData* umdata=((CTData*)umrna->uptr);
2357     //must be in a Loci anyway
2358     if (umdata==NULL || umdata->locus==NULL)
2359     GError("Error: no locus pointer for umrna %s!\n",umrna->getID());
2360     for (int i=0;i<umdata->locus->mrnas.Count();i++) {
2361     GffObj* um=umdata->locus->mrnas[i];
2362     um->strand=oloci[l]->mrnas[m]->strand;
2363     }
2364     reassignedLocs.Add(umdata->locus);
2365     break;
2366     }
2367     } //for each mrna in locus
2368     } //for each locus
2369     } //for each umrna
2370     if (reassignedLocs.Count()>0) {
2371     gseqdata->umrnas.Pack();
2372     gseqdata->nloci_u.setFreeItem(false);
2373     for (int i=0;i<reassignedLocs.Count();i++) {
2374     GLocus* loc=reassignedLocs[i];
2375     for (int m=0;m<loc->mrnas.Count();m++) {
2376     mrnas.Add(loc->mrnas[m]);
2377     }
2378     loci.Add(loc);
2379     nloci.Add(loc);
2380     gseqdata->nloci_u.Remove(loc);
2381     }
2382     gseqdata->nloci_u.setFreeItem(true);
2383     }
2384     }
2385    
2386     void umrnasXStrand(GList<GXLocus>& xloci, GSeqTrack& gtrack) {
2387     //try to determine the strand of unoriented transfrags based on possible overlaps
2388     //with other, oriented transfrags
2389     for (int x=0;x<xloci.Count();x++) {
2390     if (xloci[x]->strand=='.') continue;
2391     if (xloci[x]->qloci.Count()==0) continue;
2392     //go through all qloci in this xlocus
2393     for (int l = 0; l < xloci[x]->qloci.Count(); l++) {
2394     char locstrand=xloci[x]->qloci[l]->mrna_maxcov->strand;
2395     if (locstrand=='.') {
2396     //this is a umrna cluster
2397     GLocus* qloc=xloci[x]->qloci[l];
2398     //we don't really need to update loci lists (loci_f, nloci_f etc.)
2399     /*
2400     if (xloci[x]->strand=='+') {
2401     }
2402     else { // - strand
2403     }
2404     */
2405     for (int i=0;i<qloc->mrnas.Count();i++) {
2406     qloc->mrnas[i]->strand=xloci[x]->strand;
2407     int uidx=gtrack.qdata[qloc->qfidx]->umrnas.IndexOf(qloc->mrnas[i]);
2408     if (uidx>=0) {
2409     gtrack.qdata[qloc->qfidx]->umrnas.Forget(uidx);
2410     gtrack.qdata[qloc->qfidx]->umrnas.Delete(uidx);
2411     if (xloci[x]->strand=='+')
2412     gtrack.qdata[qloc->qfidx]->mrnas_f.Add(qloc->mrnas[i]);
2413     else
2414     gtrack.qdata[qloc->qfidx]->mrnas_r.Add(qloc->mrnas[i]);
2415     }
2416     }
2417     } //unknown strand
2418     } //for each xloci[x].qloci (l)
2419    
2420     } //for each xloci (x)
2421     }
2422    
2423     //cluster loci across all datasets
2424     void xclusterLoci(int qcount, char strand, GSeqTrack& gtrack) {
2425     //gtrack holds data for all input qry datasets for a chromosome/contig
2426     //cluster QLoci
2427     GList<GTrackLocus> loctracks(true,true,false);
2428     //all vs all clustering across all qry data sets + ref
2429     //one-strand set of loci from all datasets + ref loci
2430     GList<GLocus>* wrkloci=NULL;
2431     //build xloci without references first
2432     //then add references only if they overlap an existing xloci
2433    
2434     int nq=0;
2435     for (int q=0;q<=qcount+1;q++) {
2436     bool refcheck=false;
2437     if (q==qcount) { // check the unoriented loci for each query file
2438     while (nq<qcount &&
2439     (gtrack.qdata[nq]==NULL || gtrack.qdata[nq]->nloci_u.Count()==0))
2440     nq++; //skip query files with no unoriented loci
2441     if (nq<qcount) {
2442     wrkloci=&(gtrack.qdata[nq]->nloci_u);
2443     nq++;
2444     if (nq<qcount) q--; //so we can fetch the next nq in the next q cycle
2445     }
2446     else continue; //no more q files with unoriented loci
2447     }
2448     else if (q==qcount+1) { // check the reference loci
2449     if (strand=='+') wrkloci=gtrack.rloci_f;
2450     else wrkloci=gtrack.rloci_r;
2451    
2452     if (wrkloci==NULL) break; //no ref loci here
2453     refcheck=true;
2454     }
2455     else {
2456     if (gtrack.qdata[q]==NULL) continue;
2457     if (strand=='+') wrkloci=&(gtrack.qdata[q]->loci_f);
2458     else wrkloci=&(gtrack.qdata[q]->loci_r);
2459     }
2460     // now do the all-vs-all clustering thing:
2461     for (int t=0;t<wrkloci->Count();t++) {
2462     GLocus* loc=wrkloci->Get(t);
2463     int xfound=0; //count of parent loctracks
2464     if (loc->t_ptr!=NULL) continue; //already assigned a loctrack
2465     GArray<int> mrgloctracks(true);
2466     for (int xl=0;xl<loctracks.Count();xl++) {
2467     GTrackLocus& trackloc=*loctracks[xl];
2468     if (trackloc.start>loc->end) break;
2469     if (loc->start>trackloc.end) continue;
2470     if (trackloc.add_Locus(loc)) {
2471     xfound++;
2472     mrgloctracks.Add(xl);
2473     }
2474     } //for each existing Xlocus
2475     if (xfound==0) {
2476     if (!refcheck) //we really don't care about ref-only clusters
2477     loctracks.Add(new GTrackLocus(loc));
2478     }
2479 gpertea 64 else {
2480     int il=mrgloctracks[0];
2481     GTrackLocus& tloc=*(loctracks.Get(il));
2482     if (xfound>1) {
2483     for (int l=1;l<xfound;l++) {
2484     int mlidx=mrgloctracks[l]-l+1;
2485     tloc.addMerge(loctracks[mlidx], qcount, loc);
2486     loctracks.Delete(mlidx);
2487     }
2488 gpertea 20 }
2489 gpertea 64 //in case tloc.start was decreased, bubble-down 'til it's in the proper place
2490     while (il>0 && tloc<*(loctracks[il-1])) {
2491     il--;
2492     loctracks.Swap(il,il+1);
2493     }
2494     } //at least one locus found
2495 gpertea 20 }//for each wrklocus
2496     } //for each set of loci (q)
2497     //loctracks is now set with all x-clusters on this strand
2498     for (int i=0;i<loctracks.Count();i++) {
2499     if (!loctracks[i]->hasQloci) continue; //we really don't care here about reference-only clusters
2500     GTrackLocus& loctrack=*loctracks[i];
2501     findTMatches(loctrack, qcount); //find matching transfrags in this xcluster
2502     for (int rl=0; rl < loctrack.rloci.Count(); rl++) {
2503     findTRMatch(loctrack, qcount, *(loctrack.rloci[rl]));
2504     //find matching reference annotation for this xcluster and assign class codes to transfrags
2505     }
2506     GList<GXLocus> xloci(false,false,false);
2507     buildXLoci(loctrack, qcount, gtrack, strand, &xloci);
2508     //the newly created xloci are in xloci
2509     umrnasXStrand(xloci, gtrack);
2510     //also merge these xloci into the global list of xloci
2511     for (int l=0; l < xloci.Count(); l++) {
2512     if (xloci[l]->strand=='+') {
2513     gtrack.xloci_f.Add(xloci[l]);
2514     }
2515     else if (xloci[l]->strand=='-') {
2516     gtrack.xloci_r.Add(xloci[l]);
2517     }
2518     else gtrack.xloci_u.Add(xloci[l]);
2519     }
2520     }//for each xcluster
2521     }
2522    
2523    
2524     void printRefMap(FILE** frs, int qcount, GList<GLocus>* rloci) {
2525     if (rloci==NULL) return;
2526    
2527     for (int l=0;l<rloci->Count(); l++) {
2528     for (int r=0;r<rloci->Get(l)->mrnas.Count(); r++) {
2529     GffObj& ref = *(rloci->Get(l)->mrnas[r]);
2530     CTData* refdata = ((CTData*)ref.uptr);
2531     GStr* clist = new GStr[qcount];
2532     GStr* eqlist = new GStr[qcount];
2533     for (int i = 0; i<refdata->ovls.Count(); i++) {
2534     GffObj* m=refdata->ovls[i]->mrna;
2535     char ovlcode=refdata->ovls[i]->code;
2536     if (m==NULL) {
2537     GMessage("Warning: NULL mRNA found for ref %s with ovlcode '%c'\n",
2538     ref.getID(), refdata->ovls[i]->code);
2539     continue;
2540     }
2541     int qfidx = ((CTData*)m->uptr)->qset;
2542     if (ovlcode == '=') {
2543     eqlist[qfidx].append(getGeneID(m));
2544     eqlist[qfidx].append('|');
2545     eqlist[qfidx].append(m->getID());
2546     eqlist[qfidx].append(',');
2547     }
2548     else if (ovlcode == 'c') {
2549     clist[qfidx].append(getGeneID(m));
2550     clist[qfidx].append('|');
2551     clist[qfidx].append(m->getID());
2552     clist[qfidx].append(',');
2553     }
2554     }//for each reference overlap
2555     for (int q=0;q<qcount;q++) {
2556     if (!eqlist[q].is_empty()) {
2557     eqlist[q].trimR(',');
2558     fprintf(frs[q],"%s\t%s\t=\t%s\n", getGeneID(ref), ref.getID(),eqlist[q].chars());
2559     }
2560     if (!clist[q].is_empty()) {
2561     clist[q].trimR(',');
2562     fprintf(frs[q],"%s\t%s\tc\t%s\n",getGeneID(ref), ref.getID(),clist[q].chars());
2563     }
2564     }
2565     delete[] clist;
2566     delete[] eqlist;
2567     }// ref loop
2568     }//ref locus loop
2569     }
2570    
2571    
2572    
2573     void trackGData(int qcount, GList<GSeqTrack>& gtracks, GStr& fbasename, FILE** ftr, FILE** frs) {
2574     FILE* f_ltrack=NULL;
2575     FILE* f_itrack=NULL;
2576     FILE* f_ctrack=NULL;
2577     FILE* f_xloci=NULL;
2578     int cnum=0; //consensus numbering for printITrack()
2579     GStr s=fbasename;
2580     //if (qcount>1 || generic_GFF) { //doesn't make much sense for only 1 query file
2581     s.append(".tracking");
2582     f_itrack=fopen(s.chars(),"w");
2583     if (f_itrack==NULL) GError("Error creating file %s !\n",s.chars());
2584     // }
2585     s=fbasename;
2586     s.append(".combined.gtf");
2587     f_ctrack=fopen(s.chars(),"w");
2588     if (f_ctrack==NULL) GError("Error creating file %s !\n",s.chars());
2589    
2590     s=fbasename;
2591     s.append(".loci");
2592     f_xloci=fopen(s.chars(),"w");
2593     if (f_xloci==NULL) GError("Error creating file %s !\n",s.chars());
2594     for (int g=0;g<gtracks.Count();g++) { //for each genomic sequence
2595     GSeqTrack& gseqtrack=*gtracks[g];
2596    
2597     xclusterLoci(qcount, '+', gseqtrack);
2598     xclusterLoci(qcount, '-', gseqtrack);
2599    
2600     //count XLoci, setting their id
2601     numXLoci(gseqtrack.xloci_f, xlocnum);
2602     numXLoci(gseqtrack.xloci_r, xlocnum);
2603     numXLoci(gseqtrack.xloci_u, xlocnum);
2604     //transcript accounting: for all those transcripts with 'u' or 0 class code
2605     // we have to check for polymerase runs 'p' or repeats 'r'
2606    
2607     GFaSeqGet *faseq=gfasta.fetch(gseqtrack.get_gseqid(), checkFasta);
2608    
2609     umrnaReclass(qcount, gseqtrack, ftr, faseq);
2610    
2611     // print transcript tracking (ichain_tracking)
2612     //if (qcount>1)
2613     for (int q=0;q<qcount;q++) {
2614     if (gseqtrack.qdata[q]==NULL) continue;
2615     printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_f, qcount, cnum);
2616     printITrack(f_itrack, gseqtrack.qdata[q]->mrnas_r, qcount, cnum);
2617     //just for the sake of completion:
2618     printITrack(f_itrack, gseqtrack.qdata[q]->umrnas, qcount, cnum);
2619     }
2620     //print XLoci and XConsensi within each xlocus
2621     //also TSS clustering and protein ID assignment for XConsensi
2622     printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_f, faseq);
2623     printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_r, faseq);
2624     printXLoci(f_xloci, f_ctrack, qcount, gseqtrack.xloci_u, faseq);
2625     if (tmapFiles && haveRefs) {
2626     printRefMap(frs, qcount, gseqtrack.rloci_f);
2627     printRefMap(frs, qcount, gseqtrack.rloci_r);
2628     }
2629     delete faseq;
2630     }
2631     if (tmapFiles) {
2632     for (int q=0;q<qcount;q++) {
2633     fclose(ftr[q]);
2634     if (haveRefs) fclose(frs[q]);
2635     }
2636     }
2637     if (f_ltrack!=NULL) fclose(f_ltrack);
2638     if (f_itrack!=NULL) fclose(f_itrack);
2639     if (f_ctrack!=NULL) fclose(f_ctrack);
2640     if (f_xloci!=NULL) fclose(f_xloci);
2641    
2642     }

Properties

Name Value
svn:executable *