ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/map_report/map_report.cpp
Revision: 162
Committed: Tue Feb 7 19:27:55 2012 UTC (7 years, 7 months ago) by gpertea
File size: 34144 byte(s)
Log Message:
initial map_report commit

Line File contents
1 #include "gff_utils.h"
2 #include "GArgs.h"
3 #include <ctype.h>
4
5 #define USAGE "Usage:\n\
6 map_report [options] <transcripts.gff> <sample1_hits.sam> [<sample2_hits.sam> ...]\n\
7 \n\
8 Report various counts for read mappings on a given set of transcripts.\n\
9 \n\
10 Options:\n\
11 -g provide a full path to a multi-fasta file with the genomic sequences\n\
12 for all input mappings, OR a directory with single-fasta files\n\
13 (one per genomic sequence, with file names matching sequence names)\n\
14 -r only report counts for transcripts overlapping coordinate\n\
15 range <start>..<end> (on contig <chr>, strand <strand> if provided)\n\
16 -R for -r option, discard transcripts that are not fully contained\n\
17 within the given range\n\
18 -U discard single-exon transcripts\n\
19 -C coding only: discard mRNAs that have no CDS feature\n\
20 \n\
21 -O process also non-transcript GFF records (by default non-transcript\n\
22 records are ignored)\n\
23 \n\
24 -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
25 transcripts (those with the same exact introns and fully contained)\n\
26 -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>\n\
27 --cluster-only: same as --merge but without collapsing matching transcripts\n\
28 -K for -M option: also collapse shorter, fully contained transcripts\n\
29 with fewer introns than the container\n\
30 -Q for -M option, remove the containment restriction:\n\
31 (multi-exon transcripts will be collapsed if just their introns match,\n\
32 while single-exon transcripts can partially overlap (80%))\n\
33 \n\
34 -E expose (warn about) duplicate transcript IDs and other potential \n\
35 problems with the given GFF/GTF records\n\
36 -Z merge close exons into a single exon (for intron size<4)\n\
37 -w write a fasta file with spliced exons for each GFF transcript\n\
38 -x write a fasta file with spliced CDS for each GFF transcript\n\
39 -W for -w and -x options, also write for each fasta record the exon\n\
40 coordinates projected onto the spliced sequence\n\
41 -y write a protein fasta file with the translation of CDS for each record\n\
42 -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
43 -m <chr_replace> is a reference (genomic) sequence replacement table with\n\
44 this format:\n\
45 <original_ref_ID> <new_ref_ID>\n\
46 GFF records on reference sequences that are not found among the\n\
47 <original_ref_ID> entries in this file will be filtered out\n\
48 -o the \"filtered\" GFF records will be written to <outfile.gff>\n\
49 (use -o- for printing to stdout)\n\
50 -t use <trackname> in the second column of each GFF output line\n\
51 -T -o option will output GTF format instead of GFF3\n\
52 "
53
54
55 class SeqInfo { //populated from the -s option of gffread
56 public:
57 int len;
58 char* descr;
59 SeqInfo( int l, char* s) {
60 len=l;
61 if (s==NULL) {
62 descr=NULL;
63 } else {
64 descr=Gstrdup(s);
65 }
66 }
67 ~SeqInfo() {
68 GFREE(descr);
69 }
70 };
71
72 class RefTran {
73 public:
74 char* new_name;
75 RefTran(char *ns) {
76 new_name=NULL;
77 if (ns!=NULL)
78 new_name=Gstrdup(ns);
79 }
80 ~RefTran() {
81 GFREE(new_name);
82 }
83 };
84
85 FILE* ffasta=NULL;
86 FILE* f_in=NULL;
87 FILE* f_out=NULL;
88 FILE* f_w=NULL; //fasta with spliced exons (transcripts)
89 FILE* f_x=NULL; //fasta with spliced CDS
90 FILE* f_y=NULL; //fasta with translated CDS
91 bool wCDSonly=false;
92
93 bool validCDSonly=false; // translation with no in-frame STOP
94 bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
95 bool altPhases=false; //if original phase fails translation validation,
96 //try the other 2 phases until one makes it
97 bool mRNAOnly=true;
98 bool spliceCheck=false; //only known splice-sites
99
100 bool fullCDSonly=false; // starts with START, ends with STOP codon
101 bool fullattr=false;
102 //bool sortByLoc=false; // if the GFF output should be sorted by location
103 bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3
104
105
106 //GStr gseqpath;
107 //GStr gcdbfa;
108 //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
109 //GFaSeqGet* faseq=NULL;
110 //GCdbYank* gcdb=NULL;
111 //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
112 bool fmtGTF=false;
113 bool addDescr=false;
114 //bool protmap=false;
115 bool multiExon=false;
116 bool writeExonSegs=false;
117 char* tracklabel=NULL;
118 int maxintron=999000000;
119 bool mergeCloseExons=false;
120 //range filter:
121 char* rfltGSeq=NULL;
122 char rfltStrand=0;
123 uint rfltStart=0;
124 uint rfltEnd=MAX_UINT;
125 bool rfltWithin=false; //check for full containment within given range
126 bool noExonAttr=false;
127
128 bool doCluster=false;
129 bool doCollapseRedundant=false;
130
131 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
132
133 //hash with sequence info
134 GHash<SeqInfo> seqinfo;
135 GHash<int> isoCounter; //counts the valid isoforms
136 GHash<RefTran> reftbl;
137 GHash<GeneInfo> gene_ids;
138 //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
139
140 bool debugMode=false;
141 bool verbose=false;
142
143 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
144 GLineReader fr(f);
145 while (!fr.isEof()) {
146 char* line=fr.getLine();
147 if (line==NULL) break;
148 char* id=line;
149 char* lenstr=NULL;
150 char* text=NULL;
151 char* p=line;
152 while (*p!=0 && !isspace(*p)) p++;
153 if (*p==0) continue;
154 *p=0;p++;
155 while (*p==' ' || *p=='\t') p++;
156 if (*p==0) continue;
157 lenstr=p;
158 while (*p!=0 && !isspace(*p)) p++;
159 if (*p!=0) { *p=0;p++; }
160 while (*p==' ' || *p=='\t') p++;
161 if (*p!=0) text=p; //else text remains NULL
162 int len=0;
163 if (!parseInt(lenstr,len)) {
164 GMessage("Warning: could not parse sequence length: %s %s\n",
165 id, lenstr);
166 continue;
167 }
168 // --- here we have finished parsing the line
169 si.Add(id, new SeqInfo(len,text));
170 } //while lines
171 }
172
173 void loadRefTable(FILE* f, GHash<RefTran>& rt) {
174 GLineReader fr(f);
175 char* line=NULL;
176 while ((line=fr.getLine())) {
177 char* orig_id=line;
178 char* p=line;
179 while (*p!=0 && !isspace(*p)) p++;
180 if (*p==0) continue;
181 *p=0;p++;//split the line here
182 while (*p==' ' || *p=='\t') p++;
183 if (*p==0) continue;
184 rt.Add(orig_id, new RefTran(p));
185 } //while lines
186 }
187
188 char* getSeqDescr(char* seqid) {
189 static char charbuf[128];
190 if (seqinfo.Count()==0) return NULL;
191 char* suf=rstrchr(seqid, '.');
192 if (suf!=NULL) *suf=0;
193 SeqInfo* seqd=seqinfo.Find(seqid);
194 if (suf!=NULL) *suf='.';
195 if (seqd!=NULL) {
196 GStr s(seqd->descr);
197 //cleanup some Uniref gunk
198 if (s[0]=='[') {
199 int r=s.index(']');
200 if (r>=0 && r<8 && isdigit(s[1]))
201 s.remove(0,r+1);
202 }
203 if (s.length()>80) {
204 int r=s.index(';');
205 if (r>5) s.cut(r);
206 }
207 if (s.length()>127) {
208 s.cut(127);
209 int r=s.rindex(' ');
210 if (r>0) s.cut(r);
211 }
212 strcpy(charbuf, s.chars());
213 return charbuf;
214 }
215 else return NULL;
216 }
217
218 char* getSeqName(char* seqid) {
219 static char charbuf[128];
220 char* suf=rstrchr(seqid, '.');
221 if (suf!=NULL) *suf=0;
222 strcpy(charbuf, seqid);
223 if (suf!=NULL) *suf='.';
224 return charbuf;
225 }
226
227 GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
228 if (gfasta.fastaPath==NULL) return NULL;
229 return gfasta.fetch(gffrec.gseq_id);
230 }
231
232
233 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
234 //adj>0 => extedn CDS, adj<0 => shrink CDS
235 //when CDS is expanded, exons have to be checked too and
236 // expanded accordingly if they had the same boundary
237 int realadj=0;
238 if (gffrec.strand=='-') {
239 if ((int)gffrec.CDstart>adj) {
240
241 gffrec.CDstart-=adj;
242 realadj=adj;
243 if (adj<0) { //restore
244 if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
245 gffrec.exons.First()->start-=adj;
246 gffrec.start=gffrec.exons.First()->start;
247 gffrec.covlen+=adj;
248 }
249 }
250 else if (gffrec.exons.First()->start>=gffrec.CDstart) {
251 gffrec.exons.First()->start-=adj;
252 gffrec.start=gffrec.exons.First()->start;
253 gffrec.covlen+=adj;
254 }
255 }
256 }
257 else {
258 realadj=adj;
259 gffrec.CDend+=adj;
260 if (adj<0) {//restore
261 if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
262 gffrec.exons.Last()->end+=adj;
263 gffrec.end=gffrec.exons.Last()->end;
264 gffrec.covlen+=adj;
265 }
266 }
267 else if (gffrec.exons.Last()->end<=gffrec.CDend) {
268 gffrec.exons.Last()->end+=adj;
269 gffrec.end=gffrec.exons.Last()->end;
270 gffrec.covlen+=adj;
271 }
272 }
273 if (seglst!=NULL) seglst->Last()->end+=adj;
274 return realadj;
275 }
276
277 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
278 //returns true if the transcript passed the filter
279 char* gname=gffrec.getGeneName();
280 if (gname==NULL) gname=gffrec.getGeneID();
281 GStr defline(gffrec.getID());
282 if (f_out && !fmtGTF) {
283 const char* tname=NULL;
284 if ((tname=gffrec.getAttr("transcript_name"))!=NULL) {
285 gffrec.addAttr("Name", tname);
286 gffrec.removeAttr("transcript_name");
287 }
288 }
289 if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
290 const char* biotype=gffrec.getAttr("gene_biotype");
291 if (biotype) {
292 gffrec.addAttr("type", biotype);
293 gffrec.removeAttr("gene_biotype");
294 }
295 else { //old Ensembl files lacking gene_biotype
296 gffrec.addAttr("type", gffrec.getTrackName());
297 }
298
299 //bool is_gene=false;
300 bool is_pseudo=false;
301 if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
302 gffrec.setFeatureName("mRNA");
303 else {
304 if (strcmp(biotype, "processed_transcript")==0)
305 gffrec.setFeatureName("proc_RNA");
306 else {
307 //is_gene=endsWith(biotype, "gene");
308 is_pseudo=strifind(biotype, "pseudo");
309 if (is_pseudo) {
310 gffrec.setFeatureName("pseudo_RNA");
311 }
312 else if (endsWith(biotype, "RNA")) {
313 gffrec.setFeatureName(biotype);
314 } else gffrec.setFeatureName("misc_RNA");
315 }
316 }
317 }
318 if (gname && strcmp(gname, gffrec.getID())!=0) {
319 int* isonum=isoCounter.Find(gname);
320 if (isonum==NULL) {
321 isonum=new int(1);
322 isoCounter.Add(gname,isonum);
323 }
324 else (*isonum)++;
325 defline.appendfmt(" gene=%s", gname);
326 }
327 int seqlen=0;
328
329 const char* tlabel=tracklabel;
330 if (tlabel==NULL) tlabel=gffrec.getTrackName();
331 //defline.appendfmt(" track:%s",tlabel);
332 char* cdsnt = NULL;
333 char* cdsaa = NULL;
334 int aalen=0;
335 for (int i=1;i<gffrec.exons.Count();i++) {
336 int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
337 if (ilen>4000000)
338 GMessage("Warning: very large intron (%d) for transcript %s\n",
339 ilen, gffrec.getID());
340 if (ilen>maxintron) {
341 return false;
342 }
343 }
344 GList<GSeg> seglst(false,true);
345 GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
346 if (spliceCheck && gffrec.exons.Count()>1) {
347 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
348 if (faseq==NULL) GError("Error: no genomic sequence available!\n");
349 int glen=gffrec.end-gffrec.start+1;
350 const char* gseq=faseq->subseq(gffrec.start, glen);
351 bool revcompl=(gffrec.strand=='-');
352 bool ssValid=true;
353 for (int e=1;e<gffrec.exons.Count();e++) {
354 const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
355 int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
356 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
357 GSpliceSite donorSite(intron,intronlen, false, revcompl);
358 //GMessage("%c intron %d-%d : %s .. %s\n",
359 // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
360 if (acceptorSite=="AG") { // GT-AG or GC-AG
361 if (!donorSite.canonicalDonor()) {
362 ssValid=false;break;
363 }
364 }
365 else if (acceptorSite=="AC") { //
366 if (donorSite!="AT") { ssValid=false; break; }
367 }
368 else { ssValid=false; break; }
369 }
370 //GFREE(gseq);
371 if (!ssValid) {
372 if (verbose)
373 GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
374 return false; //don't print this one!
375 }
376 }
377
378 bool trprint=true;
379 int stopCodonAdjust=0;
380 int mCDphase=0;
381 bool hasStop=false;
382 if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
383 mCDphase = gffrec.CDphase-'0';
384 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
385 if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
386 //if (protmap && fullCDSonly) {
387 //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
388
389 if (validCDSonly) { //make sure the stop codon is always included
390 //adjust_stopcodon(gffrec,3);
391 stopCodonAdjust=adjust_stopcodon(gffrec,3);
392 }
393 int strandNum=0;
394 int phaseNum=0;
395 CDS_CHECK:
396 cdsnt=gffrec.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
397 if (cdsnt==NULL) trprint=false;
398 if (validCDSonly) {
399 cdsaa=translateDNA(cdsnt, aalen, seqlen);
400 char* p=strchr(cdsaa,'.');
401 hasStop=false;
402 if (p!=NULL) {
403 if (p-cdsaa>=aalen-2) { //stop found as the last codon
404 *p='0';//remove it
405 hasStop=true;
406 if (aalen-2==p-cdsaa) {
407 //previous to last codon is the stop codon
408 //so correct the CDS stop accordingly
409 adjust_stopcodon(gffrec,-3, &seglst);
410 stopCodonAdjust=0; //clear artificial stop adjustment
411 seqlen-=3;
412 cdsnt[seqlen]=0;
413 }
414 aalen=p-cdsaa;
415 }
416 else {//stop found before the last codon
417 trprint=false;
418 }
419 }//stop codon found
420 if (trprint==false) { //failed CDS validity check
421 //in-frame stop codon found
422 if (altPhases && phaseNum<3) {
423 phaseNum++;
424 gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
425 GFREE(cdsaa);
426 goto CDS_CHECK;
427 }
428 if (gffrec.exons.Count()==1 && bothStrands) {
429 strandNum++;
430 phaseNum=0;
431 if (strandNum<2) {
432 GFREE(cdsaa);
433 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
434 goto CDS_CHECK; //repeat the CDS check for a different frame
435 }
436 }
437 if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
438 } //has in-frame STOP
439 if (fullCDSonly) {
440 if (!hasStop || cdsaa[0]!='M') trprint=false;
441 }
442 } // CDS check requested
443 } //translation or codon check/output was requested
444 if (!trprint) {
445 GFREE(cdsnt);
446 GFREE(cdsaa);
447 return false;
448 }
449 if (stopCodonAdjust>0 && !hasStop) {
450 //restore stop codon location
451 adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
452 if (cdsnt!=NULL && seqlen>0) {
453 seqlen-=stopCodonAdjust;
454 cdsnt[seqlen]=0;
455 }
456 if (cdsaa!=NULL) aalen--;
457 }
458
459 if (f_y!=NULL) { //CDS translation fasta output requested
460 //char*
461 if (cdsaa==NULL) { //translate now if not done before
462 cdsaa=translateDNA(cdsnt, aalen, seqlen);
463 }
464 if (fullattr && gffrec.attrs!=NULL) {
465 //append all attributes found for each transcripts
466 for (int i=0;i<gffrec.attrs->Count();i++) {
467 defline.append(" ");
468 defline.append(gffrec.getAttrName(i));
469 defline.append("=");
470 defline.append(gffrec.getAttrValue(i));
471 }
472 }
473 printFasta(f_y, defline, cdsaa, aalen);
474 }
475 if (f_x!=NULL) { //CDS only
476 if (writeExonSegs) {
477 defline.append(" loc:");
478 defline.append(gffrec.getGSeqName());
479 defline.appendfmt("(%c)",gffrec.strand);
480 //warning: not CDS coordinates are written here, but the exon ones
481 defline+=(int)gffrec.start;
482 defline+=(char)'-';
483 defline+=(int)gffrec.end;
484 // -- here these are CDS substring coordinates on the spliced sequence:
485 defline.append(" segs:");
486 for (int i=0;i<seglst.Count();i++) {
487 if (i>0) defline.append(",");
488 defline+=(int)seglst[i]->start;
489 defline.append("-");
490 defline+=(int)seglst[i]->end;
491 }
492 }
493 if (fullattr && gffrec.attrs!=NULL) {
494 //append all attributes found for each transcript
495 for (int i=0;i<gffrec.attrs->Count();i++) {
496 defline.append(" ");
497 defline.append(gffrec.getAttrName(i));
498 defline.append("=");
499 defline.append(gffrec.getAttrValue(i));
500 }
501 }
502 printFasta(f_x, defline, cdsnt, seqlen);
503 }
504 GFREE(cdsnt);
505 GFREE(cdsaa);
506 if (f_w!=NULL) { //write spliced exons
507 uint cds_start=0;
508 uint cds_end=0;
509 seglst.Clear();
510 char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
511 if (exont!=NULL) {
512 if (gffrec.CDstart>0) {
513 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
514 }
515 if (writeExonSegs) {
516 defline.append(" loc:");
517 defline.append(gffrec.getGSeqName());
518 defline+=(char)'|';
519 defline+=(int)gffrec.start;
520 defline+=(char)'-';
521 defline+=(int)gffrec.end;
522 defline+=(char)'|';
523 defline+=(char)gffrec.strand;
524 defline.append(" exons:");
525 for (int i=0;i<gffrec.exons.Count();i++) {
526 if (i>0) defline.append(",");
527 defline+=(int)gffrec.exons[i]->start;
528 defline.append("-");
529 defline+=(int)gffrec.exons[i]->end;
530 }
531 defline.append(" segs:");
532 for (int i=0;i<seglst.Count();i++) {
533 if (i>0) defline.append(",");
534 defline+=(int)seglst[i]->start;
535 defline.append("-");
536 defline+=(int)seglst[i]->end;
537 }
538 }
539 if (fullattr && gffrec.attrs!=NULL) {
540 //append all attributes found for each transcripts
541 for (int i=0;i<gffrec.attrs->Count();i++) {
542 defline.append(" ");
543 defline.append(gffrec.getAttrName(i));
544 defline.append("=");
545 defline.append(gffrec.getAttrValue(i));
546 }
547 }
548 printFasta(f_w, defline, exont, seqlen);
549 GFREE(exont);
550 }
551 } //writing f_w (spliced exons)
552 return true;
553 }
554
555 void openfw(FILE* &f, GArgs& args, char opt) {
556 GStr s=args.getOpt(opt);
557 if (!s.is_empty()) {
558 if (s=='-')
559 f=stdout;
560 else {
561 f=fopen(s,"w");
562 if (f==NULL) GError("Error creating file: %s\n", s.chars());
563 }
564 }
565 }
566
567 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
568 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
569
570 void printGff3Header(FILE* f, GArgs& args) {
571 fprintf(f, "# ");
572 args.printCmdLine(f);
573 fprintf(f, "##gff-version 3\n");
574 //for (int i=0;i<gseqdata.Count();i++) {
575 //
576 //}
577 }
578
579 bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
580 if (reftbl.Count()>0) {
581 GStr refname(gffrec->getRefName());
582 RefTran* rt=reftbl.Find(refname.chars());
583 if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
584 //try removing the version suffix
585 refname.cut(-2);
586 //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
587 rt=reftbl.Find(refname.chars());
588 }
589 if (rt) {
590 gffrec->setRefName(rt->new_name);
591 }
592 else return false; //discard, ref seq not in the given translation table
593 }
594 if (mRNAOnly && gffrec->isDiscarded()) {
595 //discard generic "gene" or "locus" features with no other detailed subfeatures
596 //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
597 return false;
598 }
599 /*
600 if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
601 //a non-mRNA feature with no subfeatures
602 //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
603 //--this could be a single "pseudogene" entry or another genomic region without exons
604 //
605 gffrec->addExon(gffrec->start,gffrec->end);
606 }
607 */
608 if (rfltGSeq!=NULL) { //filter by gseqName
609 if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
610 return false;
611 }
612 }
613 if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
614 return false;
615 }
616 //check coordinates
617 if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
618 if (rfltWithin) {
619 if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
620 return false; //not within query range
621 }
622 }
623 else {
624 if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
625 return false;
626 }
627 }
628 }
629 if (multiExon && gffrec->exons.Count()<=1) {
630 return false;
631 }
632 if (wCDSonly && gffrec->CDstart==0) {
633 return false;
634 }
635 if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
636 //keep track of chr|gene_id data -- coordinate range
637 char* geneid=gffrec->getGeneID();
638 if (geneid!=NULL) {
639 GeneInfo* ginfo=gene_ids.Find(geneid);
640 if (ginfo==NULL) {//first time seeing this gene ID
641 GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
642 gene_ids.Add(geneid, geneinfo);
643 if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
644 }
645 else ginfo->update(gffrec);
646 }
647 }
648 return true;
649 }
650
651
652 int main(int argc, char * const argv[]) {
653 GArgs args(argc, argv,
654 "debug;merge;cluster-only;help;force-exons;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
655 args.printError(USAGE, true);
656 if (args.getOpt('h') || args.getOpt("help")) {
657 GMessage("%s",USAGE);
658 exit(1);
659 }
660 debugMode=(args.getOpt("debug")!=NULL);
661 bool forceExons=(args.getOpt("force-exons")!=NULL);
662 mRNAOnly=(args.getOpt('O')==NULL);
663 //sortByLoc=(args.getOpt('S')!=NULL);
664 addDescr=(args.getOpt('A')!=NULL);
665 verbose=(args.getOpt('v')!=NULL);
666 wCDSonly=(args.getOpt('C')!=NULL);
667 validCDSonly=(args.getOpt('V')!=NULL);
668 altPhases=(args.getOpt('H')!=NULL);
669 fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
670 bothStrands=(args.getOpt('B')!=NULL);
671 fullCDSonly=(args.getOpt('J')!=NULL);
672 spliceCheck=(args.getOpt('N')!=NULL);
673 bool matchAllIntrons=(args.getOpt('K')==NULL);
674 bool fuzzSpan=(args.getOpt('Q')!=NULL);
675 if (args.getOpt('M') || args.getOpt("merge")) {
676 doCluster=true;
677 doCollapseRedundant=true;
678 }
679 else {
680 if (!matchAllIntrons || fuzzSpan) {
681 GMessage("%s",USAGE);
682 GMessage("Error: -K or -Q options require -M/--merge option!\n");
683 exit(1);
684 }
685 }
686 if (args.getOpt("cluster-only")) {
687 doCluster=true;
688 doCollapseRedundant=false;
689 if (!matchAllIntrons || fuzzSpan) {
690 GMessage("%s",USAGE);
691 GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
692 exit(1);
693 }
694 }
695 if (fullCDSonly) validCDSonly=true;
696 if (verbose) {
697 fprintf(stderr, "Command line was:\n");
698 args.printCmdLine(stderr);
699 }
700
701 fullattr=(args.getOpt('F')!=NULL);
702 if (args.getOpt('G')==NULL)
703 noExonAttr=!fullattr;
704 else {
705 noExonAttr=true;
706 fullattr=true;
707 }
708 ensembl_convert=(args.getOpt('L')!=NULL);
709 if (ensembl_convert) {
710 fullattr=true;
711 noExonAttr=false;
712 //sortByLoc=true;
713 }
714
715 mergeCloseExons=(args.getOpt('Z')!=NULL);
716 multiExon=(args.getOpt('U')!=NULL);
717 writeExonSegs=(args.getOpt('W')!=NULL);
718 tracklabel=args.getOpt('t');
719 GFastaDb gfasta(args.getOpt('g'));
720 //if (gfasta.fastaPath!=NULL)
721 // sortByLoc=true; //enforce sorting by chromosome/contig
722 GStr s=args.getOpt('i');
723 if (!s.is_empty()) maxintron=s.asInt();
724
725 FILE* f_repl=NULL;
726 s=args.getOpt('d');
727 if (!s.is_empty()) {
728 if (s=="-") f_repl=stdout;
729 else {
730 f_repl=fopen(s.chars(), "w");
731 if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
732 }
733 }
734
735 rfltWithin=(args.getOpt('R')!=NULL);
736 s=args.getOpt('r');
737 if (!s.is_empty()) {
738 s.trim();
739 if (s[0]=='+' || s[0]=='-') {
740 rfltStrand=s[0];
741 s.cut(0,1);
742 }
743 int isep=s.index(':');
744 if (isep>0) { //gseq name given
745 if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
746 isep--;
747 rfltStrand=s[isep];
748 s.cut(isep,1);
749 }
750 if (isep>0)
751 rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
752 s.cut(0,isep+1);
753 }
754 GStr gsend;
755 char slast=s[s.length()-1];
756 if (rfltStrand==0 && (slast=='+' || slast=='-')) {
757 s.chomp(slast);
758 rfltStrand=slast;
759 }
760 if (s.index("..")>=0) gsend=s.split("..");
761 else gsend=s.split('-');
762 if (!s.is_empty()) rfltStart=(uint)s.asInt();
763 if (!gsend.is_empty()) {
764 rfltEnd=(uint)gsend.asInt();
765 if (rfltEnd==0) rfltEnd=MAX_UINT;
766 }
767 } //gseq/range filtering
768 else {
769 if (rfltWithin)
770 GError("Error: option -R requires -r!\n");
771 //if (rfltWholeTranscript)
772 // GError("Error: option -P requires -r!\n");
773 }
774 s=args.getOpt('m');
775 if (!s.is_empty()) {
776 FILE* ft=fopen(s,"r");
777 if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
778 loadRefTable(ft, reftbl);
779 fclose(ft);
780 }
781 s=args.getOpt('s');
782 if (!s.is_empty()) {
783 FILE* fsize=fopen(s,"r");
784 if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
785 loadSeqInfo(fsize, seqinfo);
786 fclose(fsize);
787 }
788
789 openfw(f_out, args, 'o');
790 //if (f_out==NULL) f_out=stdout;
791 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
792 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
793
794 openfw(f_w, args, 'w');
795 openfw(f_x, args, 'x');
796 openfw(f_y, args, 'y');
797 if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
798 //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
799
800 int numfiles = args.startNonOpt();
801 //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
802 int out_counter=0; //number of records printed
803 while (true) {
804 GStr infile;
805 if (numfiles) {
806 infile=args.nextNonOpt();
807 if (infile.is_empty()) break;
808 if (infile=="-") { f_in=stdin; infile="stdin"; }
809 else
810 if ((f_in=fopen(infile, "r"))==NULL)
811 GError("Error: cannot open input file %s!\n",infile.chars());
812 }
813 else
814 infile="-";
815 GffLoader gffloader(infile.chars());
816 gffloader.transcriptsOnly=mRNAOnly;
817 gffloader.fullAttributes=fullattr;
818 gffloader.noExonAttrs=noExonAttr;
819 gffloader.mergeCloseExons=mergeCloseExons;
820 gffloader.showWarnings=(args.getOpt('E')!=NULL);
821 gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
822 matchAllIntrons, fuzzSpan, forceExons);
823 if (doCluster)
824 collectLocusData(g_data);
825 if (numfiles==0) break;
826 }
827
828 GStr loctrack("gffcl");
829 if (tracklabel) loctrack=tracklabel;
830 g_data.setSorted(&gseqCmpName);
831 if (doCluster) {
832 //grouped in loci
833 for (int g=0;g<g_data.Count();g++) {
834 GenomicSeqData* gdata=g_data[g];
835 for (int l=0;l<gdata->loci.Count();l++) {
836 GffLocus& loc=*(gdata->loci[l]);
837 //check all non-replaced transcripts in this locus:
838 int numvalid=0;
839 int idxfirstvalid=-1;
840 for (int i=0;i<loc.rnas.Count();i++) {
841 GffObj& t=*(loc.rnas[i]);
842 GTData* tdata=(GTData*)(t.uptr);
843 if (tdata->replaced_by!=NULL) {
844 if (f_repl && (t.udata & 8)==0) {
845 //t.udata|=8;
846 fprintf(f_repl, "%s", t.getID());
847 GTData* rby=tdata;
848 while (rby->replaced_by!=NULL) {
849 fprintf(f_repl," => %s", rby->replaced_by->getID());
850 rby->rna->udata|=8;
851 rby=(GTData*)(rby->replaced_by->uptr);
852 }
853 fprintf(f_repl, "\n");
854 }
855 continue;
856 }
857 if (process_transcript(gfasta, t)) {
858 t.udata|=4; //tag it as valid
859 numvalid++;
860 if (idxfirstvalid<0) idxfirstvalid=i;
861 }
862 }
863
864 if (f_out && numvalid>0) {
865 GStr locname("RLOC_");
866 locname.appendfmt("%08d",loc.locus_num);
867 if (!fmtGTF) {
868 if (out_counter==0)
869 printGff3Header(f_out, args);
870 fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
871 loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
872 locname.chars(), locname.chars());
873 //const char* loc_gname=loc.getGeneName();
874 if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
875 fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
876 for (int i=1;i<loc.gene_names.Count();i++) {
877 fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
878 }
879 }
880 if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
881 fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
882 for (int i=1;i<loc.gene_ids.Count();i++) {
883 fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
884 }
885 }
886 fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
887 for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
888 fprintf(f_out, ",%s",loc.rnas[i]->getID());
889 }
890 fprintf(f_out, "\n");
891 }
892 //now print all valid, non-replaced transcripts in this locus:
893 for (int i=0;i<loc.rnas.Count();i++) {
894 GffObj& t=*(loc.rnas[i]);
895 GTData* tdata=(GTData*)(t.uptr);
896 if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
897 t.addAttr("locus", locname.chars());
898 out_counter++;
899 if (fmtGTF) t.printGtf(f_out, tracklabel);
900 else {
901 //print the parent first, if any
902 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
903 GTData* pdata=(GTData*)(t.parent->uptr);
904 if (pdata->geneinfo!=NULL)
905 pdata->geneinfo->finalize();
906 t.parent->addAttr("locus", locname.chars());
907 t.parent->printGff(f_out, tracklabel);
908 t.parent->udata|=4;
909 }
910 t.printGff(f_out, tracklabel);
911 }
912 }
913 } //have valid transcripts to print
914 }//for each locus
915 if (f_out && !mRNAOnly) {
916 //final pass through the non-transcripts, in case any of them were not printed
917 //TODO: order broken, these should be interspersed among the rnas in the correct order!
918 for (int m=0;m<gdata->gfs.Count();m++) {
919 GffObj& t=*(gdata->gfs[m]);
920 if ((t.udata&4)==0) { //never printed
921 t.udata|=4;
922 if (fmtGTF) t.printGtf(f_out, tracklabel);
923 else t.printGff(f_out, tracklabel);
924 }
925 } //for each non-transcript
926 }
927 } //for each genomic sequence
928 }
929 else {
930 //not grouped into loci, print the rnas with their parents, if any
931 int numvalid=0;
932 for (int g=0;g<g_data.Count();g++) {
933 GenomicSeqData* gdata=g_data[g];
934 for (int m=0;m<gdata->rnas.Count();m++) {
935 GffObj& t=*(gdata->rnas[m]);
936 GTData* tdata=(GTData*)(t.uptr);
937 if (tdata->replaced_by!=NULL) continue;
938 if (process_transcript(gfasta, t)) {
939 t.udata|=4; //tag it as valid
940 numvalid++;
941 if (f_out) {
942 if (tdata->geneinfo) tdata->geneinfo->finalize();
943 out_counter++;
944 if (fmtGTF) t.printGtf(f_out, tracklabel);
945 else {
946 if (out_counter==1)
947 printGff3Header(f_out, args);
948 //print the parent first, if any
949 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
950 GTData* pdata=(GTData*)(t.parent->uptr);
951 if (pdata->geneinfo!=NULL)
952 pdata->geneinfo->finalize();
953 t.parent->printGff(f_out, tracklabel);
954 t.parent->udata|=4;
955 }
956 t.printGff(f_out, tracklabel);
957 }
958 }//GFF/GTF output requested
959 } //valid transcript
960 } //for each rna
961 if (f_out && !mRNAOnly) {
962 //final pass through the non-transcripts, in case any of them were not printed
963 //TODO: order broken, these should be interspersed among the rnas in the correct order!
964 for (int m=0;m<gdata->gfs.Count();m++) {
965 GffObj& t=*(gdata->gfs[m]);
966 if ((t.udata&4)==0) { //never printed
967 t.udata|=4;
968 if (fmtGTF) t.printGtf(f_out, tracklabel);
969 else t.printGff(f_out, tracklabel);
970 }
971 } //for each non-transcript
972 }
973 } //for each genomic seq
974 }
975 if (f_repl && f_repl!=stdout) fclose(f_repl);
976 seqinfo.Clear();
977 //if (faseq!=NULL) delete faseq;
978 //if (gcdb!=NULL) delete gcdb;
979 GFREE(rfltGSeq);
980 FRCLOSE(f_in);
981 FWCLOSE(f_out);
982 FWCLOSE(f_w);
983 FWCLOSE(f_x);
984 FWCLOSE(f_y);
985 }