ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 59
Committed: Fri Sep 9 18:00:23 2011 UTC (8 years ago) by gpertea
File size: 35425 byte(s)
Log Message:
finally fixed the placeGf clustering bug

Line File contents
1 #include "gff_utils.h"
2 #include "GArgs.h"
3 #include <ctype.h>
4 // don't care about cdb compression
5 //#ifdef ENABLE_COMPRESSION
6 //#undef ENABLE_COMPRESSION
7 //#endif
8 //#include "GCdbYank.h"
9
10 #define USAGE "Usage:\n\
11 gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12 [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n\
13 [-CTVNJMKQAFGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n\
14 [-i <maxintron>] \n\
15 Filters and/or converts GFF3/GTF2 records.\n\
16 <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17 \n\
18 Options:\n\
19 -g full path to a multi-fasta file with the genomic sequences\n\
20 for all input mappings, OR a directory with single-fasta files\n\
21 (one per genomic sequence, with file names matching sequence names)\n\
22 -s <seq_info.fsize> is a tab-delimited file providing this info\n\
23 for each of the mapped sequences:\n\
24 <seq-name> <seq-length> <seq-description>\n\
25 (useful for -A option with mRNA/EST/protein mappings)\n\
26 -i discard transcripts having an intron larger than <maxintron>\n\
27 -r only show transcripts overlapping coordinate range <start>..<end>\n\
28 (on chromosome/contig <chr>, strand <strand> if provided)\n\
29 -R for -r option, discard all transcripts that are not fully \n\
30 contained within the given range\n\
31 -U discard single-exon transcripts\n\
32 -C coding only: discard mRNAs that have no CDS feature\n\
33 -F full GFF attribute preservation (all attributes are shown)\n\
34 -G only parse additional exon attributes from the first exon\n\
35 and move them to the mRNA level (useful for GTF input)\n\
36 -A use the description field from <seq_info.fsize> and add it\n\
37 as the value for a 'descr' attribute to the GFF record\n\
38 \n\
39 -O process also non-transcript GFF records (by default non-transcript\n\
40 records are ignored)\n\
41 -V discard any mRNAs with CDS having in-frame stop codons\n\
42 -H for -V option, check and adjust the starting CDS phase\n\
43 if the original phase leads to a translation with an \n\
44 in-frame stop codon\n\
45 -B for -V option, single-exon transcripts are also checked on the\n\
46 opposite strand\n\
47 -N discard multi-exon mRNAs that have any intron with a non-canonical\n\
48 splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49 -J discard any mRNAs that either lack initial START codon\n\
50 or the terminal STOP codon, or have an in-frame stop codon\n\
51 (only print mRNAs with a fulll, valid CDS)\n\
52 \n\
53 -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54 transcripts (those with the same exact introns and fully contained)\n\
55 -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>\n\
56 --cluster-only: same as --merge but without collapsing matching transcripts\n\
57 -K for -M option: also collapse shorter, fully contained transcripts\n\
58 with fewer introns than the container\n\
59 -Q for -M option, remove the containment restriction:\n\
60 (multi-exon transcripts will be collapsed if just their introns match,\n\
61 while single-exon transcripts can partially overlap (80%))\n\
62 \n\
63 -E expose (warn about) duplicate transcript IDs and other potential \n\
64 problems with the given GFF/GTF records\n\
65 -Z merge close exons into a single exon (for intron size<4)\n\
66 -w write a fasta file with spliced exons for each GFF transcript\n\
67 -x write a fasta file with spliced CDS for each GFF transcript\n\
68 -W for -w and -x options, also write for each fasta record the exon\n\
69 coordinates projected onto the spliced sequence\n\
70 -y write a protein fasta file with the translation of CDS for each record\n\
71 -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
72 -m <chr_replace> is a reference (genomic) sequence replacement table with\n\
73 this format:\n\
74 <original_ref_ID> <new_ref_ID>\n\
75 GFF records on reference sequences that are not found among the\n\
76 <original_ref_ID> entries in this file will be filtered out\n\
77 -o the \"filtered\" GFF records will be written to <outfile.gff>\n\
78 (use -o- for printing to stdout)\n\
79 -t use <trackname> in the second column of each GFF output line\n\
80 -T -o option will output GTF format instead of GFF3\n\
81 "
82
83
84 class SeqInfo { //populated from the -s option of gffread
85 public:
86 int len;
87 char* descr;
88 SeqInfo( int l, char* s) {
89 len=l;
90 if (s==NULL) {
91 descr=NULL;
92 } else {
93 descr=Gstrdup(s);
94 }
95 }
96 ~SeqInfo() {
97 GFREE(descr);
98 }
99 };
100
101 class RefTran {
102 public:
103 char* new_name;
104 RefTran(char *ns) {
105 new_name=NULL;
106 if (ns!=NULL)
107 new_name=Gstrdup(ns);
108 }
109 ~RefTran() {
110 GFREE(new_name);
111 }
112 };
113
114 FILE* ffasta=NULL;
115 FILE* f_in=NULL;
116 FILE* f_out=NULL;
117 FILE* f_w=NULL; //fasta with spliced exons (transcripts)
118 FILE* f_x=NULL; //fasta with spliced CDS
119 FILE* f_y=NULL; //fasta with translated CDS
120 bool wCDSonly=false;
121
122 bool validCDSonly=false; // translation with no in-frame STOP
123 bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
124 bool altPhases=false; //if original phase fails translation validation,
125 //try the other 2 phases until one makes it
126 bool mRNAOnly=true;
127 bool spliceCheck=false; //only known splice-sites
128
129 bool fullCDSonly=false; // starts with START, ends with STOP codon
130 bool fullattr=false;
131 //bool sortByLoc=false; // if the GFF output should be sorted by location
132 bool ensembl_convert=false; //-L, assisst in converting Ensembl GTF to GFF3
133
134
135 //GStr gseqpath;
136 //GStr gcdbfa;
137 //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
138 //GFaSeqGet* faseq=NULL;
139 //GCdbYank* gcdb=NULL;
140 //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
141 bool fmtGTF=false;
142 bool addDescr=false;
143 //bool protmap=false;
144 bool multiExon=false;
145 bool writeExonSegs=false;
146 char* tracklabel=NULL;
147 int maxintron=999000000;
148 bool mergeCloseExons=false;
149 //range filter:
150 char* rfltGSeq=NULL;
151 char rfltStrand=0;
152 uint rfltStart=0;
153 uint rfltEnd=MAX_UINT;
154 bool rfltWithin=false; //check for full containment within given range
155 bool noExonAttr=false;
156
157 bool doCluster=false;
158 bool doCollapseRedundant=false;
159
160 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
161
162 //hash with sequence info
163 GHash<SeqInfo> seqinfo;
164 GHash<int> isoCounter; //counts the valid isoforms
165 GHash<RefTran> reftbl;
166 GHash<GeneInfo> gene_ids;
167 //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
168
169 bool debugMode=false;
170 bool verbose=false;
171
172 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
173 GLineReader fr(f);
174 while (!fr.isEof()) {
175 char* line=fr.getLine();
176 if (line==NULL) break;
177 char* id=line;
178 char* lenstr=NULL;
179 char* text=NULL;
180 char* p=line;
181 while (*p!=0 && !isspace(*p)) p++;
182 if (*p==0) continue;
183 *p=0;p++;
184 while (*p==' ' || *p=='\t') p++;
185 if (*p==0) continue;
186 lenstr=p;
187 while (*p!=0 && !isspace(*p)) p++;
188 if (*p!=0) { *p=0;p++; }
189 while (*p==' ' || *p=='\t') p++;
190 if (*p!=0) text=p; //else text remains NULL
191 int len=0;
192 if (!parseInt(lenstr,len)) {
193 GMessage("Warning: could not parse sequence length: %s %s\n",
194 id, lenstr);
195 continue;
196 }
197 // --- here we have finished parsing the line
198 si.Add(id, new SeqInfo(len,text));
199 } //while lines
200 }
201
202 void loadRefTable(FILE* f, GHash<RefTran>& rt) {
203 GLineReader fr(f);
204 char* line=NULL;
205 while ((line=fr.getLine())) {
206 char* orig_id=line;
207 char* p=line;
208 while (*p!=0 && !isspace(*p)) p++;
209 if (*p==0) continue;
210 *p=0;p++;//split the line here
211 while (*p==' ' || *p=='\t') p++;
212 if (*p==0) continue;
213 rt.Add(orig_id, new RefTran(p));
214 } //while lines
215 }
216
217 char* getSeqDescr(char* seqid) {
218 static char charbuf[128];
219 if (seqinfo.Count()==0) return NULL;
220 char* suf=rstrchr(seqid, '.');
221 if (suf!=NULL) *suf=0;
222 SeqInfo* seqd=seqinfo.Find(seqid);
223 if (suf!=NULL) *suf='.';
224 if (seqd!=NULL) {
225 GStr s(seqd->descr);
226 //cleanup some Uniref gunk
227 if (s[0]=='[') {
228 int r=s.index(']');
229 if (r>=0 && r<8 && isdigit(s[1]))
230 s.remove(0,r+1);
231 }
232 if (s.length()>80) {
233 int r=s.index(';');
234 if (r>5) s.cut(r);
235 }
236 if (s.length()>127) {
237 s.cut(127);
238 int r=s.rindex(' ');
239 if (r>0) s.cut(r);
240 }
241 strcpy(charbuf, s.chars());
242 return charbuf;
243 }
244 else return NULL;
245 }
246
247 char* getSeqName(char* seqid) {
248 static char charbuf[128];
249 char* suf=rstrchr(seqid, '.');
250 if (suf!=NULL) *suf=0;
251 strcpy(charbuf, seqid);
252 if (suf!=NULL) *suf='.';
253 return charbuf;
254 }
255
256 GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
257 if (gfasta.fastaPath==NULL) return NULL;
258 return gfasta.fetch(gffrec.gseq_id);
259 }
260
261
262 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
263 //adj>0 => extedn CDS, adj<0 => shrink CDS
264 //when CDS is expanded, exons have to be checked too and
265 // expanded accordingly if they had the same boundary
266 int realadj=0;
267 if (gffrec.strand=='-') {
268 if ((int)gffrec.CDstart>adj) {
269
270 gffrec.CDstart-=adj;
271 realadj=adj;
272 if (adj<0) { //restore
273 if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
274 gffrec.exons.First()->start-=adj;
275 gffrec.start=gffrec.exons.First()->start;
276 gffrec.covlen+=adj;
277 }
278 }
279 else if (gffrec.exons.First()->start>=gffrec.CDstart) {
280 gffrec.exons.First()->start-=adj;
281 gffrec.start=gffrec.exons.First()->start;
282 gffrec.covlen+=adj;
283 }
284 }
285 }
286 else {
287 realadj=adj;
288 gffrec.CDend+=adj;
289 if (adj<0) {//restore
290 if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
291 gffrec.exons.Last()->end+=adj;
292 gffrec.end=gffrec.exons.Last()->end;
293 gffrec.covlen+=adj;
294 }
295 }
296 else if (gffrec.exons.Last()->end<=gffrec.CDend) {
297 gffrec.exons.Last()->end+=adj;
298 gffrec.end=gffrec.exons.Last()->end;
299 gffrec.covlen+=adj;
300 }
301 }
302 if (seglst!=NULL) seglst->Last()->end+=adj;
303 return realadj;
304 }
305
306 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
307 //returns true if the transcript passed the filter
308 char* gname=gffrec.getGeneName();
309 if (gname==NULL) gname=gffrec.getGeneID();
310 GStr defline(gffrec.getID());
311 if (f_out && !fmtGTF) {
312 const char* tn=NULL;
313 if ((tn=gffrec.getAttr("transcript_name"))!=NULL) {
314 gffrec.addAttr("Name", tn);
315 gffrec.removeAttr("transcript_name");
316 }
317 }
318 if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
319 const char* tn=gffrec.getTrackName();
320 gffrec.addAttr("type", tn);
321 //bool is_gene=false;
322 bool is_pseudo=false;
323 if (strcmp(tn, "protein_coding")==0 || gffrec.hasCDS())
324 gffrec.setFeatureName("mRNA");
325 else {
326 if (strcmp(tn, "processed_transcript")==0)
327 gffrec.setFeatureName("proc_RNA");
328 else {
329 //is_gene=endsWith(tn, "gene");
330 is_pseudo=strifind(tn, "pseudo");
331 if (is_pseudo) {
332 gffrec.setFeatureName("pseudo_RNA");
333 }
334 else if (endsWith(tn, "RNA")) {
335 gffrec.setFeatureName(tn);
336 } else gffrec.setFeatureName("misc_RNA");
337 }
338 }
339 }
340 if (gname && strcmp(gname, gffrec.getID())!=0) {
341 int* isonum=isoCounter.Find(gname);
342 if (isonum==NULL) {
343 isonum=new int(1);
344 isoCounter.Add(gname,isonum);
345 }
346 else (*isonum)++;
347 defline.appendfmt(" gene=%s", gname);
348 }
349 int seqlen=0;
350
351 const char* tlabel=tracklabel;
352 if (tlabel==NULL) tlabel=gffrec.getTrackName();
353 //defline.appendfmt(" track:%s",tlabel);
354 char* cdsnt = NULL;
355 char* cdsaa = NULL;
356 int aalen=0;
357 for (int i=1;i<gffrec.exons.Count();i++) {
358 int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
359 if (ilen>4000000)
360 GMessage("Warning: very large intron (%d) for transcript %s\n",
361 ilen, gffrec.getID());
362 if (ilen>maxintron) {
363 return false;
364 }
365 }
366 GList<GSeg> seglst(false,true);
367 GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
368 if (spliceCheck && gffrec.exons.Count()>1) {
369 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
370 if (faseq==NULL) GError("Error: no genomic sequence available!\n");
371 int glen=gffrec.end-gffrec.start+1;
372 const char* gseq=faseq->subseq(gffrec.start, glen);
373 bool revcompl=(gffrec.strand=='-');
374 bool ssValid=true;
375 for (int e=1;e<gffrec.exons.Count();e++) {
376 const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
377 int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
378 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
379 GSpliceSite donorSite(intron,intronlen, false, revcompl);
380 //GMessage("%c intron %d-%d : %s .. %s\n",
381 // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
382 if (acceptorSite=="AG") { // GT-AG or GC-AG
383 if (!donorSite.canonicalDonor()) {
384 ssValid=false;break;
385 }
386 }
387 else if (acceptorSite=="AC") { //
388 if (donorSite!="AT") { ssValid=false; break; }
389 }
390 else { ssValid=false; break; }
391 }
392 //GFREE(gseq);
393 if (!ssValid) {
394 if (verbose)
395 GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
396 return false; //don't print this one!
397 }
398 }
399
400 bool trprint=true;
401 int stopCodonAdjust=0;
402 int mCDphase=0;
403 bool hasStop=false;
404 if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
405 mCDphase = gffrec.CDphase-'0';
406 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
407 if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
408 //if (protmap && fullCDSonly) {
409 //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
410
411 if (validCDSonly) { //make sure the stop codon is always included
412 //adjust_stopcodon(gffrec,3);
413 stopCodonAdjust=adjust_stopcodon(gffrec,3);
414 }
415 int strandNum=0;
416 int phaseNum=0;
417 CDS_CHECK:
418 cdsnt=gffrec.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
419 if (cdsnt==NULL) trprint=false;
420 if (validCDSonly) {
421 cdsaa=translateDNA(cdsnt, aalen, seqlen);
422 char* p=strchr(cdsaa,'.');
423 hasStop=false;
424 if (p!=NULL) {
425 if (p-cdsaa>=aalen-2) { //stop found as the last codon
426 *p='0';//remove it
427 hasStop=true;
428 if (aalen-2==p-cdsaa) {
429 //previous to last codon is the stop codon
430 //so correct the CDS stop accordingly
431 adjust_stopcodon(gffrec,-3, &seglst);
432 stopCodonAdjust=0; //clear artificial stop adjustment
433 seqlen-=3;
434 cdsnt[seqlen]=0;
435 }
436 aalen=p-cdsaa;
437 }
438 else {//stop found before the last codon
439 trprint=false;
440 }
441 }//stop codon found
442 if (trprint==false) { //failed CDS validity check
443 //in-frame stop codon found
444 if (altPhases && phaseNum<3) {
445 phaseNum++;
446 gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
447 GFREE(cdsaa);
448 goto CDS_CHECK;
449 }
450 if (gffrec.exons.Count()==1 && bothStrands) {
451 strandNum++;
452 phaseNum=0;
453 if (strandNum<2) {
454 GFREE(cdsaa);
455 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
456 goto CDS_CHECK; //repeat the CDS check for a different frame
457 }
458 }
459 if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
460 } //has in-frame STOP
461 if (fullCDSonly) {
462 if (!hasStop || cdsaa[0]!='M') trprint=false;
463 }
464 } // CDS check requested
465 } //translation or codon check/output was requested
466 if (!trprint) {
467 GFREE(cdsnt);
468 GFREE(cdsaa);
469 return false;
470 }
471 if (stopCodonAdjust>0 && !hasStop) {
472 //restore stop codon location
473 adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
474 if (cdsnt!=NULL && seqlen>0) {
475 seqlen-=stopCodonAdjust;
476 cdsnt[seqlen]=0;
477 }
478 if (cdsaa!=NULL) aalen--;
479 }
480
481 if (f_y!=NULL) { //CDS translation fasta output requested
482 //char*
483 if (cdsaa==NULL) { //translate now if not done before
484 cdsaa=translateDNA(cdsnt, aalen, seqlen);
485 }
486 if (fullattr && gffrec.attrs!=NULL) {
487 //append all attributes found for each transcripts
488 for (int i=0;i<gffrec.attrs->Count();i++) {
489 defline.append(" ");
490 defline.append(gffrec.getAttrName(i));
491 defline.append("=");
492 defline.append(gffrec.getAttrValue(i));
493 }
494 }
495 printFasta(f_y, defline, cdsaa, aalen);
496 }
497 if (f_x!=NULL) { //CDS only
498 if (writeExonSegs) {
499 defline.append(" loc:");
500 defline.append(gffrec.getGSeqName());
501 defline.appendfmt("(%c)",gffrec.strand);
502 //warning: not CDS coordinates are written here, but the exon ones
503 defline+=(int)gffrec.start;
504 defline+=(char)'-';
505 defline+=(int)gffrec.end;
506 // -- here these are CDS substring coordinates on the spliced sequence:
507 defline.append(" segs:");
508 for (int i=0;i<seglst.Count();i++) {
509 if (i>0) defline.append(",");
510 defline+=(int)seglst[i]->start;
511 defline.append("-");
512 defline+=(int)seglst[i]->end;
513 }
514 }
515 if (fullattr && gffrec.attrs!=NULL) {
516 //append all attributes found for each transcript
517 for (int i=0;i<gffrec.attrs->Count();i++) {
518 defline.append(" ");
519 defline.append(gffrec.getAttrName(i));
520 defline.append("=");
521 defline.append(gffrec.getAttrValue(i));
522 }
523 }
524 printFasta(f_x, defline, cdsnt, seqlen);
525 }
526 GFREE(cdsnt);
527 GFREE(cdsaa);
528 if (f_w!=NULL) { //write spliced exons
529 uint cds_start=0;
530 uint cds_end=0;
531 seglst.Clear();
532 char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
533 if (exont!=NULL) {
534 if (gffrec.CDstart>0) {
535 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
536 }
537 if (writeExonSegs) {
538 defline.append(" loc:");
539 defline.append(gffrec.getGSeqName());
540 defline+=(char)'|';
541 defline+=(int)gffrec.start;
542 defline+=(char)'-';
543 defline+=(int)gffrec.end;
544 defline+=(char)'|';
545 defline+=(char)gffrec.strand;
546 defline.append(" exons:");
547 for (int i=0;i<gffrec.exons.Count();i++) {
548 if (i>0) defline.append(",");
549 defline+=(int)gffrec.exons[i]->start;
550 defline.append("-");
551 defline+=(int)gffrec.exons[i]->end;
552 }
553 defline.append(" segs:");
554 for (int i=0;i<seglst.Count();i++) {
555 if (i>0) defline.append(",");
556 defline+=(int)seglst[i]->start;
557 defline.append("-");
558 defline+=(int)seglst[i]->end;
559 }
560 }
561 if (fullattr && gffrec.attrs!=NULL) {
562 //append all attributes found for each transcripts
563 for (int i=0;i<gffrec.attrs->Count();i++) {
564 defline.append(" ");
565 defline.append(gffrec.getAttrName(i));
566 defline.append("=");
567 defline.append(gffrec.getAttrValue(i));
568 }
569 }
570 printFasta(f_w, defline, exont, seqlen);
571 GFREE(exont);
572 }
573 } //writing f_w (spliced exons)
574 return true;
575 }
576
577 void openfw(FILE* &f, GArgs& args, char opt) {
578 GStr s=args.getOpt(opt);
579 if (!s.is_empty()) {
580 if (s=='-')
581 f=stdout;
582 else {
583 f=fopen(s,"w");
584 if (f==NULL) GError("Error creating file: %s\n", s.chars());
585 }
586 }
587 }
588
589 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
590 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
591
592 void printGff3Header(FILE* f, GArgs& args) {
593 fprintf(f, "# ");
594 args.printCmdLine(f);
595 fprintf(f, "##gff-version 3\n");
596 //for (int i=0;i<gseqdata.Count();i++) {
597 //
598 //}
599 }
600
601 bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
602 if (reftbl.Count()>0) {
603 GStr refname(gffrec->getRefName());
604 RefTran* rt=reftbl.Find(refname.chars());
605 if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
606 //try removing the version suffix
607 refname.cut(-2);
608 //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
609 rt=reftbl.Find(refname.chars());
610 }
611 if (rt) {
612 gffrec->setRefName(rt->new_name);
613 }
614 else return false; //discard, ref seq not in the given translation table
615 }
616 if (mRNAOnly && gffrec->isDiscarded()) {
617 //discard generic "gene" or "locus" features with no other detailed subfeatures
618 //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
619 return false;
620 }
621 /*
622 if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
623 //a non-mRNA feature with no subfeatures
624 //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
625 //--this could be a single "pseudogene" entry or another genomic region without exons
626 //
627 gffrec->addExon(gffrec->start,gffrec->end);
628 }
629 */
630 if (rfltGSeq!=NULL) { //filter by gseqName
631 if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
632 return false;
633 }
634 }
635 if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
636 return false;
637 }
638 //check coordinates
639 if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
640 if (rfltWithin) {
641 if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
642 return false; //not within query range
643 }
644 }
645 else {
646 if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
647 return false;
648 }
649 }
650 }
651 if (multiExon && gffrec->exons.Count()<=1) {
652 return false;
653 }
654 if (wCDSonly && gffrec->CDstart==0) {
655 return false;
656 }
657 if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
658 //keep track of chr|gene_id data -- coordinate range
659 char* geneid=gffrec->getGeneID();
660 if (geneid!=NULL) {
661 GeneInfo* ginfo=gene_ids.Find(geneid);
662 if (ginfo==NULL) {//first time seeing this gene ID
663 GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
664 gene_ids.Add(geneid, geneinfo);
665 if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
666 }
667 else ginfo->update(gffrec);
668 }
669 }
670 return true;
671 }
672
673
674 int main(int argc, char * const argv[]) {
675 GArgs args(argc, argv,
676 "debug;merge;cluster-only;help;force-exons;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
677 args.printError(USAGE, true);
678 if (args.getOpt('h') || args.getOpt("help")) {
679 GMessage("%s",USAGE);
680 exit(1);
681 }
682 debugMode=(args.getOpt("debug")!=NULL);
683 mRNAOnly=(args.getOpt('O')==NULL);
684 //sortByLoc=(args.getOpt('S')!=NULL);
685 addDescr=(args.getOpt('A')!=NULL);
686 verbose=(args.getOpt('v')!=NULL);
687 wCDSonly=(args.getOpt('C')!=NULL);
688 validCDSonly=(args.getOpt('V')!=NULL);
689 altPhases=(args.getOpt('H')!=NULL);
690 fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
691 bothStrands=(args.getOpt('B')!=NULL);
692 fullCDSonly=(args.getOpt('J')!=NULL);
693 spliceCheck=(args.getOpt('N')!=NULL);
694 bool matchAllIntrons=(args.getOpt('K')==NULL);
695 bool fuzzSpan=(args.getOpt('Q')!=NULL);
696 if (args.getOpt('M') || args.getOpt("merge")) {
697 doCluster=true;
698 doCollapseRedundant=true;
699 }
700 else {
701 if (!matchAllIntrons || fuzzSpan) {
702 GMessage("%s",USAGE);
703 GMessage("Error: -K or -Q options require -M/--merge option!\n");
704 exit(1);
705 }
706 }
707 if (args.getOpt("cluster-only")) {
708 doCluster=true;
709 doCollapseRedundant=false;
710 if (!matchAllIntrons || fuzzSpan) {
711 GMessage("%s",USAGE);
712 GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
713 exit(1);
714 }
715 }
716 if (fullCDSonly) validCDSonly=true;
717 if (verbose) {
718 fprintf(stderr, "Command line was:\n");
719 args.printCmdLine(stderr);
720 }
721
722 fullattr=(args.getOpt('F')!=NULL);
723 if (args.getOpt('G')==NULL)
724 noExonAttr=!fullattr;
725 else {
726 noExonAttr=true;
727 fullattr=true;
728 }
729 ensembl_convert=(args.getOpt('L')!=NULL);
730 if (ensembl_convert) {
731 fullattr=true;
732 noExonAttr=false;
733 //sortByLoc=true;
734 }
735
736 mergeCloseExons=(args.getOpt('Z')!=NULL);
737 multiExon=(args.getOpt('U')!=NULL);
738 writeExonSegs=(args.getOpt('W')!=NULL);
739 tracklabel=args.getOpt('t');
740 GFastaDb gfasta(args.getOpt('g'));
741 //if (gfasta.fastaPath!=NULL)
742 // sortByLoc=true; //enforce sorting by chromosome/contig
743 GStr s=args.getOpt('i');
744 if (!s.is_empty()) maxintron=s.asInt();
745
746 FILE* f_repl=NULL;
747 s=args.getOpt('d');
748 if (!s.is_empty()) {
749 if (s=="-") f_repl=stdout;
750 else {
751 f_repl=fopen(s.chars(), "w");
752 if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
753 }
754 }
755
756 rfltWithin=(args.getOpt('R')!=NULL);
757 s=args.getOpt('r');
758 if (!s.is_empty()) {
759 s.trim();
760 if (s[0]=='+' || s[0]=='-') {
761 rfltStrand=s[0];
762 s.cut(0,1);
763 }
764 int isep=s.index(':');
765 if (isep>0) { //gseq name given
766 if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
767 isep--;
768 rfltStrand=s[isep];
769 s.cut(isep,1);
770 }
771 if (isep>0)
772 rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
773 s.cut(0,isep+1);
774 }
775 GStr gsend;
776 char slast=s[s.length()-1];
777 if (rfltStrand==0 && (slast=='+' || slast=='-')) {
778 s.chomp(slast);
779 rfltStrand=slast;
780 }
781 if (s.index("..")>=0) gsend=s.split("..");
782 else gsend=s.split('-');
783 if (!s.is_empty()) rfltStart=(uint)s.asInt();
784 if (!gsend.is_empty()) {
785 rfltEnd=(uint)gsend.asInt();
786 if (rfltEnd==0) rfltEnd=MAX_UINT;
787 }
788 } //gseq/range filtering
789 else {
790 if (rfltWithin)
791 GError("Error: option -R requires -r!\n");
792 //if (rfltWholeTranscript)
793 // GError("Error: option -P requires -r!\n");
794 }
795 s=args.getOpt('m');
796 if (!s.is_empty()) {
797 FILE* ft=fopen(s,"r");
798 if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
799 loadRefTable(ft, reftbl);
800 fclose(ft);
801 }
802 s=args.getOpt('s');
803 if (!s.is_empty()) {
804 FILE* fsize=fopen(s,"r");
805 if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
806 loadSeqInfo(fsize, seqinfo);
807 fclose(fsize);
808 }
809
810 openfw(f_out, args, 'o');
811 //if (f_out==NULL) f_out=stdout;
812 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
813 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
814
815 openfw(f_w, args, 'w');
816 openfw(f_x, args, 'x');
817 openfw(f_y, args, 'y');
818 if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
819 //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
820
821 int numfiles = args.startNonOpt();
822 //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
823 int out_counter=0; //number of records printed
824 while (true) {
825 GStr infile;
826 if (numfiles) {
827 infile=args.nextNonOpt();
828 if (infile.is_empty()) break;
829 if (infile=="-") { f_in=stdin; infile="stdin"; }
830 else
831 if ((f_in=fopen(infile, "r"))==NULL)
832 GError("Error: cannot open input file %s!\n",infile.chars());
833 }
834 else
835 infile="-";
836 GffLoader gffloader(infile.chars());
837 gffloader.transcriptsOnly=mRNAOnly;
838 gffloader.fullAttributes=fullattr;
839 gffloader.noExonAttrs=noExonAttr;
840 gffloader.mergeCloseExons=mergeCloseExons;
841 gffloader.showWarnings=(args.getOpt('E')!=NULL);
842 gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
843 matchAllIntrons, fuzzSpan);
844 if (doCluster)
845 collectLocusData(g_data);
846 if (numfiles==0) break;
847 }
848
849 GStr loctrack("gffcl");
850 if (tracklabel) loctrack=tracklabel;
851 g_data.setSorted(&gseqCmpName);
852 if (doCluster) {
853 //grouped in loci
854 for (int g=0;g<g_data.Count();g++) {
855 GenomicSeqData* gdata=g_data[g];
856 for (int l=0;l<gdata->loci.Count();l++) {
857 GffLocus& loc=*(gdata->loci[l]);
858 //check all non-replaced transcripts in this locus:
859 int numvalid=0;
860 int idxfirstvalid=-1;
861 for (int i=0;i<loc.rnas.Count();i++) {
862 GffObj& t=*(loc.rnas[i]);
863 GTData* tdata=(GTData*)(t.uptr);
864 if (tdata->replaced_by!=NULL) {
865 if (f_repl && (t.udata & 8)==0) {
866 //t.udata|=8;
867 fprintf(f_repl, "%s", t.getID());
868 GTData* rby=tdata;
869 while (rby->replaced_by!=NULL) {
870 fprintf(f_repl," => %s", rby->replaced_by->getID());
871 rby->rna->udata|=8;
872 rby=(GTData*)(rby->replaced_by->uptr);
873 }
874 fprintf(f_repl, "\n");
875 }
876 continue;
877 }
878 if (process_transcript(gfasta, t)) {
879 t.udata|=4; //tag it as valid
880 numvalid++;
881 if (idxfirstvalid<0) idxfirstvalid=i;
882 }
883 }
884
885 if (f_out && numvalid>0) {
886 GStr locname("RLOC_");
887 locname.appendfmt("%08d",loc.locus_num);
888 if (!fmtGTF) {
889 if (out_counter==0)
890 printGff3Header(f_out, args);
891 fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
892 loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
893 locname.chars(), locname.chars());
894 //const char* loc_gname=loc.getGeneName();
895 if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
896 fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
897 for (int i=1;i<loc.gene_names.Count();i++) {
898 fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
899 }
900 }
901 if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
902 fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
903 for (int i=1;i<loc.gene_ids.Count();i++) {
904 fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
905 }
906 }
907 fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
908 for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
909 fprintf(f_out, ",%s",loc.rnas[i]->getID());
910 }
911 fprintf(f_out, "\n");
912 }
913 //now print all valid, non-replaced transcripts in this locus:
914 for (int i=0;i<loc.rnas.Count();i++) {
915 GffObj& t=*(loc.rnas[i]);
916 GTData* tdata=(GTData*)(t.uptr);
917 if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
918 t.addAttr("locus", locname.chars());
919 out_counter++;
920 if (fmtGTF) t.printGtf(f_out, tracklabel);
921 else {
922 //print the parent first, if any
923 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
924 GTData* pdata=(GTData*)(t.parent->uptr);
925 if (pdata->geneinfo!=NULL)
926 pdata->geneinfo->finalize();
927 t.parent->addAttr("locus", locname.chars());
928 t.parent->printGff(f_out, tracklabel);
929 t.parent->udata|=4;
930 }
931 t.printGff(f_out, tracklabel);
932 }
933 }
934 } //have valid transcripts to print
935 }//for each locus
936 if (f_out && !mRNAOnly) {
937 //final pass through the non-transcripts, in case any of them were not printed
938 //TODO: order broken, these should be interspersed among the rnas in the correct order!
939 for (int m=0;m<gdata->gfs.Count();m++) {
940 GffObj& t=*(gdata->gfs[m]);
941 if ((t.udata&4)==0) { //never printed
942 t.udata|=4;
943 if (fmtGTF) t.printGtf(f_out, tracklabel);
944 else t.printGff(f_out, tracklabel);
945 }
946 } //for each non-transcript
947 }
948 } //for each genomic sequence
949 }
950 else {
951 //not grouped into loci, print the rnas with their parents, if any
952 int numvalid=0;
953 for (int g=0;g<g_data.Count();g++) {
954 GenomicSeqData* gdata=g_data[g];
955 for (int m=0;m<gdata->rnas.Count();m++) {
956 GffObj& t=*(gdata->rnas[m]);
957 GTData* tdata=(GTData*)(t.uptr);
958 if (tdata->replaced_by!=NULL) continue;
959 if (process_transcript(gfasta, t)) {
960 t.udata|=4; //tag it as valid
961 numvalid++;
962 if (f_out) {
963 if (tdata->geneinfo) tdata->geneinfo->finalize();
964 out_counter++;
965 if (fmtGTF) t.printGtf(f_out, tracklabel);
966 else {
967 if (out_counter==1)
968 printGff3Header(f_out, args);
969 //print the parent first, if any
970 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
971 GTData* pdata=(GTData*)(t.parent->uptr);
972 if (pdata->geneinfo!=NULL)
973 pdata->geneinfo->finalize();
974 t.parent->printGff(f_out, tracklabel);
975 t.parent->udata|=4;
976 }
977 t.printGff(f_out, tracklabel);
978 }
979 }//GFF/GTF output requested
980 } //valid transcript
981 } //for each rna
982 if (f_out && !mRNAOnly) {
983 //final pass through the non-transcripts, in case any of them were not printed
984 //TODO: order broken, these should be interspersed among the rnas in the correct order!
985 for (int m=0;m<gdata->gfs.Count();m++) {
986 GffObj& t=*(gdata->gfs[m]);
987 if ((t.udata&4)==0) { //never printed
988 t.udata|=4;
989 if (fmtGTF) t.printGtf(f_out, tracklabel);
990 else t.printGff(f_out, tracklabel);
991 }
992 } //for each non-transcript
993 }
994 } //for each genomic seq
995 }
996 if (f_repl && f_repl!=stdout) fclose(f_repl);
997 seqinfo.Clear();
998 //if (faseq!=NULL) delete faseq;
999 //if (gcdb!=NULL) delete gcdb;
1000 GFREE(rfltGSeq);
1001 FRCLOSE(f_in);
1002 FWCLOSE(f_out);
1003 FWCLOSE(f_w);
1004 FWCLOSE(f_x);
1005 FWCLOSE(f_y);
1006 }