ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 309
Committed: Fri Mar 22 19:47:48 2013 UTC (6 years, 1 month ago) by gpertea
File size: 35881 byte(s)
Log Message:
fixed -J crash

Line File contents
1 #include "GArgs.h"
2 #include "gff_utils.h"
3 #include <ctype.h>
4 // don't care about cdb compression
5 //#ifdef ENABLE_COMPRESSION
6 //#undef ENABLE_COMPRESSION
7 //#endif
8 //#include "GCdbYank.h"
9
10 #define USAGE "Usage:\n\
11 gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12 [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n\
13 [-CTVNJMKQAFGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n\
14 [-i <maxintron>] \n\
15 Filters and/or converts GFF3/GTF2 records.\n\
16 <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17 \n\
18 Options:\n\
19 -g full path to a multi-fasta file with the genomic sequences\n\
20 for all input mappings, OR a directory with single-fasta files\n\
21 (one per genomic sequence, with file names matching sequence names)\n\
22 -s <seq_info.fsize> is a tab-delimited file providing this info\n\
23 for each of the mapped sequences:\n\
24 <seq-name> <seq-length> <seq-description>\n\
25 (useful for -A option with mRNA/EST/protein mappings)\n\
26 -i discard transcripts having an intron larger than <maxintron>\n\
27 -r only show transcripts overlapping coordinate range <start>..<end>\n\
28 (on chromosome/contig <chr>, strand <strand> if provided)\n\
29 -R for -r option, discard all transcripts that are not fully \n\
30 contained within the given range\n\
31 -U discard single-exon transcripts\n\
32 -C coding only: discard mRNAs that have no CDS feature\n\
33 -F full GFF attribute preservation (all attributes are shown)\n\
34 -G only parse additional exon attributes from the first exon\n\
35 and move them to the mRNA level (useful for GTF input)\n\
36 -A use the description field from <seq_info.fsize> and add it\n\
37 as the value for a 'descr' attribute to the GFF record\n\
38 \n\
39 -O process also non-transcript GFF records (by default non-transcript\n\
40 records are ignored)\n\
41 -V discard any mRNAs with CDS having in-frame stop codons\n\
42 -H for -V option, check and adjust the starting CDS phase\n\
43 if the original phase leads to a translation with an \n\
44 in-frame stop codon\n\
45 -B for -V option, single-exon transcripts are also checked on the\n\
46 opposite strand\n\
47 -N discard multi-exon mRNAs that have any intron with a non-canonical\n\
48 splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49 -J discard any mRNAs that either lack initial START codon\n\
50 or the terminal STOP codon, or have an in-frame stop codon\n\
51 (only print mRNAs with a fulll, valid CDS)\n\
52 \n\
53 -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54 transcripts (those with the same exact introns and fully contained)\n\
55 -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>\n\
56 --cluster-only: same as --merge but without collapsing matching transcripts\n\
57 -K for -M option: also collapse shorter, fully contained transcripts\n\
58 with fewer introns than the container\n\
59 -Q for -M option, remove the containment restriction:\n\
60 (multi-exon transcripts will be collapsed if just their introns match,\n\
61 while single-exon transcripts can partially overlap (80%))\n\
62 \n\
63 -E expose (warn about) duplicate transcript IDs and other potential \n\
64 problems with the given GFF/GTF records\n\
65 -Z merge close exons into a single exon (for intron size<4)\n\
66 -w write a fasta file with spliced exons for each GFF transcript\n\
67 -x write a fasta file with spliced CDS for each GFF transcript\n\
68 -W for -w and -x options, also write for each fasta record the exon\n\
69 coordinates projected onto the spliced sequence\n\
70 -y write a protein fasta file with the translation of CDS for each record\n\
71 -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
72 -m <chr_replace> is a reference (genomic) sequence replacement table with\n\
73 this format:\n\
74 <original_ref_ID> <new_ref_ID>\n\
75 GFF records on reference sequences that are not found among the\n\
76 <original_ref_ID> entries in this file will be filtered out\n\
77 -o the \"filtered\" GFF records will be written to <outfile.gff>\n\
78 (use -o- for printing to stdout)\n\
79 -t use <trackname> in the second column of each GFF output line\n\
80 -T -o option will output GTF format instead of GFF3\n\
81 "
82
83
84 class SeqInfo { //populated from the -s option of gffread
85 public:
86 int len;
87 char* descr;
88 SeqInfo( int l, char* s) {
89 len=l;
90 if (s==NULL) {
91 descr=NULL;
92 } else {
93 descr=Gstrdup(s);
94 }
95 }
96 ~SeqInfo() {
97 GFREE(descr);
98 }
99 };
100
101 class RefTran {
102 public:
103 char* new_name;
104 RefTran(char *ns) {
105 new_name=NULL;
106 if (ns!=NULL)
107 new_name=Gstrdup(ns);
108 }
109 ~RefTran() {
110 GFREE(new_name);
111 }
112 };
113
114 FILE* ffasta=NULL;
115 FILE* f_in=NULL;
116 FILE* f_out=NULL;
117 FILE* f_w=NULL; //fasta with spliced exons (transcripts)
118 FILE* f_x=NULL; //fasta with spliced CDS
119 FILE* f_y=NULL; //fasta with translated CDS
120 bool wCDSonly=false;
121
122 bool validCDSonly=false; // translation with no in-frame STOP
123 bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
124 bool altPhases=false; //if original phase fails translation validation,
125 //try the other 2 phases until one makes it
126 bool mRNAOnly=true;
127 bool spliceCheck=false; //only known splice-sites
128
129 bool fullCDSonly=false; // starts with START, ends with STOP codon
130 bool fullattr=false;
131 //bool sortByLoc=false; // if the GFF output should be sorted by location
132 bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3
133
134
135 //GStr gseqpath;
136 //GStr gcdbfa;
137 //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
138 //GFaSeqGet* faseq=NULL;
139 //GCdbYank* gcdb=NULL;
140 //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
141 bool fmtGTF=false;
142 bool addDescr=false;
143 //bool protmap=false;
144 bool multiExon=false;
145 bool writeExonSegs=false;
146 char* tracklabel=NULL;
147 int maxintron=999000000;
148 bool mergeCloseExons=false;
149 //range filter:
150 char* rfltGSeq=NULL;
151 char rfltStrand=0;
152 uint rfltStart=0;
153 uint rfltEnd=MAX_UINT;
154 bool rfltWithin=false; //check for full containment within given range
155 bool noExonAttr=false;
156
157 bool doCluster=false;
158 bool doCollapseRedundant=false;
159
160 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
161
162 //hash with sequence info
163 GHash<SeqInfo> seqinfo;
164 GHash<int> isoCounter; //counts the valid isoforms
165 GHash<RefTran> reftbl;
166 GHash<GeneInfo> gene_ids;
167 //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
168
169 bool debugMode=false;
170 bool verbose=false;
171
172 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
173 GLineReader fr(f);
174 while (!fr.isEof()) {
175 char* line=fr.getLine();
176 if (line==NULL) break;
177 char* id=line;
178 char* lenstr=NULL;
179 char* text=NULL;
180 char* p=line;
181 while (*p!=0 && !isspace(*p)) p++;
182 if (*p==0) continue;
183 *p=0;p++;
184 while (*p==' ' || *p=='\t') p++;
185 if (*p==0) continue;
186 lenstr=p;
187 while (*p!=0 && !isspace(*p)) p++;
188 if (*p!=0) { *p=0;p++; }
189 while (*p==' ' || *p=='\t') p++;
190 if (*p!=0) text=p; //else text remains NULL
191 int len=0;
192 if (!parseInt(lenstr,len)) {
193 GMessage("Warning: could not parse sequence length: %s %s\n",
194 id, lenstr);
195 continue;
196 }
197 // --- here we have finished parsing the line
198 si.Add(id, new SeqInfo(len,text));
199 } //while lines
200 }
201
202 void loadRefTable(FILE* f, GHash<RefTran>& rt) {
203 GLineReader fr(f);
204 char* line=NULL;
205 while ((line=fr.getLine())) {
206 char* orig_id=line;
207 char* p=line;
208 while (*p!=0 && !isspace(*p)) p++;
209 if (*p==0) continue;
210 *p=0;p++;//split the line here
211 while (*p==' ' || *p=='\t') p++;
212 if (*p==0) continue;
213 rt.Add(orig_id, new RefTran(p));
214 } //while lines
215 }
216
217 char* getSeqDescr(char* seqid) {
218 static char charbuf[128];
219 if (seqinfo.Count()==0) return NULL;
220 char* suf=rstrchr(seqid, '.');
221 if (suf!=NULL) *suf=0;
222 SeqInfo* seqd=seqinfo.Find(seqid);
223 if (suf!=NULL) *suf='.';
224 if (seqd!=NULL) {
225 GStr s(seqd->descr);
226 //cleanup some Uniref gunk
227 if (s[0]=='[') {
228 int r=s.index(']');
229 if (r>=0 && r<8 && isdigit(s[1]))
230 s.remove(0,r+1);
231 }
232 if (s.length()>80) {
233 int r=s.index(';');
234 if (r>5) s.cut(r);
235 }
236 if (s.length()>127) {
237 s.cut(127);
238 int r=s.rindex(' ');
239 if (r>0) s.cut(r);
240 }
241 strcpy(charbuf, s.chars());
242 return charbuf;
243 }
244 else return NULL;
245 }
246
247 char* getSeqName(char* seqid) {
248 static char charbuf[128];
249 char* suf=rstrchr(seqid, '.');
250 if (suf!=NULL) *suf=0;
251 strcpy(charbuf, seqid);
252 if (suf!=NULL) *suf='.';
253 return charbuf;
254 }
255
256 GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
257 if (gfasta.fastaPath==NULL) return NULL;
258 return gfasta.fetch(gffrec.gseq_id);
259 }
260
261
262 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
263 //adj>0 => extedn CDS, adj<0 => shrink CDS
264 //when CDS is expanded, exons have to be checked too and
265 // expanded accordingly if they had the same boundary
266 int realadj=0;
267 if (gffrec.strand=='-') {
268 if ((int)gffrec.CDstart>adj) {
269
270 gffrec.CDstart-=adj;
271 realadj=adj;
272 if (adj<0) { //restore
273 if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
274 gffrec.exons.First()->start-=adj;
275 gffrec.start=gffrec.exons.First()->start;
276 gffrec.covlen+=adj;
277 }
278 }
279 else if (gffrec.exons.First()->start>=gffrec.CDstart) {
280 gffrec.exons.First()->start-=adj;
281 gffrec.start=gffrec.exons.First()->start;
282 gffrec.covlen+=adj;
283 }
284 }
285 }
286 else {
287 realadj=adj;
288 gffrec.CDend+=adj;
289 if (adj<0) {//restore
290 if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
291 gffrec.exons.Last()->end+=adj;
292 gffrec.end=gffrec.exons.Last()->end;
293 gffrec.covlen+=adj;
294 }
295 }
296 else if (gffrec.exons.Last()->end<=gffrec.CDend) {
297 gffrec.exons.Last()->end+=adj;
298 gffrec.end=gffrec.exons.Last()->end;
299 gffrec.covlen+=adj;
300 }
301 }
302 if (seglst!=NULL) seglst->Last()->end+=adj;
303 return realadj;
304 }
305
306 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
307 //returns true if the transcript passed the filter
308 char* gname=gffrec.getGeneName();
309 if (gname==NULL) gname=gffrec.getGeneID();
310 GStr defline(gffrec.getID());
311 if (f_out && !fmtGTF) {
312 const char* tname=NULL;
313 if ((tname=gffrec.getAttr("transcript_name"))!=NULL) {
314 gffrec.addAttr("Name", tname);
315 gffrec.removeAttr("transcript_name");
316 }
317 }
318 if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
319 const char* biotype=gffrec.getAttr("gene_biotype");
320 if (biotype) {
321 gffrec.addAttr("type", biotype);
322 gffrec.removeAttr("gene_biotype");
323 }
324 else { //old Ensembl files lacking gene_biotype
325 gffrec.addAttr("type", gffrec.getTrackName());
326 }
327
328 //bool is_gene=false;
329 bool is_pseudo=false;
330 if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
331 gffrec.setFeatureName("mRNA");
332 else {
333 if (strcmp(biotype, "processed_transcript")==0)
334 gffrec.setFeatureName("proc_RNA");
335 else {
336 //is_gene=endsWith(biotype, "gene");
337 is_pseudo=strifind(biotype, "pseudo");
338 if (is_pseudo) {
339 gffrec.setFeatureName("pseudo_RNA");
340 }
341 else if (endsWith(biotype, "RNA")) {
342 gffrec.setFeatureName(biotype);
343 } else gffrec.setFeatureName("misc_RNA");
344 }
345 }
346 }
347 if (gname && strcmp(gname, gffrec.getID())!=0) {
348 int* isonum=isoCounter.Find(gname);
349 if (isonum==NULL) {
350 isonum=new int(1);
351 isoCounter.Add(gname,isonum);
352 }
353 else (*isonum)++;
354 defline.appendfmt(" gene=%s", gname);
355 }
356 int seqlen=0;
357
358 const char* tlabel=tracklabel;
359 if (tlabel==NULL) tlabel=gffrec.getTrackName();
360 //defline.appendfmt(" track:%s",tlabel);
361 char* cdsnt = NULL;
362 char* cdsaa = NULL;
363 int aalen=0;
364 for (int i=1;i<gffrec.exons.Count();i++) {
365 int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
366 if (ilen>4000000)
367 GMessage("Warning: very large intron (%d) for transcript %s\n",
368 ilen, gffrec.getID());
369 if (ilen>maxintron) {
370 return false;
371 }
372 }
373 GList<GSeg> seglst(false,true);
374 GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
375 if (spliceCheck && gffrec.exons.Count()>1) {
376 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
377 if (faseq==NULL) GError("Error: no genomic sequence available!\n");
378 int glen=gffrec.end-gffrec.start+1;
379 const char* gseq=faseq->subseq(gffrec.start, glen);
380 bool revcompl=(gffrec.strand=='-');
381 bool ssValid=true;
382 for (int e=1;e<gffrec.exons.Count();e++) {
383 const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
384 int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
385 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
386 GSpliceSite donorSite(intron,intronlen, false, revcompl);
387 //GMessage("%c intron %d-%d : %s .. %s\n",
388 // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
389 if (acceptorSite=="AG") { // GT-AG or GC-AG
390 if (!donorSite.canonicalDonor()) {
391 ssValid=false;break;
392 }
393 }
394 else if (acceptorSite=="AC") { //
395 if (donorSite!="AT") { ssValid=false; break; }
396 }
397 else { ssValid=false; break; }
398 }
399 //GFREE(gseq);
400 if (!ssValid) {
401 if (verbose)
402 GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
403 return false; //don't print this one!
404 }
405 }
406
407 bool trprint=true;
408 int stopCodonAdjust=0;
409 int mCDphase=0;
410 bool hasStop=false;
411 if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
412 mCDphase = gffrec.CDphase-'0';
413 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
414 if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
415 //if (protmap && fullCDSonly) {
416 //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
417
418 if (validCDSonly) { //make sure the stop codon is always included
419 //adjust_stopcodon(gffrec,3);
420 stopCodonAdjust=adjust_stopcodon(gffrec,3);
421 }
422 int strandNum=0;
423 int phaseNum=0;
424 CDS_CHECK:
425 cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, NULL, &seglst);
426 if (cdsnt==NULL) trprint=false;
427 else { //has CDS
428 if (validCDSonly) {
429 cdsaa=translateDNA(cdsnt, aalen, seqlen);
430 char* p=strchr(cdsaa,'.');
431 hasStop=false;
432 if (p!=NULL) {
433 if (p-cdsaa>=aalen-2) { //stop found as the last codon
434 *p='0';//remove it
435 hasStop=true;
436 if (aalen-2==p-cdsaa) {
437 //previous to last codon is the stop codon
438 //so correct the CDS stop accordingly
439 adjust_stopcodon(gffrec,-3, &seglst);
440 stopCodonAdjust=0; //clear artificial stop adjustment
441 seqlen-=3;
442 cdsnt[seqlen]=0;
443 }
444 aalen=p-cdsaa;
445 }
446 else {//stop found before the last codon
447 trprint=false;
448 }
449 }//stop codon found
450 if (trprint==false) { //failed CDS validity check
451 //in-frame stop codon found
452 if (altPhases && phaseNum<3) {
453 phaseNum++;
454 gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
455 GFREE(cdsaa);
456 goto CDS_CHECK;
457 }
458 if (gffrec.exons.Count()==1 && bothStrands) {
459 strandNum++;
460 phaseNum=0;
461 if (strandNum<2) {
462 GFREE(cdsaa);
463 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
464 goto CDS_CHECK; //repeat the CDS check for a different frame
465 }
466 }
467 if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
468 } //has in-frame STOP
469 if (fullCDSonly) {
470 if (!hasStop || cdsaa[0]!='M') trprint=false;
471 }
472 } // CDS check requested
473 } //has CDS
474 } //translation or codon check/output was requested
475 if (!trprint) {
476 GFREE(cdsnt);
477 GFREE(cdsaa);
478 return false;
479 }
480 if (stopCodonAdjust>0 && !hasStop) {
481 //restore stop codon location
482 adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
483 if (cdsnt!=NULL && seqlen>0) {
484 seqlen-=stopCodonAdjust;
485 cdsnt[seqlen]=0;
486 }
487 if (cdsaa!=NULL) aalen--;
488 }
489
490 if (f_y!=NULL) { //CDS translation fasta output requested
491 //char*
492 if (cdsaa==NULL) { //translate now if not done before
493 cdsaa=translateDNA(cdsnt, aalen, seqlen);
494 }
495 if (fullattr && gffrec.attrs!=NULL) {
496 //append all attributes found for each transcripts
497 for (int i=0;i<gffrec.attrs->Count();i++) {
498 defline.append(" ");
499 defline.append(gffrec.getAttrName(i));
500 defline.append("=");
501 defline.append(gffrec.getAttrValue(i));
502 }
503 }
504 printFasta(f_y, defline, cdsaa, aalen);
505 }
506 if (f_x!=NULL) { //CDS only
507 if (writeExonSegs) {
508 defline.append(" loc:");
509 defline.append(gffrec.getGSeqName());
510 defline.appendfmt("(%c)",gffrec.strand);
511 //warning: not CDS coordinates are written here, but the exon ones
512 defline+=(int)gffrec.start;
513 defline+=(char)'-';
514 defline+=(int)gffrec.end;
515 // -- here these are CDS substring coordinates on the spliced sequence:
516 defline.append(" segs:");
517 for (int i=0;i<seglst.Count();i++) {
518 if (i>0) defline.append(",");
519 defline+=(int)seglst[i]->start;
520 defline.append("-");
521 defline+=(int)seglst[i]->end;
522 }
523 }
524 if (fullattr && gffrec.attrs!=NULL) {
525 //append all attributes found for each transcript
526 for (int i=0;i<gffrec.attrs->Count();i++) {
527 defline.append(" ");
528 defline.append(gffrec.getAttrName(i));
529 defline.append("=");
530 defline.append(gffrec.getAttrValue(i));
531 }
532 }
533 printFasta(f_x, defline, cdsnt, seqlen);
534 }
535 GFREE(cdsnt);
536 GFREE(cdsaa);
537 if (f_w!=NULL) { //write spliced exons
538 uint cds_start=0;
539 uint cds_end=0;
540 seglst.Clear();
541 char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
542 if (exont!=NULL) {
543 if (gffrec.CDstart>0) {
544 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
545 }
546 if (writeExonSegs) {
547 defline.append(" loc:");
548 defline.append(gffrec.getGSeqName());
549 defline+=(char)'|';
550 defline+=(int)gffrec.start;
551 defline+=(char)'-';
552 defline+=(int)gffrec.end;
553 defline+=(char)'|';
554 defline+=(char)gffrec.strand;
555 defline.append(" exons:");
556 for (int i=0;i<gffrec.exons.Count();i++) {
557 if (i>0) defline.append(",");
558 defline+=(int)gffrec.exons[i]->start;
559 defline.append("-");
560 defline+=(int)gffrec.exons[i]->end;
561 }
562 defline.append(" segs:");
563 for (int i=0;i<seglst.Count();i++) {
564 if (i>0) defline.append(",");
565 defline+=(int)seglst[i]->start;
566 defline.append("-");
567 defline+=(int)seglst[i]->end;
568 }
569 }
570 if (fullattr && gffrec.attrs!=NULL) {
571 //append all attributes found for each transcripts
572 for (int i=0;i<gffrec.attrs->Count();i++) {
573 defline.append(" ");
574 defline.append(gffrec.getAttrName(i));
575 defline.append("=");
576 defline.append(gffrec.getAttrValue(i));
577 }
578 }
579 printFasta(f_w, defline, exont, seqlen);
580 GFREE(exont);
581 }
582 } //writing f_w (spliced exons)
583 return true;
584 }
585
586 void openfw(FILE* &f, GArgs& args, char opt) {
587 GStr s=args.getOpt(opt);
588 if (!s.is_empty()) {
589 if (s=='-')
590 f=stdout;
591 else {
592 f=fopen(s,"w");
593 if (f==NULL) GError("Error creating file: %s\n", s.chars());
594 }
595 }
596 }
597
598 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
599 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
600
601 void printGff3Header(FILE* f, GArgs& args) {
602 fprintf(f, "# ");
603 args.printCmdLine(f);
604 fprintf(f, "##gff-version 3\n");
605 //for (int i=0;i<gseqdata.Count();i++) {
606 //
607 //}
608 }
609
610 bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
611 if (reftbl.Count()>0) {
612 GStr refname(gffrec->getRefName());
613 RefTran* rt=reftbl.Find(refname.chars());
614 if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
615 //try removing the version suffix
616 refname.cut(-2);
617 //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
618 rt=reftbl.Find(refname.chars());
619 }
620 if (rt) {
621 gffrec->setRefName(rt->new_name);
622 }
623 else return false; //discard, ref seq not in the given translation table
624 }
625 if (mRNAOnly && gffrec->isDiscarded()) {
626 //discard generic "gene" or "locus" features with no other detailed subfeatures
627 //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
628 return false;
629 }
630 /*
631 if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
632 //a non-mRNA feature with no subfeatures
633 //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
634 //--this could be a single "pseudogene" entry or another genomic region without exons
635 //
636 gffrec->addExon(gffrec->start,gffrec->end);
637 }
638 */
639 if (rfltGSeq!=NULL) { //filter by gseqName
640 if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
641 return false;
642 }
643 }
644 if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
645 return false;
646 }
647 //check coordinates
648 if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
649 if (rfltWithin) {
650 if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
651 return false; //not within query range
652 }
653 }
654 else {
655 if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
656 return false;
657 }
658 }
659 }
660 if (multiExon && gffrec->exons.Count()<=1) {
661 return false;
662 }
663 if (wCDSonly && gffrec->CDstart==0) {
664 return false;
665 }
666 if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
667 //keep track of chr|gene_id data -- coordinate range
668 char* geneid=gffrec->getGeneID();
669 if (geneid!=NULL) {
670 GeneInfo* ginfo=gene_ids.Find(geneid);
671 if (ginfo==NULL) {//first time seeing this gene ID
672 GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
673 gene_ids.Add(geneid, geneinfo);
674 if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
675 }
676 else ginfo->update(gffrec);
677 }
678 }
679 return true;
680 }
681
682
683 int main(int argc, char * const argv[]) {
684 GArgs args(argc, argv,
685 "debug;merge;cluster-only;help;force-exons;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
686 args.printError(USAGE, true);
687 if (args.getOpt('h') || args.getOpt("help")) {
688 GMessage("%s",USAGE);
689 exit(1);
690 }
691 debugMode=(args.getOpt("debug")!=NULL);
692 bool forceExons=(args.getOpt("force-exons")!=NULL);
693 mRNAOnly=(args.getOpt('O')==NULL);
694 //sortByLoc=(args.getOpt('S')!=NULL);
695 addDescr=(args.getOpt('A')!=NULL);
696 verbose=(args.getOpt('v')!=NULL);
697 wCDSonly=(args.getOpt('C')!=NULL);
698 validCDSonly=(args.getOpt('V')!=NULL);
699 altPhases=(args.getOpt('H')!=NULL);
700 fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
701 bothStrands=(args.getOpt('B')!=NULL);
702 fullCDSonly=(args.getOpt('J')!=NULL);
703 spliceCheck=(args.getOpt('N')!=NULL);
704 bool matchAllIntrons=(args.getOpt('K')==NULL);
705 bool fuzzSpan=(args.getOpt('Q')!=NULL);
706 if (args.getOpt('M') || args.getOpt("merge")) {
707 doCluster=true;
708 doCollapseRedundant=true;
709 }
710 else {
711 if (!matchAllIntrons || fuzzSpan) {
712 GMessage("%s",USAGE);
713 GMessage("Error: -K or -Q options require -M/--merge option!\n");
714 exit(1);
715 }
716 }
717 if (args.getOpt("cluster-only")) {
718 doCluster=true;
719 doCollapseRedundant=false;
720 if (!matchAllIntrons || fuzzSpan) {
721 GMessage("%s",USAGE);
722 GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
723 exit(1);
724 }
725 }
726 if (fullCDSonly) validCDSonly=true;
727 if (verbose) {
728 fprintf(stderr, "Command line was:\n");
729 args.printCmdLine(stderr);
730 }
731
732 fullattr=(args.getOpt('F')!=NULL);
733 if (args.getOpt('G')==NULL)
734 noExonAttr=!fullattr;
735 else {
736 noExonAttr=true;
737 fullattr=true;
738 }
739 ensembl_convert=(args.getOpt('L')!=NULL);
740 if (ensembl_convert) {
741 fullattr=true;
742 noExonAttr=false;
743 //sortByLoc=true;
744 }
745
746 mergeCloseExons=(args.getOpt('Z')!=NULL);
747 multiExon=(args.getOpt('U')!=NULL);
748 writeExonSegs=(args.getOpt('W')!=NULL);
749 tracklabel=args.getOpt('t');
750 GFastaDb gfasta(args.getOpt('g'));
751 //if (gfasta.fastaPath!=NULL)
752 // sortByLoc=true; //enforce sorting by chromosome/contig
753 GStr s=args.getOpt('i');
754 if (!s.is_empty()) maxintron=s.asInt();
755
756 FILE* f_repl=NULL;
757 s=args.getOpt('d');
758 if (!s.is_empty()) {
759 if (s=="-") f_repl=stdout;
760 else {
761 f_repl=fopen(s.chars(), "w");
762 if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
763 }
764 }
765
766 rfltWithin=(args.getOpt('R')!=NULL);
767 s=args.getOpt('r');
768 if (!s.is_empty()) {
769 s.trim();
770 if (s[0]=='+' || s[0]=='-') {
771 rfltStrand=s[0];
772 s.cut(0,1);
773 }
774 int isep=s.index(':');
775 if (isep>0) { //gseq name given
776 if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
777 isep--;
778 rfltStrand=s[isep];
779 s.cut(isep,1);
780 }
781 if (isep>0)
782 rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
783 s.cut(0,isep+1);
784 }
785 GStr gsend;
786 char slast=s[s.length()-1];
787 if (rfltStrand==0 && (slast=='+' || slast=='-')) {
788 s.chomp(slast);
789 rfltStrand=slast;
790 }
791 if (s.index("..")>=0) gsend=s.split("..");
792 else gsend=s.split('-');
793 if (!s.is_empty()) rfltStart=(uint)s.asInt();
794 if (!gsend.is_empty()) {
795 rfltEnd=(uint)gsend.asInt();
796 if (rfltEnd==0) rfltEnd=MAX_UINT;
797 }
798 } //gseq/range filtering
799 else {
800 if (rfltWithin)
801 GError("Error: option -R requires -r!\n");
802 //if (rfltWholeTranscript)
803 // GError("Error: option -P requires -r!\n");
804 }
805 s=args.getOpt('m');
806 if (!s.is_empty()) {
807 FILE* ft=fopen(s,"r");
808 if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
809 loadRefTable(ft, reftbl);
810 fclose(ft);
811 }
812 s=args.getOpt('s');
813 if (!s.is_empty()) {
814 FILE* fsize=fopen(s,"r");
815 if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
816 loadSeqInfo(fsize, seqinfo);
817 fclose(fsize);
818 }
819
820 openfw(f_out, args, 'o');
821 //if (f_out==NULL) f_out=stdout;
822 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
823 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
824
825 openfw(f_w, args, 'w');
826 openfw(f_x, args, 'x');
827 openfw(f_y, args, 'y');
828 if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
829 //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
830
831 int numfiles = args.startNonOpt();
832 //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
833 int out_counter=0; //number of records printed
834 while (true) {
835 GStr infile;
836 if (numfiles) {
837 infile=args.nextNonOpt();
838 if (infile.is_empty()) break;
839 if (infile=="-") { f_in=stdin; infile="stdin"; }
840 else
841 if ((f_in=fopen(infile, "r"))==NULL)
842 GError("Error: cannot open input file %s!\n",infile.chars());
843 }
844 else
845 infile="-";
846 GffLoader gffloader(infile.chars());
847 gffloader.transcriptsOnly=mRNAOnly;
848 gffloader.fullAttributes=fullattr;
849 gffloader.noExonAttrs=noExonAttr;
850 gffloader.mergeCloseExons=mergeCloseExons;
851 gffloader.showWarnings=(args.getOpt('E')!=NULL);
852 gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
853 matchAllIntrons, fuzzSpan, forceExons);
854 if (doCluster)
855 collectLocusData(g_data);
856 if (numfiles==0) break;
857 }
858
859 GStr loctrack("gffcl");
860 if (tracklabel) loctrack=tracklabel;
861 g_data.setSorted(&gseqCmpName);
862 if (doCluster) {
863 //grouped in loci
864 for (int g=0;g<g_data.Count();g++) {
865 GenomicSeqData* gdata=g_data[g];
866 for (int l=0;l<gdata->loci.Count();l++) {
867 GffLocus& loc=*(gdata->loci[l]);
868 //check all non-replaced transcripts in this locus:
869 int numvalid=0;
870 int idxfirstvalid=-1;
871 for (int i=0;i<loc.rnas.Count();i++) {
872 GffObj& t=*(loc.rnas[i]);
873 GTData* tdata=(GTData*)(t.uptr);
874 if (tdata->replaced_by!=NULL) {
875 if (f_repl && (t.udata & 8)==0) {
876 //t.udata|=8;
877 fprintf(f_repl, "%s", t.getID());
878 GTData* rby=tdata;
879 while (rby->replaced_by!=NULL) {
880 fprintf(f_repl," => %s", rby->replaced_by->getID());
881 rby->rna->udata|=8;
882 rby=(GTData*)(rby->replaced_by->uptr);
883 }
884 fprintf(f_repl, "\n");
885 }
886 continue;
887 }
888 if (process_transcript(gfasta, t)) {
889 t.udata|=4; //tag it as valid
890 numvalid++;
891 if (idxfirstvalid<0) idxfirstvalid=i;
892 }
893 }
894
895 if (f_out && numvalid>0) {
896 GStr locname("RLOC_");
897 locname.appendfmt("%08d",loc.locus_num);
898 if (!fmtGTF) {
899 if (out_counter==0)
900 printGff3Header(f_out, args);
901 fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
902 loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
903 locname.chars(), locname.chars());
904 //const char* loc_gname=loc.getGeneName();
905 if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
906 fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
907 for (int i=1;i<loc.gene_names.Count();i++) {
908 fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
909 }
910 }
911 if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
912 fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
913 for (int i=1;i<loc.gene_ids.Count();i++) {
914 fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
915 }
916 }
917 fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
918 for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
919 fprintf(f_out, ",%s",loc.rnas[i]->getID());
920 }
921 fprintf(f_out, "\n");
922 }
923 //now print all valid, non-replaced transcripts in this locus:
924 for (int i=0;i<loc.rnas.Count();i++) {
925 GffObj& t=*(loc.rnas[i]);
926 GTData* tdata=(GTData*)(t.uptr);
927 if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
928 t.addAttr("locus", locname.chars());
929 out_counter++;
930 if (fmtGTF) t.printGtf(f_out, tracklabel);
931 else {
932 //print the parent first, if any
933 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
934 GTData* pdata=(GTData*)(t.parent->uptr);
935 if (pdata->geneinfo!=NULL)
936 pdata->geneinfo->finalize();
937 t.parent->addAttr("locus", locname.chars());
938 t.parent->printGff(f_out, tracklabel);
939 t.parent->udata|=4;
940 }
941 t.printGff(f_out, tracklabel);
942 }
943 }
944 } //have valid transcripts to print
945 }//for each locus
946 if (f_out && !mRNAOnly) {
947 //final pass through the non-transcripts, in case any of them were not printed
948 //TODO: order broken, these should be interspersed among the rnas in the correct order!
949 for (int m=0;m<gdata->gfs.Count();m++) {
950 GffObj& t=*(gdata->gfs[m]);
951 if ((t.udata&4)==0) { //never printed
952 t.udata|=4;
953 if (fmtGTF) t.printGtf(f_out, tracklabel);
954 else t.printGff(f_out, tracklabel);
955 }
956 } //for each non-transcript
957 }
958 } //for each genomic sequence
959 }
960 else {
961 //not grouped into loci, print the rnas with their parents, if any
962 int numvalid=0;
963 for (int g=0;g<g_data.Count();g++) {
964 GenomicSeqData* gdata=g_data[g];
965 for (int m=0;m<gdata->rnas.Count();m++) {
966 GffObj& t=*(gdata->rnas[m]);
967 GTData* tdata=(GTData*)(t.uptr);
968 if (tdata->replaced_by!=NULL) continue;
969 if (process_transcript(gfasta, t)) {
970 t.udata|=4; //tag it as valid
971 numvalid++;
972 if (f_out) {
973 if (tdata->geneinfo) tdata->geneinfo->finalize();
974 out_counter++;
975 if (fmtGTF) t.printGtf(f_out, tracklabel);
976 else {
977 if (out_counter==1)
978 printGff3Header(f_out, args);
979 //print the parent first, if any
980 if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
981 GTData* pdata=(GTData*)(t.parent->uptr);
982 if (pdata->geneinfo!=NULL)
983 pdata->geneinfo->finalize();
984 t.parent->printGff(f_out, tracklabel);
985 t.parent->udata|=4;
986 }
987 t.printGff(f_out, tracklabel);
988 }
989 }//GFF/GTF output requested
990 } //valid transcript
991 } //for each rna
992 if (f_out && !mRNAOnly) {
993 //final pass through the non-transcripts, in case any of them were not printed
994 //TODO: order broken, these should be interspersed among the rnas in the correct order!
995 for (int m=0;m<gdata->gfs.Count();m++) {
996 GffObj& t=*(gdata->gfs[m]);
997 if ((t.udata&4)==0) { //never printed
998 t.udata|=4;
999 if (fmtGTF) t.printGtf(f_out, tracklabel);
1000 else t.printGff(f_out, tracklabel);
1001 }
1002 } //for each non-transcript
1003 }
1004 } //for each genomic seq
1005 }
1006 if (f_repl && f_repl!=stdout) fclose(f_repl);
1007 seqinfo.Clear();
1008 //if (faseq!=NULL) delete faseq;
1009 //if (gcdb!=NULL) delete gcdb;
1010 GFREE(rfltGSeq);
1011 FRCLOSE(f_in);
1012 FWCLOSE(f_out);
1013 FWCLOSE(f_w);
1014 FWCLOSE(f_x);
1015 FWCLOSE(f_y);
1016 }