ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 6
Committed: Mon Mar 22 22:07:27 2010 UTC (9 years, 7 months ago) by gpertea
File size: 25862 byte(s)
Log Message:
added gffread sources

Line User Rev File contents
1 gpertea 6 #include "GArgs.h"
2     #include "GStr.h"
3     #include "GHash.hh"
4     #include "GList.hh"
5     #include "GFaSeqGet.h"
6     #include "gff.h"
7     #include <ctype.h>
8     // do not care about cdb compression
9     #ifdef ENABLE_COMPRESSION
10     #undef ENABLE_COMPRESSION
11     #endif
12     #include "GCdbYank.h"
13    
14     #define USAGE "Usage:\n\
15     gffread <input_gff..> [-g <genomic_seq_fasta> | <dir>][-s <seq_info.fsize>]\n\
16     [-o <outfile.gff3>] [-t <tname>] [-r [<chr>:]<start>..<end>] [-i <maxintron>]\n\
17     [-CTVNMAFGRUVBHSZWTO] [-w <spl_exons.fa>] [-x <spl_cds.fa>] [-y <tran_cds.fa>]\n\
18     \n\
19     Filters and/or converts GFF/GTF records.\n\
20     \n\
21     Options:\n\
22     -g full path to one fasta file with the genomic sequence\n\
23     for all input mappings, OR a path to the directory where\n\
24     genomic sequences can be found as single-fasta files\n\
25     -s for mRNA/EST/protein mappings a tab-delimited file provides this info\n\
26     for each of the mapped sequences:\n\
27     <seq-name> <seq-length> <seq-description>\n\
28     -i discard transcripts having an intron larger than <maxintron>\n\
29     -r only show transcripts crossing coordinate range <start>..<end>\n\
30     (on chromosome/contig <chr> if provided)\n\
31     -R for -r option, discard all transcripts that are not fully \n\
32     contained within given range\n\
33     -U discard single-exon transcripts\n\
34     -C discard mRNAs that have no CDS feature\n\
35     -F keep all attributes from last column of GFF/GTF\n\
36     -G only parse additional exon attributes from the first exon\n\
37     and move them to the mRNA level (useful for GTF input)\n\
38     -A use the description field from <seq_info.fsize> and add it as\n\
39     a descr=.. attribute to the mRNA and/or Gene record\n\
40     \n\
41     -O process other (non-mRNA) GFF/GTF records (default is discard non-mRNA\n\
42     data). \n\
43     Note: non-mRNA records must have only one subfeature per parent feature\n\
44     -V discard any mRNAs with CDS having in-frame stop codons\n\
45     -H for -V option, check and adjust the starting CDS phase\n\
46     if the original phase leads to a translation with an \n\
47     in-frame stop codon\n\
48     -B for -V option, single-exon transcripts are also checked on the\n\
49     opposite strand\n\
50     -N only show multi-exon mRNAs if all their introns have the \n\
51     typical splice site consensus ( GT-AG, GC-AG or AT-AC )\n\
52     -M discard any mRNAs that either lack initial START codon\n\
53     or the terminal STOP codon, or have an in-frame stop codon\n\
54     (only print mRNAs with a fulll, valid CDS)\n\
55     \n\
56     -S sort output GFF records by genomic sequence and start coordinate\n\
57     note that this option is automatically enabled by -g option\n\
58     -Z merge close exons into a single exon (intron size<4)\n\
59     -t use <trackname> in the second column of each GFF/GTF output line\n\
60     -w write a fasta file with spliced exons for each mapping\n\
61     -x write a fasta file with spliced CDS for each mapping\n\
62     -W for -w and -x options, also write for each fasta record the exon \n\
63     coordinates projected onto the spliced sequence\n\
64     -y write a protein fasta file with the CDS translation for each mapping\n\
65     -o the output gff3 file with the 'filtered' entries\n\
66     -T output GTF format instead of GFF3\n\
67     \n\
68     "
69    
70     FILE* ffasta=NULL;
71     FILE* f_in=NULL;
72     FILE* f_out=NULL;
73     FILE* f_w=NULL; //fasta with spliced exons (transcripts)
74     FILE* f_x=NULL; //fasta with spliced CDS
75     FILE* f_y=NULL; //fasta with translated CDS
76     bool wCDSonly=false;
77    
78     bool validCDSonly=false; // translation with no in-frame STOP
79     bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
80     bool altPhases=false; //if original phase fails translation validation,
81     //try the other 2 phases until one makes it
82     bool mRNAOnly=true;
83     bool spliceCheck=false; //only known splice-sites
84    
85     bool fullCDSonly=false; // starts with START, ends with STOP codon
86     bool fullattr=false;
87     bool sortByLoc=false; // if the GFF output should be sorted by location
88     GStr gseqpath;
89     bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
90     bool fmtGTF=false;
91     int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
92     GFaSeqGet* faseq=NULL;
93     GCdbYank* gcdb=NULL;
94     GStr gcdbfa;
95     bool addDescr=false;
96     //bool protmap=false;
97     bool multiExon=false;
98     bool writeExonSegs=false;
99     char* tracklabel=NULL;
100     int maxintron=999000000;
101     bool mergeCloseExons=false;
102     //range filter:
103     char* rfltGSeq=NULL;
104     uint rfltStart=0;
105     uint rfltEnd=MAX_UINT;
106     bool rfltWithin=false; //check for full containment within given range
107     bool noExonAttr=false;
108     class SeqInfo {
109     public:
110     int len;
111     char* descr;
112     SeqInfo( int l, char* s) {
113     len=l;
114     if (s==NULL) {
115     descr=NULL;
116     } else {
117     descr=Gstrdup(s);
118     }
119     }
120     ~SeqInfo() {
121     GFREE(descr);
122     }
123     };
124    
125    
126     class GSpliceSite {
127     public:
128     char nt[3];
129     GSpliceSite(const char* c, bool revc=false) {
130     nt[2]=0;
131     if (c==NULL) {
132     nt[0]=0;
133     nt[1]=0;
134     return;
135     }
136     if (revc) {
137     nt[0]=toupper(ntComplement(c[1]));
138     nt[1]=toupper(ntComplement(c[0]));
139     }
140     else {
141     nt[0]=toupper(c[0]);
142     nt[1]=toupper(c[1]);
143     }
144     }
145    
146     GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) {
147     nt[2]=0;
148     if (intron==NULL || intronlen==0)
149     GError("Error: invalid intron or intron len for GSpliceSite()!\n");
150     const char* c=intron;
151     if (revc) {
152     if (!getAcceptor) c+=intronlen-2;
153     nt[0]=toupper(ntComplement(c[1]));
154     nt[1]=toupper(ntComplement(c[0]));
155     }
156     else { //on forward strand
157     if (getAcceptor) c+=intronlen-2;
158     nt[0]=toupper(c[0]);
159     nt[1]=toupper(c[1]);
160     }//forward strand
161     }
162    
163     GSpliceSite(const char n1, const char n2) {
164     nt[2]=0;
165     nt[0]=toupper(n1);
166     nt[1]=toupper(n2);
167     }
168     bool canonicalDonor() {
169     return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T'));
170     }
171     bool operator==(GSpliceSite& c) {
172     return (c.nt[0]==nt[0] && c.nt[1]==nt[1]);
173     }
174     bool operator==(GSpliceSite* c) {
175     return (c->nt[0]==nt[0] && c->nt[1]==nt[1]);
176     }
177     bool operator==(const char* c) {
178     //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1]));
179     //assumes given const nucleotides are uppercase already!
180     return (nt[0]==c[0] && nt[1]==c[1]);
181     }
182     bool operator!=(const char* c) {
183     //assumes given const nucleotides are uppercase already!
184     return (nt[0]!=c[0] || nt[1]!=c[1]);
185     }
186     };
187    
188     //hash with sequence info
189     GHash<SeqInfo> seqinfo;
190     GHash<int> isoCounter; //counts the valid isoforms
191    
192     bool debugMode=false;
193     bool verbose=false;
194    
195     char* getSeqDescr(char* seqid) {
196     static char charbuf[128];
197     if (seqinfo.Count()==0) return NULL;
198     char* suf=rstrchr(seqid, '.');
199     if (suf!=NULL) *suf=0;
200     SeqInfo* seqd=seqinfo.Find(seqid);
201     if (suf!=NULL) *suf='.';
202     if (seqd!=NULL) {
203     GStr s(seqd->descr);
204     //cleanup some Uniref gunk
205     if (s[0]=='[') {
206     int r=s.index(']');
207     if (r>=0 && r<8 && isdigit(s[1]))
208     s.remove(0,r+1);
209     }
210     if (s.length()>80) {
211     int r=s.index(';');
212     if (r>5) s.cut(r);
213     }
214     if (s.length()>127) {
215     s.cut(127);
216     int r=s.rindex(' ');
217     if (r>0) s.cut(r);
218     }
219     strcpy(charbuf, s.chars());
220     return charbuf;
221     }
222     else return NULL;
223     }
224    
225     char* getSeqName(char* seqid) {
226     static char charbuf[128];
227     char* suf=rstrchr(seqid, '.');
228     if (suf!=NULL) *suf=0;
229     strcpy(charbuf, seqid);
230     if (suf!=NULL) *suf='.';
231     return charbuf;
232     }
233    
234     void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1) {
235     if (seq==NULL) return;
236     int len=(seqlen>0)?seqlen:strlen(seq);
237     if (len<=0) return;
238     if (!defline.is_empty())
239     fprintf(f, ">%s\n",defline.chars());
240     int ilen=0;
241     for (int i=0; i < len; i++, ilen++) {
242     if (ilen == 70) {
243     fputc('\n', f);
244     ilen = 0;
245     }
246     putc(seq[i], f);
247     } //for
248     fputc('\n', f);
249     }
250    
251     void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
252     GLineReader fr(f);
253     while (!fr.isEof()) {
254     char* line=fr.getLine();
255     if (line==NULL) break;
256     char* id=line;
257     char* lenstr=NULL;
258     char* text=NULL;
259     char* p=line;
260     while (*p!=0 && !isspace(*p)) p++;
261     if (*p==0) continue;
262     *p=0;p++;
263     while (*p==' ' || *p=='\t') p++;
264     if (*p==0) continue;
265     lenstr=p;
266     while (*p!=0 && !isspace(*p)) p++;
267     if (*p!=0) { *p=0;p++; }
268     while (*p==' ' || *p=='\t') p++;
269     if (*p!=0) text=p; //else text remains NULL
270     int len=0;
271     if (!parseInt(lenstr,len)) {
272     GMessage("Warning: could not parse sequence length: %s %s\n",
273     id, lenstr);
274     continue;
275     }
276     // --- here we have finished parsing the line
277     si.Add(id, new SeqInfo(len,text));
278     } //while lines
279     }
280    
281    
282     GFaSeqGet* openFastaSeq(const char* gseqname) {
283     //this is only called when either we have a cidx or a directory in gseqpath
284     GFaSeqGet* r=NULL;
285     if (gcdb!=NULL) {
286     off_t rpos=gcdb->getRecordPos(gseqname);
287     if (rpos<0) // genomic sequence not found
288     GError("Error: cannot find genomic sequence '%s' in %s\n",gseqname, gseqpath.chars());
289     return new GFaSeqGet(gcdbfa.chars(),rpos, false);// WARNING: does not validate FASTA line-len uniformity!
290     }
291     else {
292     GStr s(gseqpath);
293     s.append('/');
294     s.append(gseqname);
295     if (debugMode) GMessage("[D] >> opening fasta file for genomic sequence %s..\n",gseqname);
296     if (fileExists(s.chars())==2) {
297     r=new GFaSeqGet(s.chars(),false); // WARNING: does not validate FASTA line-len uniformity!
298     r->loadall();
299     if (debugMode) GMessage("[D] << loaded.\n");
300     return r;
301     }
302     s.append(".fa");
303     if (fileExists(s.chars())==2) {
304     r=new GFaSeqGet(s.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
305     r->loadall();
306     if (debugMode) GMessage("[D] << loaded.\n");
307     return r;
308     }
309     s.append("sta");
310     if (fileExists(s.chars())==2) {
311     return new GFaSeqGet(s.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
312     r->loadall();
313     if (debugMode) GMessage("[D] << loaded.\n");
314     return r;
315     }
316     GError("Error: cannot load genomic sequence %s from directory %s !\n",
317     gseqname, gseqpath.chars());
318     }
319     return NULL;
320     }
321    
322    
323     GFaSeqGet* fastaSeqGet(GffObj& mrna) {
324     if (gseqpath.is_empty()) return NULL;
325     if (multiGSeq && mrna.gseq_id!=gseq_id) {
326     if (faseq!=NULL) {
327     delete faseq;
328     faseq=NULL;
329     }
330     faseq=openFastaSeq(mrna.getGSeqName());
331     gseq_id=mrna.gseq_id;
332     }
333     return faseq;
334     }
335    
336     void adjust_stopcodon(GffObj& mrna, int adj, GList<GSeg>* seglst=NULL) {
337     //adj>0 => extedn CDS, adj<0 => shrink CDS
338     //when CDS is expanded, exons have to be checked too and
339     // expanded accordingly if they had the same boundary
340     if (mrna.strand=='-') {
341     if ((int)mrna.CDstart>adj) {
342     mrna.CDstart-=adj;
343     if (adj>0 && mrna.exons.First()->start>=mrna.CDstart) {
344     mrna.exons.First()->start-=adj;
345     mrna.start=mrna.exons.First()->start;
346     mrna.covlen+=adj;
347     }
348     }
349     }
350     else {
351     mrna.CDend+=adj;
352     if (adj>0 && mrna.exons.Last()->end<=mrna.CDend) {
353     mrna.exons.Last()->end+=adj;
354     mrna.end=mrna.exons.Last()->end;
355     mrna.covlen+=adj;
356     }
357     }
358     if (seglst!=NULL) seglst->Last()->end+=adj;
359     }
360    
361     void process_mRNA(GffObj& mrna) {
362     GStr gene(mrna.getGene());
363     GStr defline(mrna.getID());
364     if (!gene.is_empty() && strcmp(gene.chars(),mrna.getID())!=0) {
365     int* isonum=isoCounter.Find(gene.chars());
366     if (isonum==NULL) {
367     isonum=new int(1);
368     isoCounter.Add(mrna.getGene(),isonum);
369     }
370     else (*isonum)++;
371     defline.appendfmt(" gene=%s", gene.chars());
372     }
373     int seqlen=0;
374     const char* tlabel=tracklabel;
375     if (tlabel==NULL) tlabel=mrna.getTrackName();
376     //defline.appendfmt(" track:%s",tlabel);
377     char* cdsnt = NULL;
378     char* cdsaa = NULL;
379     int aalen=0;
380     for (int i=1;i<mrna.exons.Count();i++) {
381     int ilen=mrna.exons[i]->start-mrna.exons[i-1]->end-1;
382     if (ilen>2000000)
383     GMessage("Warning: very large intron (%d) for transcript %s\n",
384     ilen, mrna.getID());
385     if (ilen>maxintron) {
386     return;
387     }
388     }
389     GList<GSeg> seglst(false,true);
390     fastaSeqGet(mrna);
391     if (spliceCheck && mrna.exons.Count()>1) {
392     //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
393     if (faseq==NULL) GError("Error: no genomic sequence available!\n");
394     int glen=mrna.end-mrna.start+1;
395     const char* gseq=faseq->subseq(mrna.start, glen);
396     bool revcompl=(mrna.strand=='-');
397     bool ssValid=true;
398     for (int e=1;e<mrna.exons.Count();e++) {
399     const char* intron=gseq+mrna.exons[e-1]->end+1-mrna.start;
400     //debug
401     //uint istart=mrna.exons[e-1]->end+1;
402     //uint iend=mrna.exons[e]->start-1;
403     int intronlen=mrna.exons[e]->start-mrna.exons[e-1]->end-1;
404     GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
405     GSpliceSite donorSite(intron,intronlen, false, revcompl);
406     //GMessage("%c intron %d-%d : %s .. %s\n",
407     // mrna.strand, istart, iend, donorSite.nt, acceptorSite.nt);
408     if (acceptorSite=="AG") { // GT-AG or GC-AG
409     if (!donorSite.canonicalDonor()) {
410     ssValid=false;break;
411     }
412     }
413     else if (acceptorSite=="AC") { //
414     if (donorSite!="AT") { ssValid=false; break; }
415     }
416     else { ssValid=false; break; }
417     }
418     //GFREE(gseq);
419     if (!ssValid) {
420     if (verbose)
421     GMessage("Invalid splice sites found for '%s'\n",mrna.getID());
422     return; //don't print this one!
423     }
424     }
425    
426     bool trprint=true;
427     int mCDphase=0;
428     if (mrna.CDphase=='1' || mrna.CDphase=='2')
429     mCDphase = mrna.CDphase-'0';
430     if (f_y!=NULL || f_x!=NULL || validCDSonly) {
431     if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
432     //if (protmap && fullCDSonly) {
433     //if (protmap && (fullCDSonly || (mrna.qlen>0 && mrna.qend==mrna.qlen))) {
434     if (validCDSonly) { //make sure the stop codon is always included
435     adjust_stopcodon(mrna,3);
436     }
437     int strandNum=0;
438     int phaseNum=0;
439     CDS_CHECK:
440     cdsnt=mrna.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
441     if (cdsnt==NULL) trprint=false;
442     if (validCDSonly) {
443     cdsaa=translateDNA(cdsnt, aalen, seqlen);
444     char* p=strchr(cdsaa,'.');
445     bool hasStop=false;
446     if (p!=NULL) {
447     if (p-cdsaa>=aalen-2) { //stop found as the last codon
448     *p='0';//remove it
449     hasStop=true;
450     if (aalen-2==p-cdsaa) {
451     //previous to last codon is the stop codon
452     //so correct the CDS stop accordingly
453     adjust_stopcodon(mrna,-3, &seglst);
454     seqlen-=3;
455     cdsnt[seqlen]=0;
456     }
457     aalen=p-cdsaa;
458     }
459     else {//stop found before the last codon
460     trprint=false;
461     }
462     }//stop codon found
463     if (trprint==false) { //failed CDS validity check
464     //in-frame stop codon found
465     if (altPhases && phaseNum<3) {
466     phaseNum++;
467     mrna.CDphase = '0'+((mCDphase+phaseNum)%3);
468     GFREE(cdsaa);
469     goto CDS_CHECK;
470     }
471     if (mrna.exons.Count()==1 && bothStrands) {
472     strandNum++;
473     phaseNum=0;
474     if (strandNum<2) {
475     GFREE(cdsaa);
476     mrna.strand = (mrna.strand=='-') ? '+':'-';
477     goto CDS_CHECK; //repeat the CDS check for a different frame
478     }
479     }
480     if (verbose) GMessage("In-frame STOP found for '%s'\n",mrna.getID());
481     } //has in-frame STOP
482     if (fullCDSonly) {
483     if (!hasStop || cdsaa[0]!='M') trprint=false;
484     }
485     } // CDS check requested
486     } //translation or codon check/output was requested
487     if (!trprint) {
488     GFREE(cdsnt);
489     GFREE(cdsaa);
490     return;
491     }
492     if (f_out!=NULL) {
493     if (fmtGTF) mrna.printGtf(f_out,tracklabel);
494     else mrna.printGff(f_out, tracklabel);
495     }
496     if (f_y!=NULL) { //CDS translation fasta output requested
497     //char*
498     if (cdsaa==NULL) { //translate now if not done before
499     cdsaa=translateDNA(cdsnt, aalen, seqlen);
500     }
501     if (fullattr && mrna.attrs!=NULL) {
502     //append all attributes found for each transcripts
503     for (int i=0;i<mrna.attrs->Count();i++) {
504     defline.append(" ");
505     defline.append(mrna.getAttrName(i));
506     defline.append("=");
507     defline.append(mrna.getAttrValue(i));
508     }
509     }
510     printFasta(f_y, defline, cdsaa, aalen);
511     }
512     if (f_x!=NULL) { //CDS only
513     if (writeExonSegs) {
514     defline.append(" loc:");
515     defline.append(mrna.getGSeqName());
516     defline.appendfmt("(%c)",mrna.strand);
517     //warning: not CDS coordinates are written here, but the exon ones
518     defline+=(int)mrna.start;
519     defline+=(char)'-';
520     defline+=(int)mrna.end;
521     // -- here these are CDS substring coordinates on the spliced sequence:
522     defline.append(" segs:");
523     for (int i=0;i<seglst.Count();i++) {
524     if (i>0) defline.append(",");
525     defline+=(int)seglst[i]->start;
526     defline.append("-");
527     defline+=(int)seglst[i]->end;
528     }
529     }
530     if (fullattr && mrna.attrs!=NULL) {
531     //append all attributes found for each transcript
532     for (int i=0;i<mrna.attrs->Count();i++) {
533     defline.append(" ");
534     defline.append(mrna.getAttrName(i));
535     defline.append("=");
536     defline.append(mrna.getAttrValue(i));
537     }
538     }
539     printFasta(f_x, defline, cdsnt, seqlen);
540     }
541     GFREE(cdsnt);
542     GFREE(cdsaa);
543     if (f_w!=NULL) { //write spliced exons
544     uint cds_start=0;
545     uint cds_end=0;
546     seglst.Clear();
547     char* exont=mrna.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
548     if (exont!=NULL) {
549     if (mrna.CDstart>0) {
550     defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
551     }
552     if (writeExonSegs) {
553     defline.append(" loc:");
554     defline.append(mrna.getGSeqName());
555     defline+=(char)'|';
556     defline+=(int)mrna.start;
557     defline+=(char)'-';
558     defline+=(int)mrna.end;
559     defline+=(char)'|';
560     defline+=(char)mrna.strand;
561     defline.append(" exons:");
562     for (int i=0;i<mrna.exons.Count();i++) {
563     if (i>0) defline.append(",");
564     defline+=(int)mrna.exons[i]->start;
565     defline.append("-");
566     defline+=(int)mrna.exons[i]->end;
567     }
568     defline.append(" segs:");
569     for (int i=0;i<seglst.Count();i++) {
570     if (i>0) defline.append(",");
571     defline+=(int)seglst[i]->start;
572     defline.append("-");
573     defline+=(int)seglst[i]->end;
574     }
575     }
576     if (fullattr && mrna.attrs!=NULL) {
577     //append all attributes found for each transcripts
578     for (int i=0;i<mrna.attrs->Count();i++) {
579     defline.append(" ");
580     defline.append(mrna.getAttrName(i));
581     defline.append("=");
582     defline.append(mrna.getAttrValue(i));
583     }
584     }
585     printFasta(f_w, defline, exont, seqlen);
586     GFREE(exont);
587     }
588     } //writing f_w (spliced exons)
589     return;
590     }
591    
592     void openfw(FILE* &f, GArgs& args, char opt) {
593     GStr s=args.getOpt(opt);
594     if (!s.is_empty()) {
595     if (s=='-')
596     f=stdout;
597     else {
598     f=fopen(s,"w");
599     if (f==NULL) GError("Error creating file: %s\n", s.chars());
600     }
601     }
602     }
603    
604     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
605     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
606    
607     int main(int argc, char * const argv[]) {
608     GArgs args(argc, argv, "vOUNHWCVMNSXTDAPRZFGg:i:r:s:t:a:b:o:w:x:y:MINCOV=MINPID=");
609     int e;
610     if ((e=args.isError())>0)
611     GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
612     if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
613     debugMode=(args.getOpt('D')!=NULL);
614     mRNAOnly=(args.getOpt('O')==NULL);
615     sortByLoc=(args.getOpt('S')!=NULL);
616     addDescr=(args.getOpt('A')!=NULL);
617     verbose=(args.getOpt('v')!=NULL);
618     wCDSonly=(args.getOpt('C')!=NULL);
619     validCDSonly=(args.getOpt('V')!=NULL);
620     altPhases=(args.getOpt('H')!=NULL);
621     fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
622     bothStrands=(args.getOpt('B')!=NULL);
623     fullCDSonly=(args.getOpt('M')!=NULL);
624     spliceCheck=(args.getOpt('N')!=NULL);
625     //protmap=(args.getOpt('P')!=NULL);
626     if (fullCDSonly) validCDSonly=true;
627     fullattr=(args.getOpt('F')!=NULL);
628     if (args.getOpt('G')==NULL)
629     noExonAttr=!fullattr;
630     else {
631     noExonAttr=true;
632     fullattr=true;
633     }
634     mergeCloseExons=(args.getOpt('Z')!=NULL);
635     multiExon=(args.getOpt('U')!=NULL);
636     writeExonSegs=(args.getOpt('W')!=NULL);
637     tracklabel=args.getOpt('t');
638     gseqpath=args.getOpt('g');
639     if (!gseqpath.is_empty()) {
640     sortByLoc=true;
641     int c=fileExists(gseqpath.chars());
642     if (c==1)
643     multiGSeq=true;//directory given
644     else if (c==2) { //single file given
645     if (gseqpath.rindex(".cidx")==gseqpath.length()-5) {
646     //cdbyank index given
647     gcdb=new GCdbYank(gseqpath.chars());
648     if (fileExists(gcdb->getDbName())) {
649     gcdbfa=gcdb->getDbName();
650     } else {
651     gcdbfa=gseqpath;
652     gcdbfa.chomp(".cidx");
653     if (!fileExists(gcdbfa.chars()))
654     GError("Error: cannot locate the fasta file for index %s !\n",gseqpath.chars());
655     }
656     multiGSeq=true;
657     }
658     else { //must be just the one single fasta file
659     multiGSeq=false;
660     faseq=new GFaSeqGet(gseqpath.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
661     }
662     }//fasta file given
663     else GError("Error: cannot find genomic sequence path %s !\n",gseqpath.chars());
664     } // -g option
665     GStr s=args.getOpt('i');
666     if (!s.is_empty()) maxintron=s.asInt();
667     rfltWithin=(args.getOpt('R')!=NULL);
668     s=args.getOpt('r');
669     if (!s.is_empty()) {
670     s.trim();
671     int isep=s.index(':');
672     if (isep>=0) { //gseq name given
673     if (isep>0) rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
674     s.cut(0,isep+1);
675     }
676     GStr gsend;
677     if (s.index("..")>=0) gsend=s.split("..");
678     else gsend=s.split('-');
679     if (!s.is_empty()) rfltStart=(uint)s.asInt();
680     if (!gsend.is_empty()) rfltEnd=(uint)gsend.asInt();
681     } //gseq/range filtering
682     else {
683     if (rfltWithin)
684     GError("Error: option -R doesn't make sense without -r!\n");
685     }
686     s=args.getOpt('s');
687     if (!s.is_empty()) {
688     FILE* fsize=fopen(s,"r");
689     if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
690     loadSeqInfo(fsize, seqinfo);
691     }
692     /*
693     openfw(fgtfok, args, 'a');
694     openfw(fgtfbad, args, 'b');
695     */
696     openfw(f_out, args, 'o');
697     //if (f_out==NULL) f_out=stdout;
698     if (gseqpath.is_empty() && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
699     GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
700    
701     openfw(f_w, args, 'w');
702     openfw(f_x, args, 'x');
703     openfw(f_y, args, 'y');
704     if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
705     //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
706     GStr infile;
707     if (args.startNonOpt()) {
708     infile=args.nextNonOpt();
709     //GMessage("Given file: %s\n",infile.chars());
710     }
711     if (!infile.is_empty()) {
712     f_in=fopen(infile, "r");
713     if (f_in==NULL)
714     GError("Cannot open input file %s!\n",infile.chars());
715     }
716     else
717     f_in=stdin;
718    
719     //GffReader* gfreader=new GffReader(f_in,true);
720     GffReader* gfreader=new GffReader(f_in, mRNAOnly, sortByLoc);
721     gfreader->readAll(fullattr, mergeCloseExons, noExonAttr);
722     if (debugMode) GMessage("[D] gff data loaded, now processing each entry\n");
723     //GList<GffObj> mrnas(true,true,false);
724     for (int m=0;m<gfreader->gflst.Count();m++) {
725     GffObj* mrna=gfreader->gflst[m];
726     if (mrna->exons.Count()==0) {
727     //a non-mRNA feature with no subfeatures
728     //yet just so we get some sequence functions working, add the dummy "exon" here
729     mrna->addExon(mrna->start,mrna->end);
730     }
731     if (rfltGSeq!=NULL) { //filter by gseqName
732     if (strcmp(mrna->getGSeqName(),rfltGSeq)!=0) {
733     delete mrna;
734     gfreader->gflst.Forget(m);
735     continue;
736     }
737     }
738     //check coordinates
739     if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
740     if (rfltWithin) {
741     if (mrna->start<rfltStart || mrna->end>rfltEnd) {
742     delete mrna;
743     gfreader->gflst.Forget(m);
744     continue;
745     }
746     }
747     else {
748     if (mrna->start>rfltEnd || mrna->end<rfltStart) {
749     delete mrna;
750     gfreader->gflst.Forget(m);
751     continue;
752     }
753     }
754     }
755     if (multiExon && mrna->exons.Count()==1) {
756     delete mrna;
757     gfreader->gflst.Forget(m);
758     continue;
759     }
760     if (wCDSonly && mrna->CDstart==0) {
761     delete mrna;
762     gfreader->gflst.Forget(m);
763     continue;
764     }
765     /* if (validCDSonly && mrna->hasErrors) {
766     delete mrna;
767     gfreader->gflst.Forget(m);
768     continue;
769     }
770     */
771     process_mRNA(*mrna);
772     delete mrna;
773     gfreader->gflst.Forget(m);
774     }
775     // M_END:
776     delete gfreader;
777     seqinfo.Clear();
778     if (faseq!=NULL) delete faseq;
779     if (gcdb!=NULL) delete gcdb;
780     GFREE(rfltGSeq);
781     FRCLOSE(f_in);
782     FWCLOSE(f_out);
783     FWCLOSE(f_w);
784     FWCLOSE(f_x);
785     FWCLOSE(f_y);
786     }