ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 6
Committed: Mon Mar 22 22:07:27 2010 UTC (9 years, 7 months ago) by gpertea
File size: 25862 byte(s)
Log Message:
added gffread sources

Line File contents
1 #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include "GFaSeqGet.h"
6 #include "gff.h"
7 #include <ctype.h>
8 // do not care about cdb compression
9 #ifdef ENABLE_COMPRESSION
10 #undef ENABLE_COMPRESSION
11 #endif
12 #include "GCdbYank.h"
13
14 #define USAGE "Usage:\n\
15 gffread <input_gff..> [-g <genomic_seq_fasta> | <dir>][-s <seq_info.fsize>]\n\
16 [-o <outfile.gff3>] [-t <tname>] [-r [<chr>:]<start>..<end>] [-i <maxintron>]\n\
17 [-CTVNMAFGRUVBHSZWTO] [-w <spl_exons.fa>] [-x <spl_cds.fa>] [-y <tran_cds.fa>]\n\
18 \n\
19 Filters and/or converts GFF/GTF records.\n\
20 \n\
21 Options:\n\
22 -g full path to one fasta file with the genomic sequence\n\
23 for all input mappings, OR a path to the directory where\n\
24 genomic sequences can be found as single-fasta files\n\
25 -s for mRNA/EST/protein mappings a tab-delimited file provides this info\n\
26 for each of the mapped sequences:\n\
27 <seq-name> <seq-length> <seq-description>\n\
28 -i discard transcripts having an intron larger than <maxintron>\n\
29 -r only show transcripts crossing coordinate range <start>..<end>\n\
30 (on chromosome/contig <chr> if provided)\n\
31 -R for -r option, discard all transcripts that are not fully \n\
32 contained within given range\n\
33 -U discard single-exon transcripts\n\
34 -C discard mRNAs that have no CDS feature\n\
35 -F keep all attributes from last column of GFF/GTF\n\
36 -G only parse additional exon attributes from the first exon\n\
37 and move them to the mRNA level (useful for GTF input)\n\
38 -A use the description field from <seq_info.fsize> and add it as\n\
39 a descr=.. attribute to the mRNA and/or Gene record\n\
40 \n\
41 -O process other (non-mRNA) GFF/GTF records (default is discard non-mRNA\n\
42 data). \n\
43 Note: non-mRNA records must have only one subfeature per parent feature\n\
44 -V discard any mRNAs with CDS having in-frame stop codons\n\
45 -H for -V option, check and adjust the starting CDS phase\n\
46 if the original phase leads to a translation with an \n\
47 in-frame stop codon\n\
48 -B for -V option, single-exon transcripts are also checked on the\n\
49 opposite strand\n\
50 -N only show multi-exon mRNAs if all their introns have the \n\
51 typical splice site consensus ( GT-AG, GC-AG or AT-AC )\n\
52 -M discard any mRNAs that either lack initial START codon\n\
53 or the terminal STOP codon, or have an in-frame stop codon\n\
54 (only print mRNAs with a fulll, valid CDS)\n\
55 \n\
56 -S sort output GFF records by genomic sequence and start coordinate\n\
57 note that this option is automatically enabled by -g option\n\
58 -Z merge close exons into a single exon (intron size<4)\n\
59 -t use <trackname> in the second column of each GFF/GTF output line\n\
60 -w write a fasta file with spliced exons for each mapping\n\
61 -x write a fasta file with spliced CDS for each mapping\n\
62 -W for -w and -x options, also write for each fasta record the exon \n\
63 coordinates projected onto the spliced sequence\n\
64 -y write a protein fasta file with the CDS translation for each mapping\n\
65 -o the output gff3 file with the 'filtered' entries\n\
66 -T output GTF format instead of GFF3\n\
67 \n\
68 "
69
70 FILE* ffasta=NULL;
71 FILE* f_in=NULL;
72 FILE* f_out=NULL;
73 FILE* f_w=NULL; //fasta with spliced exons (transcripts)
74 FILE* f_x=NULL; //fasta with spliced CDS
75 FILE* f_y=NULL; //fasta with translated CDS
76 bool wCDSonly=false;
77
78 bool validCDSonly=false; // translation with no in-frame STOP
79 bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
80 bool altPhases=false; //if original phase fails translation validation,
81 //try the other 2 phases until one makes it
82 bool mRNAOnly=true;
83 bool spliceCheck=false; //only known splice-sites
84
85 bool fullCDSonly=false; // starts with START, ends with STOP codon
86 bool fullattr=false;
87 bool sortByLoc=false; // if the GFF output should be sorted by location
88 GStr gseqpath;
89 bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
90 bool fmtGTF=false;
91 int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
92 GFaSeqGet* faseq=NULL;
93 GCdbYank* gcdb=NULL;
94 GStr gcdbfa;
95 bool addDescr=false;
96 //bool protmap=false;
97 bool multiExon=false;
98 bool writeExonSegs=false;
99 char* tracklabel=NULL;
100 int maxintron=999000000;
101 bool mergeCloseExons=false;
102 //range filter:
103 char* rfltGSeq=NULL;
104 uint rfltStart=0;
105 uint rfltEnd=MAX_UINT;
106 bool rfltWithin=false; //check for full containment within given range
107 bool noExonAttr=false;
108 class SeqInfo {
109 public:
110 int len;
111 char* descr;
112 SeqInfo( int l, char* s) {
113 len=l;
114 if (s==NULL) {
115 descr=NULL;
116 } else {
117 descr=Gstrdup(s);
118 }
119 }
120 ~SeqInfo() {
121 GFREE(descr);
122 }
123 };
124
125
126 class GSpliceSite {
127 public:
128 char nt[3];
129 GSpliceSite(const char* c, bool revc=false) {
130 nt[2]=0;
131 if (c==NULL) {
132 nt[0]=0;
133 nt[1]=0;
134 return;
135 }
136 if (revc) {
137 nt[0]=toupper(ntComplement(c[1]));
138 nt[1]=toupper(ntComplement(c[0]));
139 }
140 else {
141 nt[0]=toupper(c[0]);
142 nt[1]=toupper(c[1]);
143 }
144 }
145
146 GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) {
147 nt[2]=0;
148 if (intron==NULL || intronlen==0)
149 GError("Error: invalid intron or intron len for GSpliceSite()!\n");
150 const char* c=intron;
151 if (revc) {
152 if (!getAcceptor) c+=intronlen-2;
153 nt[0]=toupper(ntComplement(c[1]));
154 nt[1]=toupper(ntComplement(c[0]));
155 }
156 else { //on forward strand
157 if (getAcceptor) c+=intronlen-2;
158 nt[0]=toupper(c[0]);
159 nt[1]=toupper(c[1]);
160 }//forward strand
161 }
162
163 GSpliceSite(const char n1, const char n2) {
164 nt[2]=0;
165 nt[0]=toupper(n1);
166 nt[1]=toupper(n2);
167 }
168 bool canonicalDonor() {
169 return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T'));
170 }
171 bool operator==(GSpliceSite& c) {
172 return (c.nt[0]==nt[0] && c.nt[1]==nt[1]);
173 }
174 bool operator==(GSpliceSite* c) {
175 return (c->nt[0]==nt[0] && c->nt[1]==nt[1]);
176 }
177 bool operator==(const char* c) {
178 //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1]));
179 //assumes given const nucleotides are uppercase already!
180 return (nt[0]==c[0] && nt[1]==c[1]);
181 }
182 bool operator!=(const char* c) {
183 //assumes given const nucleotides are uppercase already!
184 return (nt[0]!=c[0] || nt[1]!=c[1]);
185 }
186 };
187
188 //hash with sequence info
189 GHash<SeqInfo> seqinfo;
190 GHash<int> isoCounter; //counts the valid isoforms
191
192 bool debugMode=false;
193 bool verbose=false;
194
195 char* getSeqDescr(char* seqid) {
196 static char charbuf[128];
197 if (seqinfo.Count()==0) return NULL;
198 char* suf=rstrchr(seqid, '.');
199 if (suf!=NULL) *suf=0;
200 SeqInfo* seqd=seqinfo.Find(seqid);
201 if (suf!=NULL) *suf='.';
202 if (seqd!=NULL) {
203 GStr s(seqd->descr);
204 //cleanup some Uniref gunk
205 if (s[0]=='[') {
206 int r=s.index(']');
207 if (r>=0 && r<8 && isdigit(s[1]))
208 s.remove(0,r+1);
209 }
210 if (s.length()>80) {
211 int r=s.index(';');
212 if (r>5) s.cut(r);
213 }
214 if (s.length()>127) {
215 s.cut(127);
216 int r=s.rindex(' ');
217 if (r>0) s.cut(r);
218 }
219 strcpy(charbuf, s.chars());
220 return charbuf;
221 }
222 else return NULL;
223 }
224
225 char* getSeqName(char* seqid) {
226 static char charbuf[128];
227 char* suf=rstrchr(seqid, '.');
228 if (suf!=NULL) *suf=0;
229 strcpy(charbuf, seqid);
230 if (suf!=NULL) *suf='.';
231 return charbuf;
232 }
233
234 void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1) {
235 if (seq==NULL) return;
236 int len=(seqlen>0)?seqlen:strlen(seq);
237 if (len<=0) return;
238 if (!defline.is_empty())
239 fprintf(f, ">%s\n",defline.chars());
240 int ilen=0;
241 for (int i=0; i < len; i++, ilen++) {
242 if (ilen == 70) {
243 fputc('\n', f);
244 ilen = 0;
245 }
246 putc(seq[i], f);
247 } //for
248 fputc('\n', f);
249 }
250
251 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
252 GLineReader fr(f);
253 while (!fr.isEof()) {
254 char* line=fr.getLine();
255 if (line==NULL) break;
256 char* id=line;
257 char* lenstr=NULL;
258 char* text=NULL;
259 char* p=line;
260 while (*p!=0 && !isspace(*p)) p++;
261 if (*p==0) continue;
262 *p=0;p++;
263 while (*p==' ' || *p=='\t') p++;
264 if (*p==0) continue;
265 lenstr=p;
266 while (*p!=0 && !isspace(*p)) p++;
267 if (*p!=0) { *p=0;p++; }
268 while (*p==' ' || *p=='\t') p++;
269 if (*p!=0) text=p; //else text remains NULL
270 int len=0;
271 if (!parseInt(lenstr,len)) {
272 GMessage("Warning: could not parse sequence length: %s %s\n",
273 id, lenstr);
274 continue;
275 }
276 // --- here we have finished parsing the line
277 si.Add(id, new SeqInfo(len,text));
278 } //while lines
279 }
280
281
282 GFaSeqGet* openFastaSeq(const char* gseqname) {
283 //this is only called when either we have a cidx or a directory in gseqpath
284 GFaSeqGet* r=NULL;
285 if (gcdb!=NULL) {
286 off_t rpos=gcdb->getRecordPos(gseqname);
287 if (rpos<0) // genomic sequence not found
288 GError("Error: cannot find genomic sequence '%s' in %s\n",gseqname, gseqpath.chars());
289 return new GFaSeqGet(gcdbfa.chars(),rpos, false);// WARNING: does not validate FASTA line-len uniformity!
290 }
291 else {
292 GStr s(gseqpath);
293 s.append('/');
294 s.append(gseqname);
295 if (debugMode) GMessage("[D] >> opening fasta file for genomic sequence %s..\n",gseqname);
296 if (fileExists(s.chars())==2) {
297 r=new GFaSeqGet(s.chars(),false); // WARNING: does not validate FASTA line-len uniformity!
298 r->loadall();
299 if (debugMode) GMessage("[D] << loaded.\n");
300 return r;
301 }
302 s.append(".fa");
303 if (fileExists(s.chars())==2) {
304 r=new GFaSeqGet(s.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
305 r->loadall();
306 if (debugMode) GMessage("[D] << loaded.\n");
307 return r;
308 }
309 s.append("sta");
310 if (fileExists(s.chars())==2) {
311 return new GFaSeqGet(s.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
312 r->loadall();
313 if (debugMode) GMessage("[D] << loaded.\n");
314 return r;
315 }
316 GError("Error: cannot load genomic sequence %s from directory %s !\n",
317 gseqname, gseqpath.chars());
318 }
319 return NULL;
320 }
321
322
323 GFaSeqGet* fastaSeqGet(GffObj& mrna) {
324 if (gseqpath.is_empty()) return NULL;
325 if (multiGSeq && mrna.gseq_id!=gseq_id) {
326 if (faseq!=NULL) {
327 delete faseq;
328 faseq=NULL;
329 }
330 faseq=openFastaSeq(mrna.getGSeqName());
331 gseq_id=mrna.gseq_id;
332 }
333 return faseq;
334 }
335
336 void adjust_stopcodon(GffObj& mrna, int adj, GList<GSeg>* seglst=NULL) {
337 //adj>0 => extedn CDS, adj<0 => shrink CDS
338 //when CDS is expanded, exons have to be checked too and
339 // expanded accordingly if they had the same boundary
340 if (mrna.strand=='-') {
341 if ((int)mrna.CDstart>adj) {
342 mrna.CDstart-=adj;
343 if (adj>0 && mrna.exons.First()->start>=mrna.CDstart) {
344 mrna.exons.First()->start-=adj;
345 mrna.start=mrna.exons.First()->start;
346 mrna.covlen+=adj;
347 }
348 }
349 }
350 else {
351 mrna.CDend+=adj;
352 if (adj>0 && mrna.exons.Last()->end<=mrna.CDend) {
353 mrna.exons.Last()->end+=adj;
354 mrna.end=mrna.exons.Last()->end;
355 mrna.covlen+=adj;
356 }
357 }
358 if (seglst!=NULL) seglst->Last()->end+=adj;
359 }
360
361 void process_mRNA(GffObj& mrna) {
362 GStr gene(mrna.getGene());
363 GStr defline(mrna.getID());
364 if (!gene.is_empty() && strcmp(gene.chars(),mrna.getID())!=0) {
365 int* isonum=isoCounter.Find(gene.chars());
366 if (isonum==NULL) {
367 isonum=new int(1);
368 isoCounter.Add(mrna.getGene(),isonum);
369 }
370 else (*isonum)++;
371 defline.appendfmt(" gene=%s", gene.chars());
372 }
373 int seqlen=0;
374 const char* tlabel=tracklabel;
375 if (tlabel==NULL) tlabel=mrna.getTrackName();
376 //defline.appendfmt(" track:%s",tlabel);
377 char* cdsnt = NULL;
378 char* cdsaa = NULL;
379 int aalen=0;
380 for (int i=1;i<mrna.exons.Count();i++) {
381 int ilen=mrna.exons[i]->start-mrna.exons[i-1]->end-1;
382 if (ilen>2000000)
383 GMessage("Warning: very large intron (%d) for transcript %s\n",
384 ilen, mrna.getID());
385 if (ilen>maxintron) {
386 return;
387 }
388 }
389 GList<GSeg> seglst(false,true);
390 fastaSeqGet(mrna);
391 if (spliceCheck && mrna.exons.Count()>1) {
392 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
393 if (faseq==NULL) GError("Error: no genomic sequence available!\n");
394 int glen=mrna.end-mrna.start+1;
395 const char* gseq=faseq->subseq(mrna.start, glen);
396 bool revcompl=(mrna.strand=='-');
397 bool ssValid=true;
398 for (int e=1;e<mrna.exons.Count();e++) {
399 const char* intron=gseq+mrna.exons[e-1]->end+1-mrna.start;
400 //debug
401 //uint istart=mrna.exons[e-1]->end+1;
402 //uint iend=mrna.exons[e]->start-1;
403 int intronlen=mrna.exons[e]->start-mrna.exons[e-1]->end-1;
404 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
405 GSpliceSite donorSite(intron,intronlen, false, revcompl);
406 //GMessage("%c intron %d-%d : %s .. %s\n",
407 // mrna.strand, istart, iend, donorSite.nt, acceptorSite.nt);
408 if (acceptorSite=="AG") { // GT-AG or GC-AG
409 if (!donorSite.canonicalDonor()) {
410 ssValid=false;break;
411 }
412 }
413 else if (acceptorSite=="AC") { //
414 if (donorSite!="AT") { ssValid=false; break; }
415 }
416 else { ssValid=false; break; }
417 }
418 //GFREE(gseq);
419 if (!ssValid) {
420 if (verbose)
421 GMessage("Invalid splice sites found for '%s'\n",mrna.getID());
422 return; //don't print this one!
423 }
424 }
425
426 bool trprint=true;
427 int mCDphase=0;
428 if (mrna.CDphase=='1' || mrna.CDphase=='2')
429 mCDphase = mrna.CDphase-'0';
430 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
431 if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
432 //if (protmap && fullCDSonly) {
433 //if (protmap && (fullCDSonly || (mrna.qlen>0 && mrna.qend==mrna.qlen))) {
434 if (validCDSonly) { //make sure the stop codon is always included
435 adjust_stopcodon(mrna,3);
436 }
437 int strandNum=0;
438 int phaseNum=0;
439 CDS_CHECK:
440 cdsnt=mrna.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
441 if (cdsnt==NULL) trprint=false;
442 if (validCDSonly) {
443 cdsaa=translateDNA(cdsnt, aalen, seqlen);
444 char* p=strchr(cdsaa,'.');
445 bool hasStop=false;
446 if (p!=NULL) {
447 if (p-cdsaa>=aalen-2) { //stop found as the last codon
448 *p='0';//remove it
449 hasStop=true;
450 if (aalen-2==p-cdsaa) {
451 //previous to last codon is the stop codon
452 //so correct the CDS stop accordingly
453 adjust_stopcodon(mrna,-3, &seglst);
454 seqlen-=3;
455 cdsnt[seqlen]=0;
456 }
457 aalen=p-cdsaa;
458 }
459 else {//stop found before the last codon
460 trprint=false;
461 }
462 }//stop codon found
463 if (trprint==false) { //failed CDS validity check
464 //in-frame stop codon found
465 if (altPhases && phaseNum<3) {
466 phaseNum++;
467 mrna.CDphase = '0'+((mCDphase+phaseNum)%3);
468 GFREE(cdsaa);
469 goto CDS_CHECK;
470 }
471 if (mrna.exons.Count()==1 && bothStrands) {
472 strandNum++;
473 phaseNum=0;
474 if (strandNum<2) {
475 GFREE(cdsaa);
476 mrna.strand = (mrna.strand=='-') ? '+':'-';
477 goto CDS_CHECK; //repeat the CDS check for a different frame
478 }
479 }
480 if (verbose) GMessage("In-frame STOP found for '%s'\n",mrna.getID());
481 } //has in-frame STOP
482 if (fullCDSonly) {
483 if (!hasStop || cdsaa[0]!='M') trprint=false;
484 }
485 } // CDS check requested
486 } //translation or codon check/output was requested
487 if (!trprint) {
488 GFREE(cdsnt);
489 GFREE(cdsaa);
490 return;
491 }
492 if (f_out!=NULL) {
493 if (fmtGTF) mrna.printGtf(f_out,tracklabel);
494 else mrna.printGff(f_out, tracklabel);
495 }
496 if (f_y!=NULL) { //CDS translation fasta output requested
497 //char*
498 if (cdsaa==NULL) { //translate now if not done before
499 cdsaa=translateDNA(cdsnt, aalen, seqlen);
500 }
501 if (fullattr && mrna.attrs!=NULL) {
502 //append all attributes found for each transcripts
503 for (int i=0;i<mrna.attrs->Count();i++) {
504 defline.append(" ");
505 defline.append(mrna.getAttrName(i));
506 defline.append("=");
507 defline.append(mrna.getAttrValue(i));
508 }
509 }
510 printFasta(f_y, defline, cdsaa, aalen);
511 }
512 if (f_x!=NULL) { //CDS only
513 if (writeExonSegs) {
514 defline.append(" loc:");
515 defline.append(mrna.getGSeqName());
516 defline.appendfmt("(%c)",mrna.strand);
517 //warning: not CDS coordinates are written here, but the exon ones
518 defline+=(int)mrna.start;
519 defline+=(char)'-';
520 defline+=(int)mrna.end;
521 // -- here these are CDS substring coordinates on the spliced sequence:
522 defline.append(" segs:");
523 for (int i=0;i<seglst.Count();i++) {
524 if (i>0) defline.append(",");
525 defline+=(int)seglst[i]->start;
526 defline.append("-");
527 defline+=(int)seglst[i]->end;
528 }
529 }
530 if (fullattr && mrna.attrs!=NULL) {
531 //append all attributes found for each transcript
532 for (int i=0;i<mrna.attrs->Count();i++) {
533 defline.append(" ");
534 defline.append(mrna.getAttrName(i));
535 defline.append("=");
536 defline.append(mrna.getAttrValue(i));
537 }
538 }
539 printFasta(f_x, defline, cdsnt, seqlen);
540 }
541 GFREE(cdsnt);
542 GFREE(cdsaa);
543 if (f_w!=NULL) { //write spliced exons
544 uint cds_start=0;
545 uint cds_end=0;
546 seglst.Clear();
547 char* exont=mrna.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
548 if (exont!=NULL) {
549 if (mrna.CDstart>0) {
550 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
551 }
552 if (writeExonSegs) {
553 defline.append(" loc:");
554 defline.append(mrna.getGSeqName());
555 defline+=(char)'|';
556 defline+=(int)mrna.start;
557 defline+=(char)'-';
558 defline+=(int)mrna.end;
559 defline+=(char)'|';
560 defline+=(char)mrna.strand;
561 defline.append(" exons:");
562 for (int i=0;i<mrna.exons.Count();i++) {
563 if (i>0) defline.append(",");
564 defline+=(int)mrna.exons[i]->start;
565 defline.append("-");
566 defline+=(int)mrna.exons[i]->end;
567 }
568 defline.append(" segs:");
569 for (int i=0;i<seglst.Count();i++) {
570 if (i>0) defline.append(",");
571 defline+=(int)seglst[i]->start;
572 defline.append("-");
573 defline+=(int)seglst[i]->end;
574 }
575 }
576 if (fullattr && mrna.attrs!=NULL) {
577 //append all attributes found for each transcripts
578 for (int i=0;i<mrna.attrs->Count();i++) {
579 defline.append(" ");
580 defline.append(mrna.getAttrName(i));
581 defline.append("=");
582 defline.append(mrna.getAttrValue(i));
583 }
584 }
585 printFasta(f_w, defline, exont, seqlen);
586 GFREE(exont);
587 }
588 } //writing f_w (spliced exons)
589 return;
590 }
591
592 void openfw(FILE* &f, GArgs& args, char opt) {
593 GStr s=args.getOpt(opt);
594 if (!s.is_empty()) {
595 if (s=='-')
596 f=stdout;
597 else {
598 f=fopen(s,"w");
599 if (f==NULL) GError("Error creating file: %s\n", s.chars());
600 }
601 }
602 }
603
604 #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
605 #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
606
607 int main(int argc, char * const argv[]) {
608 GArgs args(argc, argv, "vOUNHWCVMNSXTDAPRZFGg:i:r:s:t:a:b:o:w:x:y:MINCOV=MINPID=");
609 int e;
610 if ((e=args.isError())>0)
611 GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
612 if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
613 debugMode=(args.getOpt('D')!=NULL);
614 mRNAOnly=(args.getOpt('O')==NULL);
615 sortByLoc=(args.getOpt('S')!=NULL);
616 addDescr=(args.getOpt('A')!=NULL);
617 verbose=(args.getOpt('v')!=NULL);
618 wCDSonly=(args.getOpt('C')!=NULL);
619 validCDSonly=(args.getOpt('V')!=NULL);
620 altPhases=(args.getOpt('H')!=NULL);
621 fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
622 bothStrands=(args.getOpt('B')!=NULL);
623 fullCDSonly=(args.getOpt('M')!=NULL);
624 spliceCheck=(args.getOpt('N')!=NULL);
625 //protmap=(args.getOpt('P')!=NULL);
626 if (fullCDSonly) validCDSonly=true;
627 fullattr=(args.getOpt('F')!=NULL);
628 if (args.getOpt('G')==NULL)
629 noExonAttr=!fullattr;
630 else {
631 noExonAttr=true;
632 fullattr=true;
633 }
634 mergeCloseExons=(args.getOpt('Z')!=NULL);
635 multiExon=(args.getOpt('U')!=NULL);
636 writeExonSegs=(args.getOpt('W')!=NULL);
637 tracklabel=args.getOpt('t');
638 gseqpath=args.getOpt('g');
639 if (!gseqpath.is_empty()) {
640 sortByLoc=true;
641 int c=fileExists(gseqpath.chars());
642 if (c==1)
643 multiGSeq=true;//directory given
644 else if (c==2) { //single file given
645 if (gseqpath.rindex(".cidx")==gseqpath.length()-5) {
646 //cdbyank index given
647 gcdb=new GCdbYank(gseqpath.chars());
648 if (fileExists(gcdb->getDbName())) {
649 gcdbfa=gcdb->getDbName();
650 } else {
651 gcdbfa=gseqpath;
652 gcdbfa.chomp(".cidx");
653 if (!fileExists(gcdbfa.chars()))
654 GError("Error: cannot locate the fasta file for index %s !\n",gseqpath.chars());
655 }
656 multiGSeq=true;
657 }
658 else { //must be just the one single fasta file
659 multiGSeq=false;
660 faseq=new GFaSeqGet(gseqpath.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
661 }
662 }//fasta file given
663 else GError("Error: cannot find genomic sequence path %s !\n",gseqpath.chars());
664 } // -g option
665 GStr s=args.getOpt('i');
666 if (!s.is_empty()) maxintron=s.asInt();
667 rfltWithin=(args.getOpt('R')!=NULL);
668 s=args.getOpt('r');
669 if (!s.is_empty()) {
670 s.trim();
671 int isep=s.index(':');
672 if (isep>=0) { //gseq name given
673 if (isep>0) rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
674 s.cut(0,isep+1);
675 }
676 GStr gsend;
677 if (s.index("..")>=0) gsend=s.split("..");
678 else gsend=s.split('-');
679 if (!s.is_empty()) rfltStart=(uint)s.asInt();
680 if (!gsend.is_empty()) rfltEnd=(uint)gsend.asInt();
681 } //gseq/range filtering
682 else {
683 if (rfltWithin)
684 GError("Error: option -R doesn't make sense without -r!\n");
685 }
686 s=args.getOpt('s');
687 if (!s.is_empty()) {
688 FILE* fsize=fopen(s,"r");
689 if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
690 loadSeqInfo(fsize, seqinfo);
691 }
692 /*
693 openfw(fgtfok, args, 'a');
694 openfw(fgtfbad, args, 'b');
695 */
696 openfw(f_out, args, 'o');
697 //if (f_out==NULL) f_out=stdout;
698 if (gseqpath.is_empty() && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
699 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
700
701 openfw(f_w, args, 'w');
702 openfw(f_x, args, 'x');
703 openfw(f_y, args, 'y');
704 if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
705 //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
706 GStr infile;
707 if (args.startNonOpt()) {
708 infile=args.nextNonOpt();
709 //GMessage("Given file: %s\n",infile.chars());
710 }
711 if (!infile.is_empty()) {
712 f_in=fopen(infile, "r");
713 if (f_in==NULL)
714 GError("Cannot open input file %s!\n",infile.chars());
715 }
716 else
717 f_in=stdin;
718
719 //GffReader* gfreader=new GffReader(f_in,true);
720 GffReader* gfreader=new GffReader(f_in, mRNAOnly, sortByLoc);
721 gfreader->readAll(fullattr, mergeCloseExons, noExonAttr);
722 if (debugMode) GMessage("[D] gff data loaded, now processing each entry\n");
723 //GList<GffObj> mrnas(true,true,false);
724 for (int m=0;m<gfreader->gflst.Count();m++) {
725 GffObj* mrna=gfreader->gflst[m];
726 if (mrna->exons.Count()==0) {
727 //a non-mRNA feature with no subfeatures
728 //yet just so we get some sequence functions working, add the dummy "exon" here
729 mrna->addExon(mrna->start,mrna->end);
730 }
731 if (rfltGSeq!=NULL) { //filter by gseqName
732 if (strcmp(mrna->getGSeqName(),rfltGSeq)!=0) {
733 delete mrna;
734 gfreader->gflst.Forget(m);
735 continue;
736 }
737 }
738 //check coordinates
739 if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
740 if (rfltWithin) {
741 if (mrna->start<rfltStart || mrna->end>rfltEnd) {
742 delete mrna;
743 gfreader->gflst.Forget(m);
744 continue;
745 }
746 }
747 else {
748 if (mrna->start>rfltEnd || mrna->end<rfltStart) {
749 delete mrna;
750 gfreader->gflst.Forget(m);
751 continue;
752 }
753 }
754 }
755 if (multiExon && mrna->exons.Count()==1) {
756 delete mrna;
757 gfreader->gflst.Forget(m);
758 continue;
759 }
760 if (wCDSonly && mrna->CDstart==0) {
761 delete mrna;
762 gfreader->gflst.Forget(m);
763 continue;
764 }
765 /* if (validCDSonly && mrna->hasErrors) {
766 delete mrna;
767 gfreader->gflst.Forget(m);
768 continue;
769 }
770 */
771 process_mRNA(*mrna);
772 delete mrna;
773 gfreader->gflst.Forget(m);
774 }
775 // M_END:
776 delete gfreader;
777 seqinfo.Clear();
778 if (faseq!=NULL) delete faseq;
779 if (gcdb!=NULL) delete gcdb;
780 GFREE(rfltGSeq);
781 FRCLOSE(f_in);
782 FWCLOSE(f_out);
783 FWCLOSE(f_w);
784 FWCLOSE(f_x);
785 FWCLOSE(f_y);
786 }