ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 38
Committed: Sat Aug 27 15:42:15 2011 UTC (8 years, 3 months ago) by gpertea
File size: 35337 byte(s)
Log Message:
Line User Rev File contents
1 gpertea 17 #include "gff_utils.h"
2 gpertea 6 #include "GArgs.h"
3     #include <ctype.h>
4 gpertea 17 // don't care about cdb compression
5     //#ifdef ENABLE_COMPRESSION
6     //#undef ENABLE_COMPRESSION
7     //#endif
8     //#include "GCdbYank.h"
9 gpertea 6
10     #define USAGE "Usage:\n\
11 gpertea 17 gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12     [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end>] \n\
13     [-CTVNJMKQAFGRUVBHZWTOLE] [-w <spl_exons.fa>] [-x <spl_cds.fa>] [-y <tr_cds.fa>]\n\
14     [-i <maxintron>] \n\
15     Filters and/or converts GFF3/GTF2 records.\n\
16     <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17 gpertea 6 \n\
18     Options:\n\
19 gpertea 17 -g full path to a multi-fasta file with the genomic sequences\n\
20     for all input mappings, OR a directory with single-fasta files\n\
21     (one per genomic sequence, with file names matching sequence names)\n\
22     -s <seq_info.fsize> is a tab-delimited file providing this info\n\
23 gpertea 6 for each of the mapped sequences:\n\
24     <seq-name> <seq-length> <seq-description>\n\
25 gpertea 17 (useful for -A option with mRNA/EST/protein mappings)\n\
26 gpertea 6 -i discard transcripts having an intron larger than <maxintron>\n\
27     -r only show transcripts crossing coordinate range <start>..<end>\n\
28 gpertea 17 (on chromosome/contig <chr>, strand <strand> if provided)\n\
29 gpertea 6 -R for -r option, discard all transcripts that are not fully \n\
30     contained within given range\n\
31     -U discard single-exon transcripts\n\
32 gpertea 17 -C coding only: discard mRNAs that have no CDS feature\n\
33     -F full GFF attribute preservation (all attributes are shown)\n\
34 gpertea 6 -G only parse additional exon attributes from the first exon\n\
35     and move them to the mRNA level (useful for GTF input)\n\
36 gpertea 17 -A use the description field from <seq_info.fsize> and add it\n\
37     as the value for a 'descr' attribute to the GFF record\n\
38 gpertea 6 \n\
39 gpertea 17 -O process also non-transcript GFF records (by default non-transcript\n\
40     records are ignored)\n\
41 gpertea 6 -V discard any mRNAs with CDS having in-frame stop codons\n\
42     -H for -V option, check and adjust the starting CDS phase\n\
43     if the original phase leads to a translation with an \n\
44     in-frame stop codon\n\
45     -B for -V option, single-exon transcripts are also checked on the\n\
46     opposite strand\n\
47 gpertea 17 -N discard multi-exon mRNAs that have any intron with a non-canonical\n\
48     splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49     -J discard any mRNAs that either lack initial START codon\n\
50 gpertea 6 or the terminal STOP codon, or have an in-frame stop codon\n\
51     (only print mRNAs with a fulll, valid CDS)\n\
52     \n\
53 gpertea 17 -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54     transcripts (those with the same exact introns and fully contained)\n\
55 gpertea 38 -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>\n\
56 gpertea 17 --cluster-only: same as --merge but without collapsing matching transcripts\n\
57     -K for -M option: also collapse shorter, fully contained transcripts\n\
58     with fewer introns than the container\n\
59     -Q for -M option, remove the containment restriction:\n\
60     (multi-exon transcripts will be collapsed if just their introns match,\n\
61     while single-exon transcripts can partially overlap (80%))\n\
62     \n\
63     -E expose (warn about) duplicate transcript IDs and other potential \n\
64     problems with the given GFF/GTF records\n\
65     -Z merge close exons into a single exon (for intron size<4)\n\
66     -w write a fasta file with spliced exons for each GFF transcript\n\
67     -x write a fasta file with spliced CDS for each GFF transcript\n\
68     -W for -w and -x options, also write for each fasta record the exon\n\
69 gpertea 6 coordinates projected onto the spliced sequence\n\
70 gpertea 17 -y write a protein fasta file with the translation of CDS for each record\n\
71     -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
72     -m <chr_replace> is a reference (genomic) sequence replacement table with\n\
73     this format:\n\
74     <original_ref_ID> <new_ref_ID>\n\
75     GFF records on reference sequences that are not found among the\n\
76     <original_ref_ID> entries in this file will be filtered out\n\
77     -o the \"filtered\" GFF records will be written to <outfile.gff>\n\
78     (use -o- for printing to stdout)\n\
79     -t use <trackname> in the second column of each GFF output line\n\
80     -T -o option will output GTF format instead of GFF3\n\
81     "
82 gpertea 6
83 gpertea 17
84     class SeqInfo { //populated from the -s option of gffread
85     public:
86     int len;
87     char* descr;
88     SeqInfo( int l, char* s) {
89     len=l;
90     if (s==NULL) {
91     descr=NULL;
92     } else {
93     descr=Gstrdup(s);
94     }
95     }
96     ~SeqInfo() {
97     GFREE(descr);
98     }
99     };
100    
101     class RefTran {
102     public:
103     char* new_name;
104     RefTran(char *ns) {
105     new_name=NULL;
106     if (ns!=NULL)
107     new_name=Gstrdup(ns);
108     }
109     ~RefTran() {
110     GFREE(new_name);
111     }
112     };
113    
114 gpertea 6 FILE* ffasta=NULL;
115     FILE* f_in=NULL;
116     FILE* f_out=NULL;
117     FILE* f_w=NULL; //fasta with spliced exons (transcripts)
118     FILE* f_x=NULL; //fasta with spliced CDS
119     FILE* f_y=NULL; //fasta with translated CDS
120     bool wCDSonly=false;
121    
122     bool validCDSonly=false; // translation with no in-frame STOP
123     bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
124     bool altPhases=false; //if original phase fails translation validation,
125     //try the other 2 phases until one makes it
126     bool mRNAOnly=true;
127     bool spliceCheck=false; //only known splice-sites
128    
129     bool fullCDSonly=false; // starts with START, ends with STOP codon
130     bool fullattr=false;
131 gpertea 17 //bool sortByLoc=false; // if the GFF output should be sorted by location
132     bool ensembl_convert=false; //-L, assisst in converting Ensembl GTF to GFF3
133    
134    
135     //GStr gseqpath;
136     //GStr gcdbfa;
137     //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
138     //GFaSeqGet* faseq=NULL;
139     //GCdbYank* gcdb=NULL;
140     //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
141 gpertea 6 bool fmtGTF=false;
142     bool addDescr=false;
143     //bool protmap=false;
144     bool multiExon=false;
145     bool writeExonSegs=false;
146     char* tracklabel=NULL;
147     int maxintron=999000000;
148     bool mergeCloseExons=false;
149     //range filter:
150     char* rfltGSeq=NULL;
151 gpertea 17 char rfltStrand=0;
152 gpertea 6 uint rfltStart=0;
153     uint rfltEnd=MAX_UINT;
154     bool rfltWithin=false; //check for full containment within given range
155     bool noExonAttr=false;
156    
157 gpertea 17 bool doCluster=false;
158     bool doCollapseRedundant=false;
159 gpertea 6
160 gpertea 17 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
161 gpertea 6
162     //hash with sequence info
163     GHash<SeqInfo> seqinfo;
164     GHash<int> isoCounter; //counts the valid isoforms
165 gpertea 17 GHash<RefTran> reftbl;
166     GHash<GeneInfo> gene_ids;
167     //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
168 gpertea 6
169     bool debugMode=false;
170     bool verbose=false;
171    
172 gpertea 17 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
173     GLineReader fr(f);
174     while (!fr.isEof()) {
175     char* line=fr.getLine();
176     if (line==NULL) break;
177     char* id=line;
178     char* lenstr=NULL;
179     char* text=NULL;
180     char* p=line;
181     while (*p!=0 && !isspace(*p)) p++;
182     if (*p==0) continue;
183     *p=0;p++;
184     while (*p==' ' || *p=='\t') p++;
185     if (*p==0) continue;
186     lenstr=p;
187     while (*p!=0 && !isspace(*p)) p++;
188     if (*p!=0) { *p=0;p++; }
189     while (*p==' ' || *p=='\t') p++;
190     if (*p!=0) text=p; //else text remains NULL
191     int len=0;
192     if (!parseInt(lenstr,len)) {
193     GMessage("Warning: could not parse sequence length: %s %s\n",
194     id, lenstr);
195     continue;
196     }
197     // --- here we have finished parsing the line
198     si.Add(id, new SeqInfo(len,text));
199     } //while lines
200     }
201    
202     void loadRefTable(FILE* f, GHash<RefTran>& rt) {
203     GLineReader fr(f);
204     char* line=NULL;
205     while ((line=fr.getLine())) {
206     char* orig_id=line;
207     char* p=line;
208     while (*p!=0 && !isspace(*p)) p++;
209     if (*p==0) continue;
210     *p=0;p++;//split the line here
211     while (*p==' ' || *p=='\t') p++;
212     if (*p==0) continue;
213     rt.Add(orig_id, new RefTran(p));
214     } //while lines
215     }
216    
217 gpertea 6 char* getSeqDescr(char* seqid) {
218     static char charbuf[128];
219     if (seqinfo.Count()==0) return NULL;
220     char* suf=rstrchr(seqid, '.');
221     if (suf!=NULL) *suf=0;
222     SeqInfo* seqd=seqinfo.Find(seqid);
223     if (suf!=NULL) *suf='.';
224     if (seqd!=NULL) {
225     GStr s(seqd->descr);
226     //cleanup some Uniref gunk
227     if (s[0]=='[') {
228     int r=s.index(']');
229     if (r>=0 && r<8 && isdigit(s[1]))
230     s.remove(0,r+1);
231     }
232     if (s.length()>80) {
233     int r=s.index(';');
234     if (r>5) s.cut(r);
235     }
236     if (s.length()>127) {
237     s.cut(127);
238     int r=s.rindex(' ');
239     if (r>0) s.cut(r);
240     }
241     strcpy(charbuf, s.chars());
242     return charbuf;
243     }
244     else return NULL;
245     }
246    
247     char* getSeqName(char* seqid) {
248     static char charbuf[128];
249     char* suf=rstrchr(seqid, '.');
250     if (suf!=NULL) *suf=0;
251     strcpy(charbuf, seqid);
252     if (suf!=NULL) *suf='.';
253     return charbuf;
254     }
255    
256 gpertea 17 GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
257     if (gfasta.fastaPath==NULL) return NULL;
258     return gfasta.fetch(gffrec.gseq_id);
259 gpertea 6 }
260    
261    
262 gpertea 17 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
263 gpertea 6 //adj>0 => extedn CDS, adj<0 => shrink CDS
264     //when CDS is expanded, exons have to be checked too and
265     // expanded accordingly if they had the same boundary
266 gpertea 17 int realadj=0;
267     if (gffrec.strand=='-') {
268     if ((int)gffrec.CDstart>adj) {
269    
270     gffrec.CDstart-=adj;
271     realadj=adj;
272     if (adj<0) { //restore
273     if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
274     gffrec.exons.First()->start-=adj;
275     gffrec.start=gffrec.exons.First()->start;
276     gffrec.covlen+=adj;
277 gpertea 6 }
278 gpertea 17 }
279     else if (gffrec.exons.First()->start>=gffrec.CDstart) {
280     gffrec.exons.First()->start-=adj;
281     gffrec.start=gffrec.exons.First()->start;
282     gffrec.covlen+=adj;
283     }
284 gpertea 6 }
285     }
286     else {
287 gpertea 17 realadj=adj;
288     gffrec.CDend+=adj;
289     if (adj<0) {//restore
290     if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
291     gffrec.exons.Last()->end+=adj;
292     gffrec.end=gffrec.exons.Last()->end;
293     gffrec.covlen+=adj;
294     }
295     }
296     else if (gffrec.exons.Last()->end<=gffrec.CDend) {
297     gffrec.exons.Last()->end+=adj;
298     gffrec.end=gffrec.exons.Last()->end;
299     gffrec.covlen+=adj;
300 gpertea 6 }
301     }
302     if (seglst!=NULL) seglst->Last()->end+=adj;
303 gpertea 17 return realadj;
304 gpertea 6 }
305    
306 gpertea 17 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
307     //returns true if the transcript passed the filter
308     char* gname=gffrec.getGeneName();
309     if (gname==NULL) gname=gffrec.getGeneID();
310     GStr defline(gffrec.getID());
311     if (f_out && !fmtGTF) {
312     const char* tn=NULL;
313     if ((tn=gffrec.getAttr("transcript_name"))!=NULL) {
314     gffrec.addAttr("Name", tn);
315     gffrec.removeAttr("transcript_name");
316     }
317     }
318     if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
319     const char* tn=gffrec.getTrackName();
320     gffrec.addAttr("type", tn);
321     //bool is_gene=false;
322     bool is_pseudo=false;
323     if (strcmp(tn, "protein_coding")==0 || gffrec.hasCDS())
324     gffrec.setFeatureName("mRNA");
325     else {
326     if (strcmp(tn, "processed_transcript")==0)
327     gffrec.setFeatureName("proc_RNA");
328     else {
329     //is_gene=endsWith(tn, "gene");
330     is_pseudo=strifind(tn, "pseudo");
331     if (is_pseudo) {
332     gffrec.setFeatureName("pseudo_RNA");
333     }
334     else if (endsWith(tn, "RNA")) {
335     gffrec.setFeatureName(tn);
336     } else gffrec.setFeatureName("misc_RNA");
337     }
338     }
339 gpertea 6 }
340 gpertea 17 if (gname && strcmp(gname, gffrec.getID())!=0) {
341     int* isonum=isoCounter.Find(gname);
342     if (isonum==NULL) {
343     isonum=new int(1);
344     isoCounter.Add(gname,isonum);
345     }
346     else (*isonum)++;
347     defline.appendfmt(" gene=%s", gname);
348     }
349 gpertea 6 int seqlen=0;
350 gpertea 17
351 gpertea 6 const char* tlabel=tracklabel;
352 gpertea 17 if (tlabel==NULL) tlabel=gffrec.getTrackName();
353 gpertea 6 //defline.appendfmt(" track:%s",tlabel);
354     char* cdsnt = NULL;
355     char* cdsaa = NULL;
356     int aalen=0;
357 gpertea 17 for (int i=1;i<gffrec.exons.Count();i++) {
358     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
359     if (ilen>4000000)
360 gpertea 6 GMessage("Warning: very large intron (%d) for transcript %s\n",
361 gpertea 17 ilen, gffrec.getID());
362 gpertea 6 if (ilen>maxintron) {
363 gpertea 17 return false;
364 gpertea 6 }
365     }
366     GList<GSeg> seglst(false,true);
367 gpertea 17 GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
368     if (spliceCheck && gffrec.exons.Count()>1) {
369 gpertea 6 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
370     if (faseq==NULL) GError("Error: no genomic sequence available!\n");
371 gpertea 17 int glen=gffrec.end-gffrec.start+1;
372     const char* gseq=faseq->subseq(gffrec.start, glen);
373     bool revcompl=(gffrec.strand=='-');
374 gpertea 6 bool ssValid=true;
375 gpertea 17 for (int e=1;e<gffrec.exons.Count();e++) {
376     const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
377     int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
378 gpertea 6 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
379     GSpliceSite donorSite(intron,intronlen, false, revcompl);
380     //GMessage("%c intron %d-%d : %s .. %s\n",
381 gpertea 17 // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
382 gpertea 6 if (acceptorSite=="AG") { // GT-AG or GC-AG
383     if (!donorSite.canonicalDonor()) {
384     ssValid=false;break;
385     }
386     }
387     else if (acceptorSite=="AC") { //
388     if (donorSite!="AT") { ssValid=false; break; }
389     }
390     else { ssValid=false; break; }
391     }
392     //GFREE(gseq);
393     if (!ssValid) {
394     if (verbose)
395 gpertea 17 GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
396     return false; //don't print this one!
397 gpertea 6 }
398     }
399    
400     bool trprint=true;
401 gpertea 17 int stopCodonAdjust=0;
402 gpertea 6 int mCDphase=0;
403 gpertea 17 bool hasStop=false;
404     if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
405     mCDphase = gffrec.CDphase-'0';
406 gpertea 6 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
407     if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
408     //if (protmap && fullCDSonly) {
409 gpertea 17 //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
410    
411 gpertea 6 if (validCDSonly) { //make sure the stop codon is always included
412 gpertea 17 //adjust_stopcodon(gffrec,3);
413     stopCodonAdjust=adjust_stopcodon(gffrec,3);
414 gpertea 6 }
415     int strandNum=0;
416     int phaseNum=0;
417     CDS_CHECK:
418 gpertea 17 cdsnt=gffrec.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
419 gpertea 6 if (cdsnt==NULL) trprint=false;
420     if (validCDSonly) {
421     cdsaa=translateDNA(cdsnt, aalen, seqlen);
422     char* p=strchr(cdsaa,'.');
423 gpertea 17 hasStop=false;
424 gpertea 6 if (p!=NULL) {
425     if (p-cdsaa>=aalen-2) { //stop found as the last codon
426     *p='0';//remove it
427     hasStop=true;
428     if (aalen-2==p-cdsaa) {
429     //previous to last codon is the stop codon
430     //so correct the CDS stop accordingly
431 gpertea 17 adjust_stopcodon(gffrec,-3, &seglst);
432     stopCodonAdjust=0; //clear artificial stop adjustment
433 gpertea 6 seqlen-=3;
434     cdsnt[seqlen]=0;
435     }
436     aalen=p-cdsaa;
437     }
438     else {//stop found before the last codon
439     trprint=false;
440     }
441     }//stop codon found
442     if (trprint==false) { //failed CDS validity check
443     //in-frame stop codon found
444     if (altPhases && phaseNum<3) {
445     phaseNum++;
446 gpertea 17 gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
447 gpertea 6 GFREE(cdsaa);
448     goto CDS_CHECK;
449     }
450 gpertea 17 if (gffrec.exons.Count()==1 && bothStrands) {
451 gpertea 6 strandNum++;
452     phaseNum=0;
453     if (strandNum<2) {
454     GFREE(cdsaa);
455 gpertea 17 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
456 gpertea 6 goto CDS_CHECK; //repeat the CDS check for a different frame
457     }
458     }
459 gpertea 17 if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
460 gpertea 6 } //has in-frame STOP
461     if (fullCDSonly) {
462     if (!hasStop || cdsaa[0]!='M') trprint=false;
463     }
464     } // CDS check requested
465     } //translation or codon check/output was requested
466     if (!trprint) {
467     GFREE(cdsnt);
468     GFREE(cdsaa);
469 gpertea 17 return false;
470 gpertea 6 }
471 gpertea 17 if (stopCodonAdjust>0 && !hasStop) {
472     //restore stop codon location
473     adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
474     if (cdsnt!=NULL && seqlen>0) {
475     seqlen-=stopCodonAdjust;
476     cdsnt[seqlen]=0;
477     }
478     if (cdsaa!=NULL) aalen--;
479     }
480    
481 gpertea 6 if (f_y!=NULL) { //CDS translation fasta output requested
482     //char*
483     if (cdsaa==NULL) { //translate now if not done before
484     cdsaa=translateDNA(cdsnt, aalen, seqlen);
485     }
486 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
487 gpertea 6 //append all attributes found for each transcripts
488 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
489 gpertea 6 defline.append(" ");
490 gpertea 17 defline.append(gffrec.getAttrName(i));
491 gpertea 6 defline.append("=");
492 gpertea 17 defline.append(gffrec.getAttrValue(i));
493 gpertea 6 }
494     }
495     printFasta(f_y, defline, cdsaa, aalen);
496     }
497     if (f_x!=NULL) { //CDS only
498     if (writeExonSegs) {
499     defline.append(" loc:");
500 gpertea 17 defline.append(gffrec.getGSeqName());
501     defline.appendfmt("(%c)",gffrec.strand);
502 gpertea 6 //warning: not CDS coordinates are written here, but the exon ones
503 gpertea 17 defline+=(int)gffrec.start;
504 gpertea 6 defline+=(char)'-';
505 gpertea 17 defline+=(int)gffrec.end;
506 gpertea 6 // -- here these are CDS substring coordinates on the spliced sequence:
507     defline.append(" segs:");
508     for (int i=0;i<seglst.Count();i++) {
509     if (i>0) defline.append(",");
510     defline+=(int)seglst[i]->start;
511     defline.append("-");
512     defline+=(int)seglst[i]->end;
513     }
514     }
515 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
516 gpertea 6 //append all attributes found for each transcript
517 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
518 gpertea 6 defline.append(" ");
519 gpertea 17 defline.append(gffrec.getAttrName(i));
520 gpertea 6 defline.append("=");
521 gpertea 17 defline.append(gffrec.getAttrValue(i));
522 gpertea 6 }
523     }
524     printFasta(f_x, defline, cdsnt, seqlen);
525     }
526     GFREE(cdsnt);
527     GFREE(cdsaa);
528     if (f_w!=NULL) { //write spliced exons
529     uint cds_start=0;
530     uint cds_end=0;
531     seglst.Clear();
532 gpertea 17 char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
533 gpertea 6 if (exont!=NULL) {
534 gpertea 17 if (gffrec.CDstart>0) {
535 gpertea 6 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
536     }
537     if (writeExonSegs) {
538     defline.append(" loc:");
539 gpertea 17 defline.append(gffrec.getGSeqName());
540 gpertea 6 defline+=(char)'|';
541 gpertea 17 defline+=(int)gffrec.start;
542 gpertea 6 defline+=(char)'-';
543 gpertea 17 defline+=(int)gffrec.end;
544 gpertea 6 defline+=(char)'|';
545 gpertea 17 defline+=(char)gffrec.strand;
546 gpertea 6 defline.append(" exons:");
547 gpertea 17 for (int i=0;i<gffrec.exons.Count();i++) {
548 gpertea 6 if (i>0) defline.append(",");
549 gpertea 17 defline+=(int)gffrec.exons[i]->start;
550 gpertea 6 defline.append("-");
551 gpertea 17 defline+=(int)gffrec.exons[i]->end;
552 gpertea 6 }
553     defline.append(" segs:");
554     for (int i=0;i<seglst.Count();i++) {
555     if (i>0) defline.append(",");
556     defline+=(int)seglst[i]->start;
557     defline.append("-");
558     defline+=(int)seglst[i]->end;
559     }
560     }
561 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
562 gpertea 6 //append all attributes found for each transcripts
563 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
564 gpertea 6 defline.append(" ");
565 gpertea 17 defline.append(gffrec.getAttrName(i));
566 gpertea 6 defline.append("=");
567 gpertea 17 defline.append(gffrec.getAttrValue(i));
568 gpertea 6 }
569     }
570     printFasta(f_w, defline, exont, seqlen);
571     GFREE(exont);
572     }
573     } //writing f_w (spliced exons)
574 gpertea 17 return true;
575 gpertea 6 }
576    
577     void openfw(FILE* &f, GArgs& args, char opt) {
578     GStr s=args.getOpt(opt);
579     if (!s.is_empty()) {
580     if (s=='-')
581     f=stdout;
582     else {
583     f=fopen(s,"w");
584     if (f==NULL) GError("Error creating file: %s\n", s.chars());
585     }
586     }
587     }
588    
589     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
590     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
591    
592 gpertea 17 void printGff3Header(FILE* f, GArgs& args) {
593     fprintf(f, "# ");
594     args.printCmdLine(f);
595     fprintf(f, "##gff-version 3\n");
596     //for (int i=0;i<gseqdata.Count();i++) {
597     //
598     //}
599     }
600    
601     bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
602     if (reftbl.Count()>0) {
603     GStr refname(gffrec->getRefName());
604     RefTran* rt=reftbl.Find(refname.chars());
605     if (rt==NULL && refname[-2]=='.' && isdigit(refname[-1])) {
606     //try removing the version
607     refname.cut(-2);
608     //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
609     rt=reftbl.Find(refname.chars());
610     }
611     if (rt) {
612     gffrec->setRefName(rt->new_name);
613     }
614     else return false; //discard, ref seq not in the given translation table
615     }
616     if (mRNAOnly && gffrec->isDiscarded()) {
617     //discard generic "gene" or "locus" features with no other detailed subfeatures
618     //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
619     return false;
620     }
621     /*
622     if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
623     //a non-mRNA feature with no subfeatures
624     //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
625     //--this could be a single "pseudogene" entry or another genomic region without exons
626     //
627     gffrec->addExon(gffrec->start,gffrec->end);
628     }
629     */
630     if (rfltGSeq!=NULL) { //filter by gseqName
631     if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
632     return false;
633     }
634     }
635     if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
636     return false;
637     }
638     //check coordinates
639     if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
640     if (rfltWithin) {
641     if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
642     return false;
643     }
644     }
645     else {
646     if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
647     return false;
648     }
649     }
650     }
651     if (multiExon && gffrec->exons.Count()<=1) {
652     return false;
653     }
654     if (wCDSonly && gffrec->CDstart==0) {
655     return false;
656     }
657     if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
658     //keep track of chr|gene_id data -- coordinate range
659     char* geneid=gffrec->getGeneID();
660     if (geneid!=NULL) {
661     GeneInfo* ginfo=gene_ids.Find(geneid);
662     if (ginfo==NULL) {//first time seeing this gene ID
663     GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
664     gene_ids.Add(geneid, geneinfo);
665     if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
666     }
667     else ginfo->update(gffrec);
668     }
669     }
670     return true;
671     }
672    
673    
674 gpertea 6 int main(int argc, char * const argv[]) {
675 gpertea 17 GArgs args(argc, argv,
676     "debug;merge;cluster-only;help;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
677     args.printError(USAGE, true);
678     if (args.getOpt('h') || args.getOpt("help")) {
679     GMessage("%s",USAGE);
680     exit(1);
681     }
682     debugMode=(args.getOpt("debug")!=NULL);
683 gpertea 6 mRNAOnly=(args.getOpt('O')==NULL);
684 gpertea 17 //sortByLoc=(args.getOpt('S')!=NULL);
685 gpertea 6 addDescr=(args.getOpt('A')!=NULL);
686     verbose=(args.getOpt('v')!=NULL);
687     wCDSonly=(args.getOpt('C')!=NULL);
688     validCDSonly=(args.getOpt('V')!=NULL);
689     altPhases=(args.getOpt('H')!=NULL);
690     fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
691     bothStrands=(args.getOpt('B')!=NULL);
692 gpertea 17 fullCDSonly=(args.getOpt('J')!=NULL);
693 gpertea 6 spliceCheck=(args.getOpt('N')!=NULL);
694 gpertea 17 bool matchAllIntrons=(args.getOpt('K')==NULL);
695     bool fuzzSpan=(args.getOpt('Q')!=NULL);
696     if (args.getOpt('M') || args.getOpt("merge")) {
697     doCluster=true;
698     doCollapseRedundant=true;
699     }
700     else {
701     if (!matchAllIntrons || fuzzSpan) {
702     GMessage("%s",USAGE);
703     GMessage("Error: -K or -Q options require -M/--merge option!\n");
704     exit(1);
705     }
706     }
707     if (args.getOpt("cluster-only")) {
708     doCluster=true;
709     doCollapseRedundant=false;
710     if (!matchAllIntrons || fuzzSpan) {
711     GMessage("%s",USAGE);
712     GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
713     exit(1);
714     }
715     }
716 gpertea 6 //protmap=(args.getOpt('P')!=NULL);
717     if (fullCDSonly) validCDSonly=true;
718 gpertea 17 if (verbose) {
719     fprintf(stderr, "Command line was:\n");
720     args.printCmdLine(stderr);
721     }
722    
723 gpertea 6 fullattr=(args.getOpt('F')!=NULL);
724     if (args.getOpt('G')==NULL)
725     noExonAttr=!fullattr;
726     else {
727     noExonAttr=true;
728     fullattr=true;
729     }
730 gpertea 17 ensembl_convert=(args.getOpt('L')!=NULL);
731     if (ensembl_convert) {
732     fullattr=true;
733     noExonAttr=false;
734     //sortByLoc=true;
735     }
736    
737 gpertea 6 mergeCloseExons=(args.getOpt('Z')!=NULL);
738     multiExon=(args.getOpt('U')!=NULL);
739     writeExonSegs=(args.getOpt('W')!=NULL);
740     tracklabel=args.getOpt('t');
741 gpertea 17 GFastaDb gfasta(args.getOpt('g'));
742     //if (gfasta.fastaPath!=NULL)
743     // sortByLoc=true; //enforce sorting by chromosome/contig
744 gpertea 6 GStr s=args.getOpt('i');
745     if (!s.is_empty()) maxintron=s.asInt();
746 gpertea 17
747     FILE* f_repl=NULL;
748     s=args.getOpt('d');
749     if (!s.is_empty()) {
750     if (s=="-") f_repl=stdout;
751     else {
752     f_repl=fopen(s.chars(), "w");
753     if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
754     }
755     }
756    
757 gpertea 6 rfltWithin=(args.getOpt('R')!=NULL);
758     s=args.getOpt('r');
759     if (!s.is_empty()) {
760     s.trim();
761 gpertea 17 if (s[0]=='+' || s[0]=='-') {
762     rfltStrand=s[0];
763     s.cut(0,1);
764     }
765 gpertea 6 int isep=s.index(':');
766 gpertea 17 if (isep>0) { //gseq name given
767     if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
768     isep--;
769     rfltStrand=s[isep];
770     s.cut(isep,1);
771     }
772     if (isep>0)
773     rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
774 gpertea 6 s.cut(0,isep+1);
775     }
776     GStr gsend;
777 gpertea 17 char slast=s[s.length()-1];
778     if (rfltStrand==0 && (slast=='+' || slast=='-')) {
779     s.chomp(slast);
780     rfltStrand=slast;
781     }
782 gpertea 6 if (s.index("..")>=0) gsend=s.split("..");
783 gpertea 17 else gsend=s.split('-');
784 gpertea 6 if (!s.is_empty()) rfltStart=(uint)s.asInt();
785 gpertea 17 if (!gsend.is_empty()) {
786     rfltEnd=(uint)gsend.asInt();
787     if (rfltEnd==0) rfltEnd=MAX_UINT;
788     }
789    
790 gpertea 6 } //gseq/range filtering
791     else {
792     if (rfltWithin)
793     GError("Error: option -R doesn't make sense without -r!\n");
794     }
795 gpertea 17 s=args.getOpt('m');
796     if (!s.is_empty()) {
797     FILE* ft=fopen(s,"r");
798     if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
799     loadRefTable(ft, reftbl);
800     fclose(ft);
801     }
802 gpertea 6 s=args.getOpt('s');
803     if (!s.is_empty()) {
804 gpertea 17 FILE* fsize=fopen(s,"r");
805     if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
806     loadSeqInfo(fsize, seqinfo);
807     fclose(fsize);
808     }
809    
810 gpertea 6 openfw(f_out, args, 'o');
811     //if (f_out==NULL) f_out=stdout;
812 gpertea 17 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
813 gpertea 6 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
814    
815     openfw(f_w, args, 'w');
816     openfw(f_x, args, 'x');
817     openfw(f_y, args, 'y');
818     if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
819     //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
820 gpertea 17
821     int numfiles = args.startNonOpt();
822     //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
823     int out_counter=0; //number of records printed
824     while (true) {
825     GStr infile;
826     if (numfiles) {
827     infile=args.nextNonOpt();
828     if (infile.is_empty()) break;
829     if (infile=="-") { f_in=stdin; infile="stdin"; }
830     else
831     if ((f_in=fopen(infile, "r"))==NULL)
832     GError("Error: cannot open input file %s!\n",infile.chars());
833 gpertea 6 }
834 gpertea 17 else
835     infile="-";
836     GffLoader gffloader(infile.chars());
837     gffloader.transcriptsOnly=mRNAOnly;
838     gffloader.fullAttributes=fullattr;
839     gffloader.noExonAttrs=noExonAttr;
840     gffloader.mergeCloseExons=mergeCloseExons;
841     gffloader.showWarnings=(args.getOpt('E')!=NULL);
842     gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
843     matchAllIntrons, fuzzSpan);
844     if (doCluster)
845     collectLocusData(g_data);
846     if (numfiles==0) break;
847     }
848    
849     GStr loctrack("gffcl");
850     if (tracklabel) loctrack=tracklabel;
851     g_data.setSorted(&gseqCmpName);
852     if (doCluster) {
853     //grouped in loci
854     for (int g=0;g<g_data.Count();g++) {
855     GenomicSeqData* gdata=g_data[g];
856     for (int l=0;l<gdata->loci.Count();l++) {
857     GffLocus& loc=*(gdata->loci[l]);
858     //check all non-replaced transcripts in this locus:
859     int numvalid=0;
860     int idxfirstvalid=-1;
861     for (int i=0;i<loc.rnas.Count();i++) {
862     GffObj& t=*(loc.rnas[i]);
863     GTData* tdata=(GTData*)(t.uptr);
864     if (tdata->replaced_by!=NULL) {
865     if (f_repl && (t.udata & 8)==0) {
866     //t.udata|=8;
867     fprintf(f_repl, "%s", t.getID());
868     GTData* rby=tdata;
869     while (rby->replaced_by!=NULL) {
870     fprintf(f_repl," => %s", rby->replaced_by->getID());
871     rby->rna->udata|=8;
872     rby=(GTData*)(rby->replaced_by->uptr);
873     }
874     fprintf(f_repl, "\n");
875     }
876     continue;
877     }
878     if (process_transcript(gfasta, t)) {
879     t.udata|=4; //tag it as valid
880     numvalid++;
881     if (idxfirstvalid<0) idxfirstvalid=i;
882     }
883 gpertea 6 }
884 gpertea 17
885     if (f_out && numvalid>0) {
886     GStr locname("RLOC_");
887     locname.appendfmt("%08d",loc.locus_num);
888     if (!fmtGTF) {
889     if (out_counter==0)
890     printGff3Header(f_out, args);
891     fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
892     loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
893     locname.chars(), locname.chars());
894     //const char* loc_gname=loc.getGeneName();
895     if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
896     fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
897     for (int i=1;i<loc.gene_names.Count();i++) {
898     fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
899     }
900     }
901     if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
902     fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
903     for (int i=1;i<loc.gene_ids.Count();i++) {
904     fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
905     }
906     }
907     fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
908     for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
909     fprintf(f_out, ",%s",loc.rnas[i]->getID());
910     }
911     fprintf(f_out, "\n");
912     }
913     //now print all valid, non-replaced transcripts in this locus:
914     for (int i=0;i<loc.rnas.Count();i++) {
915     GffObj& t=*(loc.rnas[i]);
916     GTData* tdata=(GTData*)(t.uptr);
917     if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
918     t.addAttr("locus", locname.chars());
919     out_counter++;
920     if (fmtGTF) t.printGtf(f_out, tracklabel);
921     else {
922     //print the parent first, if any
923     if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
924     GTData* pdata=(GTData*)(t.parent->uptr);
925     if (pdata->geneinfo!=NULL)
926     pdata->geneinfo->finalize();
927     t.parent->addAttr("locus", locname.chars());
928     t.parent->printGff(f_out, tracklabel);
929     t.parent->udata|=4;
930     }
931     t.printGff(f_out, tracklabel);
932     }
933     }
934     } //have valid transcripts to print
935     }//for each locus
936     if (f_out && !mRNAOnly) {
937     //final pass through the non-transcripts, in case any of them were not printed
938     //TODO: order broken, these should be interspersed among the rnas in the correct order!
939     for (int m=0;m<gdata->gfs.Count();m++) {
940     GffObj& t=*(gdata->gfs[m]);
941     if ((t.udata&4)==0) { //never printed
942     t.udata|=4;
943     if (fmtGTF) t.printGtf(f_out, tracklabel);
944     else t.printGff(f_out, tracklabel);
945     }
946     } //for each non-transcript
947 gpertea 6 }
948 gpertea 17 } //for each genomic sequence
949     }
950     else {
951     //not grouped into loci, print the rnas with their parents, if any
952     int numvalid=0;
953     for (int g=0;g<g_data.Count();g++) {
954     GenomicSeqData* gdata=g_data[g];
955     for (int m=0;m<gdata->rnas.Count();m++) {
956     GffObj& t=*(gdata->rnas[m]);
957     GTData* tdata=(GTData*)(t.uptr);
958     if (tdata->replaced_by!=NULL) continue;
959     if (process_transcript(gfasta, t)) {
960     t.udata|=4; //tag it as valid
961     numvalid++;
962     if (f_out) {
963     if (tdata->geneinfo) tdata->geneinfo->finalize();
964     out_counter++;
965     if (fmtGTF) t.printGtf(f_out, tracklabel);
966     else {
967     if (out_counter==1)
968     printGff3Header(f_out, args);
969     //print the parent first, if any
970     if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
971     GTData* pdata=(GTData*)(t.parent->uptr);
972     if (pdata->geneinfo!=NULL)
973     pdata->geneinfo->finalize();
974     t.parent->printGff(f_out, tracklabel);
975     t.parent->udata|=4;
976     }
977     t.printGff(f_out, tracklabel);
978     }
979     }//GFF/GTF output requested
980     } //valid transcript
981     } //for each rna
982     if (f_out && !mRNAOnly) {
983     //final pass through the non-transcripts, in case any of them were not printed
984     //TODO: order broken, these should be interspersed among the rnas in the correct order!
985     for (int m=0;m<gdata->gfs.Count();m++) {
986     GffObj& t=*(gdata->gfs[m]);
987     if ((t.udata&4)==0) { //never printed
988     t.udata|=4;
989     if (fmtGTF) t.printGtf(f_out, tracklabel);
990     else t.printGff(f_out, tracklabel);
991     }
992     } //for each non-transcript
993 gpertea 6 }
994 gpertea 17 } //for each genomic seq
995 gpertea 6 }
996 gpertea 17 if (f_repl && f_repl!=stdout) fclose(f_repl);
997 gpertea 6 seqinfo.Clear();
998 gpertea 17 //if (faseq!=NULL) delete faseq;
999     //if (gcdb!=NULL) delete gcdb;
1000 gpertea 6 GFREE(rfltGSeq);
1001     FRCLOSE(f_in);
1002     FWCLOSE(f_out);
1003     FWCLOSE(f_w);
1004     FWCLOSE(f_x);
1005     FWCLOSE(f_y);
1006     }