ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 116
Committed: Mon Nov 7 21:25:56 2011 UTC (8 years, 1 month ago) by gpertea
File size: 35749 byte(s)
Log Message:
Ensembl conversion corrections

Line User Rev File contents
1 gpertea 17 #include "gff_utils.h"
2 gpertea 6 #include "GArgs.h"
3     #include <ctype.h>
4 gpertea 17 // don't care about cdb compression
5     //#ifdef ENABLE_COMPRESSION
6     //#undef ENABLE_COMPRESSION
7     //#endif
8     //#include "GCdbYank.h"
9 gpertea 6
10     #define USAGE "Usage:\n\
11 gpertea 17 gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12 gpertea 42 [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n\
13 gpertea 43 [-CTVNJMKQAFGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n\
14 gpertea 17 [-i <maxintron>] \n\
15     Filters and/or converts GFF3/GTF2 records.\n\
16     <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17 gpertea 6 \n\
18     Options:\n\
19 gpertea 17 -g full path to a multi-fasta file with the genomic sequences\n\
20     for all input mappings, OR a directory with single-fasta files\n\
21     (one per genomic sequence, with file names matching sequence names)\n\
22     -s <seq_info.fsize> is a tab-delimited file providing this info\n\
23 gpertea 6 for each of the mapped sequences:\n\
24     <seq-name> <seq-length> <seq-description>\n\
25 gpertea 17 (useful for -A option with mRNA/EST/protein mappings)\n\
26 gpertea 6 -i discard transcripts having an intron larger than <maxintron>\n\
27 gpertea 42 -r only show transcripts overlapping coordinate range <start>..<end>\n\
28 gpertea 17 (on chromosome/contig <chr>, strand <strand> if provided)\n\
29 gpertea 6 -R for -r option, discard all transcripts that are not fully \n\
30 gpertea 42 contained within the given range\n\
31 gpertea 6 -U discard single-exon transcripts\n\
32 gpertea 17 -C coding only: discard mRNAs that have no CDS feature\n\
33     -F full GFF attribute preservation (all attributes are shown)\n\
34 gpertea 6 -G only parse additional exon attributes from the first exon\n\
35     and move them to the mRNA level (useful for GTF input)\n\
36 gpertea 17 -A use the description field from <seq_info.fsize> and add it\n\
37     as the value for a 'descr' attribute to the GFF record\n\
38 gpertea 6 \n\
39 gpertea 17 -O process also non-transcript GFF records (by default non-transcript\n\
40     records are ignored)\n\
41 gpertea 6 -V discard any mRNAs with CDS having in-frame stop codons\n\
42     -H for -V option, check and adjust the starting CDS phase\n\
43     if the original phase leads to a translation with an \n\
44     in-frame stop codon\n\
45     -B for -V option, single-exon transcripts are also checked on the\n\
46     opposite strand\n\
47 gpertea 17 -N discard multi-exon mRNAs that have any intron with a non-canonical\n\
48     splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49     -J discard any mRNAs that either lack initial START codon\n\
50 gpertea 6 or the terminal STOP codon, or have an in-frame stop codon\n\
51     (only print mRNAs with a fulll, valid CDS)\n\
52     \n\
53 gpertea 17 -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54     transcripts (those with the same exact introns and fully contained)\n\
55 gpertea 38 -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>\n\
56 gpertea 17 --cluster-only: same as --merge but without collapsing matching transcripts\n\
57     -K for -M option: also collapse shorter, fully contained transcripts\n\
58     with fewer introns than the container\n\
59     -Q for -M option, remove the containment restriction:\n\
60     (multi-exon transcripts will be collapsed if just their introns match,\n\
61     while single-exon transcripts can partially overlap (80%))\n\
62     \n\
63     -E expose (warn about) duplicate transcript IDs and other potential \n\
64     problems with the given GFF/GTF records\n\
65     -Z merge close exons into a single exon (for intron size<4)\n\
66     -w write a fasta file with spliced exons for each GFF transcript\n\
67     -x write a fasta file with spliced CDS for each GFF transcript\n\
68     -W for -w and -x options, also write for each fasta record the exon\n\
69 gpertea 6 coordinates projected onto the spliced sequence\n\
70 gpertea 17 -y write a protein fasta file with the translation of CDS for each record\n\
71     -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
72     -m <chr_replace> is a reference (genomic) sequence replacement table with\n\
73     this format:\n\
74     <original_ref_ID> <new_ref_ID>\n\
75     GFF records on reference sequences that are not found among the\n\
76     <original_ref_ID> entries in this file will be filtered out\n\
77     -o the \"filtered\" GFF records will be written to <outfile.gff>\n\
78     (use -o- for printing to stdout)\n\
79     -t use <trackname> in the second column of each GFF output line\n\
80     -T -o option will output GTF format instead of GFF3\n\
81     "
82 gpertea 6
83 gpertea 17
84     class SeqInfo { //populated from the -s option of gffread
85     public:
86     int len;
87     char* descr;
88     SeqInfo( int l, char* s) {
89     len=l;
90     if (s==NULL) {
91     descr=NULL;
92     } else {
93     descr=Gstrdup(s);
94     }
95     }
96     ~SeqInfo() {
97     GFREE(descr);
98     }
99     };
100    
101     class RefTran {
102     public:
103     char* new_name;
104     RefTran(char *ns) {
105     new_name=NULL;
106     if (ns!=NULL)
107     new_name=Gstrdup(ns);
108     }
109     ~RefTran() {
110     GFREE(new_name);
111     }
112     };
113    
114 gpertea 6 FILE* ffasta=NULL;
115     FILE* f_in=NULL;
116     FILE* f_out=NULL;
117     FILE* f_w=NULL; //fasta with spliced exons (transcripts)
118     FILE* f_x=NULL; //fasta with spliced CDS
119     FILE* f_y=NULL; //fasta with translated CDS
120     bool wCDSonly=false;
121    
122     bool validCDSonly=false; // translation with no in-frame STOP
123     bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
124     bool altPhases=false; //if original phase fails translation validation,
125     //try the other 2 phases until one makes it
126     bool mRNAOnly=true;
127     bool spliceCheck=false; //only known splice-sites
128    
129     bool fullCDSonly=false; // starts with START, ends with STOP codon
130     bool fullattr=false;
131 gpertea 17 //bool sortByLoc=false; // if the GFF output should be sorted by location
132 gpertea 116 bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3
133 gpertea 17
134    
135     //GStr gseqpath;
136     //GStr gcdbfa;
137     //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
138     //GFaSeqGet* faseq=NULL;
139     //GCdbYank* gcdb=NULL;
140     //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
141 gpertea 6 bool fmtGTF=false;
142     bool addDescr=false;
143     //bool protmap=false;
144     bool multiExon=false;
145     bool writeExonSegs=false;
146     char* tracklabel=NULL;
147     int maxintron=999000000;
148     bool mergeCloseExons=false;
149     //range filter:
150     char* rfltGSeq=NULL;
151 gpertea 17 char rfltStrand=0;
152 gpertea 6 uint rfltStart=0;
153     uint rfltEnd=MAX_UINT;
154     bool rfltWithin=false; //check for full containment within given range
155     bool noExonAttr=false;
156    
157 gpertea 17 bool doCluster=false;
158     bool doCollapseRedundant=false;
159 gpertea 6
160 gpertea 17 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
161 gpertea 6
162     //hash with sequence info
163     GHash<SeqInfo> seqinfo;
164     GHash<int> isoCounter; //counts the valid isoforms
165 gpertea 17 GHash<RefTran> reftbl;
166     GHash<GeneInfo> gene_ids;
167     //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
168 gpertea 6
169     bool debugMode=false;
170     bool verbose=false;
171    
172 gpertea 17 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
173     GLineReader fr(f);
174     while (!fr.isEof()) {
175     char* line=fr.getLine();
176     if (line==NULL) break;
177     char* id=line;
178     char* lenstr=NULL;
179     char* text=NULL;
180     char* p=line;
181     while (*p!=0 && !isspace(*p)) p++;
182     if (*p==0) continue;
183     *p=0;p++;
184     while (*p==' ' || *p=='\t') p++;
185     if (*p==0) continue;
186     lenstr=p;
187     while (*p!=0 && !isspace(*p)) p++;
188     if (*p!=0) { *p=0;p++; }
189     while (*p==' ' || *p=='\t') p++;
190     if (*p!=0) text=p; //else text remains NULL
191     int len=0;
192     if (!parseInt(lenstr,len)) {
193     GMessage("Warning: could not parse sequence length: %s %s\n",
194     id, lenstr);
195     continue;
196     }
197     // --- here we have finished parsing the line
198     si.Add(id, new SeqInfo(len,text));
199     } //while lines
200     }
201    
202     void loadRefTable(FILE* f, GHash<RefTran>& rt) {
203     GLineReader fr(f);
204     char* line=NULL;
205     while ((line=fr.getLine())) {
206     char* orig_id=line;
207     char* p=line;
208     while (*p!=0 && !isspace(*p)) p++;
209     if (*p==0) continue;
210     *p=0;p++;//split the line here
211     while (*p==' ' || *p=='\t') p++;
212     if (*p==0) continue;
213     rt.Add(orig_id, new RefTran(p));
214     } //while lines
215     }
216    
217 gpertea 6 char* getSeqDescr(char* seqid) {
218     static char charbuf[128];
219     if (seqinfo.Count()==0) return NULL;
220     char* suf=rstrchr(seqid, '.');
221     if (suf!=NULL) *suf=0;
222     SeqInfo* seqd=seqinfo.Find(seqid);
223     if (suf!=NULL) *suf='.';
224     if (seqd!=NULL) {
225     GStr s(seqd->descr);
226     //cleanup some Uniref gunk
227     if (s[0]=='[') {
228     int r=s.index(']');
229     if (r>=0 && r<8 && isdigit(s[1]))
230     s.remove(0,r+1);
231     }
232     if (s.length()>80) {
233     int r=s.index(';');
234     if (r>5) s.cut(r);
235     }
236     if (s.length()>127) {
237     s.cut(127);
238     int r=s.rindex(' ');
239     if (r>0) s.cut(r);
240     }
241     strcpy(charbuf, s.chars());
242     return charbuf;
243     }
244     else return NULL;
245     }
246    
247     char* getSeqName(char* seqid) {
248     static char charbuf[128];
249     char* suf=rstrchr(seqid, '.');
250     if (suf!=NULL) *suf=0;
251     strcpy(charbuf, seqid);
252     if (suf!=NULL) *suf='.';
253     return charbuf;
254     }
255    
256 gpertea 17 GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
257     if (gfasta.fastaPath==NULL) return NULL;
258     return gfasta.fetch(gffrec.gseq_id);
259 gpertea 6 }
260    
261    
262 gpertea 17 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
263 gpertea 6 //adj>0 => extedn CDS, adj<0 => shrink CDS
264     //when CDS is expanded, exons have to be checked too and
265     // expanded accordingly if they had the same boundary
266 gpertea 17 int realadj=0;
267     if (gffrec.strand=='-') {
268     if ((int)gffrec.CDstart>adj) {
269    
270     gffrec.CDstart-=adj;
271     realadj=adj;
272     if (adj<0) { //restore
273     if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
274     gffrec.exons.First()->start-=adj;
275     gffrec.start=gffrec.exons.First()->start;
276     gffrec.covlen+=adj;
277 gpertea 6 }
278 gpertea 17 }
279     else if (gffrec.exons.First()->start>=gffrec.CDstart) {
280     gffrec.exons.First()->start-=adj;
281     gffrec.start=gffrec.exons.First()->start;
282     gffrec.covlen+=adj;
283     }
284 gpertea 6 }
285     }
286     else {
287 gpertea 17 realadj=adj;
288     gffrec.CDend+=adj;
289     if (adj<0) {//restore
290     if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
291     gffrec.exons.Last()->end+=adj;
292     gffrec.end=gffrec.exons.Last()->end;
293     gffrec.covlen+=adj;
294     }
295     }
296     else if (gffrec.exons.Last()->end<=gffrec.CDend) {
297     gffrec.exons.Last()->end+=adj;
298     gffrec.end=gffrec.exons.Last()->end;
299     gffrec.covlen+=adj;
300 gpertea 6 }
301     }
302     if (seglst!=NULL) seglst->Last()->end+=adj;
303 gpertea 17 return realadj;
304 gpertea 6 }
305    
306 gpertea 17 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
307     //returns true if the transcript passed the filter
308     char* gname=gffrec.getGeneName();
309     if (gname==NULL) gname=gffrec.getGeneID();
310     GStr defline(gffrec.getID());
311     if (f_out && !fmtGTF) {
312 gpertea 116 const char* tname=NULL;
313     if ((tname=gffrec.getAttr("transcript_name"))!=NULL) {
314     gffrec.addAttr("Name", tname);
315 gpertea 17 gffrec.removeAttr("transcript_name");
316     }
317     }
318     if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
319 gpertea 116 const char* biotype=gffrec.getAttr("gene_biotype");
320     if (biotype) {
321     gffrec.addAttr("type", biotype);
322     gffrec.removeAttr("gene_biotype");
323     }
324     else { //old Ensembl files lacking gene_biotype
325     gffrec.addAttr("type", gffrec.getTrackName());
326     }
327    
328 gpertea 17 //bool is_gene=false;
329     bool is_pseudo=false;
330 gpertea 116 if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
331 gpertea 17 gffrec.setFeatureName("mRNA");
332     else {
333 gpertea 116 if (strcmp(biotype, "processed_transcript")==0)
334 gpertea 17 gffrec.setFeatureName("proc_RNA");
335     else {
336 gpertea 116 //is_gene=endsWith(biotype, "gene");
337     is_pseudo=strifind(biotype, "pseudo");
338 gpertea 17 if (is_pseudo) {
339     gffrec.setFeatureName("pseudo_RNA");
340     }
341 gpertea 116 else if (endsWith(biotype, "RNA")) {
342     gffrec.setFeatureName(biotype);
343 gpertea 17 } else gffrec.setFeatureName("misc_RNA");
344     }
345     }
346 gpertea 6 }
347 gpertea 17 if (gname && strcmp(gname, gffrec.getID())!=0) {
348     int* isonum=isoCounter.Find(gname);
349     if (isonum==NULL) {
350     isonum=new int(1);
351     isoCounter.Add(gname,isonum);
352     }
353     else (*isonum)++;
354     defline.appendfmt(" gene=%s", gname);
355     }
356 gpertea 6 int seqlen=0;
357 gpertea 17
358 gpertea 6 const char* tlabel=tracklabel;
359 gpertea 17 if (tlabel==NULL) tlabel=gffrec.getTrackName();
360 gpertea 6 //defline.appendfmt(" track:%s",tlabel);
361     char* cdsnt = NULL;
362     char* cdsaa = NULL;
363     int aalen=0;
364 gpertea 17 for (int i=1;i<gffrec.exons.Count();i++) {
365     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
366     if (ilen>4000000)
367 gpertea 6 GMessage("Warning: very large intron (%d) for transcript %s\n",
368 gpertea 17 ilen, gffrec.getID());
369 gpertea 6 if (ilen>maxintron) {
370 gpertea 17 return false;
371 gpertea 6 }
372     }
373     GList<GSeg> seglst(false,true);
374 gpertea 17 GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
375     if (spliceCheck && gffrec.exons.Count()>1) {
376 gpertea 6 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
377     if (faseq==NULL) GError("Error: no genomic sequence available!\n");
378 gpertea 17 int glen=gffrec.end-gffrec.start+1;
379     const char* gseq=faseq->subseq(gffrec.start, glen);
380     bool revcompl=(gffrec.strand=='-');
381 gpertea 6 bool ssValid=true;
382 gpertea 17 for (int e=1;e<gffrec.exons.Count();e++) {
383     const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
384     int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
385 gpertea 6 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
386     GSpliceSite donorSite(intron,intronlen, false, revcompl);
387     //GMessage("%c intron %d-%d : %s .. %s\n",
388 gpertea 17 // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
389 gpertea 6 if (acceptorSite=="AG") { // GT-AG or GC-AG
390     if (!donorSite.canonicalDonor()) {
391     ssValid=false;break;
392     }
393     }
394     else if (acceptorSite=="AC") { //
395     if (donorSite!="AT") { ssValid=false; break; }
396     }
397     else { ssValid=false; break; }
398     }
399     //GFREE(gseq);
400     if (!ssValid) {
401     if (verbose)
402 gpertea 17 GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
403     return false; //don't print this one!
404 gpertea 6 }
405     }
406    
407     bool trprint=true;
408 gpertea 17 int stopCodonAdjust=0;
409 gpertea 6 int mCDphase=0;
410 gpertea 17 bool hasStop=false;
411     if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
412     mCDphase = gffrec.CDphase-'0';
413 gpertea 6 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
414     if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
415     //if (protmap && fullCDSonly) {
416 gpertea 17 //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
417    
418 gpertea 6 if (validCDSonly) { //make sure the stop codon is always included
419 gpertea 17 //adjust_stopcodon(gffrec,3);
420     stopCodonAdjust=adjust_stopcodon(gffrec,3);
421 gpertea 6 }
422     int strandNum=0;
423     int phaseNum=0;
424     CDS_CHECK:
425 gpertea 17 cdsnt=gffrec.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
426 gpertea 6 if (cdsnt==NULL) trprint=false;
427     if (validCDSonly) {
428     cdsaa=translateDNA(cdsnt, aalen, seqlen);
429     char* p=strchr(cdsaa,'.');
430 gpertea 17 hasStop=false;
431 gpertea 6 if (p!=NULL) {
432     if (p-cdsaa>=aalen-2) { //stop found as the last codon
433     *p='0';//remove it
434     hasStop=true;
435     if (aalen-2==p-cdsaa) {
436     //previous to last codon is the stop codon
437     //so correct the CDS stop accordingly
438 gpertea 17 adjust_stopcodon(gffrec,-3, &seglst);
439     stopCodonAdjust=0; //clear artificial stop adjustment
440 gpertea 6 seqlen-=3;
441     cdsnt[seqlen]=0;
442     }
443     aalen=p-cdsaa;
444     }
445     else {//stop found before the last codon
446     trprint=false;
447     }
448     }//stop codon found
449     if (trprint==false) { //failed CDS validity check
450     //in-frame stop codon found
451     if (altPhases && phaseNum<3) {
452     phaseNum++;
453 gpertea 17 gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
454 gpertea 6 GFREE(cdsaa);
455     goto CDS_CHECK;
456     }
457 gpertea 17 if (gffrec.exons.Count()==1 && bothStrands) {
458 gpertea 6 strandNum++;
459     phaseNum=0;
460     if (strandNum<2) {
461     GFREE(cdsaa);
462 gpertea 17 gffrec.strand = (gffrec.strand=='-') ? '+':'-';
463 gpertea 6 goto CDS_CHECK; //repeat the CDS check for a different frame
464     }
465     }
466 gpertea 17 if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
467 gpertea 6 } //has in-frame STOP
468     if (fullCDSonly) {
469     if (!hasStop || cdsaa[0]!='M') trprint=false;
470     }
471     } // CDS check requested
472     } //translation or codon check/output was requested
473     if (!trprint) {
474     GFREE(cdsnt);
475     GFREE(cdsaa);
476 gpertea 17 return false;
477 gpertea 6 }
478 gpertea 17 if (stopCodonAdjust>0 && !hasStop) {
479     //restore stop codon location
480     adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
481     if (cdsnt!=NULL && seqlen>0) {
482     seqlen-=stopCodonAdjust;
483     cdsnt[seqlen]=0;
484     }
485     if (cdsaa!=NULL) aalen--;
486     }
487    
488 gpertea 6 if (f_y!=NULL) { //CDS translation fasta output requested
489     //char*
490     if (cdsaa==NULL) { //translate now if not done before
491     cdsaa=translateDNA(cdsnt, aalen, seqlen);
492     }
493 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
494 gpertea 6 //append all attributes found for each transcripts
495 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
496 gpertea 6 defline.append(" ");
497 gpertea 17 defline.append(gffrec.getAttrName(i));
498 gpertea 6 defline.append("=");
499 gpertea 17 defline.append(gffrec.getAttrValue(i));
500 gpertea 6 }
501     }
502     printFasta(f_y, defline, cdsaa, aalen);
503     }
504     if (f_x!=NULL) { //CDS only
505     if (writeExonSegs) {
506     defline.append(" loc:");
507 gpertea 17 defline.append(gffrec.getGSeqName());
508     defline.appendfmt("(%c)",gffrec.strand);
509 gpertea 6 //warning: not CDS coordinates are written here, but the exon ones
510 gpertea 17 defline+=(int)gffrec.start;
511 gpertea 6 defline+=(char)'-';
512 gpertea 17 defline+=(int)gffrec.end;
513 gpertea 6 // -- here these are CDS substring coordinates on the spliced sequence:
514     defline.append(" segs:");
515     for (int i=0;i<seglst.Count();i++) {
516     if (i>0) defline.append(",");
517     defline+=(int)seglst[i]->start;
518     defline.append("-");
519     defline+=(int)seglst[i]->end;
520     }
521     }
522 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
523 gpertea 6 //append all attributes found for each transcript
524 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
525 gpertea 6 defline.append(" ");
526 gpertea 17 defline.append(gffrec.getAttrName(i));
527 gpertea 6 defline.append("=");
528 gpertea 17 defline.append(gffrec.getAttrValue(i));
529 gpertea 6 }
530     }
531     printFasta(f_x, defline, cdsnt, seqlen);
532     }
533     GFREE(cdsnt);
534     GFREE(cdsaa);
535     if (f_w!=NULL) { //write spliced exons
536     uint cds_start=0;
537     uint cds_end=0;
538     seglst.Clear();
539 gpertea 17 char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
540 gpertea 6 if (exont!=NULL) {
541 gpertea 17 if (gffrec.CDstart>0) {
542 gpertea 6 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
543     }
544     if (writeExonSegs) {
545     defline.append(" loc:");
546 gpertea 17 defline.append(gffrec.getGSeqName());
547 gpertea 6 defline+=(char)'|';
548 gpertea 17 defline+=(int)gffrec.start;
549 gpertea 6 defline+=(char)'-';
550 gpertea 17 defline+=(int)gffrec.end;
551 gpertea 6 defline+=(char)'|';
552 gpertea 17 defline+=(char)gffrec.strand;
553 gpertea 6 defline.append(" exons:");
554 gpertea 17 for (int i=0;i<gffrec.exons.Count();i++) {
555 gpertea 6 if (i>0) defline.append(",");
556 gpertea 17 defline+=(int)gffrec.exons[i]->start;
557 gpertea 6 defline.append("-");
558 gpertea 17 defline+=(int)gffrec.exons[i]->end;
559 gpertea 6 }
560     defline.append(" segs:");
561     for (int i=0;i<seglst.Count();i++) {
562     if (i>0) defline.append(",");
563     defline+=(int)seglst[i]->start;
564     defline.append("-");
565     defline+=(int)seglst[i]->end;
566     }
567     }
568 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
569 gpertea 6 //append all attributes found for each transcripts
570 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
571 gpertea 6 defline.append(" ");
572 gpertea 17 defline.append(gffrec.getAttrName(i));
573 gpertea 6 defline.append("=");
574 gpertea 17 defline.append(gffrec.getAttrValue(i));
575 gpertea 6 }
576     }
577     printFasta(f_w, defline, exont, seqlen);
578     GFREE(exont);
579     }
580     } //writing f_w (spliced exons)
581 gpertea 17 return true;
582 gpertea 6 }
583    
584     void openfw(FILE* &f, GArgs& args, char opt) {
585     GStr s=args.getOpt(opt);
586     if (!s.is_empty()) {
587     if (s=='-')
588     f=stdout;
589     else {
590     f=fopen(s,"w");
591     if (f==NULL) GError("Error creating file: %s\n", s.chars());
592     }
593     }
594     }
595    
596     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
597     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
598    
599 gpertea 17 void printGff3Header(FILE* f, GArgs& args) {
600     fprintf(f, "# ");
601     args.printCmdLine(f);
602     fprintf(f, "##gff-version 3\n");
603     //for (int i=0;i<gseqdata.Count();i++) {
604     //
605     //}
606     }
607    
608     bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
609     if (reftbl.Count()>0) {
610     GStr refname(gffrec->getRefName());
611     RefTran* rt=reftbl.Find(refname.chars());
612 gpertea 46 if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
613 gpertea 47 //try removing the version suffix
614 gpertea 17 refname.cut(-2);
615     //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
616     rt=reftbl.Find(refname.chars());
617     }
618     if (rt) {
619     gffrec->setRefName(rt->new_name);
620     }
621     else return false; //discard, ref seq not in the given translation table
622     }
623     if (mRNAOnly && gffrec->isDiscarded()) {
624     //discard generic "gene" or "locus" features with no other detailed subfeatures
625     //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
626     return false;
627     }
628     /*
629     if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
630     //a non-mRNA feature with no subfeatures
631     //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
632     //--this could be a single "pseudogene" entry or another genomic region without exons
633     //
634     gffrec->addExon(gffrec->start,gffrec->end);
635     }
636     */
637     if (rfltGSeq!=NULL) { //filter by gseqName
638     if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
639     return false;
640     }
641     }
642     if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
643     return false;
644     }
645     //check coordinates
646     if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
647     if (rfltWithin) {
648     if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
649 gpertea 42 return false; //not within query range
650 gpertea 17 }
651     }
652     else {
653     if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
654     return false;
655     }
656     }
657     }
658     if (multiExon && gffrec->exons.Count()<=1) {
659     return false;
660     }
661     if (wCDSonly && gffrec->CDstart==0) {
662     return false;
663     }
664     if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
665     //keep track of chr|gene_id data -- coordinate range
666     char* geneid=gffrec->getGeneID();
667     if (geneid!=NULL) {
668     GeneInfo* ginfo=gene_ids.Find(geneid);
669     if (ginfo==NULL) {//first time seeing this gene ID
670     GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
671     gene_ids.Add(geneid, geneinfo);
672     if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
673     }
674     else ginfo->update(gffrec);
675     }
676     }
677     return true;
678     }
679    
680    
681 gpertea 6 int main(int argc, char * const argv[]) {
682 gpertea 17 GArgs args(argc, argv,
683 gpertea 59 "debug;merge;cluster-only;help;force-exons;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
684 gpertea 17 args.printError(USAGE, true);
685     if (args.getOpt('h') || args.getOpt("help")) {
686     GMessage("%s",USAGE);
687     exit(1);
688     }
689     debugMode=(args.getOpt("debug")!=NULL);
690 gpertea 62 bool forceExons=(args.getOpt("force-exons")!=NULL);
691 gpertea 6 mRNAOnly=(args.getOpt('O')==NULL);
692 gpertea 17 //sortByLoc=(args.getOpt('S')!=NULL);
693 gpertea 6 addDescr=(args.getOpt('A')!=NULL);
694     verbose=(args.getOpt('v')!=NULL);
695     wCDSonly=(args.getOpt('C')!=NULL);
696     validCDSonly=(args.getOpt('V')!=NULL);
697     altPhases=(args.getOpt('H')!=NULL);
698     fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
699     bothStrands=(args.getOpt('B')!=NULL);
700 gpertea 17 fullCDSonly=(args.getOpt('J')!=NULL);
701 gpertea 6 spliceCheck=(args.getOpt('N')!=NULL);
702 gpertea 17 bool matchAllIntrons=(args.getOpt('K')==NULL);
703     bool fuzzSpan=(args.getOpt('Q')!=NULL);
704     if (args.getOpt('M') || args.getOpt("merge")) {
705     doCluster=true;
706     doCollapseRedundant=true;
707     }
708     else {
709     if (!matchAllIntrons || fuzzSpan) {
710     GMessage("%s",USAGE);
711     GMessage("Error: -K or -Q options require -M/--merge option!\n");
712     exit(1);
713     }
714     }
715     if (args.getOpt("cluster-only")) {
716     doCluster=true;
717     doCollapseRedundant=false;
718     if (!matchAllIntrons || fuzzSpan) {
719     GMessage("%s",USAGE);
720     GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
721     exit(1);
722     }
723     }
724 gpertea 6 if (fullCDSonly) validCDSonly=true;
725 gpertea 17 if (verbose) {
726     fprintf(stderr, "Command line was:\n");
727     args.printCmdLine(stderr);
728     }
729    
730 gpertea 6 fullattr=(args.getOpt('F')!=NULL);
731     if (args.getOpt('G')==NULL)
732     noExonAttr=!fullattr;
733     else {
734     noExonAttr=true;
735     fullattr=true;
736     }
737 gpertea 17 ensembl_convert=(args.getOpt('L')!=NULL);
738     if (ensembl_convert) {
739     fullattr=true;
740     noExonAttr=false;
741     //sortByLoc=true;
742     }
743    
744 gpertea 6 mergeCloseExons=(args.getOpt('Z')!=NULL);
745     multiExon=(args.getOpt('U')!=NULL);
746     writeExonSegs=(args.getOpt('W')!=NULL);
747     tracklabel=args.getOpt('t');
748 gpertea 17 GFastaDb gfasta(args.getOpt('g'));
749     //if (gfasta.fastaPath!=NULL)
750     // sortByLoc=true; //enforce sorting by chromosome/contig
751 gpertea 6 GStr s=args.getOpt('i');
752     if (!s.is_empty()) maxintron=s.asInt();
753 gpertea 17
754     FILE* f_repl=NULL;
755     s=args.getOpt('d');
756     if (!s.is_empty()) {
757     if (s=="-") f_repl=stdout;
758     else {
759     f_repl=fopen(s.chars(), "w");
760     if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
761     }
762     }
763    
764 gpertea 6 rfltWithin=(args.getOpt('R')!=NULL);
765     s=args.getOpt('r');
766     if (!s.is_empty()) {
767     s.trim();
768 gpertea 17 if (s[0]=='+' || s[0]=='-') {
769     rfltStrand=s[0];
770     s.cut(0,1);
771     }
772 gpertea 6 int isep=s.index(':');
773 gpertea 17 if (isep>0) { //gseq name given
774     if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
775     isep--;
776     rfltStrand=s[isep];
777     s.cut(isep,1);
778     }
779     if (isep>0)
780     rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
781 gpertea 6 s.cut(0,isep+1);
782     }
783     GStr gsend;
784 gpertea 17 char slast=s[s.length()-1];
785     if (rfltStrand==0 && (slast=='+' || slast=='-')) {
786     s.chomp(slast);
787     rfltStrand=slast;
788     }
789 gpertea 6 if (s.index("..")>=0) gsend=s.split("..");
790 gpertea 17 else gsend=s.split('-');
791 gpertea 6 if (!s.is_empty()) rfltStart=(uint)s.asInt();
792 gpertea 17 if (!gsend.is_empty()) {
793     rfltEnd=(uint)gsend.asInt();
794     if (rfltEnd==0) rfltEnd=MAX_UINT;
795     }
796 gpertea 6 } //gseq/range filtering
797     else {
798     if (rfltWithin)
799 gpertea 42 GError("Error: option -R requires -r!\n");
800     //if (rfltWholeTranscript)
801     // GError("Error: option -P requires -r!\n");
802 gpertea 6 }
803 gpertea 17 s=args.getOpt('m');
804     if (!s.is_empty()) {
805     FILE* ft=fopen(s,"r");
806     if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
807     loadRefTable(ft, reftbl);
808     fclose(ft);
809     }
810 gpertea 6 s=args.getOpt('s');
811     if (!s.is_empty()) {
812 gpertea 17 FILE* fsize=fopen(s,"r");
813     if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
814     loadSeqInfo(fsize, seqinfo);
815     fclose(fsize);
816     }
817    
818 gpertea 6 openfw(f_out, args, 'o');
819     //if (f_out==NULL) f_out=stdout;
820 gpertea 17 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
821 gpertea 6 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
822    
823     openfw(f_w, args, 'w');
824     openfw(f_x, args, 'x');
825     openfw(f_y, args, 'y');
826     if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
827     //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
828 gpertea 17
829     int numfiles = args.startNonOpt();
830     //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
831     int out_counter=0; //number of records printed
832     while (true) {
833     GStr infile;
834     if (numfiles) {
835     infile=args.nextNonOpt();
836     if (infile.is_empty()) break;
837     if (infile=="-") { f_in=stdin; infile="stdin"; }
838     else
839     if ((f_in=fopen(infile, "r"))==NULL)
840     GError("Error: cannot open input file %s!\n",infile.chars());
841 gpertea 6 }
842 gpertea 17 else
843     infile="-";
844     GffLoader gffloader(infile.chars());
845     gffloader.transcriptsOnly=mRNAOnly;
846     gffloader.fullAttributes=fullattr;
847     gffloader.noExonAttrs=noExonAttr;
848     gffloader.mergeCloseExons=mergeCloseExons;
849     gffloader.showWarnings=(args.getOpt('E')!=NULL);
850     gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
851 gpertea 62 matchAllIntrons, fuzzSpan, forceExons);
852 gpertea 17 if (doCluster)
853     collectLocusData(g_data);
854     if (numfiles==0) break;
855     }
856    
857     GStr loctrack("gffcl");
858     if (tracklabel) loctrack=tracklabel;
859     g_data.setSorted(&gseqCmpName);
860     if (doCluster) {
861     //grouped in loci
862     for (int g=0;g<g_data.Count();g++) {
863     GenomicSeqData* gdata=g_data[g];
864     for (int l=0;l<gdata->loci.Count();l++) {
865     GffLocus& loc=*(gdata->loci[l]);
866     //check all non-replaced transcripts in this locus:
867     int numvalid=0;
868     int idxfirstvalid=-1;
869     for (int i=0;i<loc.rnas.Count();i++) {
870     GffObj& t=*(loc.rnas[i]);
871     GTData* tdata=(GTData*)(t.uptr);
872     if (tdata->replaced_by!=NULL) {
873     if (f_repl && (t.udata & 8)==0) {
874     //t.udata|=8;
875     fprintf(f_repl, "%s", t.getID());
876     GTData* rby=tdata;
877     while (rby->replaced_by!=NULL) {
878     fprintf(f_repl," => %s", rby->replaced_by->getID());
879     rby->rna->udata|=8;
880     rby=(GTData*)(rby->replaced_by->uptr);
881     }
882     fprintf(f_repl, "\n");
883     }
884     continue;
885     }
886     if (process_transcript(gfasta, t)) {
887     t.udata|=4; //tag it as valid
888     numvalid++;
889     if (idxfirstvalid<0) idxfirstvalid=i;
890     }
891 gpertea 6 }
892 gpertea 17
893     if (f_out && numvalid>0) {
894     GStr locname("RLOC_");
895     locname.appendfmt("%08d",loc.locus_num);
896     if (!fmtGTF) {
897     if (out_counter==0)
898     printGff3Header(f_out, args);
899     fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
900     loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
901     locname.chars(), locname.chars());
902     //const char* loc_gname=loc.getGeneName();
903     if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
904     fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
905     for (int i=1;i<loc.gene_names.Count();i++) {
906     fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
907     }
908     }
909     if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
910     fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
911     for (int i=1;i<loc.gene_ids.Count();i++) {
912     fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
913     }
914     }
915     fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
916     for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
917     fprintf(f_out, ",%s",loc.rnas[i]->getID());
918     }
919     fprintf(f_out, "\n");
920     }
921     //now print all valid, non-replaced transcripts in this locus:
922     for (int i=0;i<loc.rnas.Count();i++) {
923     GffObj& t=*(loc.rnas[i]);
924     GTData* tdata=(GTData*)(t.uptr);
925     if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
926     t.addAttr("locus", locname.chars());
927     out_counter++;
928     if (fmtGTF) t.printGtf(f_out, tracklabel);
929     else {
930     //print the parent first, if any
931     if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
932     GTData* pdata=(GTData*)(t.parent->uptr);
933     if (pdata->geneinfo!=NULL)
934     pdata->geneinfo->finalize();
935     t.parent->addAttr("locus", locname.chars());
936     t.parent->printGff(f_out, tracklabel);
937     t.parent->udata|=4;
938     }
939     t.printGff(f_out, tracklabel);
940     }
941     }
942     } //have valid transcripts to print
943     }//for each locus
944     if (f_out && !mRNAOnly) {
945     //final pass through the non-transcripts, in case any of them were not printed
946     //TODO: order broken, these should be interspersed among the rnas in the correct order!
947     for (int m=0;m<gdata->gfs.Count();m++) {
948     GffObj& t=*(gdata->gfs[m]);
949     if ((t.udata&4)==0) { //never printed
950     t.udata|=4;
951     if (fmtGTF) t.printGtf(f_out, tracklabel);
952     else t.printGff(f_out, tracklabel);
953     }
954     } //for each non-transcript
955 gpertea 6 }
956 gpertea 17 } //for each genomic sequence
957     }
958     else {
959     //not grouped into loci, print the rnas with their parents, if any
960     int numvalid=0;
961     for (int g=0;g<g_data.Count();g++) {
962     GenomicSeqData* gdata=g_data[g];
963     for (int m=0;m<gdata->rnas.Count();m++) {
964     GffObj& t=*(gdata->rnas[m]);
965     GTData* tdata=(GTData*)(t.uptr);
966     if (tdata->replaced_by!=NULL) continue;
967     if (process_transcript(gfasta, t)) {
968     t.udata|=4; //tag it as valid
969     numvalid++;
970     if (f_out) {
971     if (tdata->geneinfo) tdata->geneinfo->finalize();
972     out_counter++;
973     if (fmtGTF) t.printGtf(f_out, tracklabel);
974     else {
975     if (out_counter==1)
976     printGff3Header(f_out, args);
977     //print the parent first, if any
978     if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
979     GTData* pdata=(GTData*)(t.parent->uptr);
980     if (pdata->geneinfo!=NULL)
981     pdata->geneinfo->finalize();
982     t.parent->printGff(f_out, tracklabel);
983     t.parent->udata|=4;
984     }
985     t.printGff(f_out, tracklabel);
986     }
987     }//GFF/GTF output requested
988     } //valid transcript
989     } //for each rna
990     if (f_out && !mRNAOnly) {
991     //final pass through the non-transcripts, in case any of them were not printed
992     //TODO: order broken, these should be interspersed among the rnas in the correct order!
993     for (int m=0;m<gdata->gfs.Count();m++) {
994     GffObj& t=*(gdata->gfs[m]);
995     if ((t.udata&4)==0) { //never printed
996     t.udata|=4;
997     if (fmtGTF) t.printGtf(f_out, tracklabel);
998     else t.printGff(f_out, tracklabel);
999     }
1000     } //for each non-transcript
1001 gpertea 6 }
1002 gpertea 17 } //for each genomic seq
1003 gpertea 6 }
1004 gpertea 17 if (f_repl && f_repl!=stdout) fclose(f_repl);
1005 gpertea 6 seqinfo.Clear();
1006 gpertea 17 //if (faseq!=NULL) delete faseq;
1007     //if (gcdb!=NULL) delete gcdb;
1008 gpertea 6 GFREE(rfltGSeq);
1009     FRCLOSE(f_in);
1010     FWCLOSE(f_out);
1011     FWCLOSE(f_w);
1012     FWCLOSE(f_x);
1013     FWCLOSE(f_y);
1014     }