ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
Revision: 309
Committed: Fri Mar 22 19:47:48 2013 UTC (6 years, 6 months ago) by gpertea
File size: 35881 byte(s)
Log Message:
fixed -J crash

Line User Rev File contents
1 gpertea 265 #include "GArgs.h"
2 gpertea 17 #include "gff_utils.h"
3 gpertea 6 #include <ctype.h>
4 gpertea 17 // don't care about cdb compression
5     //#ifdef ENABLE_COMPRESSION
6     //#undef ENABLE_COMPRESSION
7     //#endif
8     //#include "GCdbYank.h"
9 gpertea 6
10     #define USAGE "Usage:\n\
11 gpertea 17 gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12 gpertea 42 [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end> [-R]]\n\
13 gpertea 43 [-CTVNJMKQAFGUBHZWTOLE] [-w <exons.fa>] [-x <cds.fa>] [-y <tr_cds.fa>]\n\
14 gpertea 17 [-i <maxintron>] \n\
15     Filters and/or converts GFF3/GTF2 records.\n\
16     <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17 gpertea 6 \n\
18     Options:\n\
19 gpertea 17 -g full path to a multi-fasta file with the genomic sequences\n\
20     for all input mappings, OR a directory with single-fasta files\n\
21     (one per genomic sequence, with file names matching sequence names)\n\
22     -s <seq_info.fsize> is a tab-delimited file providing this info\n\
23 gpertea 6 for each of the mapped sequences:\n\
24     <seq-name> <seq-length> <seq-description>\n\
25 gpertea 17 (useful for -A option with mRNA/EST/protein mappings)\n\
26 gpertea 6 -i discard transcripts having an intron larger than <maxintron>\n\
27 gpertea 42 -r only show transcripts overlapping coordinate range <start>..<end>\n\
28 gpertea 17 (on chromosome/contig <chr>, strand <strand> if provided)\n\
29 gpertea 6 -R for -r option, discard all transcripts that are not fully \n\
30 gpertea 42 contained within the given range\n\
31 gpertea 6 -U discard single-exon transcripts\n\
32 gpertea 17 -C coding only: discard mRNAs that have no CDS feature\n\
33     -F full GFF attribute preservation (all attributes are shown)\n\
34 gpertea 6 -G only parse additional exon attributes from the first exon\n\
35     and move them to the mRNA level (useful for GTF input)\n\
36 gpertea 17 -A use the description field from <seq_info.fsize> and add it\n\
37     as the value for a 'descr' attribute to the GFF record\n\
38 gpertea 6 \n\
39 gpertea 17 -O process also non-transcript GFF records (by default non-transcript\n\
40     records are ignored)\n\
41 gpertea 6 -V discard any mRNAs with CDS having in-frame stop codons\n\
42     -H for -V option, check and adjust the starting CDS phase\n\
43     if the original phase leads to a translation with an \n\
44     in-frame stop codon\n\
45     -B for -V option, single-exon transcripts are also checked on the\n\
46     opposite strand\n\
47 gpertea 17 -N discard multi-exon mRNAs that have any intron with a non-canonical\n\
48     splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49     -J discard any mRNAs that either lack initial START codon\n\
50 gpertea 6 or the terminal STOP codon, or have an in-frame stop codon\n\
51     (only print mRNAs with a fulll, valid CDS)\n\
52     \n\
53 gpertea 17 -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54     transcripts (those with the same exact introns and fully contained)\n\
55 gpertea 38 -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>\n\
56 gpertea 17 --cluster-only: same as --merge but without collapsing matching transcripts\n\
57     -K for -M option: also collapse shorter, fully contained transcripts\n\
58     with fewer introns than the container\n\
59     -Q for -M option, remove the containment restriction:\n\
60     (multi-exon transcripts will be collapsed if just their introns match,\n\
61     while single-exon transcripts can partially overlap (80%))\n\
62     \n\
63     -E expose (warn about) duplicate transcript IDs and other potential \n\
64     problems with the given GFF/GTF records\n\
65     -Z merge close exons into a single exon (for intron size<4)\n\
66     -w write a fasta file with spliced exons for each GFF transcript\n\
67     -x write a fasta file with spliced CDS for each GFF transcript\n\
68     -W for -w and -x options, also write for each fasta record the exon\n\
69 gpertea 6 coordinates projected onto the spliced sequence\n\
70 gpertea 17 -y write a protein fasta file with the translation of CDS for each record\n\
71     -L Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
72     -m <chr_replace> is a reference (genomic) sequence replacement table with\n\
73     this format:\n\
74     <original_ref_ID> <new_ref_ID>\n\
75     GFF records on reference sequences that are not found among the\n\
76     <original_ref_ID> entries in this file will be filtered out\n\
77     -o the \"filtered\" GFF records will be written to <outfile.gff>\n\
78     (use -o- for printing to stdout)\n\
79     -t use <trackname> in the second column of each GFF output line\n\
80     -T -o option will output GTF format instead of GFF3\n\
81     "
82 gpertea 6
83 gpertea 17
84     class SeqInfo { //populated from the -s option of gffread
85     public:
86     int len;
87     char* descr;
88     SeqInfo( int l, char* s) {
89     len=l;
90     if (s==NULL) {
91     descr=NULL;
92     } else {
93     descr=Gstrdup(s);
94     }
95     }
96     ~SeqInfo() {
97     GFREE(descr);
98     }
99     };
100    
101     class RefTran {
102     public:
103     char* new_name;
104     RefTran(char *ns) {
105     new_name=NULL;
106     if (ns!=NULL)
107     new_name=Gstrdup(ns);
108     }
109     ~RefTran() {
110     GFREE(new_name);
111     }
112     };
113    
114 gpertea 6 FILE* ffasta=NULL;
115     FILE* f_in=NULL;
116     FILE* f_out=NULL;
117     FILE* f_w=NULL; //fasta with spliced exons (transcripts)
118     FILE* f_x=NULL; //fasta with spliced CDS
119     FILE* f_y=NULL; //fasta with translated CDS
120     bool wCDSonly=false;
121    
122     bool validCDSonly=false; // translation with no in-frame STOP
123     bool bothStrands=false; //for single-exon mRNA validation, check the other strand too
124     bool altPhases=false; //if original phase fails translation validation,
125     //try the other 2 phases until one makes it
126     bool mRNAOnly=true;
127     bool spliceCheck=false; //only known splice-sites
128    
129     bool fullCDSonly=false; // starts with START, ends with STOP codon
130     bool fullattr=false;
131 gpertea 17 //bool sortByLoc=false; // if the GFF output should be sorted by location
132 gpertea 116 bool ensembl_convert=false; //-L, assist in converting Ensembl GTF to GFF3
133 gpertea 17
134    
135     //GStr gseqpath;
136     //GStr gcdbfa;
137     //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
138     //GFaSeqGet* faseq=NULL;
139     //GCdbYank* gcdb=NULL;
140     //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
141 gpertea 6 bool fmtGTF=false;
142     bool addDescr=false;
143     //bool protmap=false;
144     bool multiExon=false;
145     bool writeExonSegs=false;
146     char* tracklabel=NULL;
147     int maxintron=999000000;
148     bool mergeCloseExons=false;
149     //range filter:
150     char* rfltGSeq=NULL;
151 gpertea 17 char rfltStrand=0;
152 gpertea 6 uint rfltStart=0;
153     uint rfltEnd=MAX_UINT;
154     bool rfltWithin=false; //check for full containment within given range
155     bool noExonAttr=false;
156    
157 gpertea 17 bool doCluster=false;
158     bool doCollapseRedundant=false;
159 gpertea 6
160 gpertea 17 GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
161 gpertea 6
162     //hash with sequence info
163     GHash<SeqInfo> seqinfo;
164     GHash<int> isoCounter; //counts the valid isoforms
165 gpertea 17 GHash<RefTran> reftbl;
166     GHash<GeneInfo> gene_ids;
167     //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
168 gpertea 6
169     bool debugMode=false;
170     bool verbose=false;
171    
172 gpertea 17 void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
173     GLineReader fr(f);
174     while (!fr.isEof()) {
175     char* line=fr.getLine();
176     if (line==NULL) break;
177     char* id=line;
178     char* lenstr=NULL;
179     char* text=NULL;
180     char* p=line;
181     while (*p!=0 && !isspace(*p)) p++;
182     if (*p==0) continue;
183     *p=0;p++;
184     while (*p==' ' || *p=='\t') p++;
185     if (*p==0) continue;
186     lenstr=p;
187     while (*p!=0 && !isspace(*p)) p++;
188     if (*p!=0) { *p=0;p++; }
189     while (*p==' ' || *p=='\t') p++;
190     if (*p!=0) text=p; //else text remains NULL
191     int len=0;
192     if (!parseInt(lenstr,len)) {
193     GMessage("Warning: could not parse sequence length: %s %s\n",
194     id, lenstr);
195     continue;
196     }
197     // --- here we have finished parsing the line
198     si.Add(id, new SeqInfo(len,text));
199     } //while lines
200     }
201    
202     void loadRefTable(FILE* f, GHash<RefTran>& rt) {
203     GLineReader fr(f);
204     char* line=NULL;
205     while ((line=fr.getLine())) {
206     char* orig_id=line;
207     char* p=line;
208     while (*p!=0 && !isspace(*p)) p++;
209     if (*p==0) continue;
210     *p=0;p++;//split the line here
211     while (*p==' ' || *p=='\t') p++;
212     if (*p==0) continue;
213     rt.Add(orig_id, new RefTran(p));
214     } //while lines
215     }
216    
217 gpertea 6 char* getSeqDescr(char* seqid) {
218     static char charbuf[128];
219     if (seqinfo.Count()==0) return NULL;
220     char* suf=rstrchr(seqid, '.');
221     if (suf!=NULL) *suf=0;
222     SeqInfo* seqd=seqinfo.Find(seqid);
223     if (suf!=NULL) *suf='.';
224     if (seqd!=NULL) {
225     GStr s(seqd->descr);
226     //cleanup some Uniref gunk
227     if (s[0]=='[') {
228     int r=s.index(']');
229     if (r>=0 && r<8 && isdigit(s[1]))
230     s.remove(0,r+1);
231     }
232     if (s.length()>80) {
233     int r=s.index(';');
234     if (r>5) s.cut(r);
235     }
236     if (s.length()>127) {
237     s.cut(127);
238     int r=s.rindex(' ');
239     if (r>0) s.cut(r);
240     }
241     strcpy(charbuf, s.chars());
242     return charbuf;
243     }
244     else return NULL;
245     }
246    
247     char* getSeqName(char* seqid) {
248     static char charbuf[128];
249     char* suf=rstrchr(seqid, '.');
250     if (suf!=NULL) *suf=0;
251     strcpy(charbuf, seqid);
252     if (suf!=NULL) *suf='.';
253     return charbuf;
254     }
255    
256 gpertea 17 GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
257     if (gfasta.fastaPath==NULL) return NULL;
258     return gfasta.fetch(gffrec.gseq_id);
259 gpertea 6 }
260    
261    
262 gpertea 17 int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
263 gpertea 6 //adj>0 => extedn CDS, adj<0 => shrink CDS
264     //when CDS is expanded, exons have to be checked too and
265     // expanded accordingly if they had the same boundary
266 gpertea 17 int realadj=0;
267     if (gffrec.strand=='-') {
268     if ((int)gffrec.CDstart>adj) {
269    
270     gffrec.CDstart-=adj;
271     realadj=adj;
272     if (adj<0) { //restore
273     if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
274     gffrec.exons.First()->start-=adj;
275     gffrec.start=gffrec.exons.First()->start;
276     gffrec.covlen+=adj;
277 gpertea 6 }
278 gpertea 17 }
279     else if (gffrec.exons.First()->start>=gffrec.CDstart) {
280     gffrec.exons.First()->start-=adj;
281     gffrec.start=gffrec.exons.First()->start;
282     gffrec.covlen+=adj;
283     }
284 gpertea 6 }
285     }
286     else {
287 gpertea 17 realadj=adj;
288     gffrec.CDend+=adj;
289     if (adj<0) {//restore
290     if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
291     gffrec.exons.Last()->end+=adj;
292     gffrec.end=gffrec.exons.Last()->end;
293     gffrec.covlen+=adj;
294     }
295     }
296     else if (gffrec.exons.Last()->end<=gffrec.CDend) {
297     gffrec.exons.Last()->end+=adj;
298     gffrec.end=gffrec.exons.Last()->end;
299     gffrec.covlen+=adj;
300 gpertea 6 }
301     }
302     if (seglst!=NULL) seglst->Last()->end+=adj;
303 gpertea 17 return realadj;
304 gpertea 6 }
305    
306 gpertea 17 bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
307     //returns true if the transcript passed the filter
308     char* gname=gffrec.getGeneName();
309     if (gname==NULL) gname=gffrec.getGeneID();
310     GStr defline(gffrec.getID());
311     if (f_out && !fmtGTF) {
312 gpertea 116 const char* tname=NULL;
313     if ((tname=gffrec.getAttr("transcript_name"))!=NULL) {
314     gffrec.addAttr("Name", tname);
315 gpertea 17 gffrec.removeAttr("transcript_name");
316     }
317     }
318     if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
319 gpertea 116 const char* biotype=gffrec.getAttr("gene_biotype");
320     if (biotype) {
321     gffrec.addAttr("type", biotype);
322     gffrec.removeAttr("gene_biotype");
323     }
324     else { //old Ensembl files lacking gene_biotype
325     gffrec.addAttr("type", gffrec.getTrackName());
326     }
327    
328 gpertea 17 //bool is_gene=false;
329     bool is_pseudo=false;
330 gpertea 116 if (strcmp(biotype, "protein_coding")==0 || gffrec.hasCDS())
331 gpertea 17 gffrec.setFeatureName("mRNA");
332     else {
333 gpertea 116 if (strcmp(biotype, "processed_transcript")==0)
334 gpertea 17 gffrec.setFeatureName("proc_RNA");
335     else {
336 gpertea 116 //is_gene=endsWith(biotype, "gene");
337     is_pseudo=strifind(biotype, "pseudo");
338 gpertea 17 if (is_pseudo) {
339     gffrec.setFeatureName("pseudo_RNA");
340     }
341 gpertea 116 else if (endsWith(biotype, "RNA")) {
342     gffrec.setFeatureName(biotype);
343 gpertea 17 } else gffrec.setFeatureName("misc_RNA");
344     }
345     }
346 gpertea 6 }
347 gpertea 17 if (gname && strcmp(gname, gffrec.getID())!=0) {
348     int* isonum=isoCounter.Find(gname);
349     if (isonum==NULL) {
350     isonum=new int(1);
351     isoCounter.Add(gname,isonum);
352     }
353     else (*isonum)++;
354     defline.appendfmt(" gene=%s", gname);
355     }
356 gpertea 6 int seqlen=0;
357 gpertea 17
358 gpertea 6 const char* tlabel=tracklabel;
359 gpertea 17 if (tlabel==NULL) tlabel=gffrec.getTrackName();
360 gpertea 6 //defline.appendfmt(" track:%s",tlabel);
361     char* cdsnt = NULL;
362     char* cdsaa = NULL;
363     int aalen=0;
364 gpertea 17 for (int i=1;i<gffrec.exons.Count();i++) {
365     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
366     if (ilen>4000000)
367 gpertea 6 GMessage("Warning: very large intron (%d) for transcript %s\n",
368 gpertea 17 ilen, gffrec.getID());
369 gpertea 6 if (ilen>maxintron) {
370 gpertea 17 return false;
371 gpertea 6 }
372     }
373     GList<GSeg> seglst(false,true);
374 gpertea 17 GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
375     if (spliceCheck && gffrec.exons.Count()>1) {
376 gpertea 6 //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
377     if (faseq==NULL) GError("Error: no genomic sequence available!\n");
378 gpertea 17 int glen=gffrec.end-gffrec.start+1;
379     const char* gseq=faseq->subseq(gffrec.start, glen);
380     bool revcompl=(gffrec.strand=='-');
381 gpertea 6 bool ssValid=true;
382 gpertea 17 for (int e=1;e<gffrec.exons.Count();e++) {
383     const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
384     int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
385 gpertea 6 GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
386     GSpliceSite donorSite(intron,intronlen, false, revcompl);
387     //GMessage("%c intron %d-%d : %s .. %s\n",
388 gpertea 17 // gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
389 gpertea 6 if (acceptorSite=="AG") { // GT-AG or GC-AG
390     if (!donorSite.canonicalDonor()) {
391     ssValid=false;break;
392     }
393     }
394     else if (acceptorSite=="AC") { //
395     if (donorSite!="AT") { ssValid=false; break; }
396     }
397     else { ssValid=false; break; }
398     }
399     //GFREE(gseq);
400     if (!ssValid) {
401     if (verbose)
402 gpertea 17 GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
403     return false; //don't print this one!
404 gpertea 6 }
405     }
406    
407     bool trprint=true;
408 gpertea 17 int stopCodonAdjust=0;
409 gpertea 6 int mCDphase=0;
410 gpertea 17 bool hasStop=false;
411     if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
412     mCDphase = gffrec.CDphase-'0';
413 gpertea 6 if (f_y!=NULL || f_x!=NULL || validCDSonly) {
414     if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
415     //if (protmap && fullCDSonly) {
416 gpertea 17 //if (protmap && (fullCDSonly || (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
417    
418 gpertea 6 if (validCDSonly) { //make sure the stop codon is always included
419 gpertea 17 //adjust_stopcodon(gffrec,3);
420     stopCodonAdjust=adjust_stopcodon(gffrec,3);
421 gpertea 6 }
422     int strandNum=0;
423     int phaseNum=0;
424     CDS_CHECK:
425 gpertea 265 cdsnt=gffrec.getSpliced(faseq, true, &seqlen, NULL, NULL, &seglst);
426 gpertea 6 if (cdsnt==NULL) trprint=false;
427 gpertea 309 else { //has CDS
428     if (validCDSonly) {
429     cdsaa=translateDNA(cdsnt, aalen, seqlen);
430     char* p=strchr(cdsaa,'.');
431     hasStop=false;
432     if (p!=NULL) {
433     if (p-cdsaa>=aalen-2) { //stop found as the last codon
434     *p='0';//remove it
435     hasStop=true;
436     if (aalen-2==p-cdsaa) {
437     //previous to last codon is the stop codon
438     //so correct the CDS stop accordingly
439     adjust_stopcodon(gffrec,-3, &seglst);
440     stopCodonAdjust=0; //clear artificial stop adjustment
441     seqlen-=3;
442     cdsnt[seqlen]=0;
443     }
444     aalen=p-cdsaa;
445 gpertea 6 }
446 gpertea 309 else {//stop found before the last codon
447     trprint=false;
448     }
449     }//stop codon found
450     if (trprint==false) { //failed CDS validity check
451     //in-frame stop codon found
452     if (altPhases && phaseNum<3) {
453     phaseNum++;
454     gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
455     GFREE(cdsaa);
456     goto CDS_CHECK;
457     }
458     if (gffrec.exons.Count()==1 && bothStrands) {
459     strandNum++;
460     phaseNum=0;
461     if (strandNum<2) {
462     GFREE(cdsaa);
463     gffrec.strand = (gffrec.strand=='-') ? '+':'-';
464     goto CDS_CHECK; //repeat the CDS check for a different frame
465     }
466     }
467     if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
468     } //has in-frame STOP
469     if (fullCDSonly) {
470     if (!hasStop || cdsaa[0]!='M') trprint=false;
471     }
472     } // CDS check requested
473     } //has CDS
474 gpertea 6 } //translation or codon check/output was requested
475     if (!trprint) {
476     GFREE(cdsnt);
477     GFREE(cdsaa);
478 gpertea 17 return false;
479 gpertea 6 }
480 gpertea 17 if (stopCodonAdjust>0 && !hasStop) {
481     //restore stop codon location
482     adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
483     if (cdsnt!=NULL && seqlen>0) {
484     seqlen-=stopCodonAdjust;
485     cdsnt[seqlen]=0;
486     }
487     if (cdsaa!=NULL) aalen--;
488     }
489    
490 gpertea 6 if (f_y!=NULL) { //CDS translation fasta output requested
491     //char*
492     if (cdsaa==NULL) { //translate now if not done before
493     cdsaa=translateDNA(cdsnt, aalen, seqlen);
494     }
495 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
496 gpertea 6 //append all attributes found for each transcripts
497 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
498 gpertea 6 defline.append(" ");
499 gpertea 17 defline.append(gffrec.getAttrName(i));
500 gpertea 6 defline.append("=");
501 gpertea 17 defline.append(gffrec.getAttrValue(i));
502 gpertea 6 }
503     }
504     printFasta(f_y, defline, cdsaa, aalen);
505     }
506     if (f_x!=NULL) { //CDS only
507     if (writeExonSegs) {
508     defline.append(" loc:");
509 gpertea 17 defline.append(gffrec.getGSeqName());
510     defline.appendfmt("(%c)",gffrec.strand);
511 gpertea 6 //warning: not CDS coordinates are written here, but the exon ones
512 gpertea 17 defline+=(int)gffrec.start;
513 gpertea 6 defline+=(char)'-';
514 gpertea 17 defline+=(int)gffrec.end;
515 gpertea 6 // -- here these are CDS substring coordinates on the spliced sequence:
516     defline.append(" segs:");
517     for (int i=0;i<seglst.Count();i++) {
518     if (i>0) defline.append(",");
519     defline+=(int)seglst[i]->start;
520     defline.append("-");
521     defline+=(int)seglst[i]->end;
522     }
523     }
524 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
525 gpertea 6 //append all attributes found for each transcript
526 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
527 gpertea 6 defline.append(" ");
528 gpertea 17 defline.append(gffrec.getAttrName(i));
529 gpertea 6 defline.append("=");
530 gpertea 17 defline.append(gffrec.getAttrValue(i));
531 gpertea 6 }
532     }
533     printFasta(f_x, defline, cdsnt, seqlen);
534     }
535     GFREE(cdsnt);
536     GFREE(cdsaa);
537     if (f_w!=NULL) { //write spliced exons
538     uint cds_start=0;
539     uint cds_end=0;
540     seglst.Clear();
541 gpertea 17 char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
542 gpertea 6 if (exont!=NULL) {
543 gpertea 17 if (gffrec.CDstart>0) {
544 gpertea 6 defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
545     }
546     if (writeExonSegs) {
547     defline.append(" loc:");
548 gpertea 17 defline.append(gffrec.getGSeqName());
549 gpertea 6 defline+=(char)'|';
550 gpertea 17 defline+=(int)gffrec.start;
551 gpertea 6 defline+=(char)'-';
552 gpertea 17 defline+=(int)gffrec.end;
553 gpertea 6 defline+=(char)'|';
554 gpertea 17 defline+=(char)gffrec.strand;
555 gpertea 6 defline.append(" exons:");
556 gpertea 17 for (int i=0;i<gffrec.exons.Count();i++) {
557 gpertea 6 if (i>0) defline.append(",");
558 gpertea 17 defline+=(int)gffrec.exons[i]->start;
559 gpertea 6 defline.append("-");
560 gpertea 17 defline+=(int)gffrec.exons[i]->end;
561 gpertea 6 }
562     defline.append(" segs:");
563     for (int i=0;i<seglst.Count();i++) {
564     if (i>0) defline.append(",");
565     defline+=(int)seglst[i]->start;
566     defline.append("-");
567     defline+=(int)seglst[i]->end;
568     }
569     }
570 gpertea 17 if (fullattr && gffrec.attrs!=NULL) {
571 gpertea 6 //append all attributes found for each transcripts
572 gpertea 17 for (int i=0;i<gffrec.attrs->Count();i++) {
573 gpertea 6 defline.append(" ");
574 gpertea 17 defline.append(gffrec.getAttrName(i));
575 gpertea 6 defline.append("=");
576 gpertea 17 defline.append(gffrec.getAttrValue(i));
577 gpertea 6 }
578     }
579     printFasta(f_w, defline, exont, seqlen);
580     GFREE(exont);
581     }
582     } //writing f_w (spliced exons)
583 gpertea 17 return true;
584 gpertea 6 }
585    
586     void openfw(FILE* &f, GArgs& args, char opt) {
587     GStr s=args.getOpt(opt);
588     if (!s.is_empty()) {
589     if (s=='-')
590     f=stdout;
591     else {
592     f=fopen(s,"w");
593     if (f==NULL) GError("Error creating file: %s\n", s.chars());
594     }
595     }
596     }
597    
598     #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
599     #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
600    
601 gpertea 17 void printGff3Header(FILE* f, GArgs& args) {
602     fprintf(f, "# ");
603     args.printCmdLine(f);
604     fprintf(f, "##gff-version 3\n");
605     //for (int i=0;i<gseqdata.Count();i++) {
606     //
607     //}
608     }
609    
610     bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
611     if (reftbl.Count()>0) {
612     GStr refname(gffrec->getRefName());
613     RefTran* rt=reftbl.Find(refname.chars());
614 gpertea 46 if (rt==NULL && refname.length()>2 && refname[-2]=='.' && isdigit(refname[-1])) {
615 gpertea 47 //try removing the version suffix
616 gpertea 17 refname.cut(-2);
617     //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
618     rt=reftbl.Find(refname.chars());
619     }
620     if (rt) {
621     gffrec->setRefName(rt->new_name);
622     }
623     else return false; //discard, ref seq not in the given translation table
624     }
625     if (mRNAOnly && gffrec->isDiscarded()) {
626     //discard generic "gene" or "locus" features with no other detailed subfeatures
627     //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
628     return false;
629     }
630     /*
631     if (gffrec->exons.Count()==0 && gffrec->children.Count()==0)) {
632     //a non-mRNA feature with no subfeatures
633     //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
634     //--this could be a single "pseudogene" entry or another genomic region without exons
635     //
636     gffrec->addExon(gffrec->start,gffrec->end);
637     }
638     */
639     if (rfltGSeq!=NULL) { //filter by gseqName
640     if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
641     return false;
642     }
643     }
644     if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
645     return false;
646     }
647     //check coordinates
648     if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
649     if (rfltWithin) {
650     if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
651 gpertea 42 return false; //not within query range
652 gpertea 17 }
653     }
654     else {
655     if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
656     return false;
657     }
658     }
659     }
660     if (multiExon && gffrec->exons.Count()<=1) {
661     return false;
662     }
663     if (wCDSonly && gffrec->CDstart==0) {
664     return false;
665     }
666     if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
667     //keep track of chr|gene_id data -- coordinate range
668     char* geneid=gffrec->getGeneID();
669     if (geneid!=NULL) {
670     GeneInfo* ginfo=gene_ids.Find(geneid);
671     if (ginfo==NULL) {//first time seeing this gene ID
672     GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
673     gene_ids.Add(geneid, geneinfo);
674     if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
675     }
676     else ginfo->update(gffrec);
677     }
678     }
679     return true;
680     }
681    
682    
683 gpertea 6 int main(int argc, char * const argv[]) {
684 gpertea 17 GArgs args(argc, argv,
685 gpertea 59 "debug;merge;cluster-only;help;force-exons;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
686 gpertea 17 args.printError(USAGE, true);
687     if (args.getOpt('h') || args.getOpt("help")) {
688     GMessage("%s",USAGE);
689     exit(1);
690     }
691     debugMode=(args.getOpt("debug")!=NULL);
692 gpertea 62 bool forceExons=(args.getOpt("force-exons")!=NULL);
693 gpertea 6 mRNAOnly=(args.getOpt('O')==NULL);
694 gpertea 17 //sortByLoc=(args.getOpt('S')!=NULL);
695 gpertea 6 addDescr=(args.getOpt('A')!=NULL);
696     verbose=(args.getOpt('v')!=NULL);
697     wCDSonly=(args.getOpt('C')!=NULL);
698     validCDSonly=(args.getOpt('V')!=NULL);
699     altPhases=(args.getOpt('H')!=NULL);
700     fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
701     bothStrands=(args.getOpt('B')!=NULL);
702 gpertea 17 fullCDSonly=(args.getOpt('J')!=NULL);
703 gpertea 6 spliceCheck=(args.getOpt('N')!=NULL);
704 gpertea 17 bool matchAllIntrons=(args.getOpt('K')==NULL);
705     bool fuzzSpan=(args.getOpt('Q')!=NULL);
706     if (args.getOpt('M') || args.getOpt("merge")) {
707     doCluster=true;
708     doCollapseRedundant=true;
709     }
710     else {
711     if (!matchAllIntrons || fuzzSpan) {
712     GMessage("%s",USAGE);
713     GMessage("Error: -K or -Q options require -M/--merge option!\n");
714     exit(1);
715     }
716     }
717     if (args.getOpt("cluster-only")) {
718     doCluster=true;
719     doCollapseRedundant=false;
720     if (!matchAllIntrons || fuzzSpan) {
721     GMessage("%s",USAGE);
722     GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
723     exit(1);
724     }
725     }
726 gpertea 6 if (fullCDSonly) validCDSonly=true;
727 gpertea 17 if (verbose) {
728     fprintf(stderr, "Command line was:\n");
729     args.printCmdLine(stderr);
730     }
731    
732 gpertea 6 fullattr=(args.getOpt('F')!=NULL);
733     if (args.getOpt('G')==NULL)
734     noExonAttr=!fullattr;
735     else {
736     noExonAttr=true;
737     fullattr=true;
738     }
739 gpertea 17 ensembl_convert=(args.getOpt('L')!=NULL);
740     if (ensembl_convert) {
741     fullattr=true;
742     noExonAttr=false;
743     //sortByLoc=true;
744     }
745    
746 gpertea 6 mergeCloseExons=(args.getOpt('Z')!=NULL);
747     multiExon=(args.getOpt('U')!=NULL);
748     writeExonSegs=(args.getOpt('W')!=NULL);
749     tracklabel=args.getOpt('t');
750 gpertea 17 GFastaDb gfasta(args.getOpt('g'));
751     //if (gfasta.fastaPath!=NULL)
752     // sortByLoc=true; //enforce sorting by chromosome/contig
753 gpertea 6 GStr s=args.getOpt('i');
754     if (!s.is_empty()) maxintron=s.asInt();
755 gpertea 17
756     FILE* f_repl=NULL;
757     s=args.getOpt('d');
758     if (!s.is_empty()) {
759     if (s=="-") f_repl=stdout;
760     else {
761     f_repl=fopen(s.chars(), "w");
762     if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
763     }
764     }
765    
766 gpertea 6 rfltWithin=(args.getOpt('R')!=NULL);
767     s=args.getOpt('r');
768     if (!s.is_empty()) {
769     s.trim();
770 gpertea 17 if (s[0]=='+' || s[0]=='-') {
771     rfltStrand=s[0];
772     s.cut(0,1);
773     }
774 gpertea 6 int isep=s.index(':');
775 gpertea 17 if (isep>0) { //gseq name given
776     if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
777     isep--;
778     rfltStrand=s[isep];
779     s.cut(isep,1);
780     }
781     if (isep>0)
782     rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
783 gpertea 6 s.cut(0,isep+1);
784     }
785     GStr gsend;
786 gpertea 17 char slast=s[s.length()-1];
787     if (rfltStrand==0 && (slast=='+' || slast=='-')) {
788     s.chomp(slast);
789     rfltStrand=slast;
790     }
791 gpertea 6 if (s.index("..")>=0) gsend=s.split("..");
792 gpertea 17 else gsend=s.split('-');
793 gpertea 6 if (!s.is_empty()) rfltStart=(uint)s.asInt();
794 gpertea 17 if (!gsend.is_empty()) {
795     rfltEnd=(uint)gsend.asInt();
796     if (rfltEnd==0) rfltEnd=MAX_UINT;
797     }
798 gpertea 6 } //gseq/range filtering
799     else {
800     if (rfltWithin)
801 gpertea 42 GError("Error: option -R requires -r!\n");
802     //if (rfltWholeTranscript)
803     // GError("Error: option -P requires -r!\n");
804 gpertea 6 }
805 gpertea 17 s=args.getOpt('m');
806     if (!s.is_empty()) {
807     FILE* ft=fopen(s,"r");
808     if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
809     loadRefTable(ft, reftbl);
810     fclose(ft);
811     }
812 gpertea 6 s=args.getOpt('s');
813     if (!s.is_empty()) {
814 gpertea 17 FILE* fsize=fopen(s,"r");
815     if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
816     loadSeqInfo(fsize, seqinfo);
817     fclose(fsize);
818     }
819    
820 gpertea 6 openfw(f_out, args, 'o');
821     //if (f_out==NULL) f_out=stdout;
822 gpertea 17 if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
823 gpertea 6 GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
824    
825     openfw(f_w, args, 'w');
826     openfw(f_x, args, 'x');
827     openfw(f_y, args, 'y');
828     if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
829     //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
830 gpertea 17
831     int numfiles = args.startNonOpt();
832     //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
833     int out_counter=0; //number of records printed
834     while (true) {
835     GStr infile;
836     if (numfiles) {
837     infile=args.nextNonOpt();
838     if (infile.is_empty()) break;
839     if (infile=="-") { f_in=stdin; infile="stdin"; }
840     else
841     if ((f_in=fopen(infile, "r"))==NULL)
842     GError("Error: cannot open input file %s!\n",infile.chars());
843 gpertea 6 }
844 gpertea 17 else
845     infile="-";
846     GffLoader gffloader(infile.chars());
847     gffloader.transcriptsOnly=mRNAOnly;
848     gffloader.fullAttributes=fullattr;
849     gffloader.noExonAttrs=noExonAttr;
850     gffloader.mergeCloseExons=mergeCloseExons;
851     gffloader.showWarnings=(args.getOpt('E')!=NULL);
852     gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
853 gpertea 62 matchAllIntrons, fuzzSpan, forceExons);
854 gpertea 17 if (doCluster)
855     collectLocusData(g_data);
856     if (numfiles==0) break;
857     }
858    
859     GStr loctrack("gffcl");
860     if (tracklabel) loctrack=tracklabel;
861     g_data.setSorted(&gseqCmpName);
862     if (doCluster) {
863     //grouped in loci
864     for (int g=0;g<g_data.Count();g++) {
865     GenomicSeqData* gdata=g_data[g];
866     for (int l=0;l<gdata->loci.Count();l++) {
867     GffLocus& loc=*(gdata->loci[l]);
868     //check all non-replaced transcripts in this locus:
869     int numvalid=0;
870     int idxfirstvalid=-1;
871     for (int i=0;i<loc.rnas.Count();i++) {
872     GffObj& t=*(loc.rnas[i]);
873     GTData* tdata=(GTData*)(t.uptr);
874     if (tdata->replaced_by!=NULL) {
875     if (f_repl && (t.udata & 8)==0) {
876     //t.udata|=8;
877     fprintf(f_repl, "%s", t.getID());
878     GTData* rby=tdata;
879     while (rby->replaced_by!=NULL) {
880     fprintf(f_repl," => %s", rby->replaced_by->getID());
881     rby->rna->udata|=8;
882     rby=(GTData*)(rby->replaced_by->uptr);
883     }
884     fprintf(f_repl, "\n");
885     }
886     continue;
887     }
888     if (process_transcript(gfasta, t)) {
889     t.udata|=4; //tag it as valid
890     numvalid++;
891     if (idxfirstvalid<0) idxfirstvalid=i;
892     }
893 gpertea 6 }
894 gpertea 17
895     if (f_out && numvalid>0) {
896     GStr locname("RLOC_");
897     locname.appendfmt("%08d",loc.locus_num);
898     if (!fmtGTF) {
899     if (out_counter==0)
900     printGff3Header(f_out, args);
901     fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
902     loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
903     locname.chars(), locname.chars());
904     //const char* loc_gname=loc.getGeneName();
905     if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
906     fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
907     for (int i=1;i<loc.gene_names.Count();i++) {
908     fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
909     }
910     }
911     if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
912     fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
913     for (int i=1;i<loc.gene_ids.Count();i++) {
914     fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
915     }
916     }
917     fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
918     for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
919     fprintf(f_out, ",%s",loc.rnas[i]->getID());
920     }
921     fprintf(f_out, "\n");
922     }
923     //now print all valid, non-replaced transcripts in this locus:
924     for (int i=0;i<loc.rnas.Count();i++) {
925     GffObj& t=*(loc.rnas[i]);
926     GTData* tdata=(GTData*)(t.uptr);
927     if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
928     t.addAttr("locus", locname.chars());
929     out_counter++;
930     if (fmtGTF) t.printGtf(f_out, tracklabel);
931     else {
932     //print the parent first, if any
933     if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
934     GTData* pdata=(GTData*)(t.parent->uptr);
935     if (pdata->geneinfo!=NULL)
936     pdata->geneinfo->finalize();
937     t.parent->addAttr("locus", locname.chars());
938     t.parent->printGff(f_out, tracklabel);
939     t.parent->udata|=4;
940     }
941     t.printGff(f_out, tracklabel);
942     }
943     }
944     } //have valid transcripts to print
945     }//for each locus
946     if (f_out && !mRNAOnly) {
947     //final pass through the non-transcripts, in case any of them were not printed
948     //TODO: order broken, these should be interspersed among the rnas in the correct order!
949     for (int m=0;m<gdata->gfs.Count();m++) {
950     GffObj& t=*(gdata->gfs[m]);
951     if ((t.udata&4)==0) { //never printed
952     t.udata|=4;
953     if (fmtGTF) t.printGtf(f_out, tracklabel);
954     else t.printGff(f_out, tracklabel);
955     }
956     } //for each non-transcript
957 gpertea 6 }
958 gpertea 17 } //for each genomic sequence
959     }
960     else {
961     //not grouped into loci, print the rnas with their parents, if any
962     int numvalid=0;
963     for (int g=0;g<g_data.Count();g++) {
964     GenomicSeqData* gdata=g_data[g];
965     for (int m=0;m<gdata->rnas.Count();m++) {
966     GffObj& t=*(gdata->rnas[m]);
967     GTData* tdata=(GTData*)(t.uptr);
968     if (tdata->replaced_by!=NULL) continue;
969     if (process_transcript(gfasta, t)) {
970     t.udata|=4; //tag it as valid
971     numvalid++;
972     if (f_out) {
973     if (tdata->geneinfo) tdata->geneinfo->finalize();
974     out_counter++;
975     if (fmtGTF) t.printGtf(f_out, tracklabel);
976     else {
977     if (out_counter==1)
978     printGff3Header(f_out, args);
979     //print the parent first, if any
980     if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
981     GTData* pdata=(GTData*)(t.parent->uptr);
982     if (pdata->geneinfo!=NULL)
983     pdata->geneinfo->finalize();
984     t.parent->printGff(f_out, tracklabel);
985     t.parent->udata|=4;
986     }
987     t.printGff(f_out, tracklabel);
988     }
989     }//GFF/GTF output requested
990     } //valid transcript
991     } //for each rna
992     if (f_out && !mRNAOnly) {
993     //final pass through the non-transcripts, in case any of them were not printed
994     //TODO: order broken, these should be interspersed among the rnas in the correct order!
995     for (int m=0;m<gdata->gfs.Count();m++) {
996     GffObj& t=*(gdata->gfs[m]);
997     if ((t.udata&4)==0) { //never printed
998     t.udata|=4;
999     if (fmtGTF) t.printGtf(f_out, tracklabel);
1000     else t.printGff(f_out, tracklabel);
1001     }
1002     } //for each non-transcript
1003 gpertea 6 }
1004 gpertea 17 } //for each genomic seq
1005 gpertea 6 }
1006 gpertea 17 if (f_repl && f_repl!=stdout) fclose(f_repl);
1007 gpertea 6 seqinfo.Clear();
1008 gpertea 17 //if (faseq!=NULL) delete faseq;
1009     //if (gcdb!=NULL) delete gcdb;
1010 gpertea 6 GFREE(rfltGSeq);
1011     FRCLOSE(f_in);
1012     FWCLOSE(f_out);
1013     FWCLOSE(f_w);
1014     FWCLOSE(f_x);
1015     FWCLOSE(f_y);
1016     }