ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gffread.cpp
(Generate patch)
# Line 1 | Line 1
1 + #include "gff_utils.h"
2   #include "GArgs.h"
2 #include "GStr.h"
3 #include "GHash.hh"
4 #include "GList.hh"
5 #include "GFaSeqGet.h"
6 #include "gff.h"
3   #include <ctype.h>
4 < // do not care about cdb compression
5 < #ifdef ENABLE_COMPRESSION
6 < #undef ENABLE_COMPRESSION
7 < #endif
8 < #include "GCdbYank.h"
4 > // don't care about cdb compression
5 > //#ifdef ENABLE_COMPRESSION
6 > //#undef ENABLE_COMPRESSION
7 > //#endif
8 > //#include "GCdbYank.h"
9  
10   #define USAGE "Usage:\n\
11 < gffread <input_gff..> [-g <genomic_seq_fasta> | <dir>][-s <seq_info.fsize>]\n\
12 < [-o <outfile.gff3>] [-t <tname>] [-r [<chr>:]<start>..<end>] [-i <maxintron>]\n\
13 < [-CTVNMAFGRUVBHSZWTO] [-w <spl_exons.fa>] [-x <spl_cds.fa>] [-y <tran_cds.fa>]\n\
14 < \n\
15 < Filters and/or converts GFF/GTF records.\n\
11 > gffread <input_gff> [-g <genomic_seqs_fasta> | <dir>][-s <seq_info.fsize>] \n\
12 > [-o <outfile.gff>] [-t <tname>] [-r [[<strand>]<chr>:]<start>..<end>] \n\
13 > [-CTVNJMKQAFGRUVBHZWTOLE] [-w <spl_exons.fa>] [-x <spl_cds.fa>] [-y <tr_cds.fa>]\n\
14 > [-i <maxintron>] \n\
15 > Filters and/or converts GFF3/GTF2 records.\n\
16 > <input_gff> is a GFF file, use '-' if the GFF records will be given at stdin\n\
17   \n\
18   Options:\n\
19 <  -g  full path to one fasta file with the genomic sequence\n\
20 <      for all input mappings, OR a path to the directory where\n\
21 <      genomic sequences can be found as single-fasta files\n\
22 <  -s  for mRNA/EST/protein mappings a tab-delimited file provides this info\n\
19 >  -g  full path to a multi-fasta file with the genomic sequences\n\
20 >      for all input mappings, OR a directory with single-fasta files\n\
21 >      (one per genomic sequence, with file names matching sequence names)\n\
22 >  -s  <seq_info.fsize> is a tab-delimited file providing this info\n\
23        for each of the mapped sequences:\n\
24        <seq-name> <seq-length> <seq-description>\n\
25 +      (useful for -A option with mRNA/EST/protein mappings)\n\
26    -i  discard transcripts having an intron larger than <maxintron>\n\
27    -r  only show transcripts crossing coordinate range <start>..<end>\n\
28 <      (on chromosome/contig <chr> if provided)\n\
28 >      (on chromosome/contig <chr>, strand <strand> if provided)\n\
29    -R  for -r option, discard all transcripts that are not fully \n\
30        contained within given range\n\
31    -U  discard single-exon transcripts\n\
32 <  -C  discard mRNAs that have no CDS feature\n\
33 <  -F  keep all attributes from last column of GFF/GTF\n\
32 >  -C  coding only: discard mRNAs that have no CDS feature\n\
33 >  -F  full GFF attribute preservation (all attributes are shown)\n\
34    -G  only parse additional exon attributes from the first exon\n\
35        and move them to the mRNA level (useful for GTF input)\n\
36 <  -A  use the description field from <seq_info.fsize> and add it as\n\
37 <      a descr=.. attribute to the mRNA and/or Gene record\n\
36 >  -A  use the description field from <seq_info.fsize> and add it\n\
37 >      as the value for a 'descr' attribute to the GFF record\n\
38    \n\
39 <  -O  process other (non-mRNA) GFF/GTF records (default is discard non-mRNA\n\
40 <      data). \n\
43 <      Note: non-mRNA records must have only one subfeature per parent feature\n\
39 >  -O  process also non-transcript GFF records (by default non-transcript\n\
40 >      records are ignored)\n\
41    -V  discard any mRNAs with CDS having in-frame stop codons\n\
42    -H  for -V option, check and adjust the starting CDS phase\n\
43        if the original phase leads to a translation with an \n\
44        in-frame stop codon\n\
45    -B  for -V option, single-exon transcripts are also checked on the\n\
46        opposite strand\n\
47 <  -N  only show multi-exon mRNAs if all their introns have the \n\
48 <      typical splice site consensus ( GT-AG, GC-AG or AT-AC )\n\
49 <  -M  discard any mRNAs that either lack initial START codon\n\
47 >  -N  discard multi-exon mRNAs that have any intron with a non-canonical\n\
48 >      splice site consensus (i.e. not GT-AG, GC-AG or AT-AC)\n\
49 >  -J  discard any mRNAs that either lack initial START codon\n\
50        or the terminal STOP codon, or have an in-frame stop codon\n\
51        (only print mRNAs with a fulll, valid CDS)\n\
52   \n\
53 <  -S  sort output GFF records by genomic sequence and start coordinate\n\
54 <      note that this option is automatically enabled by -g option\n\
55 <  -Z  merge close exons into a single exon (intron size<4)\n\
56 <  -t  use <trackname> in the second column of each GFF/GTF output line\n\
57 <  -w  write a fasta file with spliced exons for each mapping\n\
58 <  -x  write a fasta file with spliced CDS for each mapping\n\
59 <  -W  for -w and -x options, also write for each fasta record the exon \n\
53 >  -M/--merge : cluster the input transcripts into loci, collapsing matching\n\
54 >       transcripts (those with the same exact introns and fully contained)\n\
55 >  -d <dupinfo> : for -M option, write collapsing info to file <dupinfo>\n\
56 >  --cluster-only: same as --merge but without collapsing matching transcripts\n\
57 >  -K  for -M option: also collapse shorter, fully contained transcripts\n\
58 >      with fewer introns than the container\n\
59 >  -Q  for -M option, remove the containment restriction:\n\
60 >      (multi-exon transcripts will be collapsed if just their introns match,\n\
61 >      while single-exon transcripts can partially overlap (80%))\n\
62 > \n\
63 >  -E  expose (warn about) duplicate transcript IDs and other potential \n\
64 >      problems with the given GFF/GTF records\n\
65 >  -Z  merge close exons into a single exon (for intron size<4)\n\
66 >  -w  write a fasta file with spliced exons for each GFF transcript\n\
67 >  -x  write a fasta file with spliced CDS for each GFF transcript\n\
68 >  -W  for -w and -x options, also write for each fasta record the exon\n\
69        coordinates projected onto the spliced sequence\n\
70 <  -y  write a protein fasta file with the CDS translation for each mapping\n\
71 <  -o  the output gff3 file with the 'filtered' entries\n\
72 <  -T  output GTF format instead of GFF3\n\
73 <  \n\
74 < "
70 >  -y  write a protein fasta file with the translation of CDS for each record\n\
71 >  -L  Ensembl GTF to GFF3 conversion (implies -F; should be used with -m)\n\
72 >  -m  <chr_replace> is a reference (genomic) sequence replacement table with\n\
73 >      this format:\n\
74 >      <original_ref_ID> <new_ref_ID>\n\
75 >      GFF records on reference sequences that are not found among the\n\
76 >      <original_ref_ID> entries in this file will be filtered out\n\
77 >  -o  the \"filtered\" GFF records will be written to <outfile.gff>\n\
78 >      (use -o- for printing to stdout)\n\
79 >  -t  use <trackname> in the second column of each GFF output line\n\
80 >  -T  -o option will output GTF format instead of GFF3\n\
81 > "
82 >
83 >
84 > class SeqInfo { //populated from the -s option of gffread
85 > public:
86 >  int len;
87 >  char* descr;
88 >  SeqInfo( int l, char* s) {
89 >   len=l;
90 >   if (s==NULL) {
91 >     descr=NULL;
92 >     }   else {
93 >     descr=Gstrdup(s);
94 >     }
95 >   }
96 >  ~SeqInfo() {
97 >   GFREE(descr);
98 >   }
99 > };
100 >
101 > class RefTran {
102 > public:
103 >   char* new_name;
104 >   RefTran(char *ns) {
105 >      new_name=NULL;
106 >      if (ns!=NULL)
107 >         new_name=Gstrdup(ns);
108 >      }
109 >   ~RefTran() {
110 >      GFREE(new_name);
111 >      }
112 > };
113  
114   FILE* ffasta=NULL;
115   FILE* f_in=NULL;
# Line 84 | Line 128
128  
129   bool fullCDSonly=false; // starts with START, ends with STOP codon
130   bool fullattr=false;
131 < bool sortByLoc=false; // if the GFF output should be sorted by location
132 < GStr gseqpath;
133 < bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
131 > //bool sortByLoc=false; // if the GFF output should be sorted by location
132 > bool ensembl_convert=false; //-L, assisst in converting Ensembl GTF to GFF3
133 >
134 >
135 > //GStr gseqpath;
136 > //GStr gcdbfa;
137 > //bool multiGSeq=false; //if a directory or a .cidx file was given to -g option
138 > //GFaSeqGet* faseq=NULL;
139 > //GCdbYank* gcdb=NULL;
140 > //int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
141   bool fmtGTF=false;
91 int gseq_id=-1; //current genome sequence ID -- the current GffObj::gseq_id
92 GFaSeqGet* faseq=NULL;
93 GCdbYank* gcdb=NULL;
94 GStr gcdbfa;
142   bool addDescr=false;
143   //bool protmap=false;
144   bool multiExon=false;
# Line 101 | Line 148
148   bool mergeCloseExons=false;
149   //range filter:
150   char* rfltGSeq=NULL;
151 + char rfltStrand=0;
152   uint rfltStart=0;
153   uint rfltEnd=MAX_UINT;
154   bool rfltWithin=false; //check for full containment within given range
155   bool noExonAttr=false;
108 class SeqInfo {
109 public:
110  int len;
111  char* descr;
112  SeqInfo( int l, char* s) {
113   len=l;
114   if (s==NULL) {
115     descr=NULL;
116     }   else {
117     descr=Gstrdup(s);
118     }
119   }
120  ~SeqInfo() {
121   GFREE(descr);
122   }
123 };
156  
157 + bool doCluster=false;
158 + bool doCollapseRedundant=false;
159  
160 < class GSpliceSite {
127 < public:
128 <  char nt[3];
129 <  GSpliceSite(const char* c, bool revc=false) {
130 <    nt[2]=0;
131 <    if (c==NULL) {
132 <      nt[0]=0;
133 <      nt[1]=0;
134 <      return;
135 <      }
136 <    if (revc) {
137 <      nt[0]=toupper(ntComplement(c[1]));
138 <      nt[1]=toupper(ntComplement(c[0]));
139 <      }
140 <    else {
141 <      nt[0]=toupper(c[0]);
142 <      nt[1]=toupper(c[1]);
143 <      }
144 <    }
145 <
146 <  GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) {
147 <    nt[2]=0;
148 <    if (intron==NULL || intronlen==0)
149 <       GError("Error: invalid intron or intron len for GSpliceSite()!\n");
150 <    const char* c=intron;
151 <    if (revc) {
152 <      if (!getAcceptor) c+=intronlen-2;
153 <      nt[0]=toupper(ntComplement(c[1]));
154 <      nt[1]=toupper(ntComplement(c[0]));
155 <      }
156 <    else { //on forward strand
157 <      if (getAcceptor) c+=intronlen-2;
158 <      nt[0]=toupper(c[0]);
159 <      nt[1]=toupper(c[1]);
160 <      }//forward strand
161 <    }
162 <
163 <  GSpliceSite(const char n1, const char n2) {
164 <    nt[2]=0;
165 <    nt[0]=toupper(n1);
166 <    nt[1]=toupper(n2);
167 <    }
168 <  bool canonicalDonor() {
169 <    return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T'));
170 <    }
171 <  bool operator==(GSpliceSite& c) {
172 <    return (c.nt[0]==nt[0] && c.nt[1]==nt[1]);
173 <    }
174 <  bool operator==(GSpliceSite* c) {
175 <    return (c->nt[0]==nt[0] && c->nt[1]==nt[1]);
176 <    }
177 <  bool operator==(const char* c) {
178 <    //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1]));
179 <    //assumes given const nucleotides are uppercase already!
180 <    return (nt[0]==c[0] && nt[1]==c[1]);
181 <    }
182 <  bool operator!=(const char* c) {
183 <    //assumes given const nucleotides are uppercase already!
184 <    return (nt[0]!=c[0] || nt[1]!=c[1]);
185 <    }
186 < };
160 > GList<GenomicSeqData> g_data(true,true,true); //list of GFF records by genomic seq
161  
162   //hash with sequence info
163   GHash<SeqInfo> seqinfo;
164   GHash<int> isoCounter; //counts the valid isoforms
165 + GHash<RefTran> reftbl;
166 + GHash<GeneInfo> gene_ids;
167 +  //min-max gene span associated to chr|gene_id (mostly for Ensembl conversion)
168  
169   bool debugMode=false;
170   bool verbose=false;
171  
172 + void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
173 +  GLineReader fr(f);
174 +  while (!fr.isEof()) {
175 +      char* line=fr.getLine();
176 +      if (line==NULL) break;
177 +      char* id=line;
178 +      char* lenstr=NULL;
179 +      char* text=NULL;
180 +      char* p=line;
181 +      while (*p!=0 && !isspace(*p)) p++;
182 +      if (*p==0) continue;
183 +      *p=0;p++;
184 +      while (*p==' ' || *p=='\t') p++;
185 +      if (*p==0) continue;
186 +      lenstr=p;
187 +      while (*p!=0 && !isspace(*p)) p++;
188 +      if (*p!=0) { *p=0;p++; }
189 +      while (*p==' ' || *p=='\t') p++;
190 +      if (*p!=0) text=p; //else text remains NULL
191 +      int len=0;
192 +      if (!parseInt(lenstr,len)) {
193 +         GMessage("Warning: could not parse sequence length: %s %s\n",
194 +                  id, lenstr);
195 +         continue;
196 +         }
197 +      // --- here we have finished parsing the line
198 +      si.Add(id, new SeqInfo(len,text));
199 +      } //while lines
200 + }
201 +
202 + void loadRefTable(FILE* f, GHash<RefTran>& rt) {
203 +  GLineReader fr(f);
204 +  char* line=NULL;
205 +  while ((line=fr.getLine())) {
206 +      char* orig_id=line;
207 +      char* p=line;
208 +      while (*p!=0 && !isspace(*p)) p++;
209 +      if (*p==0) continue;
210 +      *p=0;p++;//split the line here
211 +      while (*p==' ' || *p=='\t') p++;
212 +      if (*p==0) continue;
213 +      rt.Add(orig_id, new RefTran(p));
214 +      } //while lines
215 + }
216 +
217   char* getSeqDescr(char* seqid) {
218   static char charbuf[128];
219   if (seqinfo.Count()==0) return NULL;
# Line 231 | Line 253
253    return charbuf;
254   }
255  
256 < void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1) {
257 < if (seq==NULL) return;
258 < int len=(seqlen>0)?seqlen:strlen(seq);
237 < if (len<=0) return;
238 < if (!defline.is_empty())
239 <     fprintf(f, ">%s\n",defline.chars());
240 < int ilen=0;
241 < for (int i=0; i < len; i++, ilen++) {
242 <   if (ilen == 70) {
243 <     fputc('\n', f);
244 <     ilen = 0;
245 <     }
246 <   putc(seq[i], f);
247 <   } //for
248 < fputc('\n', f);
249 < }
250 <
251 < void loadSeqInfo(FILE* f, GHash<SeqInfo> &si) {
252 <  GLineReader fr(f);
253 <  while (!fr.isEof()) {
254 <      char* line=fr.getLine();
255 <      if (line==NULL) break;
256 <      char* id=line;
257 <      char* lenstr=NULL;
258 <      char* text=NULL;
259 <      char* p=line;
260 <      while (*p!=0 && !isspace(*p)) p++;
261 <      if (*p==0) continue;
262 <      *p=0;p++;
263 <      while (*p==' ' || *p=='\t') p++;
264 <      if (*p==0) continue;
265 <      lenstr=p;
266 <      while (*p!=0 && !isspace(*p)) p++;
267 <      if (*p!=0) { *p=0;p++; }
268 <      while (*p==' ' || *p=='\t') p++;
269 <      if (*p!=0) text=p; //else text remains NULL
270 <      int len=0;
271 <      if (!parseInt(lenstr,len)) {
272 <         GMessage("Warning: could not parse sequence length: %s %s\n",
273 <                  id, lenstr);
274 <         continue;
275 <         }
276 <      // --- here we have finished parsing the line
277 <      si.Add(id, new SeqInfo(len,text));
278 <      } //while lines
256 > GFaSeqGet* fastaSeqGet(GFastaDb& gfasta, GffObj& gffrec) {
257 >  if (gfasta.fastaPath==NULL) return NULL;
258 >  return gfasta.fetch(gffrec.gseq_id);
259   }
260  
261  
262 < GFaSeqGet* openFastaSeq(const char* gseqname) {
283 <  //this is only called when either we have a cidx or a directory in gseqpath
284 <  GFaSeqGet* r=NULL;
285 <  if (gcdb!=NULL) {
286 <     off_t rpos=gcdb->getRecordPos(gseqname);
287 <     if (rpos<0) // genomic sequence not found
288 <        GError("Error: cannot find genomic sequence '%s' in %s\n",gseqname, gseqpath.chars());
289 <     return new GFaSeqGet(gcdbfa.chars(),rpos, false);// WARNING: does not validate FASTA line-len uniformity!
290 <     }
291 <   else {
292 <     GStr s(gseqpath);
293 <     s.append('/');
294 <     s.append(gseqname);
295 <     if (debugMode) GMessage("[D] >> opening fasta file for genomic sequence %s..\n",gseqname);
296 <     if (fileExists(s.chars())==2) {
297 <         r=new GFaSeqGet(s.chars(),false); // WARNING: does not validate FASTA line-len uniformity!
298 <         r->loadall();
299 <         if (debugMode) GMessage("[D]    << loaded.\n");
300 <         return r;
301 <         }
302 <     s.append(".fa");
303 <     if (fileExists(s.chars())==2) {
304 <         r=new GFaSeqGet(s.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
305 <         r->loadall();
306 <         if (debugMode) GMessage("[D]    << loaded.\n");
307 <         return r;
308 <         }
309 <     s.append("sta");
310 <     if (fileExists(s.chars())==2) {
311 <         return new GFaSeqGet(s.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
312 <         r->loadall();
313 <         if (debugMode) GMessage("[D]    << loaded.\n");
314 <         return r;
315 <         }
316 <     GError("Error: cannot load genomic sequence %s from directory %s !\n",
317 <          gseqname, gseqpath.chars());
318 <     }
319 < return NULL;
320 < }
321 <
322 <
323 < GFaSeqGet* fastaSeqGet(GffObj& mrna) {
324 < if (gseqpath.is_empty()) return NULL;
325 < if (multiGSeq && mrna.gseq_id!=gseq_id) {
326 <     if (faseq!=NULL) {
327 <         delete faseq;
328 <         faseq=NULL;
329 <         }
330 <     faseq=openFastaSeq(mrna.getGSeqName());
331 <     gseq_id=mrna.gseq_id;
332 <     }
333 < return faseq;
334 < }
335 <
336 < void adjust_stopcodon(GffObj& mrna, int adj, GList<GSeg>* seglst=NULL) {
262 > int adjust_stopcodon(GffObj& gffrec, int adj, GList<GSeg>* seglst=NULL) {
263   //adj>0 => extedn CDS,  adj<0 => shrink CDS
264   //when CDS is expanded, exons have to be checked too and
265   // expanded accordingly if they had the same boundary
266 <  if (mrna.strand=='-') {
267 <       if ((int)mrna.CDstart>adj) {
268 <           mrna.CDstart-=adj;
269 <             if (adj>0 && mrna.exons.First()->start>=mrna.CDstart) {
270 <                 mrna.exons.First()->start-=adj;
271 <                 mrna.start=mrna.exons.First()->start;
272 <                 mrna.covlen+=adj;
266 >  int realadj=0;
267 >  if (gffrec.strand=='-') {
268 >       if ((int)gffrec.CDstart>adj) {
269 >
270 >           gffrec.CDstart-=adj;
271 >           realadj=adj;
272 >           if (adj<0) { //restore
273 >              if (gffrec.exons.First()->start==gffrec.CDstart+adj) {
274 >                 gffrec.exons.First()->start-=adj;
275 >                 gffrec.start=gffrec.exons.First()->start;
276 >                 gffrec.covlen+=adj;
277 >                 }
278 >              }
279 >           else if (gffrec.exons.First()->start>=gffrec.CDstart) {
280 >                 gffrec.exons.First()->start-=adj;
281 >                 gffrec.start=gffrec.exons.First()->start;
282 >                 gffrec.covlen+=adj;
283                   }
284               }
285            }
286          else {
287 <         mrna.CDend+=adj;
288 <         if (adj>0 && mrna.exons.Last()->end<=mrna.CDend) {
289 <             mrna.exons.Last()->end+=adj;
290 <             mrna.end=mrna.exons.Last()->end;
291 <             mrna.covlen+=adj;
287 >         realadj=adj;
288 >         gffrec.CDend+=adj;
289 >         if (adj<0) {//restore
290 >           if (gffrec.exons.Last()->end==gffrec.CDend-adj) {
291 >                        gffrec.exons.Last()->end+=adj;
292 >                        gffrec.end=gffrec.exons.Last()->end;
293 >                        gffrec.covlen+=adj;
294 >                        }
295 >          }
296 >         else if (gffrec.exons.Last()->end<=gffrec.CDend) {
297 >             gffrec.exons.Last()->end+=adj;
298 >             gffrec.end=gffrec.exons.Last()->end;
299 >             gffrec.covlen+=adj;
300               }
301           }
302    if (seglst!=NULL) seglst->Last()->end+=adj;
303 +  return realadj;
304   }
305  
306 < void process_mRNA(GffObj& mrna) {
307 < GStr gene(mrna.getGene());
308 < GStr defline(mrna.getID());
309 < if (!gene.is_empty() && strcmp(gene.chars(),mrna.getID())!=0) {
310 <   int* isonum=isoCounter.Find(gene.chars());
311 <   if (isonum==NULL) {
312 <      isonum=new int(1);
313 <      isoCounter.Add(mrna.getGene(),isonum);
306 > bool process_transcript(GFastaDb& gfasta, GffObj& gffrec) {
307 > //returns true if the transcript passed the filter
308 > char* gname=gffrec.getGeneName();
309 > if (gname==NULL) gname=gffrec.getGeneID();
310 > GStr defline(gffrec.getID());
311 > if (f_out && !fmtGTF) {
312 >     const char* tn=NULL;
313 >     if ((tn=gffrec.getAttr("transcript_name"))!=NULL) {
314 >        gffrec.addAttr("Name", tn);
315 >        gffrec.removeAttr("transcript_name");
316 >        }
317 >     }
318 > if (ensembl_convert && startsWith(gffrec.getID(), "ENS")) {
319 >      const char* tn=gffrec.getTrackName();
320 >      gffrec.addAttr("type", tn);
321 >      //bool is_gene=false;
322 >      bool is_pseudo=false;
323 >      if (strcmp(tn, "protein_coding")==0 || gffrec.hasCDS())
324 >                gffrec.setFeatureName("mRNA");
325 >       else {
326 >          if (strcmp(tn, "processed_transcript")==0)
327 >              gffrec.setFeatureName("proc_RNA");
328 >            else {
329 >              //is_gene=endsWith(tn, "gene");
330 >              is_pseudo=strifind(tn, "pseudo");
331 >              if (is_pseudo) {
332 >                   gffrec.setFeatureName("pseudo_RNA");
333 >                   }
334 >                else if (endsWith(tn, "RNA")) {
335 >                   gffrec.setFeatureName(tn);
336 >                   } else gffrec.setFeatureName("misc_RNA");
337 >              }
338 >          }
339        }
340 <     else (*isonum)++;
341 <  defline.appendfmt(" gene=%s", gene.chars());
342 <  }
340 > if (gname && strcmp(gname, gffrec.getID())!=0) {
341 >   int* isonum=isoCounter.Find(gname);
342 >   if  (isonum==NULL) {
343 >       isonum=new int(1);
344 >       isoCounter.Add(gname,isonum);
345 >       }
346 >      else (*isonum)++;
347 >   defline.appendfmt(" gene=%s", gname);
348 >   }
349    int seqlen=0;
350 +
351    const char* tlabel=tracklabel;
352 <  if (tlabel==NULL) tlabel=mrna.getTrackName();
352 >  if (tlabel==NULL) tlabel=gffrec.getTrackName();
353    //defline.appendfmt(" track:%s",tlabel);
354    char* cdsnt = NULL;
355    char* cdsaa = NULL;
356    int aalen=0;
357 <  for (int i=1;i<mrna.exons.Count();i++) {
358 <     int ilen=mrna.exons[i]->start-mrna.exons[i-1]->end-1;
359 <     if (ilen>2000000)
357 >  for (int i=1;i<gffrec.exons.Count();i++) {
358 >     int ilen=gffrec.exons[i]->start-gffrec.exons[i-1]->end-1;
359 >     if (ilen>4000000)
360              GMessage("Warning: very large intron (%d) for transcript %s\n",
361 <                           ilen, mrna.getID());
361 >                           ilen, gffrec.getID());
362       if (ilen>maxintron) {
363 <         return;
363 >         return false;
364           }
365       }
366    GList<GSeg> seglst(false,true);
367 <  fastaSeqGet(mrna);
368 <  if (spliceCheck && mrna.exons.Count()>1) {
367 >  GFaSeqGet* faseq=fastaSeqGet(gfasta, gffrec);
368 >  if (spliceCheck && gffrec.exons.Count()>1) {
369      //check introns for splice site consensi ( GT-AG, GC-AG or AT-AC )
370      if (faseq==NULL) GError("Error: no genomic sequence available!\n");
371 <    int glen=mrna.end-mrna.start+1;
372 <    const char* gseq=faseq->subseq(mrna.start, glen);
373 <    bool revcompl=(mrna.strand=='-');
371 >    int glen=gffrec.end-gffrec.start+1;
372 >    const char* gseq=faseq->subseq(gffrec.start, glen);
373 >    bool revcompl=(gffrec.strand=='-');
374      bool ssValid=true;
375 <    for (int e=1;e<mrna.exons.Count();e++) {
376 <      const char* intron=gseq+mrna.exons[e-1]->end+1-mrna.start;
377 <      //debug
401 <      //uint istart=mrna.exons[e-1]->end+1;
402 <      //uint iend=mrna.exons[e]->start-1;
403 <      int intronlen=mrna.exons[e]->start-mrna.exons[e-1]->end-1;
375 >    for (int e=1;e<gffrec.exons.Count();e++) {
376 >      const char* intron=gseq+gffrec.exons[e-1]->end+1-gffrec.start;
377 >      int intronlen=gffrec.exons[e]->start-gffrec.exons[e-1]->end-1;
378        GSpliceSite acceptorSite(intron,intronlen,true, revcompl);
379        GSpliceSite    donorSite(intron,intronlen, false, revcompl);
380        //GMessage("%c intron %d-%d : %s .. %s\n",
381 <      //           mrna.strand, istart, iend, donorSite.nt, acceptorSite.nt);
381 >      //           gffrec.strand, istart, iend, donorSite.nt, acceptorSite.nt);
382        if (acceptorSite=="AG") { // GT-AG or GC-AG
383           if (!donorSite.canonicalDonor()) {
384              ssValid=false;break;
# Line 418 | Line 392
392      //GFREE(gseq);
393      if (!ssValid) {
394        if (verbose)
395 <         GMessage("Invalid splice sites found for '%s'\n",mrna.getID());
396 <      return; //don't print this one!
395 >         GMessage("Invalid splice sites found for '%s'\n",gffrec.getID());
396 >      return false; //don't print this one!
397        }
398      }
399  
400    bool trprint=true;
401 +  int stopCodonAdjust=0;
402    int mCDphase=0;
403 <  if (mrna.CDphase=='1' || mrna.CDphase=='2')
404 <      mCDphase = mrna.CDphase-'0';
403 >  bool hasStop=false;
404 >  if (gffrec.CDphase=='1' || gffrec.CDphase=='2')
405 >      mCDphase = gffrec.CDphase-'0';
406    if (f_y!=NULL || f_x!=NULL || validCDSonly) {
407      if (faseq==NULL) GError("Error: no genomic sequence provided!\n");
408      //if (protmap && fullCDSonly) {
409 <    //if (protmap && (fullCDSonly ||  (mrna.qlen>0 && mrna.qend==mrna.qlen))) {
409 >    //if (protmap && (fullCDSonly ||  (gffrec.qlen>0 && gffrec.qend==gffrec.qlen))) {
410 >    
411      if (validCDSonly) { //make sure the stop codon is always included
412 <      adjust_stopcodon(mrna,3);
412 >      //adjust_stopcodon(gffrec,3);
413 >      stopCodonAdjust=adjust_stopcodon(gffrec,3);
414        }
415      int strandNum=0;
416      int phaseNum=0;
417    CDS_CHECK:
418 <    cdsnt=mrna.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
418 >    cdsnt=gffrec.getSpliced(faseq, true, &seqlen,NULL,NULL,&seglst);
419      if (cdsnt==NULL) trprint=false;
420      if (validCDSonly) {
421         cdsaa=translateDNA(cdsnt, aalen, seqlen);
422         char* p=strchr(cdsaa,'.');
423 <       bool hasStop=false;
423 >       hasStop=false;
424         if (p!=NULL) {
425              if (p-cdsaa>=aalen-2) { //stop found as the last codon
426                      *p='0';//remove it
# Line 450 | Line 428
428                      if (aalen-2==p-cdsaa) {
429                        //previous to last codon is the stop codon
430                        //so correct the CDS stop accordingly
431 <                      adjust_stopcodon(mrna,-3, &seglst);
431 >                      adjust_stopcodon(gffrec,-3, &seglst);
432 >                      stopCodonAdjust=0; //clear artificial stop adjustment
433                        seqlen-=3;
434                        cdsnt[seqlen]=0;
435                        }
# Line 464 | Line 443
443           //in-frame stop codon found
444           if (altPhases && phaseNum<3) {
445              phaseNum++;
446 <            mrna.CDphase = '0'+((mCDphase+phaseNum)%3);
446 >            gffrec.CDphase = '0'+((mCDphase+phaseNum)%3);
447              GFREE(cdsaa);
448              goto CDS_CHECK;
449              }
450 <         if (mrna.exons.Count()==1 && bothStrands) {
450 >         if (gffrec.exons.Count()==1 && bothStrands) {
451              strandNum++;
452              phaseNum=0;
453              if (strandNum<2) {
454                 GFREE(cdsaa);
455 <               mrna.strand = (mrna.strand=='-') ? '+':'-';
455 >               gffrec.strand = (gffrec.strand=='-') ? '+':'-';
456                 goto CDS_CHECK; //repeat the CDS check for a different frame
457                 }
458              }
459 <         if (verbose) GMessage("In-frame STOP found for '%s'\n",mrna.getID());
459 >         if (verbose) GMessage("In-frame STOP found for '%s'\n",gffrec.getID());
460           } //has in-frame STOP
461         if (fullCDSonly) {
462             if (!hasStop || cdsaa[0]!='M') trprint=false;
# Line 487 | Line 466
466    if (!trprint) {
467      GFREE(cdsnt);
468      GFREE(cdsaa);
469 <    return;
469 >    return false;
470      }
471 <  if (f_out!=NULL) {
472 <     if (fmtGTF) mrna.printGtf(f_out,tracklabel);
473 <            else mrna.printGff(f_out, tracklabel);
474 <     }
471 >  if (stopCodonAdjust>0 && !hasStop) {
472 >          //restore stop codon location
473 >          adjust_stopcodon(gffrec, -stopCodonAdjust, &seglst);
474 >          if (cdsnt!=NULL && seqlen>0) {
475 >             seqlen-=stopCodonAdjust;
476 >             cdsnt[seqlen]=0;
477 >             }
478 >          if (cdsaa!=NULL) aalen--;
479 >          }
480 >
481    if (f_y!=NULL) { //CDS translation fasta output requested
482           //char*
483           if (cdsaa==NULL) { //translate now if not done before
484             cdsaa=translateDNA(cdsnt, aalen, seqlen);
485             }
486 <         if (fullattr && mrna.attrs!=NULL) {
486 >         if (fullattr && gffrec.attrs!=NULL) {
487               //append all attributes found for each transcripts
488 <              for (int i=0;i<mrna.attrs->Count();i++) {
488 >              for (int i=0;i<gffrec.attrs->Count();i++) {
489                  defline.append(" ");
490 <                defline.append(mrna.getAttrName(i));
490 >                defline.append(gffrec.getAttrName(i));
491                  defline.append("=");
492 <                defline.append(mrna.getAttrValue(i));
492 >                defline.append(gffrec.getAttrValue(i));
493                  }
494                }
495           printFasta(f_y, defline, cdsaa, aalen);
# Line 512 | Line 497
497     if (f_x!=NULL) { //CDS only
498           if (writeExonSegs) {
499                defline.append(" loc:");
500 <              defline.append(mrna.getGSeqName());
501 <              defline.appendfmt("(%c)",mrna.strand);
500 >              defline.append(gffrec.getGSeqName());
501 >              defline.appendfmt("(%c)",gffrec.strand);
502                //warning: not CDS coordinates are written here, but the exon ones
503 <              defline+=(int)mrna.start;
503 >              defline+=(int)gffrec.start;
504                defline+=(char)'-';
505 <              defline+=(int)mrna.end;
505 >              defline+=(int)gffrec.end;
506                // -- here these are CDS substring coordinates on the spliced sequence:
507                defline.append(" segs:");
508                for (int i=0;i<seglst.Count();i++) {
# Line 527 | Line 512
512                    defline+=(int)seglst[i]->end;
513                    }
514                }
515 <         if (fullattr && mrna.attrs!=NULL) {
515 >         if (fullattr && gffrec.attrs!=NULL) {
516               //append all attributes found for each transcript
517 <              for (int i=0;i<mrna.attrs->Count();i++) {
517 >              for (int i=0;i<gffrec.attrs->Count();i++) {
518                  defline.append(" ");
519 <                defline.append(mrna.getAttrName(i));
519 >                defline.append(gffrec.getAttrName(i));
520                  defline.append("=");
521 <                defline.append(mrna.getAttrValue(i));
521 >                defline.append(gffrec.getAttrValue(i));
522                  }
523                }
524           printFasta(f_x, defline, cdsnt, seqlen);
# Line 544 | Line 529
529      uint cds_start=0;
530      uint cds_end=0;
531      seglst.Clear();
532 <    char* exont=mrna.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
532 >    char* exont=gffrec.getSpliced(faseq, false, &seqlen, &cds_start, &cds_end, &seglst);
533      if (exont!=NULL) {
534 <    if (mrna.CDstart>0) {
534 >    if (gffrec.CDstart>0) {
535          defline.appendfmt(" CDS=%d-%d", cds_start, cds_end);
536          }
537        if (writeExonSegs) {
538          defline.append(" loc:");
539 <        defline.append(mrna.getGSeqName());
539 >        defline.append(gffrec.getGSeqName());
540          defline+=(char)'|';
541 <        defline+=(int)mrna.start;
541 >        defline+=(int)gffrec.start;
542          defline+=(char)'-';
543 <        defline+=(int)mrna.end;
543 >        defline+=(int)gffrec.end;
544          defline+=(char)'|';
545 <        defline+=(char)mrna.strand;
545 >        defline+=(char)gffrec.strand;
546          defline.append(" exons:");
547 <        for (int i=0;i<mrna.exons.Count();i++) {
547 >        for (int i=0;i<gffrec.exons.Count();i++) {
548                  if (i>0) defline.append(",");
549 <                defline+=(int)mrna.exons[i]->start;
549 >                defline+=(int)gffrec.exons[i]->start;
550                  defline.append("-");
551 <                defline+=(int)mrna.exons[i]->end;
551 >                defline+=(int)gffrec.exons[i]->end;
552                  }
553          defline.append(" segs:");
554          for (int i=0;i<seglst.Count();i++) {
# Line 573 | Line 558
558              defline+=(int)seglst[i]->end;
559              }
560          }
561 <      if (fullattr && mrna.attrs!=NULL) {
561 >      if (fullattr && gffrec.attrs!=NULL) {
562         //append all attributes found for each transcripts
563 <        for (int i=0;i<mrna.attrs->Count();i++) {
563 >        for (int i=0;i<gffrec.attrs->Count();i++) {
564            defline.append(" ");
565 <          defline.append(mrna.getAttrName(i));
565 >          defline.append(gffrec.getAttrName(i));
566            defline.append("=");
567 <          defline.append(mrna.getAttrValue(i));
567 >          defline.append(gffrec.getAttrValue(i));
568            }
569          }
570        printFasta(f_w, defline, exont, seqlen);
571        GFREE(exont);
572        }
573      } //writing f_w (spliced exons)
574 < return;
574 > return true;
575   }
576  
577   void openfw(FILE* &f, GArgs& args, char opt) {
# Line 604 | Line 589
589   #define FWCLOSE(fh) if (fh!=NULL && fh!=stdout) fclose(fh)
590   #define FRCLOSE(fh) if (fh!=NULL && fh!=stdin) fclose(fh)
591  
592 + void printGff3Header(FILE* f, GArgs& args) {
593 +  fprintf(f, "# ");
594 +  args.printCmdLine(f);
595 +  fprintf(f, "##gff-version 3\n");
596 +  //for (int i=0;i<gseqdata.Count();i++) {
597 +  //
598 +  //}
599 +  }
600 +
601 + bool validateGffRec(GffObj* gffrec, GList<GffObj>* gfnew) {
602 +  if (reftbl.Count()>0) {
603 +        GStr refname(gffrec->getRefName());
604 +        RefTran* rt=reftbl.Find(refname.chars());
605 +        if (rt==NULL && refname[-2]=='.' && isdigit(refname[-1])) {
606 +           //try removing the version
607 +           refname.cut(-2);
608 +           //GMessage("[DEBUG] Trying ref name '%s'...\n", refname.chars());
609 +           rt=reftbl.Find(refname.chars());
610 +           }
611 +        if (rt) {
612 +          gffrec->setRefName(rt->new_name);
613 +          }
614 +         else return false; //discard, ref seq not in the given translation table
615 +        }
616 +      if (mRNAOnly && gffrec->isDiscarded()) {
617 +       //discard generic "gene" or "locus" features with no other detailed subfeatures
618 +       //GMessage("Warning: discarding %s GFF generic gene/locus container %s\n",m->getID());
619 +       return false;
620 +       }
621 +      /*
622 +      if (gffrec->exons.Count()==0  && gffrec->children.Count()==0)) {
623 +        //a non-mRNA feature with no subfeatures
624 +        //just so we get some sequence functions working, add a dummy "exon"-like subfeature here
625 +        //--this could be a single "pseudogene" entry or another genomic region without exons
626 +        //
627 +        gffrec->addExon(gffrec->start,gffrec->end);
628 +        }
629 +      */  
630 +     if (rfltGSeq!=NULL) { //filter by gseqName
631 +        if (strcmp(gffrec->getGSeqName(),rfltGSeq)!=0) {
632 +          return false;
633 +          }
634 +        }
635 +     if (rfltStrand>0 && gffrec->strand !=rfltStrand) {
636 +        return false;
637 +        }
638 +     //check coordinates
639 +     if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
640 +       if (rfltWithin) {
641 +         if (gffrec->start<rfltStart || gffrec->end>rfltEnd) {
642 +            return false;
643 +            }
644 +         }
645 +       else {
646 +         if (gffrec->start>rfltEnd || gffrec->end<rfltStart) {
647 +           return false;
648 +           }
649 +         }
650 +       }
651 +     if (multiExon && gffrec->exons.Count()<=1) {
652 +         return false;
653 +         }
654 +   if (wCDSonly && gffrec->CDstart==0) {
655 +         return false;
656 +         }
657 +   if (ensembl_convert && startsWith(gffrec->getID(), "ENS")) {
658 +       //keep track of chr|gene_id data -- coordinate range
659 +       char* geneid=gffrec->getGeneID();
660 +       if (geneid!=NULL) {
661 +         GeneInfo* ginfo=gene_ids.Find(geneid);
662 +         if (ginfo==NULL) {//first time seeing this gene ID
663 +                   GeneInfo* geneinfo=new GeneInfo(gffrec, ensembl_convert);
664 +                   gene_ids.Add(geneid, geneinfo);
665 +                   if (gfnew!=NULL) gfnew->Add(geneinfo->gf);
666 +                   }
667 +             else ginfo->update(gffrec);
668 +         }
669 +       }
670 +  return true;
671 + }
672 +
673 +
674   int main(int argc, char * const argv[]) {
675 < GArgs args(argc, argv, "vOUNHWCVMNSXTDAPRZFGg:i:r:s:t:a:b:o:w:x:y:MINCOV=MINPID=");
676 < int e;
677 < if ((e=args.isError())>0)
678 <    GError("%s\nInvalid argument: %s\n", USAGE, argv[e]);
679 < if (args.getOpt('h')!=NULL) GError("%s\n", USAGE);
680 < debugMode=(args.getOpt('D')!=NULL);
675 > GArgs args(argc, argv,
676 >   "debug;merge;cluster-only;help;MINCOV=MINPID=hvOUNHWCVJMKQNSXTDAPRZFGLEm:g:i:r:s:t:a:b:o:w:x:y:d:");
677 > args.printError(USAGE, true);
678 > if (args.getOpt('h') || args.getOpt("help")) {
679 >    GMessage("%s",USAGE);
680 >    exit(1);
681 >    }
682 > debugMode=(args.getOpt("debug")!=NULL);
683   mRNAOnly=(args.getOpt('O')==NULL);
684 < sortByLoc=(args.getOpt('S')!=NULL);
684 > //sortByLoc=(args.getOpt('S')!=NULL);
685   addDescr=(args.getOpt('A')!=NULL);
686   verbose=(args.getOpt('v')!=NULL);
687   wCDSonly=(args.getOpt('C')!=NULL);
# Line 620 | Line 689
689   altPhases=(args.getOpt('H')!=NULL);
690   fmtGTF=(args.getOpt('T')!=NULL); //switch output format to GTF
691   bothStrands=(args.getOpt('B')!=NULL);
692 < fullCDSonly=(args.getOpt('M')!=NULL);
692 > fullCDSonly=(args.getOpt('J')!=NULL);
693   spliceCheck=(args.getOpt('N')!=NULL);
694 + bool matchAllIntrons=(args.getOpt('K')==NULL);
695 + bool fuzzSpan=(args.getOpt('Q')!=NULL);
696 + if (args.getOpt('M') || args.getOpt("merge")) {
697 +    doCluster=true;
698 +    doCollapseRedundant=true;
699 +    }
700 +   else {
701 +    if (!matchAllIntrons || fuzzSpan) {
702 +      GMessage("%s",USAGE);
703 +      GMessage("Error: -K or -Q options require -M/--merge option!\n");
704 +      exit(1);
705 +      }
706 +    }
707 + if (args.getOpt("cluster-only")) {
708 +    doCluster=true;
709 +    doCollapseRedundant=false;
710 +    if (!matchAllIntrons || fuzzSpan) {
711 +      GMessage("%s",USAGE);
712 +      GMessage("Error: -K or -Q options have no effect with --cluster-only.\n");
713 +      exit(1);
714 +      }
715 +    }
716   //protmap=(args.getOpt('P')!=NULL);
717   if (fullCDSonly) validCDSonly=true;
718 + if (verbose) {
719 +     fprintf(stderr, "Command line was:\n");
720 +     args.printCmdLine(stderr);
721 +     }
722 +
723   fullattr=(args.getOpt('F')!=NULL);
724   if (args.getOpt('G')==NULL)
725      noExonAttr=!fullattr;
# Line 631 | Line 727
727       noExonAttr=true;
728       fullattr=true;
729       }
730 + ensembl_convert=(args.getOpt('L')!=NULL);
731 + if (ensembl_convert) {
732 +    fullattr=true;
733 +    noExonAttr=false;
734 +    //sortByLoc=true;
735 +    }
736 +    
737   mergeCloseExons=(args.getOpt('Z')!=NULL);
738   multiExon=(args.getOpt('U')!=NULL);
739   writeExonSegs=(args.getOpt('W')!=NULL);
740   tracklabel=args.getOpt('t');
741 < gseqpath=args.getOpt('g');
742 < if (!gseqpath.is_empty()) {
743 <   sortByLoc=true;
641 <   int c=fileExists(gseqpath.chars());
642 <   if (c==1)
643 <       multiGSeq=true;//directory given
644 <   else if (c==2) { //single file given
645 <         if (gseqpath.rindex(".cidx")==gseqpath.length()-5) {
646 <            //cdbyank index given
647 <            gcdb=new GCdbYank(gseqpath.chars());
648 <            if (fileExists(gcdb->getDbName())) {
649 <               gcdbfa=gcdb->getDbName();
650 <               } else {
651 <               gcdbfa=gseqpath;
652 <               gcdbfa.chomp(".cidx");
653 <               if (!fileExists(gcdbfa.chars()))
654 <                   GError("Error: cannot locate the fasta file for index %s !\n",gseqpath.chars());
655 <               }
656 <            multiGSeq=true;
657 <            }
658 <         else { //must be just the one single fasta file
659 <           multiGSeq=false;
660 <           faseq=new GFaSeqGet(gseqpath.chars(), false); // WARNING: does not validate FASTA line-len uniformity!
661 <           }
662 <         }//fasta file given
663 <     else GError("Error: cannot find genomic sequence path %s !\n",gseqpath.chars());
664 <   } // -g option
741 > GFastaDb gfasta(args.getOpt('g'));
742 > //if (gfasta.fastaPath!=NULL)
743 > //    sortByLoc=true; //enforce sorting by chromosome/contig
744   GStr s=args.getOpt('i');
745   if (!s.is_empty()) maxintron=s.asInt();
746 +
747 + FILE* f_repl=NULL;
748 + s=args.getOpt('d');
749 + if (!s.is_empty()) {
750 +   if (s=="-") f_repl=stdout;
751 +     else {
752 +       f_repl=fopen(s.chars(), "w");
753 +       if (f_repl==NULL) GError("Error creating file %s\n", s.chars());
754 +       }
755 +   }
756 +
757   rfltWithin=(args.getOpt('R')!=NULL);
758   s=args.getOpt('r');
759   if (!s.is_empty()) {
760     s.trim();
761 +   if (s[0]=='+' || s[0]=='-') {
762 +     rfltStrand=s[0];
763 +     s.cut(0,1);
764 +     }
765     int isep=s.index(':');
766 <   if (isep>=0) { //gseq name given
767 <      if (isep>0) rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
766 >   if (isep>0) { //gseq name given
767 >      if (rfltStrand==0 && (s[isep-1]=='+' || s[isep-1]=='-')) {
768 >        isep--;
769 >        rfltStrand=s[isep];
770 >        s.cut(isep,1);
771 >        }
772 >      if (isep>0)
773 >          rfltGSeq=Gstrdup((s.substr(0,isep)).chars());
774        s.cut(0,isep+1);
775        }
776     GStr gsend;
777 +   char slast=s[s.length()-1];
778 +   if (rfltStrand==0 && (slast=='+' || slast=='-')) {
779 +      s.chomp(slast);
780 +      rfltStrand=slast;
781 +      }
782     if (s.index("..")>=0) gsend=s.split("..");
783 <                   else gsend=s.split('-');
783 >                    else gsend=s.split('-');
784     if (!s.is_empty()) rfltStart=(uint)s.asInt();
785 <   if (!gsend.is_empty()) rfltEnd=(uint)gsend.asInt();
785 >   if (!gsend.is_empty()) {
786 >      rfltEnd=(uint)gsend.asInt();
787 >      if (rfltEnd==0) rfltEnd=MAX_UINT;
788 >      }
789 >  
790     } //gseq/range filtering
791   else {
792     if (rfltWithin)
793       GError("Error: option -R doesn't make sense without -r!\n");
794     }
795 + s=args.getOpt('m');
796 + if (!s.is_empty()) {
797 +   FILE* ft=fopen(s,"r");
798 +   if (ft==NULL) GError("Error opening reference table: %s\n",s.chars());
799 +   loadRefTable(ft, reftbl);
800 +   fclose(ft);
801 +   }
802   s=args.getOpt('s');
803   if (!s.is_empty()) {
804 <  FILE* fsize=fopen(s,"r");
805 <  if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
806 <  loadSeqInfo(fsize, seqinfo);
807 < }
808 < /*
809 < openfw(fgtfok, args, 'a');
694 < openfw(fgtfbad, args, 'b');
695 < */
804 >   FILE* fsize=fopen(s,"r");
805 >   if (fsize==NULL) GError("Error opening info file: %s\n",s.chars());
806 >   loadSeqInfo(fsize, seqinfo);
807 >   fclose(fsize);
808 >   }
809 >
810   openfw(f_out, args, 'o');
811   //if (f_out==NULL) f_out=stdout;
812 < if (gseqpath.is_empty() && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
812 > if (gfasta.fastaPath==NULL && (validCDSonly || spliceCheck || args.getOpt('w')!=NULL || args.getOpt('x')!=NULL || args.getOpt('y')!=NULL))
813    GError("Error: -g option is required for options -w, -x, -y, -V, -N, -M !\n");
814  
815   openfw(f_w, args, 'w');
# Line 703 | Line 817
817   openfw(f_y, args, 'y');
818   if (f_y!=NULL || f_x!=NULL) wCDSonly=true;
819   //useBadCDS=useBadCDS || (fgtfok==NULL && fgtfbad==NULL && f_y==NULL && f_x==NULL);
820 < GStr infile;
821 < if (args.startNonOpt()) {
822 <        infile=args.nextNonOpt();
823 <        //GMessage("Given file: %s\n",infile.chars());
824 <        }
825 < if (!infile.is_empty()) {
826 <    f_in=fopen(infile, "r");
827 <    if (f_in==NULL)
828 <       GError("Cannot open input file %s!\n",infile.chars());
829 <    }
830 <  else
831 <    f_in=stdin;
832 <
719 < //GffReader* gfreader=new GffReader(f_in,true);
720 < GffReader* gfreader=new GffReader(f_in, mRNAOnly, sortByLoc);
721 < gfreader->readAll(fullattr, mergeCloseExons, noExonAttr);
722 < if (debugMode) GMessage("[D] gff data loaded, now processing each entry\n");
723 < //GList<GffObj> mrnas(true,true,false);
724 < for (int m=0;m<gfreader->gflst.Count();m++) {
725 <   GffObj* mrna=gfreader->gflst[m];
726 <   if (mrna->exons.Count()==0) {
727 <      //a non-mRNA feature with no subfeatures
728 <      //yet just so we get some sequence functions working, add the dummy "exon" here
729 <      mrna->addExon(mrna->start,mrna->end);
730 <      }
731 <   if (rfltGSeq!=NULL) { //filter by gseqName
732 <      if (strcmp(mrna->getGSeqName(),rfltGSeq)!=0) {
733 <        delete mrna;
734 <        gfreader->gflst.Forget(m);
735 <        continue;
736 <        }
737 <      }
738 <   //check coordinates
739 <   if (rfltStart!=0 || rfltEnd!=MAX_UINT) {
740 <     if (rfltWithin) {
741 <       if (mrna->start<rfltStart || mrna->end>rfltEnd) {
742 <          delete mrna;
743 <          gfreader->gflst.Forget(m);
744 <          continue;
820 >
821 > int numfiles = args.startNonOpt();
822 > //GList<GffObj> gfkept(false,true); //unsorted, free items on delete
823 > int out_counter=0; //number of records printed
824 > while (true) {
825 >   GStr infile;
826 >   if (numfiles) {
827 >          infile=args.nextNonOpt();
828 >          if (infile.is_empty()) break;
829 >          if (infile=="-") { f_in=stdin; infile="stdin"; }
830 >               else
831 >                 if ((f_in=fopen(infile, "r"))==NULL)
832 >                    GError("Error: cannot open input file %s!\n",infile.chars());
833            }
834 <       }
835 <     else {
836 <       if (mrna->start>rfltEnd || mrna->end<rfltStart) {
837 <         delete mrna;
838 <         gfreader->gflst.Forget(m);
839 <         continue;
834 >        else
835 >          infile="-";
836 >   GffLoader gffloader(infile.chars());
837 >   gffloader.transcriptsOnly=mRNAOnly;
838 >   gffloader.fullAttributes=fullattr;
839 >   gffloader.noExonAttrs=noExonAttr;
840 >   gffloader.mergeCloseExons=mergeCloseExons;
841 >   gffloader.showWarnings=(args.getOpt('E')!=NULL);
842 >   gffloader.load(g_data, &validateGffRec, doCluster, doCollapseRedundant,
843 >                             matchAllIntrons, fuzzSpan);
844 >   if (doCluster)
845 >     collectLocusData(g_data);
846 >   if (numfiles==0) break;
847 >   }
848 >  
849 > GStr loctrack("gffcl");
850 > if (tracklabel) loctrack=tracklabel;
851 > g_data.setSorted(&gseqCmpName);
852 > if (doCluster) {
853 >   //grouped in loci
854 >   for (int g=0;g<g_data.Count();g++) {
855 >     GenomicSeqData* gdata=g_data[g];
856 >     for (int l=0;l<gdata->loci.Count();l++) {
857 >       GffLocus& loc=*(gdata->loci[l]);
858 >       //check all non-replaced transcripts in this locus:
859 >       int numvalid=0;
860 >       int idxfirstvalid=-1;
861 >       for (int i=0;i<loc.rnas.Count();i++) {
862 >         GffObj& t=*(loc.rnas[i]);
863 >         GTData* tdata=(GTData*)(t.uptr);
864 >         if (tdata->replaced_by!=NULL) {
865 >            if (f_repl && (t.udata & 8)==0) {
866 >               //t.udata|=8;
867 >               fprintf(f_repl, "%s", t.getID());
868 >               GTData* rby=tdata;
869 >               while (rby->replaced_by!=NULL) {
870 >                  fprintf(f_repl," => %s", rby->replaced_by->getID());
871 >                  rby->rna->udata|=8;
872 >                  rby=(GTData*)(rby->replaced_by->uptr);
873 >                  }
874 >               fprintf(f_repl, "\n");
875 >               }
876 >            continue;
877 >            }
878 >         if (process_transcript(gfasta, t)) {
879 >             t.udata|=4; //tag it as valid
880 >             numvalid++;
881 >             if (idxfirstvalid<0) idxfirstvalid=i;
882 >             }
883           }
884 +      
885 +       if (f_out && numvalid>0) {
886 +         GStr locname("RLOC_");
887 +         locname.appendfmt("%08d",loc.locus_num);
888 +         if (!fmtGTF) {
889 +           if (out_counter==0)
890 +              printGff3Header(f_out, args);
891 +           fprintf(f_out,"%s\t%s\tlocus\t%d\t%d\t.\t%c\t.\tID=%s;locus=%s",
892 +                    loc.rnas[0]->getGSeqName(), loctrack.chars(), loc.start, loc.end, loc.strand,
893 +                     locname.chars(), locname.chars());
894 +           //const char* loc_gname=loc.getGeneName();
895 +           if (loc.gene_names.Count()>0) { //print all gene names associated to this locus
896 +              fprintf(f_out, ";genes=%s",loc.gene_names.First()->name.chars());
897 +              for (int i=1;i<loc.gene_names.Count();i++) {
898 +                fprintf(f_out, ",%s",loc.gene_names[i]->name.chars());
899 +                }
900 +              }
901 +           if (loc.gene_ids.Count()>0) { //print all GeneIDs names associated to this locus
902 +              fprintf(f_out, ";geneIDs=%s",loc.gene_ids.First()->name.chars());
903 +              for (int i=1;i<loc.gene_ids.Count();i++) {
904 +                fprintf(f_out, ",%s",loc.gene_ids[i]->name.chars());
905 +                }
906 +              }
907 +           fprintf(f_out, ";transcripts=%s",loc.rnas[idxfirstvalid]->getID());
908 +           for (int i=idxfirstvalid+1;i<loc.rnas.Count();i++) {
909 +              fprintf(f_out, ",%s",loc.rnas[i]->getID());
910 +              }
911 +           fprintf(f_out, "\n");
912 +           }
913 +         //now print all valid, non-replaced transcripts in this locus:
914 +         for (int i=0;i<loc.rnas.Count();i++) {
915 +           GffObj& t=*(loc.rnas[i]);
916 +           GTData* tdata=(GTData*)(t.uptr);
917 +           if (tdata->replaced_by!=NULL || ((t.udata & 4)==0)) continue;
918 +           t.addAttr("locus", locname.chars());
919 +           out_counter++;
920 +           if (fmtGTF) t.printGtf(f_out, tracklabel);
921 +               else {
922 +                //print the parent first, if any
923 +                if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
924 +                    GTData* pdata=(GTData*)(t.parent->uptr);
925 +                    if (pdata->geneinfo!=NULL)
926 +                         pdata->geneinfo->finalize();
927 +                    t.parent->addAttr("locus", locname.chars());
928 +                    t.parent->printGff(f_out, tracklabel);
929 +                    t.parent->udata|=4;
930 +                    }
931 +                t.printGff(f_out, tracklabel);
932 +                }
933 +            }
934 +          } //have valid transcripts to print
935 +       }//for each locus
936 +     if (f_out && !mRNAOnly) {
937 +       //final pass through the non-transcripts, in case any of them were not printed
938 +       //TODO: order broken, these should be interspersed among the rnas in the correct order!
939 +       for (int m=0;m<gdata->gfs.Count();m++) {
940 +         GffObj& t=*(gdata->gfs[m]);
941 +         if ((t.udata&4)==0) { //never printed
942 +           t.udata|=4;
943 +           if (fmtGTF) t.printGtf(f_out, tracklabel);
944 +              else t.printGff(f_out, tracklabel);
945 +           }
946 +         } //for each non-transcript
947         }
948 <     }
949 <   if (multiExon && mrna->exons.Count()==1) {
950 <       delete mrna;
951 <       gfreader->gflst.Forget(m);
952 <       continue;
953 <       }
954 <   if (wCDSonly && mrna->CDstart==0) {
955 <       delete mrna;
956 <       gfreader->gflst.Forget(m);
957 <       continue;
958 <       }
959 <   /* if (validCDSonly && mrna->hasErrors) {
960 <       delete mrna;
961 <       gfreader->gflst.Forget(m);
962 <       continue;
948 >     } //for each genomic sequence
949 >   }
950 >  else {
951 >   //not grouped into loci, print the rnas with their parents, if any
952 >   int numvalid=0;
953 >   for (int g=0;g<g_data.Count();g++) {
954 >     GenomicSeqData* gdata=g_data[g];
955 >     for (int m=0;m<gdata->rnas.Count();m++) {
956 >        GffObj& t=*(gdata->rnas[m]);
957 >        GTData* tdata=(GTData*)(t.uptr);
958 >        if (tdata->replaced_by!=NULL) continue;
959 >        if (process_transcript(gfasta, t)) {
960 >           t.udata|=4; //tag it as valid
961 >           numvalid++;
962 >           if (f_out) {
963 >             if (tdata->geneinfo) tdata->geneinfo->finalize();
964 >             out_counter++;
965 >             if (fmtGTF) t.printGtf(f_out, tracklabel);
966 >               else {
967 >                if (out_counter==1)
968 >                  printGff3Header(f_out, args);
969 >                //print the parent first, if any
970 >                if (t.parent!=NULL && ((t.parent->udata & 4)==0)) {
971 >                    GTData* pdata=(GTData*)(t.parent->uptr);
972 >                    if (pdata->geneinfo!=NULL)
973 >                         pdata->geneinfo->finalize();
974 >                    t.parent->printGff(f_out, tracklabel);
975 >                    t.parent->udata|=4;
976 >                    }
977 >                t.printGff(f_out, tracklabel);
978 >                }
979 >             }//GFF/GTF output requested
980 >           } //valid transcript
981 >        } //for each rna
982 >     if (f_out && !mRNAOnly) {
983 >       //final pass through the non-transcripts, in case any of them were not printed
984 >       //TODO: order broken, these should be interspersed among the rnas in the correct order!
985 >       for (int m=0;m<gdata->gfs.Count();m++) {
986 >         GffObj& t=*(gdata->gfs[m]);
987 >         if ((t.udata&4)==0) { //never printed
988 >           t.udata|=4;
989 >           if (fmtGTF) t.printGtf(f_out, tracklabel);
990 >              else t.printGff(f_out, tracklabel);
991 >           }
992 >         } //for each non-transcript
993         }
994 <   */
771 <   process_mRNA(*mrna);
772 <   delete mrna;
773 <   gfreader->gflst.Forget(m);
994 >     } //for each genomic seq
995     }
996 < // M_END:
776 < delete gfreader;
996 > if (f_repl && f_repl!=stdout) fclose(f_repl);
997   seqinfo.Clear();
998 < if (faseq!=NULL) delete faseq;
999 < if (gcdb!=NULL) delete gcdb;
998 > //if (faseq!=NULL) delete faseq;
999 > //if (gcdb!=NULL) delete gcdb;
1000   GFREE(rfltGSeq);
1001   FRCLOSE(f_in);
1002   FWCLOSE(f_out);

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines