ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gffread/gff_utils.h
Revision: 26
Committed: Wed Jul 27 19:39:27 2011 UTC (8 years, 1 month ago) by gpertea
File size: 21622 byte(s)
Log Message:
added missing gff_utils source

Line User Rev File contents
1 gpertea 26 #ifndef GFF_UTILS_H
2     #define GFF_UTILS_H
3     #include "gff.h"
4     #include "GStr.h"
5     #include "GFastaIndex.h"
6     #include "GFaSeqGet.h"
7    
8    
9     typedef bool GFValidateFunc(GffObj* gf, GList<GffObj>* gfadd);
10    
11     class GeneInfo { //for Ensembl GTF conversion
12     public:
13     int flag;
14     GffObj* gf;
15     GList<GStr> gene_names;
16     GList<GStr> transcripts; //list of transcript IDs
17     GeneInfo():gene_names(true, true, true), transcripts(true,true,true) {
18     gf=NULL;
19     flag=0;
20     }
21     GeneInfo(GffObj* gfrec, bool ensembl_convert=false):gene_names(true, true, true),
22     transcripts(true,true,true) {
23     flag=0;
24     if (gfrec->getGeneName())
25     gene_names.Add(new GStr(gfrec->getGeneName()));
26     transcripts.Add(new GStr(gfrec->getID()));
27     create_gf(gfrec, ensembl_convert);
28     }
29    
30     void create_gf(GffObj* gfrec, bool ensembl_convert) {
31     gf=new GffObj(gfrec->getGeneID());
32     gf->gseq_id=gfrec->gseq_id;
33     gf->track_id=gfrec->track_id;
34     gf->start=gfrec->start;
35     gf->end=gfrec->end;
36     gf->strand=gfrec->strand;
37     gf->setFeatureName("gene");
38     gf->isGene(true);
39     gf->isUsed(true);
40     gf->uptr=this;
41     gfrec->incLevel();
42     gfrec->parent=gf;
43     gf->children.Add(gfrec);
44     if (ensembl_convert) {
45     gf->addAttr("type", gf->getTrackName());
46     }
47     //gf->children.Add(gfrec);
48     }
49     //~GeneInfo() {
50     // }
51     void update(GffObj* gfrec) {
52     if (transcripts.AddedIfNew(new GStr(gfrec->getID()))<0)
53     return;
54     gene_names.AddedIfNew(new GStr(gfrec->getGeneName()));
55     if (gf==NULL) {
56     GError("GeneInfo::update() called on uninitialized gf!\n");
57     //create_gf(gfrec);
58     //return;
59     }
60     gfrec->parent=gf;
61     gf->children.Add(gfrec);
62     gfrec->incLevel();
63     if (gf->start>gfrec->start)
64     gf->start=gfrec->start;
65     if (gf->end<gfrec->end)
66     gf->end=gfrec->end;
67     }
68     void finalize() {
69     //prepare attributes for printing
70     //must be called right before printing
71     if (gf==NULL || transcripts.Count()==0) return;
72     if (gene_names.Count()>0) {
73     gf->addAttr("Name", gene_names[0]->chars());
74     /*
75     GStr s(gene_names[0]->chars());
76     for (int i=1;i<gene_names.Count();i++) {
77     s.append(",");
78     s.append(gene_names[i]->chars());
79     }
80     gf->addAttr("genes", s.chars());
81     */
82     } //has gene names
83     GStr t(transcripts[0]->chars());
84     for (int i=1;i<transcripts.Count();i++) {
85     t.append(",");
86     t.append(transcripts[i]->chars());
87     }
88     gf->addAttr("transcripts", t.chars());
89     }
90     };
91    
92     //genomic fasta sequence handling
93     class GFastaDb {
94     public:
95     char* fastaPath;
96     GFastaIndex* faIdx; //could be a cdb .cidx file
97     int last_fetchid;
98     GFaSeqGet* faseq;
99     //GCdbYank* gcdb;
100     char* getFastaFile(int gseq_id) {
101     if (fastaPath==NULL) return NULL;
102     GStr s(fastaPath);
103     s.trimR('/');
104     s.appendfmt("/%s",GffObj::names->gseqs.getName(gseq_id));
105     GStr sbase(s);
106     if (!fileExists(s.chars())) s.append(".fa");
107     if (!fileExists(s.chars())) s.append("sta");
108     if (fileExists(s.chars())) return Gstrdup(s.chars());
109     else {
110     GMessage("Warning: cannot find genomic sequence file %s{.fa,.fasta}\n",sbase.chars());
111     return NULL;
112     }
113     }
114    
115     GFastaDb(const char* fpath=NULL) {
116     //gcdb=NULL;
117     fastaPath=NULL;
118     faseq=NULL;
119     faIdx=NULL;
120     init(fpath);
121     }
122    
123     void init(const char* fpath) {
124     if (fpath==NULL || fpath[0]==0) return;
125     last_fetchid=-1;
126     if (!fileExists(fpath))
127     GError("Error: file/directory %s does not exist!\n",fpath);
128     fastaPath=Gstrdup(fpath);
129     GStr gseqpath(fpath);
130     if (fileExists(fastaPath)>1) { //exists and it's not a directory
131     GStr fainame(fastaPath);
132     if (fainame.rindex(".fai")==fainame.length()-4) {
133     //.fai index file given directly
134     fastaPath[fainame.length()-4]=0;
135     if (!fileExists(fastaPath))
136     GError("Error: cannot find fasta file for index %s !\n", fastaPath);
137     }
138     else fainame.append(".fai");
139     //GMessage("creating GFastaIndex with fastaPath=%s, fainame=%s\n", fastaPath, fainame.chars());
140     faIdx=new GFastaIndex(fastaPath,fainame.chars());
141     GStr fainamecwd(fainame);
142     int ip=-1;
143     if ((ip=fainamecwd.rindex(CHPATHSEP))>=0)
144     fainamecwd.cut(0,ip+1);
145     if (!faIdx->hasIndex()) { //could not load index
146     //try current directory
147     if (fainame!=fainamecwd) {
148     if (fileExists(fainamecwd.chars())>1) {
149     faIdx->loadIndex(fainamecwd.chars());
150     }
151     }
152     } //tried to load index
153     if (!faIdx->hasIndex()) {
154     GMessage("No fasta index found for %s. Rebuilding, please wait..\n",fastaPath);
155     faIdx->buildIndex();
156     if (faIdx->getCount()==0) GError("Error: no fasta records found!\n");
157     GMessage("Fasta index rebuilt.\n");
158     FILE* fcreate=fopen(fainame.chars(), "w");
159     if (fcreate==NULL) {
160     GMessage("Warning: cannot create fasta index %s! (permissions?)\n", fainame.chars());
161     if (fainame!=fainamecwd) fcreate=fopen(fainamecwd.chars(), "w");
162     if (fcreate==NULL)
163     GError("Error: cannot create fasta index %s!\n", fainamecwd.chars());
164     }
165     if (faIdx->storeIndex(fcreate)<faIdx->getCount())
166     GMessage("Warning: error writing the index file!\n");
167     } //index created and attempted to store it
168     } //multi-fasta
169     }
170     GFaSeqGet* fetch(int gseq_id, bool checkFasta=false) {
171     if (fastaPath==NULL) return NULL;
172     if (gseq_id==last_fetchid && faseq!=NULL) return faseq;
173     delete faseq;
174     faseq=NULL;
175     last_fetchid=-1;
176     char* gseqname=GffObj::names->gseqs.getName(gseq_id);
177     // DEBUG:
178     //GMessage("..processing transcripts on: %s\n",gseqname);
179     //genomic sequence given
180     /*
181     if (gcdb!=NULL) {
182     uint32 reclen=0;
183     off_t rpos=gcdb->getRecordPos(gseqname, &reclen);
184     if (rpos<0) // genomic sequence not found
185     GError("Error: cannot find genomic sequence '%s' in %s\n",gseqname, fastaPath);
186     // WARNING: does not validate FASTA line-len uniformity!
187     faseq=new GFaSeqGet(fastaPath,rpos, false);
188     faseq->loadall(reclen); //load the whole sequence, it's faster
189     last_fetchid=gseq_id;
190     return faseq;
191     }
192     */
193     if (faIdx!=NULL) { //fastaPath was the multi-fasta file name
194     GFastaRec* farec=faIdx->getRecord(gseqname);
195     if (farec!=NULL) {
196     faseq=new GFaSeqGet(fastaPath,farec->seqlen, farec->fpos,
197     farec->line_len, farec->line_blen);
198     faseq->loadall(); //just cache the whole sequence, it's faster
199     last_fetchid=gseq_id;
200     }
201     else {
202     GMessage("Warning: couldn't find fasta record for '%s'!\n",gseqname);
203     return NULL;
204     }
205     }
206     else {
207     char* sfile=getFastaFile(gseq_id);
208     if (sfile!=NULL) {
209     faseq=new GFaSeqGet(sfile,checkFasta);
210     faseq->loadall();
211     last_fetchid=gseq_id;
212     GFREE(sfile);
213     }
214     } //one fasta file per contig
215     return faseq;
216     }
217    
218     ~GFastaDb() {
219     GFREE(fastaPath);
220     //delete gcdb;
221     delete faIdx;
222     delete faseq;
223     }
224     };
225    
226     class GffLocus;
227    
228     class GTData { //transcript associated data
229     public:
230     GffObj* rna;
231     GffLocus* locus;
232     GffObj* replaced_by;
233     GeneInfo* geneinfo;
234     int flag;
235     GTData(GffObj* t=NULL) {
236     rna=t;
237     flag=0;
238     locus=NULL;
239     replaced_by=NULL;
240     geneinfo=NULL;
241     if (rna!=NULL) {
242     geneinfo=(GeneInfo*)rna->uptr; //take over geneinfo, if there
243     rna->uptr=this;
244     }
245     }
246     bool operator>(GTData& b) { return (rna > b.rna); }
247     bool operator<(GTData& b) { return (rna < b.rna); }
248     bool operator==(GTData& b) { return (rna==b.rna); }
249     };
250    
251     class CGeneSym {
252     public:
253     GStr name;
254     int freq;
255     CGeneSym(const char* n=NULL, int f=0):name(n) {
256     freq=f;
257     }
258     bool operator>(CGeneSym& b) {
259     return (freq==b.freq) ? ( (name.length()==b.name.length()) ? (name>b.name) :
260     (name.length()>b.name.length()) ) : ( freq<b.freq );
261     }
262     bool operator<(CGeneSym& b) {
263     return (freq==b.freq)? ( (name.length()==b.name.length()) ? (name<b.name) :
264     (name.length()<b.name.length()) ) : ( freq>b.freq );
265     }
266     bool operator==(CGeneSym& b) { return name==b.name; }
267     };
268    
269     const char* getGeneDescr(const char* gsym);
270    
271     class GffLocus:public GSeg {
272     public:
273     int gseq_id; //id of underlying genomic sequence
274     int locus_num;
275     bool is_mrna;
276     char strand;
277     GffObj* t_maxcov; //transcript with maximum coverage (for main "ref" transcript)
278     GList<GffObj> rnas; //list of transcripts (isoforms) for this locus
279     GArray<GSeg> mexons; //list of merged exons in this region
280     GList<CGeneSym> gene_names;
281     GList<CGeneSym> gene_ids;
282     int v; //user flag/data
283     bool operator==(GffLocus& d){
284     return (gseq_id==d.gseq_id && strand==d.strand && start==d.start && end==d.end);
285     }
286     bool operator>(GffLocus& d){
287     if (gseq_id!=d.gseq_id) return (gseq_id>d.gseq_id);
288     if (start==d.start) {
289     if (end==d.end) return (strand>d.strand);
290     else return (end>d.end);
291     } else return (start>d.start);
292     }
293     bool operator<(GffLocus& d){
294     if (gseq_id!=d.gseq_id) return (gseq_id<d.gseq_id);
295     if (start==d.start) {
296     if (end==d.end) return strand<d.strand;
297     else return end<d.end;
298     } else return (start<d.start);
299     }
300    
301     const char* getGeneName() {
302     if (gene_names.Count()==0) return NULL;
303     return gene_names.First()->name.chars();
304     }
305     const char* get_tmax_id() {
306     return t_maxcov->getID();
307     }
308     const char* get_descr() {
309     if (gene_names.Count()>0) {
310     for (int i=0;i<gene_names.Count();i++) {
311     const char* gn=getGeneDescr(gene_names.First()->name.chars());
312     if (gn!=NULL) return gn;
313     }
314     }
315     char* s=t_maxcov->getAttr("product");
316     if (s!=NULL) return s;
317     s=t_maxcov->getAttr("descr");
318     if (s!=NULL) return s;
319     s=t_maxcov->getAttr("description");
320     if (s!=NULL) return s;
321     s=t_maxcov->getAttr("info");
322     if (s!=NULL) return s;
323     return NULL;
324     }
325    
326     GffLocus(GffObj* t=NULL):rnas(true,false,false),mexons(true,true),
327     gene_names(true,true,false), gene_ids(true,true,false) {
328     //this will NOT free rnas!
329     t_maxcov=NULL;
330     gseq_id=-1;
331     v=0;
332     locus_num=0;
333     start=0;
334     end=0;
335     strand=0;
336     is_mrna=false;
337     if (t!=NULL) {
338     start=t->exons.First()->start;
339     end=t->exons.Last()->end;;
340     gseq_id=t->gseq_id;
341     GSeg seg;
342     for (int i=0;i<t->exons.Count();i++) {
343     seg.start=t->exons[i]->start;
344     seg.end=t->exons[i]->end;
345     mexons.Add(seg);
346     }
347     rnas.Add(t);
348     ((GTData*)(t->uptr))->locus=this;
349     t_maxcov=t;
350     strand=t->strand;
351     if (t->ftype_id==gff_fid_mRNA) {
352     is_mrna=true;
353     }
354     }
355     }
356    
357     void addMerge(GffLocus& locus, GffObj* lnkrna) {
358     //add all the elements of the other locus (merging)
359     //-- merge mexons
360     GArray<int> ovlexons(true,true); //list of locus.mexons indexes overlapping existing mexons
361     int i=0; //index of first mexons with a merge
362     int j=0; //index current mrna exon
363     while (i<mexons.Count() && j<locus.mexons.Count()) {
364     uint istart=mexons[i].start;
365     uint iend=mexons[i].end;
366     uint jstart=locus.mexons[j].start;
367     uint jend=locus.mexons[j].end;
368     if (iend<jstart) { i++; continue; }
369     if (jend<istart) { j++; continue; }
370     //if (mexons[i].overlap(jstart, jend)) {
371     //exon overlap was found :
372     ovlexons.Add(j);
373     //extend mexons[i] as needed
374     if (jstart<istart) mexons[i].start=jstart;
375     if (jend>iend) { //mexons[i] end extend
376     mexons[i].end=jend;
377     //now this could overlap the next mexon(s), so we have to merge them all
378     while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
379     uint nextend=mexons[i+1].end;
380     mexons.Delete(i+1);
381     if (nextend>mexons[i].end) {
382     mexons[i].end=nextend;
383     break; //no need to check next mexons
384     }
385     } //while next mexons merge
386     } // mexons[i] end extend
387     // } //exon overlap
388     j++; //check the next locus.mexon
389     }
390     //-- add the rest of the non-overlapping mexons:
391     GSeg seg;
392     for (int i=0;i<locus.mexons.Count();i++) {
393     seg.start=locus.mexons[i].start;
394     seg.end=locus.mexons[i].end;
395     if (!ovlexons.Exists(i)) mexons.Add(seg);
396     }
397     // -- add locus.rnas
398     for (int i=0;i<locus.rnas.Count();i++) {
399     ((GTData*)(locus.rnas[i]->uptr))->locus=this;
400     if (locus.rnas[i]!=lnkrna) rnas.Add(locus.rnas[i]);
401     }
402     // -- adjust start/end as needed
403     if (start>locus.start) start=locus.start;
404     if (end<locus.end) end=locus.end;
405     if (locus.is_mrna) is_mrna=true;
406     if (t_maxcov->covlen<locus.t_maxcov->covlen)
407     t_maxcov=locus.t_maxcov;
408     }
409    
410    
411     bool exonOverlap(GffLocus& loc) {
412     //check if any mexons overlap!
413     if (strand!=loc.strand || loc.start>end || start>loc.end) return false;
414     int i=0;
415     int j=0;
416     while (i<mexons.Count() && j<loc.mexons.Count()) {
417     uint istart=mexons[i].start;
418     uint iend=mexons[i].end;
419     uint jstart=loc.mexons[j].start;
420     uint jend=loc.mexons[j].end;
421     if (iend<jstart) { i++; continue; }
422     if (jend<istart) { j++; continue; }
423     //exon overlap found if we're here:
424     return true;
425     }
426     return false;
427     }
428    
429     bool add_RNA(GffObj* t) {
430     //if (rnas.Count()==0) return true; //? should never be called on an empty locus
431     if (t->gseq_id!=gseq_id || t->strand!=strand || t->start>end || start>t->end)
432     return false; //rna must be on the same genomic seq
433     //check for exon overlap with existing mexons
434     //also update mexons accordingly if t is to be added
435     bool hasovl=false;
436     int i=0; //index of first mexons with a merge
437     int j=0; //index current t exon
438     GArray<int> ovlexons(true,true); //list of mrna exon indexes overlapping mexons
439     while (i<mexons.Count() && j<t->exons.Count()) {
440     uint istart=mexons[i].start;
441     uint iend=mexons[i].end;
442     uint jstart=t->exons[j]->start;
443     uint jend=t->exons[j]->end;
444     if (iend<jstart) { i++; continue; }
445     if (jend<istart) { j++; continue; }
446     //exon overlap found if we're here:
447     ovlexons.Add(j);
448     hasovl=true;
449     //extend mexons[i] as needed
450     if (jstart<istart) mexons[i].start=jstart;
451     if (jend>iend) { //mexon stretch up
452     mexons[i].end=jend;
453     //now this could overlap the next mexon(s), so we have to merge them all
454     while (i<mexons.Count()-1 && mexons[i].end>mexons[i+1].start) {
455     uint nextend=mexons[i+1].end;
456     mexons.Delete(i+1);
457     if (nextend>mexons[i].end) {
458     mexons[i].end=nextend;
459     break; //no need to check next mexons
460     }
461     } //while next mexons merge
462     } //possible mexons merge
463    
464     j++; //check the next t exon
465     }//all vs all exon check loop
466     if (hasovl) {
467     GSeg seg;
468     //add the rest of the non-overlapping exons
469     for (int i=0;i<t->exons.Count();i++) {
470     seg.start=t->exons[i]->start;
471     seg.end=t->exons[i]->end;
472     if (!ovlexons.Exists(i)) mexons.Add(seg);
473     }
474     rnas_add(t);
475     // add to rnas
476     ((GTData*)t->uptr)->locus=this;
477     gseq_id=t->gseq_id;
478     }
479     return hasovl;
480     }
481    
482     //simpler,basic adding of a mrna
483     void rnas_add(GffObj* t) {
484     rnas.Add(t);
485     // adjust start/end
486     if (start>t->start || start==0) start=t->start;
487     if (end<t->end) end=t->end;
488     if (t_maxcov->covlen<t->covlen) t_maxcov=t;
489     if (strand==0) strand=t->strand;
490     if (t->ftype_id==gff_fid_mRNA) is_mrna=true;
491     }
492     };
493    
494     class GenomicSeqData {
495     int gseq_id;
496     public:
497     const char* gseq_name;
498     GList<GffObj> gfs; //all non-transcript features -> usually gene features
499     GList<GffObj> rnas; //all transcripts on this genomic sequence
500     GList<GffLocus> loci; //all loci clusters
501     GList<GTData> tdata; //transcript data (uptr holder for all rnas loaded here)
502     //GenomicSeqData(int gid=-1):rnas(true,true,false),loci(true,true,true),
503     GenomicSeqData(int gid=-1):gfs(true, true, false),rnas((GCompareProc*)gfo_cmpByLoc),loci(true,true,false),
504     tdata(false,true,false) {
505     gseq_id=gid;
506     if (gseq_id>=0)
507     gseq_name=GffObj::names->gseqs.getName(gseq_id);
508    
509     }
510     bool operator==(GenomicSeqData& d){
511     return gseq_id==d.gseq_id;
512     }
513     bool operator>(GenomicSeqData& d){
514     return (gseq_id>d.gseq_id);
515     }
516     bool operator<(GenomicSeqData& d){
517     return (gseq_id<d.gseq_id);
518     }
519     };
520    
521     int gseqCmpName(const pointer p1, const pointer p2);
522    
523     class GSpliceSite {
524     public:
525     char nt[3];
526     GSpliceSite(const char* c, bool revc=false) {
527     nt[2]=0;
528     if (c==NULL) {
529     nt[0]=0;
530     nt[1]=0;
531     return;
532     }
533     if (revc) {
534     nt[0]=toupper(ntComplement(c[1]));
535     nt[1]=toupper(ntComplement(c[0]));
536     }
537     else {
538     nt[0]=toupper(c[0]);
539     nt[1]=toupper(c[1]);
540     }
541     }
542    
543     GSpliceSite(const char* intron, int intronlen, bool getAcceptor, bool revc=false) {
544     nt[2]=0;
545     if (intron==NULL || intronlen==0)
546     GError("Error: invalid intron or intron len for GSpliceSite()!\n");
547     const char* c=intron;
548     if (revc) {
549     if (!getAcceptor) c+=intronlen-2;
550     nt[0]=toupper(ntComplement(c[1]));
551     nt[1]=toupper(ntComplement(c[0]));
552     }
553     else { //on forward strand
554     if (getAcceptor) c+=intronlen-2;
555     nt[0]=toupper(c[0]);
556     nt[1]=toupper(c[1]);
557     }//forward strand
558     }
559    
560     GSpliceSite(const char n1, const char n2) {
561     nt[2]=0;
562     nt[0]=toupper(n1);
563     nt[1]=toupper(n2);
564     }
565     bool canonicalDonor() {
566     return (nt[0]=='G' && (nt[1]=='C' || nt[1]=='T'));
567     }
568     bool operator==(GSpliceSite& c) {
569     return (c.nt[0]==nt[0] && c.nt[1]==nt[1]);
570     }
571     bool operator==(GSpliceSite* c) {
572     return (c->nt[0]==nt[0] && c->nt[1]==nt[1]);
573     }
574     bool operator==(const char* c) {
575     //return (nt[0]==toupper(c[0]) && nt[1]==toupper(c[1]));
576     //assumes given const nucleotides are uppercase already!
577     return (nt[0]==c[0] && nt[1]==c[1]);
578     }
579     bool operator!=(const char* c) {
580     //assumes given const nucleotides are uppercase already!
581     return (nt[0]!=c[0] || nt[1]!=c[1]);
582     }
583     };
584    
585     struct GffLoader {
586     GStr fname;
587     FILE* f;
588     bool transcriptsOnly;
589     bool fullAttributes;
590     bool noExonAttrs;
591     bool mergeCloseExons;
592     bool showWarnings;
593     void load(GList<GenomicSeqData>&seqdata, GFValidateFunc* gf_validate=NULL,
594     bool doCluster=true, bool doCollapseRedundant=true, bool matchAllIntrons=true, bool fuzzSpan=false);
595     GffLoader(const char* filename):fname(filename) {
596     f=NULL;
597     transcriptsOnly=true;
598     fullAttributes=false;
599     noExonAttrs=false;
600     mergeCloseExons=false;
601     showWarnings=false;
602     if (fname=="-") {
603     f=stdin;
604     fname="stdin";
605     }
606     else {
607     if ((f=fopen(fname.chars(), "r"))==NULL) {
608     GError("Error: cannot open file %s!\n",fname.chars());
609     }
610     }
611     }
612     ~GffLoader() {
613     if (f!=NULL && f!=stdin) fclose(f);
614     }
615     };
616    
617     void printFasta(FILE* f, GStr& defline, char* seq, int seqlen=-1);
618    
619     //"position" a given coordinate x within a list of transcripts sorted by their start (lowest)
620     //coordinate, using quick-search; the returned int is the list index of the closest *higher*
621     //GffObj - i.e. starting right *ABOVE* the given coordinate
622     //Convention: returns -1 if there is no such GffObj (i.e. last GffObj starts below x)
623     int qsearch_rnas(uint x, GList<GffObj>& rnas);
624     int qsearch_gloci(uint x, GList<GffLocus>& loci);
625    
626     GffObj* redundantTranscripts(GffObj& ti, GffObj& tj, bool matchAllIntrons=true, bool fuzzSpan=false);
627    
628     void placeGf(GffObj* t, GenomicSeqData* gdata, bool doCluster=true, bool collapseRedundant=true,
629     bool matchAllIntrons=true, bool fuzzSpan=false);
630     //void loadGFF(FILE* f, GList<GenomicSeqData>& seqdata, const char* fname);
631    
632     void collectLocusData(GList<GenomicSeqData>& ref_data);
633    
634     #endif