ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFastaIndex.cpp
Revision: 18
Committed: Mon Jul 18 21:00:21 2011 UTC (8 years ago) by gpertea
File size: 5477 byte(s)
Log Message:
added a few more source files, test code

Line User Rev File contents
1 gpertea 18 /*
2     * GFaIdx.cpp
3     *
4     * Created on: Aug 25, 2010
5     * Author: gpertea
6     */
7    
8     #include "GFastaIndex.h"
9     #define ERR_FAIDXLINE "Error parsing fasta index line: \n%s\n"
10     #define ERR_FALINELEN "Error: sequence lines in a FASTA record must have the same length!\n"
11     void GFastaIndex::addRecord(const char* seqname, uint seqlen, off_t foffs, int llen, int llen_full) {
12     GFastaRec* farec=records.Find(seqname);
13     if (farec!=NULL) {
14     GMessage("Warning: duplicate sequence ID (%s) added to the fasta index! Only last entry data will be kept.\n");
15     farec->seqlen=seqlen;
16     farec->fpos=foffs;
17     farec->line_len=llen;
18     farec->line_blen=llen_full;
19     }
20     else {
21     farec=new GFastaRec(seqlen,foffs,llen,llen_full);
22     records.Add(seqname,farec);
23     farec->seqname=records.getLastKey();
24     }
25     }
26    
27     int GFastaIndex::loadIndex(const char* finame) { //load record info from existing fasta index
28     if (finame==NULL) finame=fai_name;
29     if (finame!=fai_name) {
30     fai_name=Gstrdup(finame);
31     }
32     if (fai_name==NULL) GError("Error: GFastaIndex::loadIndex() called with no file name!\n");
33     records.Clear();
34     haveFai=false;
35     FILE* fi=fopen(fai_name,"rb");
36     if (fi==NULL) {
37     GMessage("Warning: cannot open fasta index file: %s!\n",fai_name);
38     return 0;
39     }
40     GLineReader fl(fi);
41     char* s=NULL;
42     while ((s=fl.nextLine())!=NULL) {
43     if (*s=='#') continue;
44     char* p=strchrs(s,"\t ");
45     if (p==NULL) GError(ERR_FAIDXLINE,s);
46     *p=0; //s now holds the genomic sequence name
47     p++;
48     uint len=0;
49     int line_len=0, line_blen=0;
50     #ifdef __WIN32__
51     long offset=-1;
52     sscanf(p, "%d%ld%d%d", &len, &offset, &line_len, &line_blen);
53     #else
54     long long offset=-1;
55     sscanf(p, "%d%lld%d%d", &len, &offset, &line_len, &line_blen);
56     #endif
57     if (len==0 || line_len==0 || line_blen==0 || line_blen<line_len)
58     GError(ERR_FAIDXLINE,p);
59     addRecord(s,len,offset,line_len, line_blen);
60     }
61     fclose(fi);
62     haveFai=(records.Count()>0);
63     return records.Count();
64     }
65    
66     int GFastaIndex::buildIndex() {
67     //this parses the whole fasta file, so it could be slow
68     if (fa_name==NULL)
69     GError("Error: GFastaIndex::buildIndex() called with no fasta file!\n");
70     FILE* fa=fopen(fa_name,"rb");
71     if (fa==NULL) {
72     GMessage("Warning: cannot open fasta index file: %s!\n",fa_name);
73     return 0;
74     }
75     records.Clear();
76     GLineReader fl(fa);
77     char* s=NULL;
78     uint seqlen=0;
79     int line_len=0,line_blen=0;
80     bool newSeq=false; //set to true after defline
81     off_t newSeqOffset=0;
82     int prevOffset=0;
83     char* seqname=NULL;
84     int last_len=0;
85     bool mustbeLastLine=false; //true if the line length decreases
86     while ((s=fl.nextLine())!=NULL) {
87     if (s[0]=='>') {
88     if (seqname!=NULL) {
89     if (seqlen==0)
90     GError("Warning: empty FASTA record skipped (%s)!\n",seqname);
91     else { //seqlen!=0
92     addRecord(seqname, seqlen,newSeqOffset, line_len, line_blen);
93     }
94     }
95     char *p=s;
96     while (*p > 32) p++;
97     *p=0;
98     GFREE(seqname);
99     seqname=Gstrdup(&s[1]);
100     newSeq=true;
101     newSeqOffset=fl.getfpos();
102     last_len=0;
103     line_len=0;
104     line_blen=0;
105     seqlen=0;
106     mustbeLastLine=false;
107     } //defline parsing
108     else { //sequence line
109     int llen=fl.length();
110     int lblen=fl.getFpos()-prevOffset;
111     if (newSeq) { //first sequence line after defline
112     line_len=llen;
113     line_blen=lblen;
114     }
115     else {//next seq lines after first
116     if (mustbeLastLine || llen>last_len)
117     GError(ERR_FALINELEN);
118     if (llen<last_len) mustbeLastLine=true;
119     }
120     seqlen+=llen;
121     last_len=llen;
122     newSeq=false;
123     } //sequence line
124     prevOffset=fl.getfpos();
125     }//for each line of the fasta file
126     if (seqlen>0)
127     addRecord(seqname, seqlen, newSeqOffset, line_len, line_blen);
128     GFREE(seqname);
129     fclose(fa);
130     return records.Count();
131     }
132    
133    
134     int GFastaIndex::storeIndex(const char* finame) { //write the hash to a file
135     if (records.Count()==0)
136     GError("Error at GFastaIndex:storeIndex(): no records found!\n");
137     FILE* fai=fopen(finame, "w");
138     if (fai==NULL) GError("Error creating fasta index file: %s\n",finame);
139     int rcount=storeIndex(fai);
140     GFREE(fai_name);
141     fai_name=Gstrdup(finame);
142     return rcount;
143     }
144    
145     int GFastaIndex::storeIndex(FILE* fai) {
146     int rcount=0;
147     GList<GFastaRec> reclist(true,false,true); //sorted, don't free members, unique
148     records.startIterate();
149     GFastaRec* rec=NULL;
150     while ((rec=records.NextData())!=NULL) {
151     reclist.Add(rec);
152     }
153     //reclist has records sorted by file offset
154     for (int i=0;i<reclist.Count();i++) {
155     #ifdef __WIN32__
156     int written=fprintf(fai, "%s\t%d\t%ld\t%d\t%d\n",
157     reclist[i]->seqname,reclist[i]->seqlen,(long)reclist[i]->fpos,
158     reclist[i]->line_len, reclist[i]->line_blen);
159     #else
160     int written=fprintf(fai, "%s\t%d\t%lld\t%d\t%d\n",
161     reclist[i]->seqname, reclist[i]->seqlen, (long long)(reclist[i]->fpos),
162     reclist[i]->line_len, reclist[i]->line_blen);
163     #endif
164     if (written>0) rcount++;
165     else break; //couldn't write anymore
166     }
167     fclose(fai);
168     haveFai=(rcount>0);
169     return rcount;
170     }