ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GFastaIndex.cpp
Revision: 18
Committed: Mon Jul 18 21:00:21 2011 UTC (8 years, 4 months ago) by gpertea
File size: 5477 byte(s)
Log Message:
added a few more source files, test code

Line File contents
1 /*
2 * GFaIdx.cpp
3 *
4 * Created on: Aug 25, 2010
5 * Author: gpertea
6 */
7
8 #include "GFastaIndex.h"
9 #define ERR_FAIDXLINE "Error parsing fasta index line: \n%s\n"
10 #define ERR_FALINELEN "Error: sequence lines in a FASTA record must have the same length!\n"
11 void GFastaIndex::addRecord(const char* seqname, uint seqlen, off_t foffs, int llen, int llen_full) {
12 GFastaRec* farec=records.Find(seqname);
13 if (farec!=NULL) {
14 GMessage("Warning: duplicate sequence ID (%s) added to the fasta index! Only last entry data will be kept.\n");
15 farec->seqlen=seqlen;
16 farec->fpos=foffs;
17 farec->line_len=llen;
18 farec->line_blen=llen_full;
19 }
20 else {
21 farec=new GFastaRec(seqlen,foffs,llen,llen_full);
22 records.Add(seqname,farec);
23 farec->seqname=records.getLastKey();
24 }
25 }
26
27 int GFastaIndex::loadIndex(const char* finame) { //load record info from existing fasta index
28 if (finame==NULL) finame=fai_name;
29 if (finame!=fai_name) {
30 fai_name=Gstrdup(finame);
31 }
32 if (fai_name==NULL) GError("Error: GFastaIndex::loadIndex() called with no file name!\n");
33 records.Clear();
34 haveFai=false;
35 FILE* fi=fopen(fai_name,"rb");
36 if (fi==NULL) {
37 GMessage("Warning: cannot open fasta index file: %s!\n",fai_name);
38 return 0;
39 }
40 GLineReader fl(fi);
41 char* s=NULL;
42 while ((s=fl.nextLine())!=NULL) {
43 if (*s=='#') continue;
44 char* p=strchrs(s,"\t ");
45 if (p==NULL) GError(ERR_FAIDXLINE,s);
46 *p=0; //s now holds the genomic sequence name
47 p++;
48 uint len=0;
49 int line_len=0, line_blen=0;
50 #ifdef __WIN32__
51 long offset=-1;
52 sscanf(p, "%d%ld%d%d", &len, &offset, &line_len, &line_blen);
53 #else
54 long long offset=-1;
55 sscanf(p, "%d%lld%d%d", &len, &offset, &line_len, &line_blen);
56 #endif
57 if (len==0 || line_len==0 || line_blen==0 || line_blen<line_len)
58 GError(ERR_FAIDXLINE,p);
59 addRecord(s,len,offset,line_len, line_blen);
60 }
61 fclose(fi);
62 haveFai=(records.Count()>0);
63 return records.Count();
64 }
65
66 int GFastaIndex::buildIndex() {
67 //this parses the whole fasta file, so it could be slow
68 if (fa_name==NULL)
69 GError("Error: GFastaIndex::buildIndex() called with no fasta file!\n");
70 FILE* fa=fopen(fa_name,"rb");
71 if (fa==NULL) {
72 GMessage("Warning: cannot open fasta index file: %s!\n",fa_name);
73 return 0;
74 }
75 records.Clear();
76 GLineReader fl(fa);
77 char* s=NULL;
78 uint seqlen=0;
79 int line_len=0,line_blen=0;
80 bool newSeq=false; //set to true after defline
81 off_t newSeqOffset=0;
82 int prevOffset=0;
83 char* seqname=NULL;
84 int last_len=0;
85 bool mustbeLastLine=false; //true if the line length decreases
86 while ((s=fl.nextLine())!=NULL) {
87 if (s[0]=='>') {
88 if (seqname!=NULL) {
89 if (seqlen==0)
90 GError("Warning: empty FASTA record skipped (%s)!\n",seqname);
91 else { //seqlen!=0
92 addRecord(seqname, seqlen,newSeqOffset, line_len, line_blen);
93 }
94 }
95 char *p=s;
96 while (*p > 32) p++;
97 *p=0;
98 GFREE(seqname);
99 seqname=Gstrdup(&s[1]);
100 newSeq=true;
101 newSeqOffset=fl.getfpos();
102 last_len=0;
103 line_len=0;
104 line_blen=0;
105 seqlen=0;
106 mustbeLastLine=false;
107 } //defline parsing
108 else { //sequence line
109 int llen=fl.length();
110 int lblen=fl.getFpos()-prevOffset;
111 if (newSeq) { //first sequence line after defline
112 line_len=llen;
113 line_blen=lblen;
114 }
115 else {//next seq lines after first
116 if (mustbeLastLine || llen>last_len)
117 GError(ERR_FALINELEN);
118 if (llen<last_len) mustbeLastLine=true;
119 }
120 seqlen+=llen;
121 last_len=llen;
122 newSeq=false;
123 } //sequence line
124 prevOffset=fl.getfpos();
125 }//for each line of the fasta file
126 if (seqlen>0)
127 addRecord(seqname, seqlen, newSeqOffset, line_len, line_blen);
128 GFREE(seqname);
129 fclose(fa);
130 return records.Count();
131 }
132
133
134 int GFastaIndex::storeIndex(const char* finame) { //write the hash to a file
135 if (records.Count()==0)
136 GError("Error at GFastaIndex:storeIndex(): no records found!\n");
137 FILE* fai=fopen(finame, "w");
138 if (fai==NULL) GError("Error creating fasta index file: %s\n",finame);
139 int rcount=storeIndex(fai);
140 GFREE(fai_name);
141 fai_name=Gstrdup(finame);
142 return rcount;
143 }
144
145 int GFastaIndex::storeIndex(FILE* fai) {
146 int rcount=0;
147 GList<GFastaRec> reclist(true,false,true); //sorted, don't free members, unique
148 records.startIterate();
149 GFastaRec* rec=NULL;
150 while ((rec=records.NextData())!=NULL) {
151 reclist.Add(rec);
152 }
153 //reclist has records sorted by file offset
154 for (int i=0;i<reclist.Count();i++) {
155 #ifdef __WIN32__
156 int written=fprintf(fai, "%s\t%d\t%ld\t%d\t%d\n",
157 reclist[i]->seqname,reclist[i]->seqlen,(long)reclist[i]->fpos,
158 reclist[i]->line_len, reclist[i]->line_blen);
159 #else
160 int written=fprintf(fai, "%s\t%d\t%lld\t%d\t%d\n",
161 reclist[i]->seqname, reclist[i]->seqlen, (long long)(reclist[i]->fpos),
162 reclist[i]->line_len, reclist[i]->line_blen);
163 #endif
164 if (written>0) rcount++;
165 else break; //couldn't write anymore
166 }
167 fclose(fai);
168 haveFai=(rcount>0);
169 return rcount;
170 }