ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBase.h
Revision: 171
Committed: Tue Feb 14 22:36:26 2012 UTC (7 years, 4 months ago) by gpertea
File size: 15391 byte(s)
Log Message:
wip fqtrim

Line File contents
1 #ifndef G_BASE_DEFINED
2 #define G_BASE_DEFINED
3 #ifndef _POSIX_SOURCE
4 //mostly for MinGW
5 #define _POSIX_SOURCE
6 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
10 #include <string.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <math.h>
14 #include <limits.h>
15 #include <sys/types.h>
16 #include <sys/stat.h>
17 #include <stdint.h>
18
19 #if defined __WIN32__ || defined WIN32 || defined _WIN32 || defined _WIN32_
20 #ifndef __WIN32__
21 #define __WIN32__
22 #endif
23 #include <windows.h>
24 #include <io.h>
25 #define CHPATHSEP '\\'
26 #undef off_t
27 #define off_t int64_t
28 #ifdef _fseeki64
29 #define fseeko(stream, offset, origin) _fseeki64(stream, offset, origin)
30 #else
31 /*
32 #define _DEFINE_WIN32_FSEEKO
33 int fseeko(FILE *stream, off_t offset, int whence);
34 */
35 #define fseeko fseek
36 #endif
37 #ifdef _ftelli64
38 #define ftello(stream) _ftelli64(stream)
39 #else
40 /*
41 #define _DEFINE_WIN32_FTELLO
42 off_t ftello(FILE *stream);
43 */
44 #define ftello ftell
45 #endif
46 #else
47 #define CHPATHSEP '/'
48 #include <unistd.h>
49 #endif
50
51 #ifndef fseeko
52 #define fseeko fseek
53 #endif
54 #ifndef ftello
55 #define ftello ftell
56 #endif
57
58 #ifdef DEBUG
59 #undef NDEBUG
60 #endif
61
62 typedef int32_t int32;
63 typedef uint32_t uint32;
64 typedef int16_t int16;
65 typedef uint16_t uint16;
66
67 typedef unsigned char uchar;
68 typedef unsigned char byte;
69
70 #ifndef MAXUINT
71 #define MAXUINT ((unsigned int)-1)
72 #endif
73
74 #ifndef MAXINT
75 #define MAXINT INT_MAX
76 #endif
77
78 #ifndef MAX_UINT
79 #define MAX_UINT ((unsigned int)-1)
80 #endif
81
82 #ifndef MAX_INT
83 #define MAX_INT INT_MAX
84 #endif
85
86 typedef int64_t int64;
87 typedef uint64_t uint64;
88
89 /****************************************************************************/
90
91 #ifndef EXIT_FAILURE
92 #define EXIT_FAILURE 1
93 #endif
94
95 #ifndef EXIT_SUCCESS
96 #define EXIT_SUCCESS 0
97 #endif
98
99 /****************************************************************************/
100 #define ERR_ALLOC "Error allocating memory.\n"
101
102 //-------------------
103
104 // Debug helpers
105 #ifndef NDEBUG
106 #define GASSERT(exp) ((exp)?((void)0):(void)GAssert(#exp,__FILE__,__LINE__))
107 #ifdef TRACE
108 #define GTRACE(exp) (GMessage exp)
109 #else
110 #define GTRACE(exp) ((void)0)
111 #endif
112 #else
113 #define GASSERT(exp) ((void)0)
114 #define GTRACE(exp) ((void)0)
115 #endif
116
117 #define GERROR(exp) (GError exp)
118 /********************************** Macros ***********************************/
119 // Abolute value
120 #define GABS(val) (((val)>=0)?(val):-(val))
121
122 // Min and Max
123 #define GMAX(a,b) (((a)>(b))?(a):(b))
124 #define GMIN(a,b) (((a)>(b))?(b):(a))
125
126 // Min of three
127 #define GMIN3(x,y,z) ((x)<(y)?GMIN(x,z):GMIN(y,z))
128
129 // Max of three
130 #define GMAX3(x,y,z) ((x)>(y)?GMAX(x,z):GMAX(y,z))
131
132 // Return minimum and maximum of a, b
133 #define GMINMAX(lo,hi,a,b) ((a)<(b)?((lo)=(a),(hi)=(b)):((lo)=(b),(hi)=(a)))
134
135 // Clamp value x to range [lo..hi]
136 #define GCLAMP(lo,x,hi) ((x)<(lo)?(lo):((x)>(hi)?(hi):(x)))
137
138 typedef void* pointer;
139 typedef unsigned int uint;
140
141 typedef int GCompareProc(const pointer item1, const pointer item2);
142 typedef void GFreeProc(pointer item); //usually just delete,
143 //but may also support structures with embedded dynamic members
144
145 #define GMALLOC(ptr,size) if (!GMalloc((pointer*)(&ptr),size)) \
146 GError(ERR_ALLOC)
147 #define GCALLOC(ptr,size) if (!GCalloc((pointer*)(&ptr),size)) \
148 GError(ERR_ALLOC)
149 #define GREALLOC(ptr,size) if (!GRealloc((pointer*)(&ptr),size)) \
150 GError(ERR_ALLOC)
151 #define GFREE(ptr) GFree((pointer*)(&ptr))
152
153 inline char* strMin(char *arg1, char *arg2) {
154 return (strcmp(arg1, arg2) < 0)? arg1 : arg2;
155 }
156
157 inline char* strMax(char *arg1, char *arg2) {
158 return (strcmp(arg2, arg1) < 0)? arg1 : arg2;
159 }
160
161 inline int iround(double x) {
162 return (int)floor(x + 0.5);
163 }
164
165 /****************************************************************************/
166
167 inline int Gintcmp(int a, int b) {
168 //return (a>b)? 1 : ((a==b)?0:-1);
169 return a-b;
170 }
171
172 int Gstrcmp(const char* a, const char* b, int n=-1);
173 //same as strcmp but doesn't crash on NULL pointers
174
175 int Gstricmp(const char* a, const char* b, int n=-1);
176
177 //basic swap template function
178 template<class T> void Gswap(T& lhs, T& rhs) {
179 //register T tmp=lhs;
180 T tmp=lhs; //requires copy operator
181 lhs=rhs;
182 rhs=tmp;
183 }
184
185 /// bitCount_32 - this function counts the number of set bits in a value.
186 /// Ex. CountPopulation(0xF000F000) = 8
187 /// Returns 0 if the word is zero.
188 inline uint bitCount_32(uint32_t Value) {
189 #if __GNUC__ >= 4
190 return __builtin_popcount(Value);
191 #else
192 uint32_t v = Value - ((Value >> 1) & 0x55555555);
193 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
194 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
195 #endif
196 }
197
198 /// bitCount_64 - this function counts the number of set bits in a value,
199 /// (64 bit edition.)
200 inline uint bitCount_64(uint64_t Value) {
201 #if __GNUC__ >= 4
202 return __builtin_popcountll(Value);
203 #else
204 uint64_t v = Value - ((Value >> 1) & 0x5555555555555555ULL);
205 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
206 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
207 return uint((uint64_t)(v * 0x0101010101010101ULL) >> 56);
208 #endif
209 }
210
211 /// CountTrailingZeros_32 - this function performs the platform optimal form of
212 /// counting the number of zeros from the least significant bit to the first one
213 /// bit. Ex. CountTrailingZeros_32(0xFF00FF00) == 8.
214 /// Returns 32 if the word is zero.
215 inline unsigned bitCountTrailingZeros_32(uint32_t Value) {
216 #if __GNUC__ >= 4
217 return Value ? __builtin_ctz(Value) : 32;
218 #else
219 static const unsigned Mod37BitPosition[] = {
220 32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13,
221 4, 7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9,
222 5, 20, 8, 19, 18
223 };
224 return Mod37BitPosition[(-Value & Value) % 37];
225 #endif
226 }
227
228 // CountTrailingZeros_64 - This function performs the platform optimal form
229 /// of counting the number of zeros from the least significant bit to the first
230 /// one bit (64 bit edition.)
231 /// Returns 64 if the word is zero.
232 inline unsigned bitCountTrailingZeros_64(uint64_t Value) {
233 #if __GNUC__ >= 4
234 return Value ? __builtin_ctzll(Value) : 64;
235 #else
236 static const unsigned Mod67Position[] = {
237 64, 0, 1, 39, 2, 15, 40, 23, 3, 12, 16, 59, 41, 19, 24, 54,
238 4, 64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55,
239 47, 5, 32, 65, 38, 14, 22, 11, 58, 18, 53, 63, 9, 61, 27,
240 29, 50, 43, 46, 31, 37, 21, 57, 52, 8, 26, 49, 45, 36, 56,
241 7, 48, 35, 6, 34, 33, 0
242 };
243 return Mod67Position[(-Value & Value) % 67];
244 #endif
245 }
246
247 /**************** Memory management ***************************/
248
249 bool GMalloc(pointer* ptr, unsigned long size); // Allocate memory
250 bool GCalloc(pointer* ptr, unsigned long size); // Allocate and initialize memory
251 bool GRealloc(pointer* ptr,unsigned long size); // Resize memory
252 void GFree(pointer* ptr); // Free memory, resets ptr to NULL
253
254
255 int saprintf(char **retp, const char *fmt, ...);
256
257 void GError(const char* format,...); // Error routine (aborts program)
258 void GMessage(const char* format,...);// Log message to stderr
259 // Assert failed routine:- usually not called directly but through GASSERT
260 void GAssert(const char* expression, const char* filename, unsigned int lineno);
261
262 // ****************** string manipulation *************************
263 char *Gstrdup(const char* str);
264 //duplicate a string by allocating a copy for it and returning it
265 char* Gstrdup(const char* sfrom, const char* sto);
266 //same as GStrdup, but with an early termination (e.g. on delimiter)
267
268 char* Gsubstr(const char* str, char* from, char* to=NULL);
269 //extracts a substring, allocating it, including boundaries (from/to)
270
271 int strsplit(char* str, char** fields, int maxfields, const char* delim);
272 int strsplit(char* str, char** fields, int maxfields, const char delim);
273 int strsplit(char* str, char** fields, int maxfields); //splits by tab or space
274
275 char* replaceStr(char* &str, char* newvalue);
276
277 //conversion: to Lower/Upper case
278 // creating a new string:
279 char* upCase(const char* str);
280 char* loCase(const char* str);
281 // changing string in place:
282 char* strlower(char * str);
283 char* strupper(char * str);
284
285 //strstr but for memory zones: scans a memory region
286 //for a substring:
287 void* Gmemscan(void *mem, unsigned int len,
288 void *part, unsigned int partlen);
289
290 // test if a char is in a string:
291 bool chrInStr(char c, const char* str);
292
293 char* rstrchr(char* str, char ch);
294 /* returns a pointer to the rightmost
295 occurence of ch in str - like rindex for platforms missing it*/
296
297 char* strchrs(const char* s, const char* chrs);
298 //strchr but with a set of chars instead of only one
299
300 char* rstrfind(const char* str, const char *substr);
301 // like rindex() but for strings; right side version of strstr()
302
303 char* reverseChars(char* str, int slen=0); //in place reversal of string
304
305 char* rstrstr(const char* rstart, const char *lend, const char* substr);
306 /*the reversed, rightside equivalent of strstr: starts searching
307 from right end (rstart), going back to left end (lend) and returns
308 a pointer to the last (right) matching character in str */
309
310 char* strifind(const char* str, const char* substr);
311 // the case insensitive version of strstr -- finding a string within a strin
312
313
314 //Determines if a string begins with a given prefix
315 //(returns false when any of the params is NULL,
316 // but true when prefix is '' (empty string)!)
317 bool startsWith(const char* s, const char* prefix);
318
319 bool endsWith(const char* s, const char* suffix);
320 //Note: returns true if suffix is empty string, but false if it's NULL
321
322
323 // ELF hash function for strings
324 int strhash(const char* str);
325
326
327
328 //---- generic base GSeg : genomic segment (interval) --
329 // coordinates are considered 1-based (so 0 is invalid)
330 class GSeg {
331 public:
332 uint start; //start<end always!
333 uint end;
334 GSeg(uint s=0,uint e=0) {
335 if (s>e) { start=e;end=s; }
336 else { start=s;end=e; }
337 }
338 //check for overlap with other segment
339 uint len() { return end-start+1; }
340 bool overlap(GSeg* d) {
341 //return start<d->start ? (d->start<=end) : (start<=d->end);
342 return (start<=d->end && end>=d->start);
343 }
344
345 bool overlap(GSeg& d) {
346 //return start<d.start ? (d.start<=end) : (start<=d.end);
347 return (start<=d.end && end>=d.start);
348 }
349
350 bool overlap(GSeg& d, int fuzz) {
351 //return start<d.start ? (d.start<=end+fuzz) : (start<=d.end+fuzz);
352 return (start<=d.end+fuzz && end+fuzz>=d.start);
353 }
354
355 bool overlap(uint s, uint e) {
356 if (s>e) { Gswap(s,e); }
357 //return start<s ? (s<=end) : (start<=e);
358 return (start<=e && end>=s);
359 }
360
361 //return the length of overlap between two segments
362 int overlapLen(GSeg* r) {
363 if (start<r->start) {
364 if (r->start>end) return 0;
365 return (r->end>end) ? end-r->start+1 : r->end-r->start+1;
366 }
367 else { //r->start<=start
368 if (start>r->end) return 0;
369 return (r->end<end)? r->end-start+1 : end-start+1;
370 }
371 }
372 int overlapLen(uint rstart, uint rend) {
373 if (rstart>rend) { Gswap(rstart,rend); }
374 if (start<rstart) {
375 if (rstart>end) return 0;
376 return (rend>end) ? end-rstart+1 : rend-rstart+1;
377 }
378 else { //rstart<=start
379 if (start>rend) return 0;
380 return (rend<end)? rend-start+1 : end-start+1;
381 }
382 }
383
384 //fuzzy coordinate matching:
385 bool coordMatch(GSeg* s, uint fuzz=0) {
386 if (fuzz==0) return (start==s->start && end==s->end);
387 uint sd = (start>s->start) ? start-s->start : s->start-start;
388 uint ed = (end>s->end) ? end-s->end : s->end-end;
389 return (sd<=fuzz && ed<=fuzz);
390 }
391 //comparison operators required for sorting
392 bool operator==(GSeg& d){
393 return (start==d.start && end==d.end);
394 }
395 bool operator<(GSeg& d){
396 return (start==d.start)?(end<d.end):(start<d.start);
397 }
398 };
399
400
401
402 //--------------------------------------------------------
403 // ************** simple line reading class for text files
404
405 //GLineReader -- text line reading/buffering class
406 class GLineReader {
407 bool closeFile;
408 int len;
409 int allocated;
410 char* buf;
411 bool isEOF;
412 FILE* file;
413 off_t filepos; //current position
414 bool pushed; //pushed back
415 int lcount; //line counter (read lines)
416 public:
417 char* chars() { return buf; }
418 char* line() { return buf; }
419 int readcount() { return lcount; } //number of lines read
420 void setFile(FILE* stream) { file=stream; }
421 int length() { return len; }
422 int size() { return len; } //same as size();
423 bool isEof() {return isEOF; }
424 bool eof() { return isEOF; }
425 off_t getfpos() { return filepos; }
426 off_t getFpos() { return filepos; }
427 char* nextLine() { return getLine(); }
428 char* getLine() { if (pushed) { pushed=false; return buf; }
429 else return getLine(file); }
430 char* getLine(FILE* stream) {
431 if (pushed) { pushed=false; return buf; }
432 else return getLine(stream, filepos); }
433 char* getLine(FILE* stream, off_t& f_pos); //read a line from a stream and update
434 // the given file position
435 void pushBack() { if (lcount>0) pushed=true; } // "undo" the last getLine request
436 // so the next call will in fact return the same line
437 GLineReader(const char* fname) {
438 FILE* f=fopen(fname, "rb");
439 if (f==NULL) GError("Error opening file '%s'!\n",fname);
440 closeFile=true;
441 init(f);
442 }
443 GLineReader(FILE* stream=NULL, off_t fpos=0) {
444 closeFile=false;
445 init(stream,fpos);
446 }
447 void init(FILE* stream, off_t fpos=0) {
448 len=0;
449 isEOF=false;
450 allocated=1024;
451 GMALLOC(buf,allocated);
452 lcount=0;
453 buf[0]=0;
454 file=stream;
455 filepos=fpos;
456 pushed=false;
457 }
458 ~GLineReader() {
459 GFREE(buf);
460 if (closeFile) fclose(file);
461 }
462 };
463
464
465 /* extended fgets() - to read one full line from a file and
466 update the file position correctly !
467 buf will be reallocated as necessary, to fit the whole line
468 */
469 char* fgetline(char* & buf, int& buflen, FILE* stream, off_t* f_pos=NULL, int* linelen=NULL);
470
471
472 //print int/values nicely formatted in 3-digit groups
473 char* commaprint(uint64 n);
474
475 /*********************** File management functions *********************/
476
477 // removes the last part (file or directory name) of a full path
478 // WARNING: this is a destructive operation for the given string!
479 void delFileName(char* filepath);
480
481 // returns a pointer to the last file or directory name in a full path
482 const char* getFileName(const char* filepath);
483 // returns a pointer to the file "extension" part in a filename
484 const char* getFileExt(const char* filepath);
485
486
487 int fileExists(const char* fname);
488 //returns 0 if file entry doesn't exist
489 // 1 if it's a directory
490 // 2 if it's a regular file
491 // 3 otherwise (?)
492
493 int64 fileSize(const char* fpath);
494
495 //write a formatted fasta record, fasta formatted
496 void writeFasta(FILE *fw, const char* seqid, const char* descr,
497 const char* seq, int linelen=60, int seqlen=0);
498
499 //parses the next number found in a string at the current position
500 //until a non-digit (and not a '.', 'e','E','-','+') is encountered;
501 //updates the char* pointer to be after the last digit parsed
502 bool parseNumber(char* &p, double& v);
503 bool parseDouble(char* &p, double& v); //just an alias for parseNumber
504
505 bool parseInt(char* &p, int& i);
506 bool parseUInt(char* &p, uint& i);
507 bool parseHex(char* &p, uint& i);
508
509 #endif /* G_BASE_DEFINED */