ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBase.h
Revision: 173
Committed: Wed Feb 15 03:34:29 2012 UTC (7 years, 4 months ago) by gpertea
File size: 15440 byte(s)
Log Message:
wip fqtrim

Line User Rev File contents
1 gpertea 2 #ifndef G_BASE_DEFINED
2     #define G_BASE_DEFINED
3 gpertea 16 #ifndef _POSIX_SOURCE
4     //mostly for MinGW
5     #define _POSIX_SOURCE
6     #endif
7     #ifdef HAVE_CONFIG_H
8     #include "config.h"
9     #endif
10 gpertea 2 #include <string.h>
11     #include <stdlib.h>
12     #include <stdio.h>
13     #include <math.h>
14     #include <limits.h>
15     #include <sys/types.h>
16     #include <sys/stat.h>
17 gpertea 16 #include <stdint.h>
18    
19     #if defined __WIN32__ || defined WIN32 || defined _WIN32 || defined _WIN32_
20     #ifndef __WIN32__
21     #define __WIN32__
22     #endif
23 gpertea 2 #include <windows.h>
24 gpertea 16 #include <io.h>
25     #define CHPATHSEP '\\'
26     #undef off_t
27     #define off_t int64_t
28 gpertea 173 #ifndef popen
29     #define popen _popen
30     #endif
31 gpertea 16 #ifdef _fseeki64
32     #define fseeko(stream, offset, origin) _fseeki64(stream, offset, origin)
33     #else
34     /*
35     #define _DEFINE_WIN32_FSEEKO
36     int fseeko(FILE *stream, off_t offset, int whence);
37     */
38     #define fseeko fseek
39     #endif
40     #ifdef _ftelli64
41     #define ftello(stream) _ftelli64(stream)
42     #else
43     /*
44     #define _DEFINE_WIN32_FTELLO
45     off_t ftello(FILE *stream);
46     */
47     #define ftello ftell
48     #endif
49     #else
50     #define CHPATHSEP '/'
51     #include <unistd.h>
52 gpertea 2 #endif
53    
54 gpertea 36 #ifndef fseeko
55     #define fseeko fseek
56     #endif
57     #ifndef ftello
58     #define ftello ftell
59     #endif
60 gpertea 16
61 gpertea 2 #ifdef DEBUG
62     #undef NDEBUG
63     #endif
64    
65 gpertea 16 typedef int32_t int32;
66     typedef uint32_t uint32;
67 gpertea 171 typedef int16_t int16;
68     typedef uint16_t uint16;
69 gpertea 16
70 gpertea 2 typedef unsigned char uchar;
71     typedef unsigned char byte;
72    
73     #ifndef MAXUINT
74     #define MAXUINT ((unsigned int)-1)
75     #endif
76    
77 gpertea 16 #ifndef MAXINT
78     #define MAXINT INT_MAX
79 gpertea 2 #endif
80    
81 gpertea 16 #ifndef MAX_UINT
82     #define MAX_UINT ((unsigned int)-1)
83     #endif
84    
85     #ifndef MAX_INT
86     #define MAX_INT INT_MAX
87     #endif
88    
89     typedef int64_t int64;
90     typedef uint64_t uint64;
91    
92 gpertea 2 /****************************************************************************/
93    
94     #ifndef EXIT_FAILURE
95     #define EXIT_FAILURE 1
96     #endif
97    
98     #ifndef EXIT_SUCCESS
99     #define EXIT_SUCCESS 0
100     #endif
101    
102     /****************************************************************************/
103     #define ERR_ALLOC "Error allocating memory.\n"
104    
105     //-------------------
106    
107     // Debug helpers
108     #ifndef NDEBUG
109     #define GASSERT(exp) ((exp)?((void)0):(void)GAssert(#exp,__FILE__,__LINE__))
110     #ifdef TRACE
111     #define GTRACE(exp) (GMessage exp)
112     #else
113     #define GTRACE(exp) ((void)0)
114     #endif
115     #else
116     #define GASSERT(exp) ((void)0)
117     #define GTRACE(exp) ((void)0)
118     #endif
119    
120     #define GERROR(exp) (GError exp)
121     /********************************** Macros ***********************************/
122     // Abolute value
123     #define GABS(val) (((val)>=0)?(val):-(val))
124    
125     // Min and Max
126     #define GMAX(a,b) (((a)>(b))?(a):(b))
127     #define GMIN(a,b) (((a)>(b))?(b):(a))
128    
129     // Min of three
130     #define GMIN3(x,y,z) ((x)<(y)?GMIN(x,z):GMIN(y,z))
131    
132     // Max of three
133     #define GMAX3(x,y,z) ((x)>(y)?GMAX(x,z):GMAX(y,z))
134    
135     // Return minimum and maximum of a, b
136     #define GMINMAX(lo,hi,a,b) ((a)<(b)?((lo)=(a),(hi)=(b)):((lo)=(b),(hi)=(a)))
137    
138     // Clamp value x to range [lo..hi]
139     #define GCLAMP(lo,x,hi) ((x)<(lo)?(lo):((x)>(hi)?(hi):(x)))
140    
141     typedef void* pointer;
142     typedef unsigned int uint;
143    
144     typedef int GCompareProc(const pointer item1, const pointer item2);
145     typedef void GFreeProc(pointer item); //usually just delete,
146     //but may also support structures with embedded dynamic members
147    
148     #define GMALLOC(ptr,size) if (!GMalloc((pointer*)(&ptr),size)) \
149     GError(ERR_ALLOC)
150     #define GCALLOC(ptr,size) if (!GCalloc((pointer*)(&ptr),size)) \
151     GError(ERR_ALLOC)
152     #define GREALLOC(ptr,size) if (!GRealloc((pointer*)(&ptr),size)) \
153     GError(ERR_ALLOC)
154     #define GFREE(ptr) GFree((pointer*)(&ptr))
155    
156 gpertea 16 inline char* strMin(char *arg1, char *arg2) {
157 gpertea 2 return (strcmp(arg1, arg2) < 0)? arg1 : arg2;
158     }
159    
160 gpertea 16 inline char* strMax(char *arg1, char *arg2) {
161     return (strcmp(arg2, arg1) < 0)? arg1 : arg2;
162     }
163    
164 gpertea 2 inline int iround(double x) {
165     return (int)floor(x + 0.5);
166     }
167    
168     /****************************************************************************/
169    
170     inline int Gintcmp(int a, int b) {
171     //return (a>b)? 1 : ((a==b)?0:-1);
172     return a-b;
173     }
174    
175 gpertea 150 int Gstrcmp(const char* a, const char* b, int n=-1);
176 gpertea 2 //same as strcmp but doesn't crash on NULL pointers
177    
178 gpertea 150 int Gstricmp(const char* a, const char* b, int n=-1);
179 gpertea 2
180 gpertea 144 //basic swap template function
181     template<class T> void Gswap(T& lhs, T& rhs) {
182     //register T tmp=lhs;
183     T tmp=lhs; //requires copy operator
184     lhs=rhs;
185     rhs=tmp;
186     }
187 gpertea 2
188 gpertea 144 /// bitCount_32 - this function counts the number of set bits in a value.
189     /// Ex. CountPopulation(0xF000F000) = 8
190     /// Returns 0 if the word is zero.
191     inline uint bitCount_32(uint32_t Value) {
192     #if __GNUC__ >= 4
193     return __builtin_popcount(Value);
194     #else
195     uint32_t v = Value - ((Value >> 1) & 0x55555555);
196     v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
197     return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
198     #endif
199 gpertea 16 }
200 gpertea 2
201 gpertea 144 /// bitCount_64 - this function counts the number of set bits in a value,
202     /// (64 bit edition.)
203     inline uint bitCount_64(uint64_t Value) {
204     #if __GNUC__ >= 4
205     return __builtin_popcountll(Value);
206     #else
207     uint64_t v = Value - ((Value >> 1) & 0x5555555555555555ULL);
208     v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
209     v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
210     return uint((uint64_t)(v * 0x0101010101010101ULL) >> 56);
211     #endif
212 gpertea 16 }
213 gpertea 2
214 gpertea 144 /// CountTrailingZeros_32 - this function performs the platform optimal form of
215     /// counting the number of zeros from the least significant bit to the first one
216     /// bit. Ex. CountTrailingZeros_32(0xFF00FF00) == 8.
217     /// Returns 32 if the word is zero.
218     inline unsigned bitCountTrailingZeros_32(uint32_t Value) {
219     #if __GNUC__ >= 4
220     return Value ? __builtin_ctz(Value) : 32;
221     #else
222     static const unsigned Mod37BitPosition[] = {
223     32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13,
224     4, 7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9,
225     5, 20, 8, 19, 18
226     };
227     return Mod37BitPosition[(-Value & Value) % 37];
228     #endif
229     }
230 gpertea 2
231 gpertea 144 // CountTrailingZeros_64 - This function performs the platform optimal form
232     /// of counting the number of zeros from the least significant bit to the first
233     /// one bit (64 bit edition.)
234     /// Returns 64 if the word is zero.
235     inline unsigned bitCountTrailingZeros_64(uint64_t Value) {
236     #if __GNUC__ >= 4
237     return Value ? __builtin_ctzll(Value) : 64;
238     #else
239     static const unsigned Mod67Position[] = {
240     64, 0, 1, 39, 2, 15, 40, 23, 3, 12, 16, 59, 41, 19, 24, 54,
241     4, 64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55,
242     47, 5, 32, 65, 38, 14, 22, 11, 58, 18, 53, 63, 9, 61, 27,
243     29, 50, 43, 46, 31, 37, 21, 57, 52, 8, 26, 49, 45, 36, 56,
244     7, 48, 35, 6, 34, 33, 0
245     };
246     return Mod67Position[(-Value & Value) % 67];
247     #endif
248     }
249 gpertea 2
250     /**************** Memory management ***************************/
251    
252     bool GMalloc(pointer* ptr, unsigned long size); // Allocate memory
253     bool GCalloc(pointer* ptr, unsigned long size); // Allocate and initialize memory
254     bool GRealloc(pointer* ptr,unsigned long size); // Resize memory
255     void GFree(pointer* ptr); // Free memory, resets ptr to NULL
256    
257    
258 gpertea 16 int saprintf(char **retp, const char *fmt, ...);
259    
260 gpertea 2 void GError(const char* format,...); // Error routine (aborts program)
261     void GMessage(const char* format,...);// Log message to stderr
262     // Assert failed routine:- usually not called directly but through GASSERT
263     void GAssert(const char* expression, const char* filename, unsigned int lineno);
264    
265     // ****************** string manipulation *************************
266     char *Gstrdup(const char* str);
267     //duplicate a string by allocating a copy for it and returning it
268     char* Gstrdup(const char* sfrom, const char* sto);
269     //same as GStrdup, but with an early termination (e.g. on delimiter)
270    
271     char* Gsubstr(const char* str, char* from, char* to=NULL);
272     //extracts a substring, allocating it, including boundaries (from/to)
273    
274     int strsplit(char* str, char** fields, int maxfields, const char* delim);
275     int strsplit(char* str, char** fields, int maxfields, const char delim);
276     int strsplit(char* str, char** fields, int maxfields); //splits by tab or space
277    
278     char* replaceStr(char* &str, char* newvalue);
279    
280     //conversion: to Lower/Upper case
281     // creating a new string:
282     char* upCase(const char* str);
283     char* loCase(const char* str);
284     // changing string in place:
285     char* strlower(char * str);
286     char* strupper(char * str);
287    
288     //strstr but for memory zones: scans a memory region
289     //for a substring:
290     void* Gmemscan(void *mem, unsigned int len,
291     void *part, unsigned int partlen);
292    
293     // test if a char is in a string:
294 gpertea 16 bool chrInStr(char c, const char* str);
295 gpertea 2
296     char* rstrchr(char* str, char ch);
297     /* returns a pointer to the rightmost
298     occurence of ch in str - like rindex for platforms missing it*/
299    
300 gpertea 16 char* strchrs(const char* s, const char* chrs);
301 gpertea 2 //strchr but with a set of chars instead of only one
302    
303 gpertea 90 char* rstrfind(const char* str, const char *substr);
304     // like rindex() but for strings; right side version of strstr()
305 gpertea 2
306 gpertea 90 char* reverseChars(char* str, int slen=0); //in place reversal of string
307    
308 gpertea 16 char* rstrstr(const char* rstart, const char *lend, const char* substr);
309 gpertea 2 /*the reversed, rightside equivalent of strstr: starts searching
310     from right end (rstart), going back to left end (lend) and returns
311     a pointer to the last (right) matching character in str */
312    
313 gpertea 16 char* strifind(const char* str, const char* substr);
314 gpertea 2 // the case insensitive version of strstr -- finding a string within a strin
315    
316    
317     //Determines if a string begins with a given prefix
318     //(returns false when any of the params is NULL,
319     // but true when prefix is '' (empty string)!)
320 gpertea 16 bool startsWith(const char* s, const char* prefix);
321 gpertea 2
322 gpertea 16 bool endsWith(const char* s, const char* suffix);
323     //Note: returns true if suffix is empty string, but false if it's NULL
324    
325    
326 gpertea 2 // ELF hash function for strings
327     int strhash(const char* str);
328    
329    
330    
331     //---- generic base GSeg : genomic segment (interval) --
332     // coordinates are considered 1-based (so 0 is invalid)
333     class GSeg {
334     public:
335     uint start; //start<end always!
336     uint end;
337     GSeg(uint s=0,uint e=0) {
338     if (s>e) { start=e;end=s; }
339     else { start=s;end=e; }
340     }
341     //check for overlap with other segment
342     uint len() { return end-start+1; }
343     bool overlap(GSeg* d) {
344 gpertea 16 //return start<d->start ? (d->start<=end) : (start<=d->end);
345     return (start<=d->end && end>=d->start);
346 gpertea 2 }
347    
348     bool overlap(GSeg& d) {
349 gpertea 16 //return start<d.start ? (d.start<=end) : (start<=d.end);
350     return (start<=d.end && end>=d.start);
351 gpertea 2 }
352    
353     bool overlap(GSeg& d, int fuzz) {
354 gpertea 16 //return start<d.start ? (d.start<=end+fuzz) : (start<=d.end+fuzz);
355     return (start<=d.end+fuzz && end+fuzz>=d.start);
356 gpertea 2 }
357    
358     bool overlap(uint s, uint e) {
359 gpertea 144 if (s>e) { Gswap(s,e); }
360 gpertea 16 //return start<s ? (s<=end) : (start<=e);
361     return (start<=e && end>=s);
362 gpertea 2 }
363    
364     //return the length of overlap between two segments
365     int overlapLen(GSeg* r) {
366     if (start<r->start) {
367     if (r->start>end) return 0;
368     return (r->end>end) ? end-r->start+1 : r->end-r->start+1;
369     }
370     else { //r->start<=start
371     if (start>r->end) return 0;
372     return (r->end<end)? r->end-start+1 : end-start+1;
373     }
374     }
375     int overlapLen(uint rstart, uint rend) {
376 gpertea 144 if (rstart>rend) { Gswap(rstart,rend); }
377 gpertea 2 if (start<rstart) {
378     if (rstart>end) return 0;
379     return (rend>end) ? end-rstart+1 : rend-rstart+1;
380     }
381     else { //rstart<=start
382     if (start>rend) return 0;
383     return (rend<end)? rend-start+1 : end-start+1;
384     }
385     }
386    
387     //fuzzy coordinate matching:
388     bool coordMatch(GSeg* s, uint fuzz=0) {
389     if (fuzz==0) return (start==s->start && end==s->end);
390     uint sd = (start>s->start) ? start-s->start : s->start-start;
391     uint ed = (end>s->end) ? end-s->end : s->end-end;
392     return (sd<=fuzz && ed<=fuzz);
393     }
394     //comparison operators required for sorting
395     bool operator==(GSeg& d){
396     return (start==d.start && end==d.end);
397     }
398     bool operator<(GSeg& d){
399     return (start==d.start)?(end<d.end):(start<d.start);
400     }
401     };
402    
403    
404    
405     //--------------------------------------------------------
406     // ************** simple line reading class for text files
407    
408     //GLineReader -- text line reading/buffering class
409     class GLineReader {
410 gpertea 16 bool closeFile;
411 gpertea 2 int len;
412     int allocated;
413     char* buf;
414     bool isEOF;
415     FILE* file;
416     off_t filepos; //current position
417     bool pushed; //pushed back
418     int lcount; //line counter (read lines)
419     public:
420     char* chars() { return buf; }
421     char* line() { return buf; }
422     int readcount() { return lcount; } //number of lines read
423 gpertea 16 void setFile(FILE* stream) { file=stream; }
424 gpertea 2 int length() { return len; }
425     int size() { return len; } //same as size();
426     bool isEof() {return isEOF; }
427     bool eof() { return isEOF; }
428     off_t getfpos() { return filepos; }
429     off_t getFpos() { return filepos; }
430     char* nextLine() { return getLine(); }
431     char* getLine() { if (pushed) { pushed=false; return buf; }
432     else return getLine(file); }
433     char* getLine(FILE* stream) {
434     if (pushed) { pushed=false; return buf; }
435     else return getLine(stream, filepos); }
436     char* getLine(FILE* stream, off_t& f_pos); //read a line from a stream and update
437     // the given file position
438     void pushBack() { if (lcount>0) pushed=true; } // "undo" the last getLine request
439     // so the next call will in fact return the same line
440 gpertea 16 GLineReader(const char* fname) {
441     FILE* f=fopen(fname, "rb");
442     if (f==NULL) GError("Error opening file '%s'!\n",fname);
443     closeFile=true;
444     init(f);
445     }
446 gpertea 2 GLineReader(FILE* stream=NULL, off_t fpos=0) {
447 gpertea 16 closeFile=false;
448     init(stream,fpos);
449     }
450     void init(FILE* stream, off_t fpos=0) {
451 gpertea 2 len=0;
452     isEOF=false;
453     allocated=1024;
454     GMALLOC(buf,allocated);
455     lcount=0;
456     buf[0]=0;
457     file=stream;
458     filepos=fpos;
459     pushed=false;
460     }
461     ~GLineReader() {
462     GFREE(buf);
463 gpertea 16 if (closeFile) fclose(file);
464 gpertea 2 }
465     };
466    
467    
468     /* extended fgets() - to read one full line from a file and
469     update the file position correctly !
470     buf will be reallocated as necessary, to fit the whole line
471     */
472     char* fgetline(char* & buf, int& buflen, FILE* stream, off_t* f_pos=NULL, int* linelen=NULL);
473    
474 gpertea 16
475     //print int/values nicely formatted in 3-digit groups
476     char* commaprint(uint64 n);
477    
478 gpertea 2 /*********************** File management functions *********************/
479    
480 gpertea 16 // removes the last part (file or directory name) of a full path
481     // WARNING: this is a destructive operation for the given string!
482 gpertea 2 void delFileName(char* filepath);
483    
484 gpertea 16 // returns a pointer to the last file or directory name in a full path
485     const char* getFileName(const char* filepath);
486     // returns a pointer to the file "extension" part in a filename
487     const char* getFileExt(const char* filepath);
488 gpertea 2
489 gpertea 16
490 gpertea 2 int fileExists(const char* fname);
491     //returns 0 if file entry doesn't exist
492     // 1 if it's a directory
493     // 2 if it's a regular file
494     // 3 otherwise (?)
495    
496 gpertea 16 int64 fileSize(const char* fpath);
497 gpertea 2
498 gpertea 16 //write a formatted fasta record, fasta formatted
499     void writeFasta(FILE *fw, const char* seqid, const char* descr,
500     const char* seq, int linelen=60, int seqlen=0);
501    
502 gpertea 2 //parses the next number found in a string at the current position
503     //until a non-digit (and not a '.', 'e','E','-','+') is encountered;
504     //updates the char* pointer to be after the last digit parsed
505     bool parseNumber(char* &p, double& v);
506     bool parseDouble(char* &p, double& v); //just an alias for parseNumber
507    
508     bool parseInt(char* &p, int& i);
509     bool parseUInt(char* &p, uint& i);
510     bool parseHex(char* &p, uint& i);
511    
512     #endif /* G_BASE_DEFINED */