ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBase.h
Revision: 171
Committed: Tue Feb 14 22:36:26 2012 UTC (7 years, 8 months ago) by gpertea
File size: 15391 byte(s)
Log Message:
wip fqtrim

Line User Rev File contents
1 gpertea 2 #ifndef G_BASE_DEFINED
2     #define G_BASE_DEFINED
3 gpertea 16 #ifndef _POSIX_SOURCE
4     //mostly for MinGW
5     #define _POSIX_SOURCE
6     #endif
7     #ifdef HAVE_CONFIG_H
8     #include "config.h"
9     #endif
10 gpertea 2 #include <string.h>
11     #include <stdlib.h>
12     #include <stdio.h>
13     #include <math.h>
14     #include <limits.h>
15     #include <sys/types.h>
16     #include <sys/stat.h>
17 gpertea 16 #include <stdint.h>
18    
19     #if defined __WIN32__ || defined WIN32 || defined _WIN32 || defined _WIN32_
20     #ifndef __WIN32__
21     #define __WIN32__
22     #endif
23 gpertea 2 #include <windows.h>
24 gpertea 16 #include <io.h>
25     #define CHPATHSEP '\\'
26     #undef off_t
27     #define off_t int64_t
28     #ifdef _fseeki64
29     #define fseeko(stream, offset, origin) _fseeki64(stream, offset, origin)
30     #else
31     /*
32     #define _DEFINE_WIN32_FSEEKO
33     int fseeko(FILE *stream, off_t offset, int whence);
34     */
35     #define fseeko fseek
36     #endif
37     #ifdef _ftelli64
38     #define ftello(stream) _ftelli64(stream)
39     #else
40     /*
41     #define _DEFINE_WIN32_FTELLO
42     off_t ftello(FILE *stream);
43     */
44     #define ftello ftell
45     #endif
46     #else
47     #define CHPATHSEP '/'
48     #include <unistd.h>
49 gpertea 2 #endif
50    
51 gpertea 36 #ifndef fseeko
52     #define fseeko fseek
53     #endif
54     #ifndef ftello
55     #define ftello ftell
56     #endif
57 gpertea 16
58 gpertea 2 #ifdef DEBUG
59     #undef NDEBUG
60     #endif
61    
62 gpertea 16 typedef int32_t int32;
63     typedef uint32_t uint32;
64 gpertea 171 typedef int16_t int16;
65     typedef uint16_t uint16;
66 gpertea 16
67 gpertea 2 typedef unsigned char uchar;
68     typedef unsigned char byte;
69    
70     #ifndef MAXUINT
71     #define MAXUINT ((unsigned int)-1)
72     #endif
73    
74 gpertea 16 #ifndef MAXINT
75     #define MAXINT INT_MAX
76 gpertea 2 #endif
77    
78 gpertea 16 #ifndef MAX_UINT
79     #define MAX_UINT ((unsigned int)-1)
80     #endif
81    
82     #ifndef MAX_INT
83     #define MAX_INT INT_MAX
84     #endif
85    
86     typedef int64_t int64;
87     typedef uint64_t uint64;
88    
89 gpertea 2 /****************************************************************************/
90    
91     #ifndef EXIT_FAILURE
92     #define EXIT_FAILURE 1
93     #endif
94    
95     #ifndef EXIT_SUCCESS
96     #define EXIT_SUCCESS 0
97     #endif
98    
99     /****************************************************************************/
100     #define ERR_ALLOC "Error allocating memory.\n"
101    
102     //-------------------
103    
104     // Debug helpers
105     #ifndef NDEBUG
106     #define GASSERT(exp) ((exp)?((void)0):(void)GAssert(#exp,__FILE__,__LINE__))
107     #ifdef TRACE
108     #define GTRACE(exp) (GMessage exp)
109     #else
110     #define GTRACE(exp) ((void)0)
111     #endif
112     #else
113     #define GASSERT(exp) ((void)0)
114     #define GTRACE(exp) ((void)0)
115     #endif
116    
117     #define GERROR(exp) (GError exp)
118     /********************************** Macros ***********************************/
119     // Abolute value
120     #define GABS(val) (((val)>=0)?(val):-(val))
121    
122     // Min and Max
123     #define GMAX(a,b) (((a)>(b))?(a):(b))
124     #define GMIN(a,b) (((a)>(b))?(b):(a))
125    
126     // Min of three
127     #define GMIN3(x,y,z) ((x)<(y)?GMIN(x,z):GMIN(y,z))
128    
129     // Max of three
130     #define GMAX3(x,y,z) ((x)>(y)?GMAX(x,z):GMAX(y,z))
131    
132     // Return minimum and maximum of a, b
133     #define GMINMAX(lo,hi,a,b) ((a)<(b)?((lo)=(a),(hi)=(b)):((lo)=(b),(hi)=(a)))
134    
135     // Clamp value x to range [lo..hi]
136     #define GCLAMP(lo,x,hi) ((x)<(lo)?(lo):((x)>(hi)?(hi):(x)))
137    
138     typedef void* pointer;
139     typedef unsigned int uint;
140    
141     typedef int GCompareProc(const pointer item1, const pointer item2);
142     typedef void GFreeProc(pointer item); //usually just delete,
143     //but may also support structures with embedded dynamic members
144    
145     #define GMALLOC(ptr,size) if (!GMalloc((pointer*)(&ptr),size)) \
146     GError(ERR_ALLOC)
147     #define GCALLOC(ptr,size) if (!GCalloc((pointer*)(&ptr),size)) \
148     GError(ERR_ALLOC)
149     #define GREALLOC(ptr,size) if (!GRealloc((pointer*)(&ptr),size)) \
150     GError(ERR_ALLOC)
151     #define GFREE(ptr) GFree((pointer*)(&ptr))
152    
153 gpertea 16 inline char* strMin(char *arg1, char *arg2) {
154 gpertea 2 return (strcmp(arg1, arg2) < 0)? arg1 : arg2;
155     }
156    
157 gpertea 16 inline char* strMax(char *arg1, char *arg2) {
158     return (strcmp(arg2, arg1) < 0)? arg1 : arg2;
159     }
160    
161 gpertea 2 inline int iround(double x) {
162     return (int)floor(x + 0.5);
163     }
164    
165     /****************************************************************************/
166    
167     inline int Gintcmp(int a, int b) {
168     //return (a>b)? 1 : ((a==b)?0:-1);
169     return a-b;
170     }
171    
172 gpertea 150 int Gstrcmp(const char* a, const char* b, int n=-1);
173 gpertea 2 //same as strcmp but doesn't crash on NULL pointers
174    
175 gpertea 150 int Gstricmp(const char* a, const char* b, int n=-1);
176 gpertea 2
177 gpertea 144 //basic swap template function
178     template<class T> void Gswap(T& lhs, T& rhs) {
179     //register T tmp=lhs;
180     T tmp=lhs; //requires copy operator
181     lhs=rhs;
182     rhs=tmp;
183     }
184 gpertea 2
185 gpertea 144 /// bitCount_32 - this function counts the number of set bits in a value.
186     /// Ex. CountPopulation(0xF000F000) = 8
187     /// Returns 0 if the word is zero.
188     inline uint bitCount_32(uint32_t Value) {
189     #if __GNUC__ >= 4
190     return __builtin_popcount(Value);
191     #else
192     uint32_t v = Value - ((Value >> 1) & 0x55555555);
193     v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
194     return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
195     #endif
196 gpertea 16 }
197 gpertea 2
198 gpertea 144 /// bitCount_64 - this function counts the number of set bits in a value,
199     /// (64 bit edition.)
200     inline uint bitCount_64(uint64_t Value) {
201     #if __GNUC__ >= 4
202     return __builtin_popcountll(Value);
203     #else
204     uint64_t v = Value - ((Value >> 1) & 0x5555555555555555ULL);
205     v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
206     v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
207     return uint((uint64_t)(v * 0x0101010101010101ULL) >> 56);
208     #endif
209 gpertea 16 }
210 gpertea 2
211 gpertea 144 /// CountTrailingZeros_32 - this function performs the platform optimal form of
212     /// counting the number of zeros from the least significant bit to the first one
213     /// bit. Ex. CountTrailingZeros_32(0xFF00FF00) == 8.
214     /// Returns 32 if the word is zero.
215     inline unsigned bitCountTrailingZeros_32(uint32_t Value) {
216     #if __GNUC__ >= 4
217     return Value ? __builtin_ctz(Value) : 32;
218     #else
219     static const unsigned Mod37BitPosition[] = {
220     32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13,
221     4, 7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9,
222     5, 20, 8, 19, 18
223     };
224     return Mod37BitPosition[(-Value & Value) % 37];
225     #endif
226     }
227 gpertea 2
228 gpertea 144 // CountTrailingZeros_64 - This function performs the platform optimal form
229     /// of counting the number of zeros from the least significant bit to the first
230     /// one bit (64 bit edition.)
231     /// Returns 64 if the word is zero.
232     inline unsigned bitCountTrailingZeros_64(uint64_t Value) {
233     #if __GNUC__ >= 4
234     return Value ? __builtin_ctzll(Value) : 64;
235     #else
236     static const unsigned Mod67Position[] = {
237     64, 0, 1, 39, 2, 15, 40, 23, 3, 12, 16, 59, 41, 19, 24, 54,
238     4, 64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55,
239     47, 5, 32, 65, 38, 14, 22, 11, 58, 18, 53, 63, 9, 61, 27,
240     29, 50, 43, 46, 31, 37, 21, 57, 52, 8, 26, 49, 45, 36, 56,
241     7, 48, 35, 6, 34, 33, 0
242     };
243     return Mod67Position[(-Value & Value) % 67];
244     #endif
245     }
246 gpertea 2
247     /**************** Memory management ***************************/
248    
249     bool GMalloc(pointer* ptr, unsigned long size); // Allocate memory
250     bool GCalloc(pointer* ptr, unsigned long size); // Allocate and initialize memory
251     bool GRealloc(pointer* ptr,unsigned long size); // Resize memory
252     void GFree(pointer* ptr); // Free memory, resets ptr to NULL
253    
254    
255 gpertea 16 int saprintf(char **retp, const char *fmt, ...);
256    
257 gpertea 2 void GError(const char* format,...); // Error routine (aborts program)
258     void GMessage(const char* format,...);// Log message to stderr
259     // Assert failed routine:- usually not called directly but through GASSERT
260     void GAssert(const char* expression, const char* filename, unsigned int lineno);
261    
262     // ****************** string manipulation *************************
263     char *Gstrdup(const char* str);
264     //duplicate a string by allocating a copy for it and returning it
265     char* Gstrdup(const char* sfrom, const char* sto);
266     //same as GStrdup, but with an early termination (e.g. on delimiter)
267    
268     char* Gsubstr(const char* str, char* from, char* to=NULL);
269     //extracts a substring, allocating it, including boundaries (from/to)
270    
271     int strsplit(char* str, char** fields, int maxfields, const char* delim);
272     int strsplit(char* str, char** fields, int maxfields, const char delim);
273     int strsplit(char* str, char** fields, int maxfields); //splits by tab or space
274    
275     char* replaceStr(char* &str, char* newvalue);
276    
277     //conversion: to Lower/Upper case
278     // creating a new string:
279     char* upCase(const char* str);
280     char* loCase(const char* str);
281     // changing string in place:
282     char* strlower(char * str);
283     char* strupper(char * str);
284    
285     //strstr but for memory zones: scans a memory region
286     //for a substring:
287     void* Gmemscan(void *mem, unsigned int len,
288     void *part, unsigned int partlen);
289    
290     // test if a char is in a string:
291 gpertea 16 bool chrInStr(char c, const char* str);
292 gpertea 2
293     char* rstrchr(char* str, char ch);
294     /* returns a pointer to the rightmost
295     occurence of ch in str - like rindex for platforms missing it*/
296    
297 gpertea 16 char* strchrs(const char* s, const char* chrs);
298 gpertea 2 //strchr but with a set of chars instead of only one
299    
300 gpertea 90 char* rstrfind(const char* str, const char *substr);
301     // like rindex() but for strings; right side version of strstr()
302 gpertea 2
303 gpertea 90 char* reverseChars(char* str, int slen=0); //in place reversal of string
304    
305 gpertea 16 char* rstrstr(const char* rstart, const char *lend, const char* substr);
306 gpertea 2 /*the reversed, rightside equivalent of strstr: starts searching
307     from right end (rstart), going back to left end (lend) and returns
308     a pointer to the last (right) matching character in str */
309    
310 gpertea 16 char* strifind(const char* str, const char* substr);
311 gpertea 2 // the case insensitive version of strstr -- finding a string within a strin
312    
313    
314     //Determines if a string begins with a given prefix
315     //(returns false when any of the params is NULL,
316     // but true when prefix is '' (empty string)!)
317 gpertea 16 bool startsWith(const char* s, const char* prefix);
318 gpertea 2
319 gpertea 16 bool endsWith(const char* s, const char* suffix);
320     //Note: returns true if suffix is empty string, but false if it's NULL
321    
322    
323 gpertea 2 // ELF hash function for strings
324     int strhash(const char* str);
325    
326    
327    
328     //---- generic base GSeg : genomic segment (interval) --
329     // coordinates are considered 1-based (so 0 is invalid)
330     class GSeg {
331     public:
332     uint start; //start<end always!
333     uint end;
334     GSeg(uint s=0,uint e=0) {
335     if (s>e) { start=e;end=s; }
336     else { start=s;end=e; }
337     }
338     //check for overlap with other segment
339     uint len() { return end-start+1; }
340     bool overlap(GSeg* d) {
341 gpertea 16 //return start<d->start ? (d->start<=end) : (start<=d->end);
342     return (start<=d->end && end>=d->start);
343 gpertea 2 }
344    
345     bool overlap(GSeg& d) {
346 gpertea 16 //return start<d.start ? (d.start<=end) : (start<=d.end);
347     return (start<=d.end && end>=d.start);
348 gpertea 2 }
349    
350     bool overlap(GSeg& d, int fuzz) {
351 gpertea 16 //return start<d.start ? (d.start<=end+fuzz) : (start<=d.end+fuzz);
352     return (start<=d.end+fuzz && end+fuzz>=d.start);
353 gpertea 2 }
354    
355     bool overlap(uint s, uint e) {
356 gpertea 144 if (s>e) { Gswap(s,e); }
357 gpertea 16 //return start<s ? (s<=end) : (start<=e);
358     return (start<=e && end>=s);
359 gpertea 2 }
360    
361     //return the length of overlap between two segments
362     int overlapLen(GSeg* r) {
363     if (start<r->start) {
364     if (r->start>end) return 0;
365     return (r->end>end) ? end-r->start+1 : r->end-r->start+1;
366     }
367     else { //r->start<=start
368     if (start>r->end) return 0;
369     return (r->end<end)? r->end-start+1 : end-start+1;
370     }
371     }
372     int overlapLen(uint rstart, uint rend) {
373 gpertea 144 if (rstart>rend) { Gswap(rstart,rend); }
374 gpertea 2 if (start<rstart) {
375     if (rstart>end) return 0;
376     return (rend>end) ? end-rstart+1 : rend-rstart+1;
377     }
378     else { //rstart<=start
379     if (start>rend) return 0;
380     return (rend<end)? rend-start+1 : end-start+1;
381     }
382     }
383    
384     //fuzzy coordinate matching:
385     bool coordMatch(GSeg* s, uint fuzz=0) {
386     if (fuzz==0) return (start==s->start && end==s->end);
387     uint sd = (start>s->start) ? start-s->start : s->start-start;
388     uint ed = (end>s->end) ? end-s->end : s->end-end;
389     return (sd<=fuzz && ed<=fuzz);
390     }
391     //comparison operators required for sorting
392     bool operator==(GSeg& d){
393     return (start==d.start && end==d.end);
394     }
395     bool operator<(GSeg& d){
396     return (start==d.start)?(end<d.end):(start<d.start);
397     }
398     };
399    
400    
401    
402     //--------------------------------------------------------
403     // ************** simple line reading class for text files
404    
405     //GLineReader -- text line reading/buffering class
406     class GLineReader {
407 gpertea 16 bool closeFile;
408 gpertea 2 int len;
409     int allocated;
410     char* buf;
411     bool isEOF;
412     FILE* file;
413     off_t filepos; //current position
414     bool pushed; //pushed back
415     int lcount; //line counter (read lines)
416     public:
417     char* chars() { return buf; }
418     char* line() { return buf; }
419     int readcount() { return lcount; } //number of lines read
420 gpertea 16 void setFile(FILE* stream) { file=stream; }
421 gpertea 2 int length() { return len; }
422     int size() { return len; } //same as size();
423     bool isEof() {return isEOF; }
424     bool eof() { return isEOF; }
425     off_t getfpos() { return filepos; }
426     off_t getFpos() { return filepos; }
427     char* nextLine() { return getLine(); }
428     char* getLine() { if (pushed) { pushed=false; return buf; }
429     else return getLine(file); }
430     char* getLine(FILE* stream) {
431     if (pushed) { pushed=false; return buf; }
432     else return getLine(stream, filepos); }
433     char* getLine(FILE* stream, off_t& f_pos); //read a line from a stream and update
434     // the given file position
435     void pushBack() { if (lcount>0) pushed=true; } // "undo" the last getLine request
436     // so the next call will in fact return the same line
437 gpertea 16 GLineReader(const char* fname) {
438     FILE* f=fopen(fname, "rb");
439     if (f==NULL) GError("Error opening file '%s'!\n",fname);
440     closeFile=true;
441     init(f);
442     }
443 gpertea 2 GLineReader(FILE* stream=NULL, off_t fpos=0) {
444 gpertea 16 closeFile=false;
445     init(stream,fpos);
446     }
447     void init(FILE* stream, off_t fpos=0) {
448 gpertea 2 len=0;
449     isEOF=false;
450     allocated=1024;
451     GMALLOC(buf,allocated);
452     lcount=0;
453     buf[0]=0;
454     file=stream;
455     filepos=fpos;
456     pushed=false;
457     }
458     ~GLineReader() {
459     GFREE(buf);
460 gpertea 16 if (closeFile) fclose(file);
461 gpertea 2 }
462     };
463    
464    
465     /* extended fgets() - to read one full line from a file and
466     update the file position correctly !
467     buf will be reallocated as necessary, to fit the whole line
468     */
469     char* fgetline(char* & buf, int& buflen, FILE* stream, off_t* f_pos=NULL, int* linelen=NULL);
470    
471 gpertea 16
472     //print int/values nicely formatted in 3-digit groups
473     char* commaprint(uint64 n);
474    
475 gpertea 2 /*********************** File management functions *********************/
476    
477 gpertea 16 // removes the last part (file or directory name) of a full path
478     // WARNING: this is a destructive operation for the given string!
479 gpertea 2 void delFileName(char* filepath);
480    
481 gpertea 16 // returns a pointer to the last file or directory name in a full path
482     const char* getFileName(const char* filepath);
483     // returns a pointer to the file "extension" part in a filename
484     const char* getFileExt(const char* filepath);
485 gpertea 2
486 gpertea 16
487 gpertea 2 int fileExists(const char* fname);
488     //returns 0 if file entry doesn't exist
489     // 1 if it's a directory
490     // 2 if it's a regular file
491     // 3 otherwise (?)
492    
493 gpertea 16 int64 fileSize(const char* fpath);
494 gpertea 2
495 gpertea 16 //write a formatted fasta record, fasta formatted
496     void writeFasta(FILE *fw, const char* seqid, const char* descr,
497     const char* seq, int linelen=60, int seqlen=0);
498    
499 gpertea 2 //parses the next number found in a string at the current position
500     //until a non-digit (and not a '.', 'e','E','-','+') is encountered;
501     //updates the char* pointer to be after the last digit parsed
502     bool parseNumber(char* &p, double& v);
503     bool parseDouble(char* &p, double& v); //just an alias for parseNumber
504    
505     bool parseInt(char* &p, int& i);
506     bool parseUInt(char* &p, uint& i);
507     bool parseHex(char* &p, uint& i);
508    
509     #endif /* G_BASE_DEFINED */