ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/gclib/gclib/GBase.h
Revision: 173
Committed: Wed Feb 15 03:34:29 2012 UTC (7 years, 5 months ago) by gpertea
File size: 15440 byte(s)
Log Message:
wip fqtrim

Line File contents
1 #ifndef G_BASE_DEFINED
2 #define G_BASE_DEFINED
3 #ifndef _POSIX_SOURCE
4 //mostly for MinGW
5 #define _POSIX_SOURCE
6 #endif
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
10 #include <string.h>
11 #include <stdlib.h>
12 #include <stdio.h>
13 #include <math.h>
14 #include <limits.h>
15 #include <sys/types.h>
16 #include <sys/stat.h>
17 #include <stdint.h>
18
19 #if defined __WIN32__ || defined WIN32 || defined _WIN32 || defined _WIN32_
20 #ifndef __WIN32__
21 #define __WIN32__
22 #endif
23 #include <windows.h>
24 #include <io.h>
25 #define CHPATHSEP '\\'
26 #undef off_t
27 #define off_t int64_t
28 #ifndef popen
29 #define popen _popen
30 #endif
31 #ifdef _fseeki64
32 #define fseeko(stream, offset, origin) _fseeki64(stream, offset, origin)
33 #else
34 /*
35 #define _DEFINE_WIN32_FSEEKO
36 int fseeko(FILE *stream, off_t offset, int whence);
37 */
38 #define fseeko fseek
39 #endif
40 #ifdef _ftelli64
41 #define ftello(stream) _ftelli64(stream)
42 #else
43 /*
44 #define _DEFINE_WIN32_FTELLO
45 off_t ftello(FILE *stream);
46 */
47 #define ftello ftell
48 #endif
49 #else
50 #define CHPATHSEP '/'
51 #include <unistd.h>
52 #endif
53
54 #ifndef fseeko
55 #define fseeko fseek
56 #endif
57 #ifndef ftello
58 #define ftello ftell
59 #endif
60
61 #ifdef DEBUG
62 #undef NDEBUG
63 #endif
64
65 typedef int32_t int32;
66 typedef uint32_t uint32;
67 typedef int16_t int16;
68 typedef uint16_t uint16;
69
70 typedef unsigned char uchar;
71 typedef unsigned char byte;
72
73 #ifndef MAXUINT
74 #define MAXUINT ((unsigned int)-1)
75 #endif
76
77 #ifndef MAXINT
78 #define MAXINT INT_MAX
79 #endif
80
81 #ifndef MAX_UINT
82 #define MAX_UINT ((unsigned int)-1)
83 #endif
84
85 #ifndef MAX_INT
86 #define MAX_INT INT_MAX
87 #endif
88
89 typedef int64_t int64;
90 typedef uint64_t uint64;
91
92 /****************************************************************************/
93
94 #ifndef EXIT_FAILURE
95 #define EXIT_FAILURE 1
96 #endif
97
98 #ifndef EXIT_SUCCESS
99 #define EXIT_SUCCESS 0
100 #endif
101
102 /****************************************************************************/
103 #define ERR_ALLOC "Error allocating memory.\n"
104
105 //-------------------
106
107 // Debug helpers
108 #ifndef NDEBUG
109 #define GASSERT(exp) ((exp)?((void)0):(void)GAssert(#exp,__FILE__,__LINE__))
110 #ifdef TRACE
111 #define GTRACE(exp) (GMessage exp)
112 #else
113 #define GTRACE(exp) ((void)0)
114 #endif
115 #else
116 #define GASSERT(exp) ((void)0)
117 #define GTRACE(exp) ((void)0)
118 #endif
119
120 #define GERROR(exp) (GError exp)
121 /********************************** Macros ***********************************/
122 // Abolute value
123 #define GABS(val) (((val)>=0)?(val):-(val))
124
125 // Min and Max
126 #define GMAX(a,b) (((a)>(b))?(a):(b))
127 #define GMIN(a,b) (((a)>(b))?(b):(a))
128
129 // Min of three
130 #define GMIN3(x,y,z) ((x)<(y)?GMIN(x,z):GMIN(y,z))
131
132 // Max of three
133 #define GMAX3(x,y,z) ((x)>(y)?GMAX(x,z):GMAX(y,z))
134
135 // Return minimum and maximum of a, b
136 #define GMINMAX(lo,hi,a,b) ((a)<(b)?((lo)=(a),(hi)=(b)):((lo)=(b),(hi)=(a)))
137
138 // Clamp value x to range [lo..hi]
139 #define GCLAMP(lo,x,hi) ((x)<(lo)?(lo):((x)>(hi)?(hi):(x)))
140
141 typedef void* pointer;
142 typedef unsigned int uint;
143
144 typedef int GCompareProc(const pointer item1, const pointer item2);
145 typedef void GFreeProc(pointer item); //usually just delete,
146 //but may also support structures with embedded dynamic members
147
148 #define GMALLOC(ptr,size) if (!GMalloc((pointer*)(&ptr),size)) \
149 GError(ERR_ALLOC)
150 #define GCALLOC(ptr,size) if (!GCalloc((pointer*)(&ptr),size)) \
151 GError(ERR_ALLOC)
152 #define GREALLOC(ptr,size) if (!GRealloc((pointer*)(&ptr),size)) \
153 GError(ERR_ALLOC)
154 #define GFREE(ptr) GFree((pointer*)(&ptr))
155
156 inline char* strMin(char *arg1, char *arg2) {
157 return (strcmp(arg1, arg2) < 0)? arg1 : arg2;
158 }
159
160 inline char* strMax(char *arg1, char *arg2) {
161 return (strcmp(arg2, arg1) < 0)? arg1 : arg2;
162 }
163
164 inline int iround(double x) {
165 return (int)floor(x + 0.5);
166 }
167
168 /****************************************************************************/
169
170 inline int Gintcmp(int a, int b) {
171 //return (a>b)? 1 : ((a==b)?0:-1);
172 return a-b;
173 }
174
175 int Gstrcmp(const char* a, const char* b, int n=-1);
176 //same as strcmp but doesn't crash on NULL pointers
177
178 int Gstricmp(const char* a, const char* b, int n=-1);
179
180 //basic swap template function
181 template<class T> void Gswap(T& lhs, T& rhs) {
182 //register T tmp=lhs;
183 T tmp=lhs; //requires copy operator
184 lhs=rhs;
185 rhs=tmp;
186 }
187
188 /// bitCount_32 - this function counts the number of set bits in a value.
189 /// Ex. CountPopulation(0xF000F000) = 8
190 /// Returns 0 if the word is zero.
191 inline uint bitCount_32(uint32_t Value) {
192 #if __GNUC__ >= 4
193 return __builtin_popcount(Value);
194 #else
195 uint32_t v = Value - ((Value >> 1) & 0x55555555);
196 v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
197 return ((v + (v >> 4) & 0xF0F0F0F) * 0x1010101) >> 24;
198 #endif
199 }
200
201 /// bitCount_64 - this function counts the number of set bits in a value,
202 /// (64 bit edition.)
203 inline uint bitCount_64(uint64_t Value) {
204 #if __GNUC__ >= 4
205 return __builtin_popcountll(Value);
206 #else
207 uint64_t v = Value - ((Value >> 1) & 0x5555555555555555ULL);
208 v = (v & 0x3333333333333333ULL) + ((v >> 2) & 0x3333333333333333ULL);
209 v = (v + (v >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
210 return uint((uint64_t)(v * 0x0101010101010101ULL) >> 56);
211 #endif
212 }
213
214 /// CountTrailingZeros_32 - this function performs the platform optimal form of
215 /// counting the number of zeros from the least significant bit to the first one
216 /// bit. Ex. CountTrailingZeros_32(0xFF00FF00) == 8.
217 /// Returns 32 if the word is zero.
218 inline unsigned bitCountTrailingZeros_32(uint32_t Value) {
219 #if __GNUC__ >= 4
220 return Value ? __builtin_ctz(Value) : 32;
221 #else
222 static const unsigned Mod37BitPosition[] = {
223 32, 0, 1, 26, 2, 23, 27, 0, 3, 16, 24, 30, 28, 11, 0, 13,
224 4, 7, 17, 0, 25, 22, 31, 15, 29, 10, 12, 6, 0, 21, 14, 9,
225 5, 20, 8, 19, 18
226 };
227 return Mod37BitPosition[(-Value & Value) % 37];
228 #endif
229 }
230
231 // CountTrailingZeros_64 - This function performs the platform optimal form
232 /// of counting the number of zeros from the least significant bit to the first
233 /// one bit (64 bit edition.)
234 /// Returns 64 if the word is zero.
235 inline unsigned bitCountTrailingZeros_64(uint64_t Value) {
236 #if __GNUC__ >= 4
237 return Value ? __builtin_ctzll(Value) : 64;
238 #else
239 static const unsigned Mod67Position[] = {
240 64, 0, 1, 39, 2, 15, 40, 23, 3, 12, 16, 59, 41, 19, 24, 54,
241 4, 64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55,
242 47, 5, 32, 65, 38, 14, 22, 11, 58, 18, 53, 63, 9, 61, 27,
243 29, 50, 43, 46, 31, 37, 21, 57, 52, 8, 26, 49, 45, 36, 56,
244 7, 48, 35, 6, 34, 33, 0
245 };
246 return Mod67Position[(-Value & Value) % 67];
247 #endif
248 }
249
250 /**************** Memory management ***************************/
251
252 bool GMalloc(pointer* ptr, unsigned long size); // Allocate memory
253 bool GCalloc(pointer* ptr, unsigned long size); // Allocate and initialize memory
254 bool GRealloc(pointer* ptr,unsigned long size); // Resize memory
255 void GFree(pointer* ptr); // Free memory, resets ptr to NULL
256
257
258 int saprintf(char **retp, const char *fmt, ...);
259
260 void GError(const char* format,...); // Error routine (aborts program)
261 void GMessage(const char* format,...);// Log message to stderr
262 // Assert failed routine:- usually not called directly but through GASSERT
263 void GAssert(const char* expression, const char* filename, unsigned int lineno);
264
265 // ****************** string manipulation *************************
266 char *Gstrdup(const char* str);
267 //duplicate a string by allocating a copy for it and returning it
268 char* Gstrdup(const char* sfrom, const char* sto);
269 //same as GStrdup, but with an early termination (e.g. on delimiter)
270
271 char* Gsubstr(const char* str, char* from, char* to=NULL);
272 //extracts a substring, allocating it, including boundaries (from/to)
273
274 int strsplit(char* str, char** fields, int maxfields, const char* delim);
275 int strsplit(char* str, char** fields, int maxfields, const char delim);
276 int strsplit(char* str, char** fields, int maxfields); //splits by tab or space
277
278 char* replaceStr(char* &str, char* newvalue);
279
280 //conversion: to Lower/Upper case
281 // creating a new string:
282 char* upCase(const char* str);
283 char* loCase(const char* str);
284 // changing string in place:
285 char* strlower(char * str);
286 char* strupper(char * str);
287
288 //strstr but for memory zones: scans a memory region
289 //for a substring:
290 void* Gmemscan(void *mem, unsigned int len,
291 void *part, unsigned int partlen);
292
293 // test if a char is in a string:
294 bool chrInStr(char c, const char* str);
295
296 char* rstrchr(char* str, char ch);
297 /* returns a pointer to the rightmost
298 occurence of ch in str - like rindex for platforms missing it*/
299
300 char* strchrs(const char* s, const char* chrs);
301 //strchr but with a set of chars instead of only one
302
303 char* rstrfind(const char* str, const char *substr);
304 // like rindex() but for strings; right side version of strstr()
305
306 char* reverseChars(char* str, int slen=0); //in place reversal of string
307
308 char* rstrstr(const char* rstart, const char *lend, const char* substr);
309 /*the reversed, rightside equivalent of strstr: starts searching
310 from right end (rstart), going back to left end (lend) and returns
311 a pointer to the last (right) matching character in str */
312
313 char* strifind(const char* str, const char* substr);
314 // the case insensitive version of strstr -- finding a string within a strin
315
316
317 //Determines if a string begins with a given prefix
318 //(returns false when any of the params is NULL,
319 // but true when prefix is '' (empty string)!)
320 bool startsWith(const char* s, const char* prefix);
321
322 bool endsWith(const char* s, const char* suffix);
323 //Note: returns true if suffix is empty string, but false if it's NULL
324
325
326 // ELF hash function for strings
327 int strhash(const char* str);
328
329
330
331 //---- generic base GSeg : genomic segment (interval) --
332 // coordinates are considered 1-based (so 0 is invalid)
333 class GSeg {
334 public:
335 uint start; //start<end always!
336 uint end;
337 GSeg(uint s=0,uint e=0) {
338 if (s>e) { start=e;end=s; }
339 else { start=s;end=e; }
340 }
341 //check for overlap with other segment
342 uint len() { return end-start+1; }
343 bool overlap(GSeg* d) {
344 //return start<d->start ? (d->start<=end) : (start<=d->end);
345 return (start<=d->end && end>=d->start);
346 }
347
348 bool overlap(GSeg& d) {
349 //return start<d.start ? (d.start<=end) : (start<=d.end);
350 return (start<=d.end && end>=d.start);
351 }
352
353 bool overlap(GSeg& d, int fuzz) {
354 //return start<d.start ? (d.start<=end+fuzz) : (start<=d.end+fuzz);
355 return (start<=d.end+fuzz && end+fuzz>=d.start);
356 }
357
358 bool overlap(uint s, uint e) {
359 if (s>e) { Gswap(s,e); }
360 //return start<s ? (s<=end) : (start<=e);
361 return (start<=e && end>=s);
362 }
363
364 //return the length of overlap between two segments
365 int overlapLen(GSeg* r) {
366 if (start<r->start) {
367 if (r->start>end) return 0;
368 return (r->end>end) ? end-r->start+1 : r->end-r->start+1;
369 }
370 else { //r->start<=start
371 if (start>r->end) return 0;
372 return (r->end<end)? r->end-start+1 : end-start+1;
373 }
374 }
375 int overlapLen(uint rstart, uint rend) {
376 if (rstart>rend) { Gswap(rstart,rend); }
377 if (start<rstart) {
378 if (rstart>end) return 0;
379 return (rend>end) ? end-rstart+1 : rend-rstart+1;
380 }
381 else { //rstart<=start
382 if (start>rend) return 0;
383 return (rend<end)? rend-start+1 : end-start+1;
384 }
385 }
386
387 //fuzzy coordinate matching:
388 bool coordMatch(GSeg* s, uint fuzz=0) {
389 if (fuzz==0) return (start==s->start && end==s->end);
390 uint sd = (start>s->start) ? start-s->start : s->start-start;
391 uint ed = (end>s->end) ? end-s->end : s->end-end;
392 return (sd<=fuzz && ed<=fuzz);
393 }
394 //comparison operators required for sorting
395 bool operator==(GSeg& d){
396 return (start==d.start && end==d.end);
397 }
398 bool operator<(GSeg& d){
399 return (start==d.start)?(end<d.end):(start<d.start);
400 }
401 };
402
403
404
405 //--------------------------------------------------------
406 // ************** simple line reading class for text files
407
408 //GLineReader -- text line reading/buffering class
409 class GLineReader {
410 bool closeFile;
411 int len;
412 int allocated;
413 char* buf;
414 bool isEOF;
415 FILE* file;
416 off_t filepos; //current position
417 bool pushed; //pushed back
418 int lcount; //line counter (read lines)
419 public:
420 char* chars() { return buf; }
421 char* line() { return buf; }
422 int readcount() { return lcount; } //number of lines read
423 void setFile(FILE* stream) { file=stream; }
424 int length() { return len; }
425 int size() { return len; } //same as size();
426 bool isEof() {return isEOF; }
427 bool eof() { return isEOF; }
428 off_t getfpos() { return filepos; }
429 off_t getFpos() { return filepos; }
430 char* nextLine() { return getLine(); }
431 char* getLine() { if (pushed) { pushed=false; return buf; }
432 else return getLine(file); }
433 char* getLine(FILE* stream) {
434 if (pushed) { pushed=false; return buf; }
435 else return getLine(stream, filepos); }
436 char* getLine(FILE* stream, off_t& f_pos); //read a line from a stream and update
437 // the given file position
438 void pushBack() { if (lcount>0) pushed=true; } // "undo" the last getLine request
439 // so the next call will in fact return the same line
440 GLineReader(const char* fname) {
441 FILE* f=fopen(fname, "rb");
442 if (f==NULL) GError("Error opening file '%s'!\n",fname);
443 closeFile=true;
444 init(f);
445 }
446 GLineReader(FILE* stream=NULL, off_t fpos=0) {
447 closeFile=false;
448 init(stream,fpos);
449 }
450 void init(FILE* stream, off_t fpos=0) {
451 len=0;
452 isEOF=false;
453 allocated=1024;
454 GMALLOC(buf,allocated);
455 lcount=0;
456 buf[0]=0;
457 file=stream;
458 filepos=fpos;
459 pushed=false;
460 }
461 ~GLineReader() {
462 GFREE(buf);
463 if (closeFile) fclose(file);
464 }
465 };
466
467
468 /* extended fgets() - to read one full line from a file and
469 update the file position correctly !
470 buf will be reallocated as necessary, to fit the whole line
471 */
472 char* fgetline(char* & buf, int& buflen, FILE* stream, off_t* f_pos=NULL, int* linelen=NULL);
473
474
475 //print int/values nicely formatted in 3-digit groups
476 char* commaprint(uint64 n);
477
478 /*********************** File management functions *********************/
479
480 // removes the last part (file or directory name) of a full path
481 // WARNING: this is a destructive operation for the given string!
482 void delFileName(char* filepath);
483
484 // returns a pointer to the last file or directory name in a full path
485 const char* getFileName(const char* filepath);
486 // returns a pointer to the file "extension" part in a filename
487 const char* getFileExt(const char* filepath);
488
489
490 int fileExists(const char* fname);
491 //returns 0 if file entry doesn't exist
492 // 1 if it's a directory
493 // 2 if it's a regular file
494 // 3 otherwise (?)
495
496 int64 fileSize(const char* fpath);
497
498 //write a formatted fasta record, fasta formatted
499 void writeFasta(FILE *fw, const char* seqid, const char* descr,
500 const char* seq, int linelen=60, int seqlen=0);
501
502 //parses the next number found in a string at the current position
503 //until a non-digit (and not a '.', 'e','E','-','+') is encountered;
504 //updates the char* pointer to be after the last digit parsed
505 bool parseNumber(char* &p, double& v);
506 bool parseDouble(char* &p, double& v); //just an alias for parseNumber
507
508 bool parseInt(char* &p, int& i);
509 bool parseUInt(char* &p, uint& i);
510 bool parseHex(char* &p, uint& i);
511
512 #endif /* G_BASE_DEFINED */