ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cd-hit/cd-hi-class.c++
Revision: 1.2
Committed: Mon Mar 8 10:26:09 2004 UTC (17 years, 7 months ago) by dmb
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +33 -33 lines
Log Message:
Maintainance release by lducazu.

Line File contents
1 // =============================================================================
2 // CD-HI
3 //
4 // Cluster Database at High Identity
5 //
6 // CD-HI clusters protein sequence database at high sequence identity threshold.
7 // This program can remove the high sequence redundance efficiently.
8 //
9 // program written by
10 // Weizhong Li
11 // UCSD, San Diego Supercomputer Center
12 // La Jolla, CA, 92093
13 // Email liwz@sdsc.edu
14 //
15 // at
16 // Adam Godzik's lab
17 // The Burnham Institute
18 // La Jolla, CA, 92037
19 // Email adam@burnham-inst.org
20 // =============================================================================
21
22 #include "cd-hi.h"
23 int DIAG_score[MAX_DIAG];
24
25 void bomb_error(char *message) {
26 cerr << "\nFatal Error\n";
27 cerr << message << endl;
28 cerr << "\nProgram halted !! \n\n";
29 exit (1);
30 } // END void bomb_error
31
32
33 //quick_sort calling (a, 0, no-1)
34 int quick_sort (int *a, int lo0, int hi0 ) {
35 int lo = lo0;
36 int hi = hi0;
37 int mid;
38 int tmp;
39
40 if ( hi0 > lo0) {
41 mid = a[ ( lo0 + hi0 ) / 2 ];
42
43 while( lo <= hi ) {
44 while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
45 while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
46 if( lo <= hi ) {
47 tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
48 lo++; hi--;
49 }
50 } // while
51
52 if( lo0 < hi ) quick_sort(a, lo0, hi );
53 if( lo < hi0 ) quick_sort(a, lo, hi0 );
54 } // if ( hi0 > lo0)
55 return 0;
56 } // quick_sort
57
58
59 //quick_sort_idx calling (a, idx, 0, no-1)
60 //sort a with another array idx
61 //so that idx rearranged
62 int quick_sort_idx (int *a, int *idx, int lo0, int hi0 ) {
63 int lo = lo0;
64 int hi = hi0;
65 int mid;
66 int tmp;
67
68 if ( hi0 > lo0) {
69 mid = a[ ( lo0 + hi0 ) / 2 ];
70
71 while( lo <= hi ) {
72 while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
73 while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
74 if( lo <= hi ) {
75 tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
76 tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
77 lo++; hi--;
78 }
79 } // while
80
81 if( lo0 < hi ) quick_sort_idx(a, idx, lo0, hi );
82 if( lo < hi0 ) quick_sort_idx(a, idx, lo, hi0 );
83 } // if ( hi0 > lo0)
84 return 0;
85 } // quick_sort_idx
86
87
88 //quick_sort_idx calling (a, idx, 0, no-1)
89 //sort a with another array idx
90 //so that idx rearranged
91 int quick_sort_idx2 (int *a, int *b, int *idx, int lo0, int hi0 ) {
92 int lo = lo0;
93 int hi = hi0;
94 int mid;
95 int tmp;
96
97 if ( hi0 > lo0) {
98 mid = a[ ( lo0 + hi0 ) / 2 ];
99
100 while( lo <= hi ) {
101 while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
102 while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
103 if( lo <= hi ) {
104 tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
105 tmp=b[lo]; b[lo]=b[hi]; b[hi]=tmp;
106 tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
107 lo++; hi--;
108 }
109 } // while
110
111 if( lo0 < hi ) quick_sort_idx2(a, b, idx, lo0, hi );
112 if( lo < hi0 ) quick_sort_idx2(a, b, idx, lo, hi0 );
113 } // if ( hi0 > lo0)
114 return 0;
115 } // quick_sort_idx2
116
117
118 //quick_sort_a_b_idx
119 //sort a list by a first priority
120 // and b second priority
121 //another idx go with them
122 int quick_sort_a_b_idx (int *a, int *b, int *idx, int lo0, int hi0 ) {
123
124 //first sort list by a
125 quick_sort_idx2(a, b, idx, lo0, hi0);
126
127 //then sort segments where elements in a is same
128 int i;
129 int bb = lo0;
130
131 for (i=bb+1; i<=hi0; i++) {
132 if ( a[i] == a[i-1] ) {
133 ;
134 }
135 else {
136 if ( i-1 > bb ) quick_sort_idx(b, idx, bb, i-1);
137 bb = i;
138 }
139 }
140
141 // last segment
142 if ( hi0 > bb ) quick_sort_idx(b, idx, bb, hi0);
143
144 return 0;
145 } // quick_sort_a_b_idx
146
147
148 void format_seq(char *seq) {
149 int i, j;
150 char c1;
151 int len = strlen(seq);
152
153 for (i=0,j=0; i<len; i++) {
154 c1 = toupper(seq[i]);
155 if ( isalpha(c1) ) seq[j++] = c1;
156 }
157 seq[j] = 0;
158 } // END void format_seq
159
160
161 ////For smiple len1 <= len2
162 ////walk along all diag path of two sequences,
163 ////find the diags with most aap
164 ////return top n diags
165 ////for diag 0 means direction (0,0) -> (1,1)
166 //// 1 means direction (0,1) -> (1,2)
167 //// -1 means direction (1,0) -> (2,1)
168 int diag_test_aapn(char iseq2[], int len1, int len2, int *taap,
169 INTs *aap_begin, INTs *aap_list, int &best_sum,
170 int band_width, int &band_left, int &band_right, int required_aa1) {
171 int i, i1, j, k;
172 int *pp;
173 int nall = len1+len2-1;
174 static int diag_score[MAX_DIAG];
175
176 for (pp=diag_score, i=nall; i; i--, pp++) *pp=0;
177
178 int c22;
179 INTs *bip;
180 int len11 = len1-1;
181 int len22 = len2-1;
182 i1 = len11;
183 for (i=0; i<len22; i++,i1++) {
184 // c22 = iseq2[i]*NAA1 + iseq2[i+1];
185 c22 = iseq2[i]*MAX_UAA + iseq2[i+1];
186 if ( (j=taap[c22]) == 0) continue;
187 bip = aap_list+ aap_begin[c22]; // bi = aap_begin[c22];
188 for (; j; j--, bip++) { // for (j=0; j<taap[c22]; j++,bi++) {
189 diag_score[i1 - *bip]++;
190 }
191 }
192
193 //find the best band range
194 // int band_b = required_aa1;
195 int band_b = required_aa1-1 >= 0 ? required_aa1-1:0; // on dec 21 2001
196 int band_e = nall - required_aa1;
197 int band_m = ( band_b+band_width-1 < band_e ) ? band_b+band_width-1 : band_e;
198 int best_score=0;
199 for (i=band_b; i<=band_m; i++) best_score += diag_score[i];
200 int from=band_b;
201 int end =band_m;
202 int score = best_score;
203 for (k=from, j=band_m+1; j<band_e; j++) {
204 score -= diag_score[k++];
205 score += diag_score[j];
206 if ( score > best_score ) {
207 from = k;
208 end = j;
209 best_score = score;
210 }
211 }
212 for (j=from; j<=end; j++) { // if aap pairs fail to open gap
213 if ( diag_score[j] < 5 ) { best_score -= diag_score[j]; from++;}
214 else break;
215 }
216 for (j=end; j>=from; j--) { // if aap pairs fail to open gap
217 if ( diag_score[j] < 5 ) { best_score -= diag_score[j]; end--;}
218 else break;
219 }
220
221 // delete [] diag_score;
222 band_left = from-len1+1;
223 band_right= end-len1+1;
224 best_sum = best_score;
225 return OK_FUNC;
226 }
227 // END diag_test_aapn
228
229
230 ////local alignment of two sequence within a diag band
231 ////for band 0 means direction (0,0) -> (1,1)
232 //// 1 means direction (0,1) -> (1,2)
233 //// -1 means direction (1,0) -> (2,1)
234 ////iseq len are integer sequence and its length,
235 ////mat is matrix, return ALN_PAIR class
236 int local_band_align(char iseq1[], char iseq2[], int len1, int len2,
237 AA_MATRIX &mat, int &best_score, int &iden_no,
238 int band_left, int band_right) {
239 int i, j, k, j1;
240 int jj, kk;
241 int best_score1, iden_no1;
242 int *gap_array;
243 iden_no = 0;
244
245 if ( (band_right >= len2 ) ||
246 (band_left <= -len1) ||
247 (band_left > band_right) ) return FAILED_FUNC;
248
249 // allocate mem for score_mat[len1][len2] etc
250 int band_width = band_right - band_left + 1;
251 int *(*score_mat), *(*iden_mat);
252 if ((score_mat = new int * [len1]) == NULL) bomb_error("Memory");
253 if ((iden_mat = new int * [len1]) == NULL) bomb_error("Memory");
254 for (i=0; i<len1; i++) {
255 if ((score_mat[i] = new int [band_width]) == NULL) bomb_error("Memory");
256 if ((iden_mat[i] = new int [band_width]) == NULL) bomb_error("Memory");
257 }
258 //here index j1 refer to band column
259 for (i=0; i<len1; i++) for (j1=0; j1<band_width; j1++) score_mat[i][j1] = 0;
260
261 gap_array = mat.gap_array;
262 best_score = 0;
263
264 if (band_left < 0) { //set score to left border of the matrix within band
265 int tband = (band_right < 0) ? band_right : 0;
266 for (k=band_left; k<tband; k++) {
267 i = -k;
268 j1 = k-band_left;
269 if ( ( score_mat[i][j1] = mat.matrix[iseq1[i]][iseq2[0]] ) > best_score)
270 best_score = score_mat[i][j1];
271 iden_mat[i][j1] = (iseq1[i] == iseq2[0]) ? 1 : 0;
272 }
273 }
274
275 if (band_right >=0) { //set score to top border of the matrix within band
276 int tband = (band_left > 0) ? band_left : 0;
277 for (i=0,j=tband; j<=band_right; j++) {
278 j1 = j-band_left;
279 if ( ( score_mat[i][j1] = mat.matrix[iseq1[i]][iseq2[j]] ) > best_score)
280 best_score = score_mat[i][j1];
281 iden_mat[i][j1] = (iseq1[i] == iseq2[j]) ? 1 : 0;
282 }
283 }
284
285 for (i=1; i<len1; i++) {
286 for (j1=0; j1<band_width; j1++) {
287 j = j1+i+band_left;
288 if ( j<1 ) continue;
289 if ( j>=len2) continue;
290
291 int sij = mat.matrix[iseq1[i]][iseq2[j]];
292 int iden_ij = (iseq1[i] == iseq2[j] ) ? 1 : 0;
293 int s1, *mat_row;
294 int k0;
295
296 // from (i-1,j-1)
297 if ( (best_score1 = score_mat[i-1][j1] )> 0 ) {
298 iden_no1 = iden_mat[i-1][j1];
299 }
300 else {
301 best_score1 = 0;
302 iden_no1 = 0;
303 }
304
305 // from last row
306 mat_row = score_mat[i-1];
307 k0 = (-band_left+1-i > 0) ? -band_left+1-i : 0;
308 for (k=j1-1, kk=0; k>=k0; k--, kk++) {
309 if ( (s1 = mat_row[k] + gap_array[kk] ) > best_score1 ){
310 best_score1 = s1;
311 iden_no1 = iden_mat[i-1][k];
312 }
313 }
314
315 k0 = (j-band_right-1 > 0) ? j-band_right-1 : 0;
316 for(k=i-2, jj=j1+1,kk=0; k>=k0; k--,kk++,jj++) {
317 if ( (s1 = score_mat[k][jj] + gap_array[kk] ) > best_score1 ){
318 best_score1 = s1;
319 iden_no1 = iden_mat[k][jj];
320 }
321 }
322
323 best_score1 += sij;
324 iden_no1 += iden_ij;
325 score_mat[i][j1] = best_score1;
326 iden_mat[i][j1] = iden_no1;
327
328 if ( best_score1 > best_score ) {
329 best_score = best_score1;
330 iden_no = iden_no1;
331 }
332 } // END for (j=1; j<len2; j++)
333 } // END for (i=1; i<len1; i++)
334
335 for (i=0; i<len1; i++) {
336 delete [] score_mat[i];
337 delete [] iden_mat[i];
338 }
339 delete [] score_mat;
340 delete [] iden_mat;
341
342 return OK_FUNC;
343 } // END int local_band_align
344
345
346
347 // class function definition
348 char aa[] = {"ARNDCQEGHILKMFPSTWYVBZX"};
349 int aa_idx[MAX_AA] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,2,6,20};
350 int aa2idx[] = {0, 2, 4, 3, 6, 13,7, 8, 9,20,11,10,12, 2,20,14, 5, 1,15,16,20,19,17,20,18, 6};
351 // idx for A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
352 // so aa2idx[ X - 'A'] => idx_of_X, eg aa2idx['A' - 'A'] => 0, and aa2idx['M'-'A'] => 12
353
354 int BLOSUM62[] = {
355 4, // A
356 -1, 5, // R
357 -2, 0, 6, // N
358 -2,-2, 1, 6, // D
359 0,-3,-3,-3, 9, // C
360 -1, 1, 0, 0,-3, 5, // Q
361 -1, 0, 0, 2,-4, 2, 5, // E
362 0,-2, 0,-1,-3,-2,-2, 6, // G
363 -2, 0, 1,-1,-3, 0, 0,-2, 8, // H
364 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4, // I
365 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4, // L
366 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5, // K
367 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5, // M
368 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6, // F
369 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7, // P
370 1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4, // S
371 0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5, // T
372 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11, // W
373 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7, // Y
374 0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4, // V
375 -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4, // B
376 -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4, // Z
377 0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1 // X
378 //A R N D C Q E G H I L K M F P S T W Y V B Z X
379 //0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 2 6 20
380 };
381
382
383 int outiseq(char iseq[], int len) {
384 int i;
385 char seq[MAX_SEQ];
386 for (i=0; i<len; i++) seq[i] = aa[iseq[i]];
387 seq[len]=0;
388 cout << ">>" << seq << endl;
389 return 0;
390 }
391
392 int setiseq(char *seq, int len) {
393 for (int i=0; i<len; i++) {
394 seq[i] = aa2idx[seq[i] - 'A'];
395 }
396 return 0;
397 } // END void SEQ::seq2iseq()
398
399 /////////////////
400 AA_MATRIX::AA_MATRIX() {
401 int i, j, k;
402 gap = -11;
403 ext_gap = -1;
404 if ( (gap_array = new int[MAX_GAP]) == NULL ) bomb_error("memory");
405 for (i=0; i<MAX_GAP; i++) gap_array[i] = gap + i * ext_gap;
406 k = 0;
407 for ( i=0; i<MAX_AA; i++)
408 for ( j=0; j<=i; j++)
409 matrix[j][i] = matrix[i][j] = BLOSUM62[ k++ ];
410 } // END AA_MATRIX::AA_MATRIX()
411
412 void AA_MATRIX::init() {
413 int i, j, k;
414 gap = -11;
415 ext_gap = -1;
416 for (i=0; i<MAX_GAP; i++) gap_array[i] = gap + i * ext_gap;
417 k = 0;
418 for ( i=0; i<MAX_AA; i++)
419 for ( j=0; j<=i; j++)
420 matrix[j][i] = matrix[i][j] = BLOSUM62[ k++ ];
421 } // END void AA_MATRIX::init()
422
423 void AA_MATRIX::set_gap(int gap1, int ext_gap1) {
424 int i;
425 gap = gap1; ext_gap = ext_gap1;
426 for (i=0; i<MAX_GAP; i++) gap_array[i] = gap + i * ext_gap;
427 } // END void AA_MATRIX::set_gap
428
429 void AA_MATRIX::set_matrix(int *mat1) {
430 int i, j, k;
431 k = 0;
432 for ( i=0; i<MAX_AA; i++)
433 for ( j=0; j<=i; j++)
434 matrix[j][i] = matrix[i][j] = mat1[ k++ ];
435 } // END void AA_MATRIX::set_matrix
436
437
438
439 IDX_TBL::IDX_TBL(){
440 NAA = 0;
441 NAAN = 0;
442 mem_size = 1;
443 buffer_size = 100000;
444 } // END IDX_TBL::IDX_TBL
445
446
447 void IDX_TBL::init(int naa, int naan){
448 int i;
449 NAA = naa;
450 NAAN = naan;
451 buffer_size = 100000;
452
453 if ( NAA == 2 ) { mem_size = 25000; }
454 else if ( NAA == 3 ) { mem_size = 1200; }
455 else if ( NAA == 4 ) { mem_size = 60; }
456 else if ( NAA == 5 ) { mem_size = 3; }
457 else bomb_error("Something wrong!");
458
459 if ((size = new int[NAAN]) == NULL) bomb_error("Memory");
460 if ((capacity = new int[NAAN]) == NULL) bomb_error("Memory");
461 if ((seq_idx = new int*[NAAN]) == NULL) bomb_error("Memory");
462 if ((word_no = new INTs*[NAAN]) == NULL) bomb_error("Memory");
463 if ((buffer = new int[buffer_size]) == NULL) bomb_error("Memory");
464
465 for (i=0; i<NAAN; i++) {
466 size[i] = 0;
467 capacity[i] = 0;
468 }
469
470 } // END IDX_TBL::init
471
472
473 void IDX_TBL::clean() {
474 int i;
475 for (i=0; i<NAAN; i++) size[i]=0;
476 } // END IDX_TBL::clean
477
478
479 int IDX_TBL::read_tbl(char *filename) {
480 int i;
481
482 ifstream fswap(filename);
483 if (! fswap) bomb_error("Can not open file");
484
485 for (i=0; i<NAAN; i++) {
486 if ( size[i] > 0 ) {
487 delete [] seq_idx[i];
488 delete [] word_no[i];
489 }
490
491 fswap.read((char*) &size[i], sizeof(int));
492 capacity[i] = size[i];
493 if (size[i] == 0 ) continue;
494 if ((seq_idx[i] = new int[size[i]]) == NULL) bomb_error("Memory");
495 if ((word_no[i] = new INTs[size[i]]) == NULL) bomb_error("Memory");
496
497 fswap.read((char*) seq_idx[i], sizeof(int) * size[i]);
498 fswap.read((char*) word_no[i], sizeof(INTs) * size[i]);
499 }
500
501 fswap.close();
502 return OK_FUNC;
503
504 } // END int IDX_TBL::read_tbl
505
506
507 int IDX_TBL::write_tbl(char *filename) {
508 int i;
509 ofstream fswap(filename);
510
511 if (! fswap) bomb_error("Can not open file");
512
513 for (i=0; i<NAAN; i++) {
514 fswap.write((char*) &size[i], sizeof(int));
515 if (size[i] == 0 ) continue;
516 fswap.write((char*) seq_idx[i], sizeof(int) * size[i]);
517 fswap.write((char*) word_no[i], sizeof(INTs) * size[i]);
518 }
519 fswap.close();
520 return OK_FUNC;
521
522 } // END int IDX_TBL::write_tbl
523
524
525 int IDX_TBL::add_word_list(int aan_no, int *aan_list,
526 INTs *aan_list_no, int idx) {
527 int j, k, j1, j0;
528
529 for (j0=0; j0<aan_no; j0++) {
530 if ( (j1=aan_list_no[j0]) ) {
531 j = aan_list[j0];
532
533 if ( size[j] == capacity[j] ) { // resize array
534 if ( capacity[j] > buffer_size ) {
535 delete [] buffer;
536 buffer_size = capacity[j];
537 if ((buffer = new int[buffer_size]) == NULL) bomb_error("Memory");
538 }
539
540 for (k=0; k<size[j]; k++) buffer[k] = seq_idx[j][k];
541 if ( capacity[j] >0 ) delete [] seq_idx[j];
542 if ((seq_idx[j] = new int[mem_size+capacity[j]]) == NULL)
543 bomb_error("Memory");
544 for (k=0; k<size[j]; k++) seq_idx[j][k] = buffer[k];
545
546 for (k=0; k<size[j]; k++) buffer[k] = word_no[j][k];
547 if ( capacity[j] >0 ) delete [] word_no[j];
548 if ((word_no[j] = new INTs[mem_size+capacity[j]]) == NULL)
549 bomb_error("Memory");
550 for (k=0; k<size[j]; k++) word_no[j][k] = buffer[k];
551
552 capacity[j] += mem_size;
553 }
554 seq_idx[j][size[j]] = idx;
555 word_no[j][size[j]] = j1;
556 size[j]++;
557 }
558 } // for (j0=0; j0<aan_no; j0++) {
559
560 return OK_FUNC;
561 } // END int IDX_TBL::add_word_list
562
563
564 int IDX_TBL::count_word_no(int aan_no, int *aan_list,
565 INTs *aan_list_no, INTs *look_and_count) {
566 int j, k, j0, j1, k1;
567 int *ptr1;
568 INTs *ptr2;
569
570 for (j0=0; j0<aan_no; j0++) {
571 if ( (j1=aan_list_no[j0]) ) {
572 j = aan_list[j0];
573 k1 = size[j];
574 ptr1 = seq_idx[j];
575 ptr2 = word_no[j];
576 for (k=0; k<k1; k++)
577 look_and_count[ptr1[k]] += ( j1 < ptr2[k]) ? j1 : ptr2[k] ;
578 }
579 }
580
581 return OK_FUNC;
582 } // END int IDX_TBL::count_word_no
583
584
585
586
587
588 int print_usage (char *arg) {
589 cout << "Usage "<< arg << " [Options] " << endl;
590 cout << "Options " << endl;
591 cout << " -i in_dbname, required" << endl;
592 cout << " -o out_dbname, required" << endl;
593 cout << " -c threshold, default 0.9" << endl;
594 cout << " the sequence identity is calculated as :" << endl;
595 cout << " Number of identical amino acids in alignment divided by" << endl;
596 cout << " full length of short sequence" << endl;
597 cout << " -b band_width, default 20" << endl;
598 cout << " -M max_memory, default 400 (Mbyte) " << endl;
599 cout << " -n word_length, default 5" << endl;
600 cout << " -l length_of_throw_away_sequences, default 10" << endl;
601 cout << " -L coverage cutoff, default 0.0" << endl;
602 cout << " the coverage is for shorter sequence, calculated as :" << endl;
603 cout << " length of aligned part divided by full length" << endl;
604 cout << " -d length of description line in the .clstr file, default 20" << endl;
605 cout << " -t tolerance for redundance, default 2" << endl;
606 cout << " -u filename of an old dbname.clstr, for incremental update" << endl;
607 cout << " if an old NR clustered at 90% yields NR90 and NR90.clstr," << endl;
608 cout << " to cluster a larger NR at 90%, use -u NR90.clstr" << endl;
609 cout << " -h, print this help" << endl;
610 exit(1);
611 } // END print_usage
612
613
614 int db_seq_no_test(ifstream &in1) {
615 char c0, c1;
616 int no = 0;
617
618 c0 = '\n';
619 while(1) {
620 if ( in1.eof()) break;
621 in1.read(&c1, 1);
622 if ( c1 == '>' && c0 == '\n') no++;
623 c0 = c1;
624 }
625 in1.close();
626 return no;
627 }
628
629
630 int old_clstr_seq_no_test(ifstream &in1) {
631 char c0, c1;
632 int no = 0;
633
634 c0 = '\n';
635 while(1) {
636 if ( in1.eof()) break;
637 in1.read(&c1, 1);
638 if ( c1 != '>' && c0 == '\n') no++;
639 c0 = c1;
640 }
641 in1.close();
642 return no;
643 }
644
645
646 int db_read_in (ifstream &in1, int length_of_throw,
647 int & NR_no, char *NR_seq[], int *NR_len) {
648
649 char raw_seq[MAX_SEQ], raw_des[MAX_DES];
650 char buffer1[MAX_LINE_SIZE];
651 raw_seq[0] = raw_des[0] = buffer1[0] = 0;
652 int read_in = 0;
653
654 NR_no = 0;
655 while(1) {
656 if ( in1.eof()) break;
657 in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');
658
659 if ( buffer1[0] == '>') {
660 if ( read_in ) { // write previous record
661 if ( strlen(raw_seq) >= MAX_SEQ-1)
662 bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
663 format_seq(raw_seq);
664
665 if ( strlen(raw_seq) > (unsigned)length_of_throw ) {
666 // if (strlen(raw_seq) > 32766) raw_seq[32766]=0; // temp
667 if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
668 bomb_error("memory");
669 strcpy( NR_seq[NR_no], raw_seq);
670 NR_len[NR_no] = strlen(raw_seq);
671 NR_no++;
672 }
673 }
674 strncpy(raw_des, buffer1, MAX_DES-2);
675 raw_seq[0] = 0;
676 }
677 else {
678 read_in = 1;
679 strcat(raw_seq, buffer1);
680 }
681 } // END while(1);
682
683 if (1) { // the last record
684 if ( strlen(raw_seq) >= MAX_SEQ-1)
685 bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
686 format_seq(raw_seq);
687
688 if ( strlen(raw_seq) > (unsigned)length_of_throw ) {
689 if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
690 bomb_error("memory");
691 strcpy( NR_seq[NR_no], raw_seq);
692 NR_len[NR_no] = strlen(raw_seq);
693 NR_no++;
694 }
695 }
696 in1.close();
697 return 0;
698 } // END db_read_in
699
700 int db_read_in2(ifstream &in1, int length_of_throw,
701 int & NR_no, char *NR_seq[], int *NR_len,
702 int NRo_no, int * NRo_idx,
703 int * NRo_id1, int * NRo_id2, int * NRo_NR_idx) {
704
705 int i;
706 char raw_seq[MAX_SEQ], raw_des[MAX_DES];
707 char buffer1[MAX_LINE_SIZE];
708 raw_seq[0] = raw_des[0] = buffer1[0] = 0;
709 int read_in = 0;
710 int id1, id2;
711
712 NR_no = 0;
713 while(1) {
714 if ( in1.eof()) break;
715 in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');
716
717 if ( buffer1[0] == '>' || buffer1[0] == ';') {
718 if ( read_in ) { // write last record
719 if ( strlen(raw_seq) >= MAX_SEQ-1)
720 bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
721 format_seq(raw_seq);
722
723 if ( strlen(raw_seq) > (unsigned)length_of_throw ) {
724 if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
725 bomb_error("memory");
726 strcpy( NR_seq[NR_no], raw_seq);
727 NR_len[NR_no] = strlen(raw_seq);
728 des_to_idx(id1, id2, raw_des);
729 i = get_index_of_2_sorted_list(NRo_id1, NRo_id2, 0, NRo_no-1, id1, id2);
730 if ( i != -1 ) NRo_NR_idx[ NRo_idx[i] ] = NR_no;
731 // else { cout << id1 << " " << id2 << raw_des << endl; }
732 NR_no++;
733 }
734 }
735 strncpy(raw_des, buffer1, MAX_DES-2);
736 raw_seq[0] = 0;
737 }
738 else {
739 read_in = 1;
740 strcat(raw_seq, buffer1);
741 }
742 } // END while(1);
743
744 if (1) { // the last record
745 if ( strlen(raw_seq) >= MAX_SEQ-1)
746 bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
747 format_seq(raw_seq);
748
749 if ( strlen(raw_seq) > (unsigned)length_of_throw ) {
750 if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
751 bomb_error("memory");
752 strcpy( NR_seq[NR_no], raw_seq);
753 NR_len[NR_no] = strlen(raw_seq);
754 des_to_idx(id1, id2, raw_des);
755 i = get_index_of_2_sorted_list(NRo_id1, NRo_id2, 0, NRo_no-1, id1, id2);
756 if ( i != -1 ) NRo_NR_idx[ NRo_idx[i] ] = NR_no;
757 NR_no++;
758 }
759 }
760 in1.close();
761 return 0;
762 } // END db_read_in2
763
764
765 int sort_seqs_divide_segs (int NR_no, int *NR_len, int *NR_idx, char *NR_seq[],
766 int mem_limit, int NAAN,
767 int &SEG_no, int *SEG_b, int *SEG_e,
768 char db_swap[MAX_SEG][MAX_FILE_NAME]) {
769 int i, j, k, i1;
770
771 // ************************************* change all the NR_seq to iseq
772 int len, total_letter=0;
773 int max_len = 0, min_len = 99999;
774 for (i=0; i<NR_no; i++) {
775 len = NR_len[i];
776 total_letter += len;
777 if (len > max_len) max_len = len;
778 if (len < min_len) min_len = len;
779 setiseq(NR_seq[i], len);
780 }
781
782 cout << "longest and shortest : " << max_len << " and " << min_len << endl;
783 cout << "Total letters: " << total_letter << endl;
784 // END change all the NR_seq to iseq
785
786 // **************************** Form NR_idx[], Sort them from Long to short
787 int *size_no;
788 int *size_begin;
789 if ((size_no = new int[max_len-min_len+1]) == NULL ) bomb_error("Memory");
790 if ((size_begin = new int[max_len-min_len+1]) == NULL ) bomb_error("Memory");
791
792 for (i=max_len; i>=min_len; i--) {
793 size_no[max_len - i] = 0;
794 size_begin[max_len - i] = 0;
795 }
796 for (i=0; i<NR_no; i++) size_no[max_len - NR_len[i]]++;
797 for (i=max_len; i>=min_len; i--)
798 for (j=max_len; j>i; j--)
799 size_begin[max_len-i] += size_no[max_len-j];
800 for (i=max_len; i>=min_len; i--) size_no[max_len - i] = 0;
801 for (i=0; i<NR_no; i++) {
802 j = max_len-NR_len[i];
803 NR_idx[ size_begin[j] + size_no[j]] = i;
804 size_no[j]++;
805 }
806 delete []size_no; delete []size_begin;
807 cout << "Sequences have been sorted" << endl;
808 // END sort them from long to short
809
810 mem_limit -= total_letter + 24*NR_no + 16 * NAAN;
811 if ( mem_limit <= 1000000 ) bomb_error("No memory, change -M option");
812 mem_limit /= sizeof (int) + sizeof (INTs);
813 SEG_no=0; j=0; k=0;
814 for (i1=0; i1<NR_no; i1++) {
815 i = NR_idx[i1];
816 len = NR_len[i];
817 j += len;
818 if ( j>mem_limit ) {
819 SEG_b[SEG_no] = k;
820 SEG_e[SEG_no] = i1;
821 sprintf(db_swap[SEG_no], "SWAP.%d",SEG_no);
822 j=0; k=i1+1;
823 SEG_no++;
824 if ( SEG_no >= MAX_SEG )
825 bomb_error("Too many segments, enlarge Macro MAX_SEG or change -M option");
826 }
827 }
828 if ( SEG_no == 0 ) {
829 SEG_b[SEG_no] = 0;
830 SEG_e[SEG_no] = NR_no-1;
831 sprintf(db_swap[SEG_no], "SWAP.%d",SEG_no);
832 SEG_no++;
833 }
834 else if ( SEG_e[SEG_no-1] != NR_no-1 ) { // last Segment
835 SEG_b[SEG_no] = k;
836 SEG_e[SEG_no] = NR_no-1;
837 sprintf(db_swap[SEG_no], "SWAP.%d",SEG_no);
838 SEG_no++;
839 }
840 if (SEG_no > 1) cout << "Sequences divided into " << SEG_no << " parts\n";
841
842 return 0;
843 }// END sort_seqs_divide_segs
844
845
846 int db_read_and_write (ifstream &in1, ofstream &out1,
847 int length_of_throw, int des_len,
848 char *NR_seq[], int *NR_clstr_no) {
849
850 char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ];
851 char buffer1[MAX_LINE_SIZE];
852 raw_seq[0] = raw_des[0] = buffer1[0] = 0;
853 int read_in = 0;
854 int NR_no1 = 0;
855
856 while(1) {
857 if ( in1.eof()) break;
858 in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');
859 if ( buffer1[0] == '>' || buffer1[0] == ';') {
860 if ( read_in ) { // write last record
861 strcpy(raw_seq1, raw_seq);
862 format_seq(raw_seq1);
863
864 if ( strlen(raw_seq1) > (unsigned)length_of_throw ) {
865 if (NR_clstr_no[NR_no1] >= 0 ) out1 << raw_des << "\n" << raw_seq;
866 if ((NR_seq[NR_no1] = new char[des_len] ) == NULL )
867 bomb_error("memory");
868 strncpy(NR_seq[NR_no1], raw_des, des_len-2);
869 NR_seq[NR_no1][des_len-2]=0;
870 NR_no1++;
871 }
872 }
873 strncpy(raw_des, buffer1, MAX_DES-2);
874 raw_seq[0] = 0;
875 }
876 else {
877 read_in = 1;
878 strcat(raw_seq, buffer1); strcat(raw_seq,"\n");
879 }
880 } // END while(1);
881
882 if (1) { // the last record
883 strcpy(raw_seq1, raw_seq);
884 format_seq(raw_seq1);
885
886 if ( strlen(raw_seq1) > (unsigned)length_of_throw ) {
887 if (NR_clstr_no[NR_no1] >= 0 ) out1 << raw_des << "\n" << raw_seq;
888 if ((NR_seq[NR_no1] = new char[des_len] ) == NULL )
889 bomb_error("memory");
890 strncpy(NR_seq[NR_no1], raw_des, des_len-2);
891 NR_seq[NR_no1][des_len-2]=0;
892 NR_no1++;
893 }
894 }
895
896 return 0;
897 } // END db_read_and_write
898
899
900 int old_clstr_read_in(ifstream &in_clstr, int &NRo_no, int &NRo90_no,
901 int *NRo_idx, int *NRo_id1, int *NRo_id2,
902 char *NRo_iden, int *NRo_clstr_no, int *NRo_NR_idx) {
903
904 int i, j, k, i1;
905 char buffer1[MAX_LINE_SIZE];
906 int is_rep, len;
907 char iden, iden1[4];
908
909 NRo_no = 0;
910 NRo90_no = 0;
911
912 while(1) {
913 if (in_clstr.eof()) break;
914 in_clstr.getline(buffer1, MAX_LINE_SIZE-2, '\n');
915 if ( buffer1[0] == '>') {
916 //read in line like >Cluster 10
917 NRo90_no ++;
918 }
919 else {
920 //read in line like
921 //1 225aa, >gi|4099051|gb|AAD... at 80%
922 //2 9448aa, >gi|8249467|emb|CA... *
923 len = strlen(buffer1); if (len<12) continue;
924 iden = 0;
925 for (i=len-1; i>=0; i--) {
926 if (buffer1[i] == '*') {
927 is_rep = 1; break;
928 }
929 else if (buffer1[i] == '%') {
930 is_rep = 0;
931 for (j=i-1; isdigit(buffer1[j]); j--) ;
932 strncpy(iden1, buffer1+j, i-j); iden1[i-j]=0; iden = atoi(iden1);
933 break;
934 }
935 }
936
937 for (i=0; i<len; i++)
938 if (buffer1[i] == '>' ) break;
939
940 NRo_idx[NRo_no] = NRo_no;
941 des_to_idx(NRo_id1[NRo_no], NRo_id2[NRo_no], buffer1+i);
942 NRo_clstr_no[NRo_no] = is_rep ? NRo90_no -1 : -1 - (NRo90_no-1);
943 NRo_NR_idx[NRo_no] = -1;
944 NRo_iden[NRo_no] = iden;
945 NRo_no ++;
946 }
947 }
948
949 for (i=0; i<NRo_no; i++) {
950 if ( (i1=NRo_clstr_no[i]) < 0) continue;
951
952 NRo_clstr_no[i] = i;
953 for (j=i-1; j>=0; j--) {
954 if ( i1 == -1-NRo_clstr_no[j] ) NRo_clstr_no[j] = i;
955 else break;
956 }
957
958 k=i;
959 for (j=i+1; j<NRo_no; j++) {
960 if ( i1 == -1-NRo_clstr_no[j] ) {NRo_clstr_no[j] = k; i++;}
961 else break;
962 }
963 }
964
965 quick_sort_a_b_idx(NRo_id1, NRo_id2, NRo_idx, 0, NRo_no-1);
966 // now NRo_id1 and NRo_id2 are sorted,
967 // and NRo_idx restore the sorted index of NRo_clstr_no, NRo_NR_idx
968
969 return 0;
970 } // END old_clstr_read_in
971
972
973 // this is the index the identifier of each sequence in FASTA format
974 // I use the first 12 alphabet or digit letters to identify each sequence
975 // the first 6-letter convert to a unique integer,
976 // and 2nd letter convert to another integer, all alphabet in lower case
977 // if the string is shorter than 12, others are filled with ''
978 // idx is
979 // 0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z ''
980 // 0 1 2 3
981 // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6
982 // so total 37 for each bit
983 // 37 1369 50653 1874161 69343957
984 int des_to_idx(int &id1, int &id2, char *str1) {
985 int i, j;
986 char idstr[13];
987 int len = strlen(str1);
988 char c1;
989
990 for (i=0; i<12; i++) idstr[i] = 36;
991
992 j = 0;
993 for (i=0; i<len; i++) {
994 c1 = str1[i];
995 if ( isalpha(c1) ) {
996 idstr[j] = tolower(c1) - 'a' + 10;
997 j++;
998 }
999 else if ( isdigit(c1) ) {
1000 idstr[j] = tolower(c1) - '0';
1001 j++;
1002 }
1003 if (j == 12) break;
1004 }
1005
1006 id1 = idstr[0]*69343957 + idstr[1]*1874161 + idstr[2]*50653 +
1007 idstr[3]*1369 + idstr[4]*37 + idstr[5];
1008 id2 = idstr[6]*69343957 + idstr[7]*1874161 + idstr[8]*50653 +
1009 idstr[9]*1369 + idstr[10]*37 + idstr[11];
1010 return 0;
1011 } // END des_to_idx
1012
1013
1014
1015 // get index of a element of a sorted list using 2-div method
1016 // calling get_index_of_sorted_list (list, begin_no, end_no, element)
1017 // list is a sorted list in order of increasing
1018 int get_index_of_sorted_list (int *list, int b, int e, int element) {
1019
1020 int mid = (b+e) / 2;
1021 int mid_v = list[mid];
1022
1023 while( e > b+1 ) {
1024 mid = (b+e) / 2;
1025 mid_v = list[mid];
1026
1027 if (element > mid_v) { b = mid; }
1028 else if (element < mid_v) { e = mid; }
1029 else { break; }
1030 }
1031
1032 if (element == mid_v ) { return mid; }
1033 else if (element == list[e] ) { return e; }
1034 else if (element == list[b] ) { return b; }
1035 else { return -1; }
1036 } // END get_index_of_sorted_list
1037
1038
1039 // get index of a element of a sorted list using 2-div method
1040 // calling get_index_of_2_sorted_list (list, list2, begin_no, end_no, element)
1041 // list is a sorted list in order of increasing
1042 // if index of list is same, check list2
1043 int get_index_of_2_sorted_list (int *list, int *list2, int b, int e,
1044 int element, int element2) {
1045 int i = get_index_of_sorted_list(list, b, e, element);
1046 if ( i == -1 ) { return -1; }
1047
1048 int bb = i;
1049 int ee = i;
1050
1051 while (1) {
1052 if (bb == b) break;
1053 if (list[bb] == list[bb-1] ) {bb--;}
1054 else {break;}
1055 }
1056
1057 while(1) {
1058 if (ee == e) break;
1059 if (list[ee] == list[ee+1] ) {ee++;}
1060 else {break;}
1061 }
1062
1063 i = get_index_of_sorted_list(list2, bb, ee, element2);
1064 return i;
1065 } // END get_index_of_2_sorted_list
1066
1067
1068 /////////////////////////// END ALL ////////////////////////