ViewVC Help
View File | Revision Log | Show Annotations | Root Listing
root/cd-hit/cd-hit.c++
Revision: 1.2
Committed: Mon Mar 8 10:26:09 2004 UTC (17 years, 7 months ago) by dmb
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +9 -6 lines
Log Message:
Maintainance release by lducazu.

Line File contents
1 // =============================================================================
2 // CD-HIT
3 // http://bioinformatics.burnham-inst.org/cd-hi
4 //
5 // program written by
6 // Weizhong Li
7 // UCSD, San Diego Supercomputer Center
8 // La Jolla, CA, 92093
9 // Email liwz@sdsc.edu
10 // at
11 // Adam Godzik's lab
12 // The Burnham Institute
13 // La Jolla, CA, 92037
14 // Email adam@burnham-inst.org
15 // =============================================================================
16
17 #include "cd-hi.h"
18 #include "cd-hi-init.h"
19
20 //////////////////////////////////// MAIN /////////////////////////////////////
21 int main(int argc, char **argv) {
22 int i, j, k, i1, j1, sggi, sgj;
23 char db_in[MAX_FILE_NAME];
24 char db_out[MAX_FILE_NAME];
25 char db_clstr[MAX_FILE_NAME];
26 char db_clstr_bak[MAX_FILE_NAME];
27 char db_clstr_old[MAX_FILE_NAME];
28
29 // *********************************** parse command line and open file
30 if (argc < 5) print_usage(argv[0]);
31
32 for (i=1; i<argc; i++) {
33 if (strcmp(argv[i], "-i") == 0)
34 strncpy(db_in, argv[++i], MAX_FILE_NAME-1);
35 else if (strcmp(argv[i], "-o") == 0)
36 strncpy(db_out, argv[++i], MAX_FILE_NAME-1);
37 else if (strcmp(argv[i], "-u") == 0) {
38 strncpy(db_clstr_old, argv[++i], MAX_FILE_NAME-1);
39 old_clstr_file = 1;
40 }
41 else if (strcmp(argv[i], "-M") == 0)
42 mem_limit = 1000000 * atoi(argv[++i]);
43 else if (strcmp(argv[i], "-l") == 0)
44 length_of_throw = atoi(argv[++i]);
45 else if (strcmp(argv[i], "-c") == 0) {
46 NR_clstr = atof(argv[++i]);
47 if ((NR_clstr > 1.0) || (NR_clstr < 0.4)) bomb_error("invalid clstr");
48 NR_clstr100 = (int) (NR_clstr * 100 );
49 }
50 else if (strcmp(argv[i], "-L") == 0) {
51 NR_cov = atof(argv[++i]);
52 if ((NR_cov > 1.0) || (NR_cov < 0.0)) bomb_error("invalid coverage cutoff");
53 }
54 else if (strcmp(argv[i], "-b") == 0) {
55 BAND_width = atoi(argv[++i]);
56 if (BAND_width < 0 ) bomb_error("invalid band width");
57 }
58 else if (strcmp(argv[i], "-n") == 0) {
59 NAA = atoi(argv[++i]);
60 if ( NAA < 2 || NAA > 5 ) bomb_error("invalid word length");
61 }
62 else if (strcmp(argv[i], "-d") == 0) {
63 des_len = atoi(argv[++i]);
64 if ( des_len < 15 )
65 bomb_error("too short description, not enough to identify sequences");
66 }
67 else if (strcmp(argv[i], "-t") == 0) {
68 tolerance = atoi(argv[++i]);
69 if ( tolerance < 0 || tolerance > 5 ) bomb_error("invalid tolerance");
70 }
71 else
72 print_usage(argv[0]);
73 }
74 db_clstr[0]=0; strcat(db_clstr,db_out); strcat(db_clstr,".clstr");
75 db_clstr_bak[0]=0; strcat(db_clstr_bak,db_out); strcat(db_clstr_bak,".bak.clstr");
76
77 if ( NAA == 2 ) { NAAN = NAA2; }
78 else if ( NAA == 3 ) { NAAN = NAA3; }
79 else if ( NAA == 4 ) { NAAN = NAA4; }
80 else if ( NAA == 5 ) { NAAN = NAA5; }
81 else bomb_error("invalid -n parameter!");
82
83 word_table.init(NAA, NAAN);
84
85 if ( tolerance ) {
86 int clstr_idx = (int) (NR_clstr * 100) - naa_stat_start_percent;
87 int tcutoff = naa_stat[tolerance-1][clstr_idx][5-NAA];
88
89 if (tcutoff < 5 )
90 bomb_error("Too short word length, increase it or the tolerance");
91 for ( i=5; i>NAA; i--) {
92 if ( naa_stat[tolerance-1][clstr_idx][5-i] > 10 ) {
93 cout << "Your word length is " << NAA << ", using "
94 << i << " may be faster!" <<endl;
95 break;
96 }
97 }
98 }
99 else {
100 if ( NR_clstr > 0.85 && NAA < 5)
101 cout << "Your word length is " << NAA
102 << ", using 5 may be faster!" <<endl;
103 else if ( NR_clstr > 0.80 && NAA < 4 )
104 cout << "Your word length is " << NAA
105 << ", using 4 may be faster!" <<endl;
106 else if ( NR_clstr > 0.75 && NAA < 3 )
107 cout << "Your word length is " << NAA
108 << ", using 3 may be faster!" <<endl;
109 }
110
111 if ( length_of_throw <= NAA ) bomb_error("Too short -l, redefine it");
112
113 ifstream in1(db_in);
114 if ( ! in1 ) { cout << "Can not open file" << db_in << endl; exit(1); }
115 ofstream out1(db_out);
116 if ( ! out1) { cout << "Can not open file" << db_out << endl; exit(1); }
117 ofstream out2(db_clstr);
118 if ( ! out2) { cout << "Can not open file" << db_clstr << endl; exit(1); }
119 ofstream out2_bak(db_clstr_bak);
120 if ( ! out2_bak) { cout << "Can not open file" << db_clstr_bak << endl; exit(1); }
121
122 DB_no = db_seq_no_test(in1);
123 if ((NR_len = new int [DB_no]) == NULL) bomb_error("Memory");
124 if ((NR_idx = new int [DB_no]) == NULL) bomb_error("Memory");
125 if ((NR90_idx = new int [DB_no]) == NULL) bomb_error("Memory");
126 if ((NR_clstr_no = new int [DB_no]) == NULL) bomb_error("Memory");
127 if ((NR_iden = new char [DB_no]) == NULL) bomb_error("Memory");
128 if ((NR_coverage = new char [DB_no]) == NULL) bomb_error("Memory");
129 if ((NR_flag = new char [DB_no]) == NULL) bomb_error("Memory");
130 if ((NR_seq = new char *[DB_no]) == NULL) bomb_error("Memory");
131 int *Clstr_no, *(*Clstr_list);
132 if ((Clstr_no = new int [DB_no]) == NULL) bomb_error("Memory");
133 if ((Clstr_list = new int *[DB_no]) == NULL) bomb_error("Memory");
134
135 if ( old_clstr_file ) {
136 ifstream in_clstr(db_clstr_old);
137 if ( ! in_clstr) {
138 cout << "Can not open file" << db_clstr_old << endl;
139 exit(1);
140 }
141
142 //number of seq in old clstr file
143 int clstr_seq_no = old_clstr_seq_no_test(in_clstr);
144 in_clstr.clear();
145 in_clstr.open(db_clstr_old);
146
147 if ((NRo_idx = new int [clstr_seq_no]) == NULL) bomb_error("Memory");
148 if ((NRo_id1 = new int [clstr_seq_no]) == NULL) bomb_error("Memory");
149 if ((NRo_id2 = new int [clstr_seq_no]) == NULL) bomb_error("Memory");
150 if ((NRo_clstr_no = new int [clstr_seq_no]) == NULL) bomb_error("Memory");
151 if ((NRo_NR_idx = new int [clstr_seq_no]) == NULL) bomb_error("Memory");
152 if ((NRo_iden = new char[clstr_seq_no]) == NULL) bomb_error("Memory");
153 old_clstr_read_in(in_clstr, NRo_no, NRo90_no, NRo_idx, NRo_id1, NRo_id2,
154 NRo_iden, NRo_clstr_no, NRo_NR_idx);
155 in_clstr.close();
156 }
157
158 in1.clear();
159 in1.open(db_in);
160 NRo_no > 0 ?
161 db_read_in2(in1, length_of_throw, NR_no, NR_seq, NR_len,
162 NRo_no, NRo_idx, NRo_id1, NRo_id2, NRo_NR_idx):
163 db_read_in(in1, length_of_throw, NR_no, NR_seq, NR_len);
164 in1.close();
165 cout << "total seq: " << NR_no << endl;
166
167 // ********************************************* init NR_flag
168 for (i=0; i<NR_no; i++) NR_flag[i] = 0;
169 if ( old_clstr_file ) {
170 for (i=0; i<NRo_no; i++) {
171 if ( (j = NRo_NR_idx[i]) == -1 ) continue ;
172 if ( NRo_clstr_no[i] == i ) NR_flag[j] |= IS_OLD_REP;
173
174 if ( (k = NRo_NR_idx[ NRo_clstr_no[i] ]) == -1 ) continue ;
175 if ( NRo_clstr_no[i] != i ) NR_flag[j] |= IS_OLD_REDUNDANT;
176 NR_iden[j] = NRo_iden[i];
177 NR_clstr_no[j] = k; // note, later it need be changed to NR90_no
178 }
179 delete [] NRo_idx;
180 delete [] NRo_id1;
181 delete [] NRo_id2;
182 delete [] NRo_iden;
183 delete [] NRo_clstr_no;
184 delete [] NRo_NR_idx;
185 }
186
187 sort_seqs_divide_segs(NR_no, NR_len, NR_idx, NR_seq, mem_limit, NAAN,
188 SEG_no, SEG_b, SEG_e, db_swap);
189
190 // ********************************************* Main loop
191 char *seqi;
192 double aa1_cutoff = NR_clstr;
193 double aa2_cutoff = 1 - (1-NR_clstr)*2;
194 double aan_cutoff = 1 - (1-NR_clstr)*NAA;
195 int len, hit_no, has_aa2, iden_no, aan_no, segb;
196 int aan_list[MAX_SEQ];
197 INTs aan_list_no[MAX_SEQ];
198 INTs *look_and_count;
199 if ((look_and_count= new INTs[NR_no]) == NULL) bomb_error("Memory");
200
201 if ( tolerance ) {
202 int clstr_idx = (int) (NR_clstr * 100) - naa_stat_start_percent;
203 double d2 = ((double) (naa_stat[tolerance-1][clstr_idx][3] )) / 100;
204 double dn = ((double) (naa_stat[tolerance-1][clstr_idx][5-NAA] )) / 100;
205 aa2_cutoff = d2 > aa2_cutoff ? d2 : aa2_cutoff;
206 aan_cutoff = dn > aan_cutoff ? dn : aan_cutoff;
207 }
208
209 NR90_no = 0;
210 for (sggi=0; sggi<SEG_no; sggi++) {
211 if (SEG_no >1)
212 cout << "SEG " << sggi << " " << SEG_b[sggi] << " " << SEG_e[sggi] <<endl;
213
214 for (sgj=sggi-1; sgj>=0; sgj--) {
215 cout << "Reading swap" << endl;
216 if ( sgj != sggi-1) word_table.read_tbl(db_swap[sgj]); // reading old segment
217 cout << "Comparing with SEG " << sgj << endl;
218 for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) {
219 i = NR_idx[i1];
220 if (NR_flag[i] & IS_REDUNDANT ) continue;
221
222 if ( (NR_flag[i] & IS_OLD_REDUNDANT) &&
223 (NR_flag[ NR_clstr_no[i] ] & IS_REP) ) {
224 NR_clstr_no[i] = - (NR_clstr_no[ NR_clstr_no[i] ]) - 1;
225 NR_flag[i] |= IS_REDUNDANT ;
226 delete [] NR_seq[i];
227 continue;
228 }
229
230 len = NR_len[i]; seqi = NR_seq[i];
231 has_aa2 = 0;
232
233 int flag = check_this(len, seqi, has_aa2,
234 NAA, aan_no, aan_list, aan_list_no, look_and_count,
235 hit_no, SEG90_b[sgj], SEG90_e[sgj], iden_no,
236 aa1_cutoff, aa2_cutoff, aan_cutoff,
237 NR_flag[i], NR_flag);
238
239 if ( flag == 1) { // if similar to old one delete it
240 delete [] NR_seq[i];
241 NR_clstr_no[i] = -hit_no-1; // (-hit_no-1) for non representatives
242 NR_iden[i] = iden_no * 100 / len;
243 NR_flag[i] |= IS_REDUNDANT ;
244 }
245 } //for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++)
246 } // for (sgj=0; sgj<sggi; sgj++)
247
248 if (SEG_no >1) cout << "Refresh Memory" << endl;
249 word_table.clean();
250
251 if (SEG_no >1) cout << "Self comparing" << endl;
252 segb = NR90_no;
253 for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) {
254 i = NR_idx[i1];
255
256 if ( ! (NR_flag[i] & IS_REDUNDANT) ) {
257 if ( (NR_flag[i] & IS_OLD_REDUNDANT) &&
258 (NR_flag[ NR_clstr_no[i] ] & IS_REP) ) {
259 NR_clstr_no[i] = - (NR_clstr_no[ NR_clstr_no[i] ]) - 1;
260 NR_flag[i] |= IS_REDUNDANT ;
261 delete [] NR_seq[i];
262 }
263 else {
264 len = NR_len[i]; seqi = NR_seq[i];
265 has_aa2 = 0;
266
267 int flag = check_this(len, seqi, has_aa2,
268 NAA, aan_no, aan_list, aan_list_no, look_and_count,
269 hit_no, segb, NR90_no-1, iden_no,
270 aa1_cutoff, aa2_cutoff, aan_cutoff,
271 NR_flag[i], NR_flag);
272
273 if ( flag == 1) { // if similar to old one delete it
274 delete [] NR_seq[i];
275 NR_clstr_no[i] = -hit_no-1; // (-hit_no-1) for non representatives
276 NR_iden[i] = iden_no * 100 / len;
277 NR_flag[i] |= IS_REDUNDANT ;
278 }
279 else { // else add to NR90 db
280 NR90_idx[NR90_no] = i;
281 NR_clstr_no[i] = NR90_no; // positive value for representatives
282 NR_iden[i] = 0;
283 NR_flag[i] |= IS_REP;
284 word_table.add_word_list(aan_no, aan_list, aan_list_no, NR90_no);
285 NR90_no++;
286 } // else
287 } // else
288 } // if ( ! (NR_flag[i] & IS_REDUNDANT) )
289
290 if ( (i1+1) % 100 == 0 ) {
291 cerr << ".";
292 if ( (i1+1) % 1000 == 0 )
293 cout << i1+1 << " finished\t" << NR90_no << " clusters" << endl;
294 }
295 } // for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) {
296
297 SEG90_b[sggi] = segb; SEG90_e[sggi] = NR90_no-1;
298 if ( sggi < SEG_no-2 ) word_table.write_tbl( db_swap[sggi] ); // if not last segment
299 } // for (sggi=0; sggi<SEG_no; sggi++) {
300 cout << NR_no << " finished\t" << NR90_no << " clusters" << endl;
301
302 for (i=0; i<NR90_no; i++) delete [] NR_seq[ NR90_idx[i] ];
303
304 cout << "writing new database" << endl;
305 in1.clear();
306 in1.open(db_in);
307 db_read_and_write(in1, out1, length_of_throw, des_len, NR_seq, NR_clstr_no);
308 in1.close();
309 out1.close();
310
311 // write a backup clstr file in case next step crashes
312 for (i=0; i<NR_no; i++) {
313 j1 = NR_clstr_no[i];
314 if ( j1 < 0 ) j1 =-j1-1;
315 out2_bak << j1 << "\t" << NR_len[i] << "aa, "<< NR_seq[i] << "...";
316 if ( NR_iden[i]>0 ) out2_bak << " at " << int(NR_iden[i]) << "%" << endl;
317 else out2_bak << " *" << endl;
318 }
319 out2_bak.close();
320
321 cout << "writing clustering information" << endl;
322 // write clstr information
323 // I mask following 3 lines, because it crash when clusters NR
324 // I thought maybe there is not a big block memory now, so
325 // move the new statement to the begining of program, but because I
326 // don't know the NR90_no, I just use DB_no instead
327 // int *Clstr_no, *(*Clstr_list);
328 // if ((Clstr_no = new int[NR90_no]) == NULL) bomb_error("Memory");
329 // if ((Clstr_list = new int*[NR90_no]) == NULL) bomb_error("Memory");
330
331
332 for (i=0; i<NR90_no; i++) Clstr_no[i]=0;
333 for (i=0; i<NR_no; i++) {
334 j1 = NR_clstr_no[i];
335 if ( j1 < 0 ) j1 =-j1-1;
336 Clstr_no[j1]++;
337 }
338 for (i=0; i<NR90_no; i++) {
339 if((Clstr_list[i] = new int[ Clstr_no[i] ]) == NULL) bomb_error("Memory");
340 Clstr_no[i]=0;
341 }
342
343 for (i=0; i<NR_no; i++) {
344 j1 = NR_clstr_no[i];
345 if ( j1 < 0 ) j1 =-j1-1;
346 Clstr_list[j1][ Clstr_no[j1]++ ] = i;
347 }
348
349 for (i=0; i<NR90_no; i++) {
350 out2 << ">Cluster " << i << endl;
351 for (k=0; k<Clstr_no[i]; k++) {
352 j = Clstr_list[i][k];
353 out2 << k << "\t" << NR_len[j] << "aa, "<< NR_seq[j] << "...";
354 if ( NR_iden[j]>0 ) out2 << " at " << int(NR_iden[j]) << "%" << endl;
355 else out2 << " *" << endl;
356 }
357 }
358 out2.close();
359 cout << "program completed !" << endl << endl;
360
361 } // END int main
362
363 ///////////////////////FUNCTION of common tools////////////////////////////
364
365
366 int check_this(int len, char *seqi, int &has_aa2,
367 int NAA, int& aan_no, int *aan_list, INTs *aan_list_no,
368 INTs *look_and_count,
369 int &hit_no, int libb, int libe, int &iden_no,
370 double aa1_cutoff, double aa2_cutoff, double aan_cutoff,
371 char this_flag, char *NR_flag) {
372
373 static int taap[MAX_UAA*MAX_UAA];
374 static INTs aap_list[MAX_SEQ];
375 static INTs aap_begin[MAX_UAA*MAX_UAA];
376
377 int j, j1, c22, sk, mm;
378 int required_aa1 = int (aa1_cutoff* (double) len);
379 int required_aa2 = (aa1_cutoff > 0.95) ?
380 len-2 +1-(len-required_aa1)*2 : int (aa2_cutoff* (double) len);
381 int required_aan = (aa1_cutoff > 0.95) ?
382 len-NAA+1-(len-required_aa1)*NAA : int (aan_cutoff* (double) len);
383
384 // check_aan_list
385 aan_no = len - NAA + 1;
386 if ( NAA == 2)
387 for (j=0; j<aan_no; j++)
388 aan_list[j] = seqi[j]*NAA1 + seqi[j+1];
389 else if ( NAA == 3)
390 for (j=0; j<aan_no; j++)
391 aan_list[j] = seqi[j]*NAA2 + seqi[j+1]*NAA1 + seqi[j+2];
392 else if ( NAA == 4)
393 for (j=0; j<aan_no; j++)
394 aan_list[j] =
395 seqi[j]*NAA3+seqi[j+1]*NAA2 + seqi[j+2]*NAA1 + seqi[j+3];
396 else if ( NAA == 5)
397 for (j=0; j<aan_no; j++)
398 aan_list[j] =
399 seqi[j]*NAA4+seqi[j+1]*NAA3+seqi[j+2]*NAA2+seqi[j+3]*NAA1+seqi[j+4];
400 else return FAILED_FUNC;
401
402 quick_sort(aan_list,0,aan_no-1);
403 for(j=0; j<aan_no; j++) aan_list_no[j]=1;
404 for(j=aan_no-1; j; j--) {
405 if (aan_list[j] == aan_list[j-1]) {
406 aan_list_no[j-1] += aan_list_no[j];
407 aan_list_no[j]=0;
408 }
409 }
410 // END check_aan_list
411
412
413 // lookup_aan
414 for (j=libe; j>=libb; j--) look_and_count[j]=0;
415 word_table.count_word_no(aan_no, aan_list, aan_list_no, look_and_count);
416
417
418 // contained_in_old_lib()
419 int band_left, band_right, best_score, band_width1, best_sum, len2;
420 int len1 = len - 1;
421 char *seqj;
422 int flag = 0; // compare to old lib
423 for (j=libe; j>=libb; j--) {
424 if ( look_and_count[j] < required_aan ) continue;
425 if ( (this_flag & IS_OLD_REP ) &&
426 (NR_flag[NR90_idx[j]] & IS_OLD_REP) ) continue;
427 len2 = NR_len[NR90_idx[j]];
428 seqj = NR_seq[NR90_idx[j]];
429
430 if ( has_aa2 == 0 ) { // calculate AAP array
431 for (sk=0; sk<NAA2; sk++) taap[sk] = 0;
432 for (j1=0; j1<len1; j1++) {
433 c22= seqi[j1]*NAA1 + seqi[j1+1];
434 taap[c22]++;
435 }
436 for (sk=0,mm=0; sk<NAA2; sk++) {
437 aap_begin[sk] = mm; mm+=taap[sk]; taap[sk] = 0;
438 }
439 for (j1=0; j1<len1; j1++) {
440 c22= seqi[j1]*NAA1 + seqi[j1+1];
441 aap_list[aap_begin[c22]+taap[c22]++] =j1;
442 }
443 has_aa2 = 1;
444 }
445
446 band_width1 = (BAND_width < len+len2-2 ) ? BAND_width : len+len2-2;
447 diag_test_aapn(seqj, len, len2, taap, aap_begin,
448 aap_list, best_sum,
449 band_width1, band_left, band_right, required_aa1);
450 if ( best_sum < required_aa2 ) continue;
451
452 local_band_align(seqi, seqj, len, len2, mat,
453 best_score, iden_no, band_left, band_right);
454 if ( iden_no < required_aa1 ) continue;
455 if ( (iden_no * 100 / len ) < NR_clstr100 ) continue;
456
457 flag = 1; break; // else flag = 1, and break loop
458 }
459 hit_no = j;
460 return flag;
461 // END contained_in_old_lib()
462 } // END check_this
463
464 /////////////////////////// END ALL ////////////////////////