// =============================================================================
// CD-HI/CD-HIT
//
// Cluster Database at High Identity
//
// CD-HIT clusters protein sequences at high identity threshold.
// This program can remove the high sequence redundance efficiently.
//
// program written by
//                                      Weizhong Li
//                                      UCSD, San Diego Supercomputer Center
//                                      La Jolla, CA, 92093
//                                      Email liwz@sdsc.edu
//
//                 at
//                                      Adam Godzik's lab
//                                      The Burnham Institute
//                                      La Jolla, CA, 92037
//                                      Email adam@burnham-inst.org
// =============================================================================

#include "cd-hi.h"

// information
char cd_hit_ref1[] = "\"Clustering of highly homologous sequences to reduce thesize of large protein database\", Weizhong Li, Lukasz Jaroszewski & Adam GodzikBioinformatics, (2001) 17:282-283";
char cd_hit_ref2[] = "\"Tolerating some redundancy significantly speeds up clustering of large protein databases\", Weizhong Li, Lukasz Jaroszewski & Adam Godzik Bioinformatics, (2002) 18:77-82";
//

int DIAG_score[MAX_DIAG];

void bomb_error(char *message) {
  cerr << "\nFatal Error\n";
  cerr << message << endl;
  cerr << "\nProgram halted !! \n\n";
  exit (1);
} // END void bomb_error

void bomb_error(char *message, char *message2) {
  cerr << "\nFatal Error\n";
  cerr << message << " " << message2 << endl;
  cerr << "\nProgram halted !! \n\n";
  exit (1);
} // END void bomb_error



//quick_sort calling (a, 0, no-1)
int quick_sort (int *a, int lo0, int hi0 ) {
  int lo = lo0;
  int hi = hi0;
  int mid;
  int tmp;

  if ( hi0 > lo0) {
    mid = a[ ( lo0 + hi0 ) / 2 ];

    while( lo <= hi ) {
      while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
      while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
      if( lo <= hi ) {
        tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
        lo++; hi--;
      }
    } // while

    if( lo0 < hi ) quick_sort(a, lo0, hi );
    if( lo < hi0 ) quick_sort(a, lo, hi0 );
  } // if ( hi0 > lo0)
  return 0;
} // quick_sort


//quick_sort_idx calling (a, idx, 0, no-1)
//sort a with another array idx
//so that idx rearranged
int quick_sort_idx (int *a, int *idx, int lo0, int hi0 ) {
  int lo = lo0;
  int hi = hi0;
  int mid;
  int tmp;

  if ( hi0 > lo0) {
    mid = a[ ( lo0 + hi0 ) / 2 ];

    while( lo <= hi ) {
      while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
      while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
      if( lo <= hi ) {
        tmp=a[lo];   a[lo]=a[hi];     a[hi]=tmp;
        tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
        lo++; hi--;
      }
    } // while
  
    if( lo0 < hi ) quick_sort_idx(a, idx, lo0, hi );
    if( lo < hi0 ) quick_sort_idx(a, idx, lo, hi0 );
  } // if ( hi0 > lo0)
  return 0;
} // quick_sort_idx


//quick_sort_idx calling (a, idx, 0, no-1)
//sort a with another array idx
//so that idx rearranged
int quick_sort_idx2 (int *a, int *b, int *idx, int lo0, int hi0 ) {
  int lo = lo0;
  int hi = hi0;
  int mid;
  int tmp;

  if ( hi0 > lo0) {
    mid = a[ ( lo0 + hi0 ) / 2 ];

    while( lo <= hi ) {
      while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
      while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
      if( lo <= hi ) {
        tmp=a[lo];   a[lo]=a[hi];     a[hi]=tmp;
        tmp=b[lo];   b[lo]=b[hi];     b[hi]=tmp;
        tmp=idx[lo]; idx[lo]=idx[hi]; idx[hi]=tmp;
        lo++; hi--;
      }
    } // while

    if( lo0 < hi ) quick_sort_idx2(a, b, idx, lo0, hi );
    if( lo < hi0 ) quick_sort_idx2(a, b, idx, lo, hi0 );
  } // if ( hi0 > lo0)
  return 0;
} // quick_sort_idx2


//quick_sort_a_b_idx
//sort a list by a first priority
//           and b second priority
//another idx go with them
int quick_sort_a_b_idx (int *a, int *b, int *idx, int lo0, int hi0 ) {

  //first sort list by a
  quick_sort_idx2(a, b, idx, lo0, hi0);

  //then sort segments where elements in a is same
  int i, j, k;
  int bb = lo0;

  for (i=bb+1; i<=hi0; i++) {
    if ( a[i] == a[i-1] ) {
      ;
    }
    else {
      if ( i-1 > bb ) quick_sort_idx(b, idx, bb, i-1);
      bb = i;
    } 
  }

  // last segment
  if ( hi0 > bb ) quick_sort_idx(b, idx, bb, hi0);

  return 0;
} // quick_sort_a_b_idx


void format_seq(char *seq) {
  int i, j, k;
  char c1;
  int len = strlen(seq);

  for (i=0,j=0; i<len; i++) {
    c1 = toupper(seq[i]);
    if ( isalpha(c1) ) seq[j++] = c1;
  }
  seq[j] = 0;
} // END void format_seq


////For smiple len1 <= len2
////walk along all diag path of two sequences,
////find the diags with most aap
////return top n diags
////for diag 0 means direction (0,0) -> (1,1)
////         1 means direction (0,1) -> (1,2)
////        -1 means direction (1,0) -> (2,1)
int diag_test_aapn(int NAA1, char iseq2[], int len1, int len2, int *taap,
        INTs *aap_begin, INTs *aap_list, int &best_sum,
        int band_width, int &band_left, int &band_right, int required_aa1) {
  int i, i1, j, k, l, m, n;
  int *pp;
  int nall = len1+len2-1;
  static int diag_score[MAX_DIAG]; 

  for (pp=diag_score, i=nall; i; i--, pp++) *pp=0;

  int bi, bj, c22;
  INTs *bip;
  int len11 = len1-1;
  int len22 = len2-1;
  i1 = len11;
  for (i=0; i<len22; i++,i1++) {
//    c22 = iseq2[i]*NAA1 + iseq2[i+1];
    c22 = iseq2[i]*NAA1+ iseq2[i+1];
    if ( (j=taap[c22]) == 0) continue;
    bip = aap_list+ aap_begin[c22];     //    bi = aap_begin[c22];
    for (; j; j--, bip++) {  //  for (j=0; j<taap[c22]; j++,bi++) {
      diag_score[i1 - *bip]++;
    }
  }

  //find the best band range
//  int band_b = required_aa1;
  int band_b = required_aa1-1 >= 0 ? required_aa1-1:0;  // on dec 21 2001
  int band_e = nall - required_aa1;
  int band_m = ( band_b+band_width-1 < band_e ) ? band_b+band_width-1 : band_e;
  int best_score=0;
  for (i=band_b; i<=band_m; i++) best_score += diag_score[i];
  int from=band_b;
  int end =band_m;
  int score = best_score;  
  for (k=from, j=band_m+1; j<band_e; j++) {
    score -= diag_score[k++]; 
    score += diag_score[j]; 
    if ( score > best_score ) {
      from = k;
      end  = j;
      best_score = score;
    }
  }
  for (j=from; j<=end; j++) { // if aap pairs fail to open gap
    if ( diag_score[j] < 5 ) { best_score -= diag_score[j]; from++;}
    else break;
  }
  for (j=end; j>=from; j--) { // if aap pairs fail to open gap
    if ( diag_score[j] < 5 ) { best_score -= diag_score[j]; end--;}
    else break;
  }

//  delete [] diag_score;
  band_left = from-len1+1; 
  band_right= end-len1+1;
  best_sum = best_score;
  return OK_FUNC;
}
// END diag_test_aapn
 

////local alignment of two sequence within a diag band
////for band 0 means direction (0,0) -> (1,1)
////         1 means direction (0,1) -> (1,2)
////        -1 means direction (1,0) -> (2,1)
////iseq len are integer sequence and its length,
////mat is matrix, return ALN_PAIR class
int local_band_align(char iseq1[], char iseq2[], int len1, int len2,
                     AA_MATRIX &mat, int &best_score, int &iden_no,
                     int band_left, int band_right) {
  int i, j, k, l, m, n, j1;
  int ii, jj, kk;
  int best_score1, iden_no1, best_i, best_j, best_j1;
  int *gap_array;
  iden_no = 0;

  if ( (band_right >= len2 ) ||
       (band_left  <= -len1) ||
       (band_left  > band_right) ) return FAILED_FUNC;

  // allocate mem for score_mat[len1][len2] etc
  int band_width = band_right - band_left + 1;
  int *(*score_mat), *(*iden_mat);
  if ((score_mat = new int * [len1]) == NULL) bomb_error("Memory");
  if ((iden_mat  = new int * [len1]) == NULL) bomb_error("Memory");
  for (i=0; i<len1; i++) {
    if ((score_mat[i] = new int [band_width]) == NULL) bomb_error("Memory");
    if ((iden_mat[i]  = new int [band_width]) == NULL) bomb_error("Memory");
  }
  //here index j1 refer to band column
  for (i=0; i<len1; i++) for (j1=0; j1<band_width; j1++) score_mat[i][j1] =  0;

  gap_array  = mat.gap_array;
  best_score = 0;

  if (band_left < 0) {  //set score to left border of the matrix within band
    int tband = (band_right < 0) ? band_right : 0;
    for (k=band_left; k<tband; k++) {
      i = -k;
      j1 = k-band_left;
      if ( ( score_mat[i][j1] = mat.matrix[iseq1[i]][iseq2[0]] ) > best_score) 
        best_score = score_mat[i][j1];
      iden_mat[i][j1] = (iseq1[i] == iseq2[0]) ? 1 : 0;
    }
  }

  if (band_right >=0) { //set score to top border of the matrix within band
    int tband = (band_left > 0) ? band_left : 0;
    for (i=0,j=tband; j<=band_right; j++) {
      j1 = j-band_left;
      if ( ( score_mat[i][j1] = mat.matrix[iseq1[i]][iseq2[j]] ) > best_score)
        best_score = score_mat[i][j1];
      iden_mat[i][j1] = (iseq1[i] == iseq2[j]) ? 1 : 0;
    }
  }

  for (i=1; i<len1; i++) {
    for (j1=0; j1<band_width; j1++) {
      j = j1+i+band_left;
      if ( j<1 ) continue;
      if ( j>=len2) continue;

      int sij = mat.matrix[iseq1[i]][iseq2[j]];
      int iden_ij = (iseq1[i] == iseq2[j] ) ? 1 : 0;
      int s1, *mat_row;
      int k0, k_idx;

      // from (i-1,j-1)
      if ( (best_score1 = score_mat[i-1][j1] )> 0 ) {
        iden_no1 = iden_mat[i-1][j1];
      }
      else {
        best_score1 = 0;
        iden_no1 = 0;
      }

      // from last row
      mat_row = score_mat[i-1];
      k0 = (-band_left+1-i > 0) ? -band_left+1-i : 0;
      for (k=j1-1, kk=0; k>=k0; k--, kk++) {
        if ( (s1 = mat_row[k] + gap_array[kk] ) > best_score1 ){
           best_score1 = s1;
           iden_no1 = iden_mat[i-1][k];
        }
      }

      k0 = (j-band_right-1 > 0) ? j-band_right-1 : 0;
      for(k=i-2, jj=j1+1,kk=0; k>=k0; k--,kk++,jj++) {
        if ( (s1 = score_mat[k][jj] + gap_array[kk] ) > best_score1 ){
           best_score1 = s1;
           iden_no1 = iden_mat[k][jj];
        }
      }

      best_score1 += sij;
      iden_no1    += iden_ij;
      score_mat[i][j1] = best_score1;
      iden_mat[i][j1]  = iden_no1;

      if ( best_score1 > best_score ) {
        best_score = best_score1;
        iden_no = iden_no1;
      }
    } // END for (j=1; j<len2; j++)
  } // END for (i=1; i<len1; i++)

  for (i=0; i<len1; i++) {
    delete [] score_mat[i]; 
    delete [] iden_mat[i];
  }
  delete [] score_mat;
  delete [] iden_mat;

  return OK_FUNC;
} // END int local_band_align


////local alignment of two sequence within a diag band
////for band 0 means direction (0,0) -> (1,1)
////         1 means direction (0,1) -> (1,2)
////        -1 means direction (1,0) -> (2,1)
////iseq len are integer sequence and its length,
////mat is matrix, return ALN_PAIR class
////copied from local_band_align, but also return alignment position
int local_band_align2(char iseq1[], char iseq2[], int len1, int len2,
                     AA_MATRIX &mat, int &best_score, int &iden_no,
                     int band_left, int band_right, 
                     int &from1, int &end1, int &from2, int &end2) {
  int i, j, k, l, m, n, j1;
  int ii, jj, kk;
  int best_score1, iden_no1, best_i, best_j, best_j1;
  int best_from1, best_from2;
  int *gap_array;
  iden_no = 0; from1=0; from2=0;

  if ( (band_right >= len2 ) ||
       (band_left  <= -len1) ||
       (band_left  > band_right) ) return FAILED_FUNC;

  // allocate mem for score_mat[len1][len2] etc
  int band_width = band_right - band_left + 1;
  int *(*score_mat), *(*iden_mat);
  int *(*from1_mat), *(*from2_mat);
  if ((score_mat = new int * [len1]) == NULL) bomb_error("Memory");
  if ((iden_mat  = new int * [len1]) == NULL) bomb_error("Memory");
  if ((from1_mat = new int * [len1]) == NULL) bomb_error("Memory");
  if ((from2_mat = new int * [len1]) == NULL) bomb_error("Memory");
  for (i=0; i<len1; i++) {
    if ((score_mat[i] = new int [band_width]) == NULL) bomb_error("Memory");
    if ((iden_mat[i]  = new int [band_width]) == NULL) bomb_error("Memory");
    if ((from1_mat[i] = new int [band_width]) == NULL) bomb_error("Memory");
    if ((from2_mat[i] = new int [band_width]) == NULL) bomb_error("Memory");
  }
  //here index j1 refer to band column
  for (i=0; i<len1; i++) for (j1=0; j1<band_width; j1++) score_mat[i][j1] =  0;

  gap_array  = mat.gap_array;
  best_score = 0;

  if (band_left < 0) {  //set score to left border of the matrix within band
    int tband = (band_right < 0) ? band_right : 0;
    for (k=band_left; k<tband; k++) {
      i = -k;
      j1 = k-band_left;
      if ( ( score_mat[i][j1] = mat.matrix[iseq1[i]][iseq2[0]] ) > best_score) {
        best_score = score_mat[i][j1];
        from1 = i; from2 = 0; end1 = i; end2 = 0;
      }
      iden_mat[i][j1] = (iseq1[i] == iseq2[0]) ? 1 : 0;
      from1_mat[i][j1] = i;
      from2_mat[i][j1] = 0;
    }
  }

  if (band_right >=0) { //set score to top border of the matrix within band
    int tband = (band_left > 0) ? band_left : 0;
    for (i=0,j=tband; j<=band_right; j++) {
      j1 = j-band_left;
      if ( ( score_mat[i][j1] = mat.matrix[iseq1[i]][iseq2[j]] ) > best_score) {
        best_score = score_mat[i][j1];
        from1 = i; from2 = j; end1 = i; end2 = j;
      }
      iden_mat[i][j1] = (iseq1[i] == iseq2[j]) ? 1 : 0;
      from1_mat[i][j1] = i;
      from2_mat[i][j1] = j;
    }
  }

  for (i=1; i<len1; i++) {
    for (j1=0; j1<band_width; j1++) {
      j = j1+i+band_left;
      if ( j<1 ) continue;
      if ( j>=len2) continue;

      int sij = mat.matrix[iseq1[i]][iseq2[j]];
      int iden_ij = (iseq1[i] == iseq2[j] ) ? 1 : 0;
      int s1, *mat_row;
      int k0, k_idx;

      // from (i-1,j-1)
      if ( (best_score1 = score_mat[i-1][j1] )> 0 ) {
        iden_no1 = iden_mat[i-1][j1];
        best_from1 = from1_mat[i-1][j1];
        best_from2 = from2_mat[i-1][j1];
      }
      else {
        best_score1 = 0;
        iden_no1 = 0;
        best_from1 = i;
        best_from2 = j;
      }

      // from last row
      mat_row = score_mat[i-1];
      k0 = (-band_left+1-i > 0) ? -band_left+1-i : 0;
      for (k=j1-1, kk=0; k>=k0; k--, kk++) {
        if ( (s1 = mat_row[k] + gap_array[kk] ) > best_score1 ){
           best_score1 = s1;
           iden_no1 = iden_mat[i-1][k];
           best_from1 = from1_mat[i-1][k];
           best_from2 = from2_mat[i-1][k];
        }
      }

      k0 = (j-band_right-1 > 0) ? j-band_right-1 : 0;
      for(k=i-2, jj=j1+1,kk=0; k>=k0; k--,kk++,jj++) {
        if ( (s1 = score_mat[k][jj] + gap_array[kk] ) > best_score1 ){
           best_score1 = s1;
           iden_no1 = iden_mat[k][jj];
           best_from1 = from1_mat[k][jj];
           best_from2 = from2_mat[k][jj];
        }
      }

      best_score1 += sij;
      iden_no1    += iden_ij;
      score_mat[i][j1] = best_score1;
      iden_mat[i][j1]  = iden_no1;
      from1_mat[i][j1] = best_from1;
      from2_mat[i][j1] = best_from2;
      if ( best_score1 > best_score ) {
        best_score = best_score1;
        iden_no = iden_no1;
        end1 = i; end2 = j;
        from1 = best_from1; from2 = best_from2;
      }
    } // END for (j=1; j<len2; j++)
  } // END for (i=1; i<len1; i++)

  for (i=0; i<len1; i++) {
    delete [] score_mat[i]; 
    delete [] iden_mat[i];
    delete [] from1_mat[i];
    delete [] from2_mat[i];
  }
  delete [] score_mat;
  delete [] iden_mat;
  delete [] from1_mat;
  delete [] from2_mat;

  return OK_FUNC;
} // END int local_band_align2


//class function definition
//char aa[] = {"ARNDCQEGHILKMFPSTWYVBZX"};
//{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,2,6,20};
int aa2idx[] = {0, 2, 4, 3, 6, 13,7, 8, 9,20,11,10,12, 2,20,14,
                5, 1,15,16,20,19,17,20,18, 6};
    // idx for  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P
    //          Q  R  S  T  U  V  W  X  Y  Z
    // so  aa2idx[ X - 'A'] => idx_of_X, eg aa2idx['A' - 'A'] => 0,
    // and aa2idx['M'-'A'] => 12

int BLOSUM62[] = {
  4,                                                                  // A
 -1, 5,                                                               // R
 -2, 0, 6,                                                            // N
 -2,-2, 1, 6,                                                         // D
  0,-3,-3,-3, 9,                                                      // C
 -1, 1, 0, 0,-3, 5,                                                   // Q
 -1, 0, 0, 2,-4, 2, 5,                                                // E
  0,-2, 0,-1,-3,-2,-2, 6,                                             // G
 -2, 0, 1,-1,-3, 0, 0,-2, 8,                                          // H
 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4,                                       // I
 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,                                    // L
 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,                                 // K
 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5,                              // M
 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,                           // F
 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,                        // P
  1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4,                     // S
  0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,                  // T
 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11,               // W
 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,            // Y
  0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,         // V
 -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4,      // B
 -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,   // Z
  0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1 // X
//A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X
//0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19  2  6 20
};


int na2idx[] = {0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4,
                4, 4, 4, 3, 3, 4, 4, 4, 4, 4};
    // idx for  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P
    //          Q  R  S  T  U  V  W  X  Y  Z
    // so aa2idx[ X - 'A'] => idx_of_X, eg aa2idx['A' - 'A'] => 0,
    // and aa2idx['M'-'A'] => 4
int BLOSUM62_na[] = {
  1,               // A
 -2, 1,            // C
 -2,-2, 1,         // G
 -2,-2,-2, 1,      // T
 -2,-2,-2, 1, 1,   // U
 -2,-2,-2,-2,-2, 1 // N
//A  C  G  T  U  N
//0  1  2  3  3  4
};

void setaa_to_na() {
  int i, j, k;
  for (i=0; i<26; i++) aa2idx[i]   = na2idx[i];
} // END void setaa_to_na


int setiseq(char *seq, int len) {
  for (int i=0; i<len; i++) {
    seq[i] = aa2idx[seq[i] - 'A'];
  }
  return 0;
} // END void SEQ::seq2iseq()


/////////////////
AA_MATRIX::AA_MATRIX() {
  int i, j, k;
  gap = -11;
  ext_gap = -1;
  if ( (gap_array = new int[MAX_GAP]) == NULL ) bomb_error("memory");
  for (i=0; i<MAX_GAP; i++)  gap_array[i] = gap + i * ext_gap;
  k = 0;
  for ( i=0; i<MAX_AA; i++)
    for ( j=0; j<=i; j++)
      matrix[j][i] = matrix[i][j] = BLOSUM62[ k++ ];
} // END AA_MATRIX::AA_MATRIX()


void AA_MATRIX::init() {
  int i, j, k;
  gap = -11;
  ext_gap = -1;
  for (i=0; i<MAX_GAP; i++)  gap_array[i] = gap + i * ext_gap;
  k = 0;
  for ( i=0; i<MAX_AA; i++)
    for ( j=0; j<=i; j++)
      matrix[j][i] = matrix[i][j] = BLOSUM62[ k++ ];
} // END void AA_MATRIX::init()


void AA_MATRIX::set_gap(int gap1, int ext_gap1) {
  int i;
  gap = gap1; ext_gap = ext_gap1;
  for (i=0; i<MAX_GAP; i++)  gap_array[i] = gap + i * ext_gap;
} // END void AA_MATRIX::set_gap


void AA_MATRIX::set_matrix(int *mat1) {
  int i, j, k;
  k = 0;
  for ( i=0; i<MAX_AA; i++)
    for ( j=0; j<=i; j++)
      matrix[j][i] = matrix[i][j] = mat1[ k++ ];
} // END void AA_MATRIX::set_matrix


void AA_MATRIX::set_to_na() {
  int i, j, k;
  gap = -6;
  ext_gap = -1;
  for (i=0; i<MAX_GAP; i++)  gap_array[i] = gap + i * ext_gap;
  k = 0;
  for ( i=0; i<MAX_NA; i++)
    for ( j=0; j<=i; j++)
      matrix[j][i] = matrix[i][j] = BLOSUM62_na[ k++ ];
} // END void AA_MATRIX::set_to_na


IDX_TBL::IDX_TBL(){
  NAA      = 0;
  NAAN     = 0;
  mem_size = 1;
  buffer_size = 100000;
  is_aa       = 1;
} // END IDX_TBL::IDX_TBL


void IDX_TBL::set_dna() {
  is_aa = 0;
} // END IDX_TBL::set_dna

void IDX_TBL::init(int naa, int naan){
  int i, j, k;
  NAA  = naa;
  NAAN = naan;
  buffer_size = 100000;

  if (is_aa) {
    if      ( NAA == 2 ) { mem_size = 25000; }
    else if ( NAA == 3 ) { mem_size = 1200; }
    else if ( NAA == 4 ) { mem_size = 60; }
    else if ( NAA == 5 ) { mem_size = 3; }
    else bomb_error("Something wrong!");
  }
  else {
    if      ( NAA == 2 ) { mem_size = 250000; }
    else if ( NAA == 3 ) { mem_size = 50000; }
    else if ( NAA == 4 ) { mem_size = 10000; }
    else if ( NAA == 5 ) { mem_size = 2000; }
    else if ( NAA == 6 ) { mem_size = 350; }
    else if ( NAA == 7 ) { mem_size = 75; }
    else if ( NAA == 8 ) { mem_size = 15; }
    else if ( NAA == 9 ) { mem_size = 3; }
    else if ( NAA ==10 ) { mem_size = 2; }
    else bomb_error("Something wrong!");
  }

  if ((size     = new int[NAAN])        == NULL) bomb_error("Memory");
  if ((capacity = new int[NAAN])        == NULL) bomb_error("Memory");
  if ((seq_idx  = new int*[NAAN])       == NULL) bomb_error("Memory");
  if ((word_no  = new INTs*[NAAN])      == NULL) bomb_error("Memory");
  if ((buffer   = new int[buffer_size]) == NULL) bomb_error("Memory");

  for (i=0; i<NAAN; i++) {
    size[i]     = 0;
    capacity[i] = 0;
  }

} // END IDX_TBL::init


void IDX_TBL::clean() {
  int i, j, k;
  for (i=0; i<NAAN; i++) size[i]=0;
} // END IDX_TBL::clean


int IDX_TBL::read_tbl(char *filename) {
  int i, j, k;

  ifstream fswap(filename);
  if (! fswap) bomb_error("Can not open ", filename);

  for (i=0; i<NAAN; i++) {
    if ( size[i] > 0 ) {
      delete [] seq_idx[i];
      delete [] word_no[i];
    }

    fswap.read((char *) &size[i], sizeof(int));
    capacity[i] = size[i];
    if (size[i] == 0 ) continue;
    if ((seq_idx[i] = new int[size[i]])  == NULL) bomb_error("Memory");
    if ((word_no[i] = new INTs[size[i]]) == NULL) bomb_error("Memory");

    fswap.read((char *) seq_idx[i], sizeof(int) * size[i]);
    fswap.read((char *) word_no[i], sizeof(INTs) * size[i]);
  }

  fswap.close();
  return OK_FUNC;

} // END int IDX_TBL::read_tbl


int IDX_TBL::write_tbl(char *filename) {
  int i, j, k;
  ofstream fswap(filename);

  if (! fswap) bomb_error("Can not open ", filename);

  for (i=0; i<NAAN; i++) {
    fswap.write ((char *) &size[i], sizeof(int));
    if (size[i] == 0 ) continue;
    fswap.write((char *) seq_idx[i], sizeof(int)  * size[i]);
    fswap.write((char *) word_no[i], sizeof(INTs) * size[i]);
  }
  fswap.close();
  return OK_FUNC;

} // END int IDX_TBL::write_tbl


int IDX_TBL::add_word_list(int aan_no, int *aan_list, 
                           INTs *aan_list_no, int idx) {
  int i, j, k, i1, j1, k1, i0, j0, k0;

  for (j0=0; j0<aan_no; j0++) {
    if ( j1=aan_list_no[j0] ) {
      j = aan_list[j0];

      if ( size[j] == capacity[j] ) { // resize array
        if ( capacity[j] > buffer_size ) {
           delete [] buffer;
           buffer_size = capacity[j];
           if ((buffer = new int[buffer_size]) == NULL) bomb_error("Memory");
        }

        for (k=0; k<size[j]; k++) buffer[k] = seq_idx[j][k];
        if ( capacity[j] >0 ) delete [] seq_idx[j];
        if ((seq_idx[j] = new int[mem_size+capacity[j]]) == NULL)
          bomb_error("Memory");
        for (k=0; k<size[j]; k++) seq_idx[j][k] = buffer[k];

        for (k=0; k<size[j]; k++) buffer[k] = word_no[j][k];
        if ( capacity[j] >0 ) delete [] word_no[j];
        if ((word_no[j] = new INTs[mem_size+capacity[j]]) == NULL)
          bomb_error("Memory");
        for (k=0; k<size[j]; k++) word_no[j][k] = buffer[k];

        capacity[j] += mem_size;
      }
      seq_idx[j][size[j]] = idx;
      word_no[j][size[j]] = j1;
      size[j]++;
    }
  } //  for (j0=0; j0<aan_no; j0++) {

  return OK_FUNC;
} // END int IDX_TBL::add_word_list


// copied from above with only diff if j < 0 ...
int IDX_TBL::add_word_list2(int aan_no, int *aan_list, 
                           INTs *aan_list_no, int idx) {
  int i, j, k, i1, j1, k1, i0, j0, k0;

  for (j0=0; j0<aan_no; j0++) {
    if ( j1=aan_list_no[j0] ) {
      j = aan_list[j0];
      if (j<0) continue; // for those has 'N'
      if ( size[j] == capacity[j] ) { // resize array
        if ( capacity[j] > buffer_size ) {
           delete [] buffer;
           buffer_size = capacity[j];
           if ((buffer = new int[buffer_size]) == NULL) bomb_error("Memory");
        }

        for (k=0; k<size[j]; k++) buffer[k] = seq_idx[j][k];
        if ( capacity[j] >0 ) delete [] seq_idx[j];
        if ((seq_idx[j] = new int[mem_size+capacity[j]]) == NULL)
          bomb_error("Memory");
        for (k=0; k<size[j]; k++) seq_idx[j][k] = buffer[k];

        for (k=0; k<size[j]; k++) buffer[k] = word_no[j][k];
        if ( capacity[j] >0 ) delete [] word_no[j];
        if ((word_no[j] = new INTs[mem_size+capacity[j]]) == NULL)
          bomb_error("Memory");
        for (k=0; k<size[j]; k++) word_no[j][k] = buffer[k];

        capacity[j] += mem_size;
      }
      seq_idx[j][size[j]] = idx;
      word_no[j][size[j]] = j1;
      size[j]++;
    }
  } //  for (j0=0; j0<aan_no; j0++) {

  return OK_FUNC;
} // END int IDX_TBL::add_word_list2


int IDX_TBL::count_word_no(int aan_no, int *aan_list,
                           INTs *aan_list_no, INTs *look_and_count) {
  int  i, j, k, j0, j1, k1;
  int  *ptr1;
  INTs *ptr2;

  for (j0=0; j0<aan_no; j0++) {
    if ( j1=aan_list_no[j0] ) {
      j = aan_list[j0];
      k1 = size[j];
      ptr1 = seq_idx[j];
      ptr2 = word_no[j];
      for (k=0; k<k1; k++)
        look_and_count[ptr1[k]] += ( j1 < ptr2[k]) ? j1 : ptr2[k] ;
    }
  }

  return OK_FUNC;
} // END int IDX_TBL::count_word_no


// copied from above with only diff if j < 0 ...
int IDX_TBL::count_word_no2(int aan_no, int *aan_list,
                           INTs *aan_list_no, INTs *look_and_count) {
  int  i, j, k, j0, j1, k1;
  int  *ptr1;
  INTs *ptr2;

  for (j0=0; j0<aan_no; j0++) {
    if ( j1=aan_list_no[j0] ) {
      j = aan_list[j0];
      if (j<0) continue; // if met short word has 'N'
      k1 = size[j];
      ptr1 = seq_idx[j];
      ptr2 = word_no[j];
      for (k=0; k<k1; k++)
        look_and_count[ptr1[k]] += ( j1 < ptr2[k]) ? j1 : ptr2[k] ;
    }
  }
                                                                                
  return OK_FUNC;
} // END int IDX_TBL::count_word_no2


// remove seqs whose index is before upper_bound
// those seqs are longer than the seq at upper_bound
// 
int IDX_TBL::pop_long_seqs(int upper_bound) {
  int i, j, k, i1, j1, k1, i0, j0, k0;

  for (i=0; i<NAAN; i++) {
    if ( size[i] == 0 ) continue;

    k = 0;
    for (j=0; j<size[i]; j++) {
      if (seq_idx[i][j] < upper_bound) continue;
      seq_idx[i][k] = seq_idx[i][j];
      word_no[i][k] = word_no[i][j];
      k++;
    }
    size[i] = k;
    //capacity[i] remain unchanged, 
  }
  return OK_FUNC;
} // END int IDX_TBL::add_word_list2


int print_usage (char *arg) {
  cout << "Usage "<< arg << " [Options] \n\nOptions\n\n";
  cout << "    -i input filename in fasta format, required\n";
  cout << "    -o output filename, required\n";
  cout << "    -c sequence identity threshold, default 0.9\n";
  cout << "       the sequence identity is calculated as :\n";
  cout << "       Number of identical amino acids in alignment\n";
  cout << "       divided by full length of shorter sequence\n";
  cout << "    -b band_width of alignment, default 20\n";
  cout << "    -M max available memory, default 400 (Mbyte) \n";
  cout << "    -n word_length, default 5\n";
  cout << "    -l length of throw_away_sequences, default 10\n";
  cout << "    -d length of description in .clstr file, default 20\n";
  cout << "    -t tolerance for redundance, default 2\n";
  cout << "    -u filename of an old dbname.clstr\n";
  cout << "       for incremental update, if an old NR clustered\n";
  cout << "       at 90% and yielded NR90 and NR90.clstr,\n";
  cout << "       to cluster a new NR at 90%, use -u NR90.clstr\n";
  cout << "    -s length difference cutoff, default 0.0\n";
  cout << "       if set to 0.9, the shorter sequences need to be\n";
  cout << "       at least 90% long of longest one in same cluster\n";
  cout << "    -S length difference cutoff in amino acid, default 999999\n";
  cout << "       if set to 60, the length difference\n";
  cout << "       within a cluster will < 60 amino acid\n";
  cout << "    -B 1 or 0, default 0, by default, sequences are stored in RAM\n";
  cout << "       if set to 1, sequence are stored on hard deive\n";
  cout << "    -p 1 or 0, default 0\n";
  cout << "       if set to 1, print alignment overlap in .clstr file\n";
  cout << "    -h print this help\n\n";
  cout << "    Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
  cout << "    If you find cd-hit useful, please kindly cite:\n\n";
  cout << "    " << cd_hit_ref1 << "\n";
  cout << "    " << cd_hit_ref2 << "\n\n\n";
  exit(1);
} // END print_usage



int print_usage_2d (char *arg) {
  cout << "Usage "<< arg << " [Options] \n\nOptions\n\n";
  cout << "    -i input filename for db1 in fasta format, required\n";
  cout << "    -i2 input filename for db2 in fasta format, required\n";
  cout << "    -o output filename, required\n";
  cout << "    -c sequence identity threshold, default 0.9\n";
  cout << "       the sequence identity is calculated as :\n";
  cout << "       Number of identical amino acids in alignment\n";
  cout << "       divided by full length of shorter sequence\n";
  cout << "    -b band_width of alignment, default 20\n";
  cout << "    -M max available memory, default 400 (Mbyte) \n";
  cout << "    -n word_length, default 5\n";
  cout << "    -l length of throw_away_sequences, default 10\n";
  cout << "    -d length of description in .clstr file, default 20\n";
  cout << "    -t tolerance for redundance, default 2\n";
  cout << "    -u filename of an old dbname.clstr\n";
  cout << "       for incremental update, if an old NR clustered\n";
  cout << "       at 90% and yielded NR90 and NR90.clstr,\n";
  cout << "       to cluster a new NR at 90%, use -u NR90.clstr\n";
  cout << "    -s length difference cutoff, default 0.0\n";
  cout << "       if set to 0.9, the shorter sequences need to be\n";
  cout << "       at least 90% long of longest one in same cluster\n";
  cout << "    -S length difference cutoff in amino acid, default 999999\n";
  cout << "       if set to 60, the length difference\n";
  cout << "       within a cluster will < 60 amino acid\n";
  cout << "    -s2 length difference cutoff for db1, default 1.0\n";
  cout << "       by default, seqs in db1 >= seqs in db2 in a same cluster\n";
  cout << "       if set to 0.9, seqs in db1 may just >= 90% seqs in db2\n";
  cout << "    -S2 length difference cutoff, default 0\n";
  cout << "       by default, seqs in db1 >= seqs in db2 in a same cluster\n";
  cout << "       if set to 60, seqs in db2 may 60aa longer than seqs in db1\n";
  cout << "    -B 1 or 0, default 0, by default, sequences are stored in RAM\n";
  cout << "       if set to 1, sequence are stored on hard deive\n";
  cout << "    -p 1 or 0, default 0\n";
  cout << "       if set to 1, print alignment overlap in .clstr file\n";
  cout << "    -h print this help\n\n";
  cout << "    Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
  cout << "    If you find cd-hit useful, please kindly cite:\n\n";
  cout << "    " << cd_hit_ref1 << "\n";
  cout << "    " << cd_hit_ref2 << "\n\n\n";
  exit(1);
} // END print_usage_2d


int print_usage_est (char *arg) {
  cout << "Usage "<< arg << " [Options] \n\nOptions\n\n";
  cout << "    -i input filename in fasta format, required\n";
  cout << "    -o output filename, required\n";
  cout << "    -c sequence identity threshold, default 0.9\n";
  cout << "       the sequence identity is calculated as :\n";
  cout << "       Number of identical amino acids in alignment\n";
  cout << "       divided by full length of shorter sequence\n";
  cout << "    -b band_width of alignment, default 20\n";
  cout << "    -M max available memory, default 400 (Mbyte) \n";
  cout << "    -n word_length, default 8\n";
  cout << "    -l length of throw_away_sequences, default 10\n";
  cout << "    -d length of description in .clstr file, default 20\n";
  cout << "    -s length difference cutoff, default 0.0\n";
  cout << "       if set to 0.9, the shorter sequences need to be\n";
  cout << "       at least 90% long of longest one in same cluster\n";
  cout << "    -S length difference cutoff in amino acid, default 999999\n";
  cout << "       if set to 60, the length difference\n";
  cout << "       within a cluster will < 60 amino acid\n";
  cout << "    -B 1 or 0, default 0, by default, sequences are stored in RAM\n";
  cout << "       if set to 1, sequence are stored on hard deive\n";
  cout << "    -p 1 or 0, default 0\n";
  cout << "       if set to 1, print alignment overlap in .clstr file\n";
  cout << "    -r 1 or 0, default 0, by default only +/+ strand alignment\n";
  cout << "       if set to 1, do both +/+ & +/- alignments" << endl;
  cout << "    -h print this help\n\n";
  cout << "    Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
  cout << "    If you find cd-hit useful, please kindly cite:\n\n";
  cout << "    " << cd_hit_ref1 << "\n";
  exit(1);
} // END print_usage_est


int print_usage_est_2d (char *arg) {
  cout << "Usage "<< arg << " [Options] \n\nOptions\n\n";
  cout << "    -i input filename for db1 in fasta format, required\n";
  cout << "    -i2 input filename for db2 in fasta format, required\n";
  cout << "    -o output filename, required\n";
  cout << "    -c sequence identity threshold, default 0.9\n";
  cout << "       the sequence identity is calculated as :\n";
  cout << "       Number of identical amino acids in alignment\n";
  cout << "       divided by full length of shorter sequence\n";
  cout << "    -b band_width of alignment, default 20\n";
  cout << "    -M max available memory, default 400 (Mbyte) \n";
  cout << "    -n word_length, default 8\n";
  cout << "    -l length of throw_away_sequences, default 10\n";
  cout << "    -d length of description in .clstr file, default 20\n";
  cout << "    -s length difference cutoff, default 0.0\n";
  cout << "       if set to 0.9, the shorter sequences need to be\n";
  cout << "       at least 90% long of longest one in same cluster\n";
  cout << "    -S length difference cutoff in amino acid, default 999999\n";
  cout << "       if set to 60, the length difference\n";
  cout << "       within a cluster will < 60 amino acid\n";
  cout << "    -s2 length difference cutoff for db1, default 1.0\n";
  cout << "       by default, seqs in db1 >= seqs in db2 in a same cluster\n";
  cout << "       if set to 0.9, seqs in db1 may just >= 90% seqs in db2\n";
  cout << "    -S2 length difference cutoff, default 0\n";
  cout << "       by default, seqs in db1 >= seqs in db2 in a same cluster\n";
  cout << "       if set to 60, seqs in db2 may 60aa longer than seqs in db1\n";
  cout << "    -B 1 or 0, default 0, by default, sequences are stored in RAM\n";
  cout << "       if set to 1, sequence are stored on hard deive\n";
  cout << "    -p 1 or 0, default 0\n";
  cout << "       if set to 1, print alignment overlap in .clstr file\n";
  cout << "    -r 1 or 0, default 0, by default only +/+ strand alignment\n";
  cout << "       if set to 1, do both +/+ & +/- alignments" << endl;
  cout << "    -h print this help\n\n";
  cout << "    Questions, bugs, contact Weizhong Li at liwz@sdsc.edu\n\n";
  cout << "    If you find cd-hit useful, please kindly cite:\n\n";
  cout << "    " << cd_hit_ref1 << "\n";
  exit(1);
} // END print_usage_est_2d


int print_usage_div (char *arg) {
  cout << "Usage "<< arg << " [Options] \n\nOptions\n\n";
  cout << "Options " << endl << endl;
  cout << "    -i in_dbname, required" << endl;
  cout << "    -o out_dbname, required" << endl;
  cout << "    -div number of divide, required " << endl;
  cout << "    -f, fast divide " << endl;
  cout << "    -dbmax max size of your db\n\n\n";
  exit(1);
} // END print_usage_div



int db_seq_no_test(ifstream &in1) {
  char c0, c1;
  int no = 0;

  c0 = '\n';
  while(1) {
    if ( in1.eof()) break;
    in1.read(&c1, 1);
    if ( c1 == '>' && c0 == '\n') no++;
    c0 = c1;
  }
  return no;
}


int old_clstr_seq_no_test(ifstream &in1) {
  char c0, c1;
  int no = 0;

  c0 = '\n';
  while(1) {
    if ( in1.eof()) break;
    in1.read(&c1, 1);
    if ( c1 != '>' && c0 == '\n') no++;
    c0 = c1;
  }
  in1.close();
  return no;
}



int db_read_in_old (ifstream &in1, int length_of_throw, 
                int & NR_no, char *NR_seq[], int *NR_len) {

  char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ];
  char buffer1[MAX_LINE_SIZE];
  raw_seq[0] = raw_des[0] = buffer1[0] = 0;
  int read_in = 0;

  NR_no = 0;
  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');

    if ( buffer1[0] == '>') {
      if ( read_in ) { // write previous record
         format_seq(raw_seq);

         if ( strlen(raw_seq) > length_of_throw ) {
//           if (strlen(raw_seq) > 32766) raw_seq[32766]=0; // temp
           if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
             bomb_error("memory");
           strcpy( NR_seq[NR_no], raw_seq);
           NR_len[NR_no] = strlen(raw_seq);
           NR_no++;
         }
      }
      strncpy(raw_des, buffer1, MAX_DES-2);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      if ( strlen(raw_seq)+strlen(buffer1) >= MAX_SEQ-1 )
        bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
      strcat(raw_seq, buffer1);
    }
  } // END while(1);

  if (1) { // the last record
    format_seq(raw_seq);

    if ( strlen(raw_seq) > length_of_throw ) {
      if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
        bomb_error("memory");
      strcpy( NR_seq[NR_no], raw_seq);
      NR_len[NR_no] = strlen(raw_seq);
      NR_no++;
    }
  }
  in1.close();
  return 0;
} // END db_read_in_old


int db_read_in (ifstream &in1, char *db_bin_swap, 
                int seq_swap, int length_of_throw, 
                int & NR_no, char *NR_seq[], int *NR_len) {

  char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ];
  char buffer1[MAX_LINE_SIZE];
  raw_seq[0] = raw_des[0] = buffer1[0] = 0;
  int read_in = 0;

  ofstream bindb[16];
  int bin_no = 0;
  int total_letter_bin = 0;
  char db_bin_swap_over[MAX_FILE_NAME];
  if (seq_swap) {
    bindb[bin_no].open(db_bin_swap);
    if (! bindb[bin_no]) bomb_error("Can not open", db_bin_swap);
  }
  int jj = -1;

  NR_no = 0;
  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');

    if ( buffer1[0] == '>') {
      if ( read_in ) { // write previous record
         format_seq(raw_seq);

         if ( strlen(raw_seq) > length_of_throw ) {
           NR_len[NR_no] = strlen(raw_seq);
           if (seq_swap) {
            setiseq(raw_seq, NR_len[NR_no]);
            total_letter_bin += sizeof(int) + NR_len[NR_no];
            // so that size of file < MAX_BIN_SWAP about 2GB
            if ( total_letter_bin >= MAX_BIN_SWAP) {
              bindb[bin_no].write((char *) &jj, sizeof(int)); // signal
              bindb[bin_no].close();
              sprintf(db_bin_swap_over, "%s.%d",db_bin_swap,++bin_no);
              bindb[bin_no].open(db_bin_swap_over);
              if (! bindb[bin_no]) bomb_error("Can not open", db_bin_swap_over);
              total_letter_bin = 0;
            }
            bindb[bin_no].write((char *) &NR_len[NR_no], sizeof(int));
            bindb[bin_no].write(raw_seq, NR_len[NR_no]);
           }
           else {
             if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
               bomb_error("memory");
             strcpy( NR_seq[NR_no], raw_seq);
           }
           NR_no++;
         }
      }
      strncpy(raw_des, buffer1, MAX_DES-2);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      if ( strlen(raw_seq)+strlen(buffer1) >= MAX_SEQ-1 )
        bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
      strcat(raw_seq, buffer1);
    }
  } // END while(1);

  if (1) { // the last record
    format_seq(raw_seq);

    if ( strlen(raw_seq) > length_of_throw ) {

      NR_len[NR_no] = strlen(raw_seq);
      if (seq_swap) {
       setiseq(raw_seq, NR_len[NR_no]);
       bindb[bin_no].write((char *) &NR_len[NR_no], sizeof(int));
       bindb[bin_no].write(raw_seq, NR_len[NR_no]);

      }
      else {
        if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
          bomb_error("memory");
        strcpy( NR_seq[NR_no], raw_seq);
      }
      NR_no++;
    }
  }
  in1.close();
  if (seq_swap) bindb[bin_no].close();
  
  return 0;
} // END db_read_in


// modified from above, but only readin length
int db_read_in_len (ifstream &in1, int length_of_throw, 
                int & NR_no, int *NR_len) {

  char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ];
  char buffer1[MAX_LINE_SIZE];
  raw_seq[0] = raw_des[0] = buffer1[0] = 0;
  int read_in = 0;

  NR_no = 0;
  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');

    if ( buffer1[0] == '>') {
      if ( read_in ) { // write previous record
         format_seq(raw_seq);
         if ( strlen(raw_seq) > length_of_throw ) {
           NR_len[NR_no] = strlen(raw_seq);
           NR_no++;
         }
      }
      strncpy(raw_des, buffer1, MAX_DES-2);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      if ( strlen(raw_seq)+strlen(buffer1) >= MAX_SEQ-1 )
        bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
      strcat(raw_seq, buffer1);
    }
  } // END while(1);

  if (1) { // the last record
    format_seq(raw_seq);
    if ( strlen(raw_seq) > length_of_throw ) {
      NR_len[NR_no] = strlen(raw_seq);
      NR_no++;
    }
  }
  in1.close();
  
  return 0;
} // END db_read_in_len


// modified from above, but skip length_of_throw and format_seq
int db_read_in_lenf (ifstream &in1, int & NR_no, int *NR_len) {

  char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ];
  char buffer1[MAX_LINE_SIZE];
  raw_seq[0] = raw_des[0] = buffer1[0] = 0;
  int read_in = 0;

  NR_no = 0;
  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');

    if ( buffer1[0] == '>') {
      if ( read_in ) { // write previous record
         NR_len[NR_no] = strlen(raw_seq);
         NR_no++;
      }
      strncpy(raw_des, buffer1, MAX_DES-2);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      if ( strlen(raw_seq)+strlen(buffer1) >= MAX_SEQ-1 )
        bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
      strcat(raw_seq, buffer1);
    }
  } // END while(1);

  if (1) { // the last record
    NR_len[NR_no] = strlen(raw_seq);
    NR_no++;
  }
  in1.close();
  
  return 0;
} // END db_read_in_len

int db_read_in2_old(ifstream &in1, int length_of_throw, 
                int & NR_no, char *NR_seq[], int *NR_len,
                int NRo_no, int * NRo_idx,
                int * NRo_id1, int * NRo_id2, int * NRo_NR_idx) {

  int i, j, k;
  char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ];
  char buffer1[MAX_LINE_SIZE];
  raw_seq[0] = raw_des[0] = buffer1[0] = 0;
  int read_in = 0;
  int id1, id2;

  NR_no = 0;
  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');

    if ( buffer1[0] == '>' || buffer1[0] == ';') {
      if ( read_in ) { // write last record
         format_seq(raw_seq);

         if ( strlen(raw_seq) > length_of_throw ) {
           if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
             bomb_error("memory");
           strcpy( NR_seq[NR_no], raw_seq);
           NR_len[NR_no] = strlen(raw_seq);
           des_to_idx(id1, id2, raw_des);
           i = get_index_of_2_sorted_list(NRo_id1, NRo_id2, 0, NRo_no-1, id1, id2);
           if ( i != -1 ) NRo_NR_idx[ NRo_idx[i] ] = NR_no;
           NR_no++;
         }
      }
      strncpy(raw_des, buffer1, MAX_DES-2);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      if ( strlen(raw_seq)+strlen(buffer1) >= MAX_SEQ-1 )
        bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
      strcat(raw_seq, buffer1);
    }
  } // END while(1);

  if (1) { // the last record
    format_seq(raw_seq);

    if ( strlen(raw_seq) > length_of_throw ) {
      if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
        bomb_error("memory");
      strcpy( NR_seq[NR_no], raw_seq);
      NR_len[NR_no] = strlen(raw_seq);
      des_to_idx(id1, id2, raw_des);
      i = get_index_of_2_sorted_list(NRo_id1, NRo_id2, 0, NRo_no-1, id1, id2);
      if ( i != -1 ) NRo_NR_idx[ NRo_idx[i] ] = NR_no;
      NR_no++;
    }
  }
  in1.close();
  return 0;
} // END db_read_in2_old


int db_read_in2(ifstream &in1, char *db_bin_swap, int seq_swap,
                int length_of_throw, 
                int & NR_no, char *NR_seq[], int *NR_len,
                int NRo_no, int * NRo_idx,
                int * NRo_id1, int * NRo_id2, int * NRo_NR_idx) {

  int i, j, k;
  char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ];
  char buffer1[MAX_LINE_SIZE];
  raw_seq[0] = raw_des[0] = buffer1[0] = 0;
  int read_in = 0;
  int id1, id2;

  ofstream bindb[16];
  int bin_no = 0;
  int total_letter_bin = 0;
  char db_bin_swap_over[MAX_FILE_NAME];
  if (seq_swap) {
    bindb[bin_no].open(db_bin_swap);
    if (! bindb[bin_no]) bomb_error("Can not open", db_bin_swap);
  }
  int jj = -1;

  NR_no = 0;
  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');

    if ( buffer1[0] == '>' || buffer1[0] == ';') {
      if ( read_in ) { // write last record
         format_seq(raw_seq);

         if ( strlen(raw_seq) > length_of_throw ) {
           NR_len[NR_no] = strlen(raw_seq);
           if (seq_swap) {
            setiseq(raw_seq, NR_len[NR_no]);
            total_letter_bin += sizeof(int) + NR_len[NR_no];
            // so that size of file < MAX_BIN_SWAP about 2GB
            if ( total_letter_bin >= MAX_BIN_SWAP) {
              bindb[bin_no].write((char *) &jj, sizeof(int)); // signal
              bindb[bin_no].close();
              sprintf(db_bin_swap_over, "%s.%d",db_bin_swap,++bin_no);
              bindb[bin_no].open(db_bin_swap_over);
              if (! bindb[bin_no]) bomb_error("Can not open", db_bin_swap_over);
              total_letter_bin = 0;
            }
            bindb[bin_no].write((char *) &NR_len[NR_no], sizeof(int));
            bindb[bin_no].write(raw_seq, NR_len[NR_no]);
           }
           else {
             if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
               bomb_error("memory");
             strcpy( NR_seq[NR_no], raw_seq);
           }
           des_to_idx(id1, id2, raw_des);
           i = get_index_of_2_sorted_list(NRo_id1, NRo_id2, 0, 
                                          NRo_no-1, id1, id2);
           if ( i != -1 ) NRo_NR_idx[ NRo_idx[i] ] = NR_no;
           NR_no++;
         }
      }
      strncpy(raw_des, buffer1, MAX_DES-2);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      if ( strlen(raw_seq)+strlen(buffer1) >= MAX_SEQ-1 )
        bomb_error("Long sequence found, enlarge Macro MAX_SEQ");
      strcat(raw_seq, buffer1);
    }
  } // END while(1);

  if (1) { // the last record
    format_seq(raw_seq);

    if ( strlen(raw_seq) > length_of_throw ) {
      NR_len[NR_no] = strlen(raw_seq);
      if (seq_swap) {
       setiseq(raw_seq, NR_len[NR_no]);
       bindb[bin_no].write((char *) &NR_len[NR_no], sizeof(int));
       bindb[bin_no].write(raw_seq, NR_len[NR_no]);
      }
      else {
        if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )
          bomb_error("memory");
        strcpy( NR_seq[NR_no], raw_seq);
      }
      des_to_idx(id1, id2, raw_des);
      i = get_index_of_2_sorted_list(NRo_id1, NRo_id2, 0, 
                                     NRo_no-1, id1, id2);
      if ( i != -1 ) NRo_NR_idx[ NRo_idx[i] ] = NR_no;
      NR_no++;
    }
  }
  in1.close();
  if (seq_swap) bindb[bin_no].close();
  return 0;
} // END db_read_in2


int sort_seqs_divide_segs (int seq_swap,
                           int NR_no, int *NR_len, int *NR_idx, char *NR_seq[],
                           int mem_limit, int NAAN,
                           int &SEG_no, int *SEG_b, int *SEG_e, 
                           char db_swap[MAX_SEG][MAX_FILE_NAME],
                           char db_out[]) {
  int i, j, k, i1;

  // *************************************     change all the NR_seq to iseq
  int len, len1, len2, len22;
  long long total_letter=0;
  int max_len = 0, min_len = 99999;
  for (i=0; i<NR_no; i++) {
    len = NR_len[i];
    total_letter += len;
    if (len > max_len) max_len = len;
    if (len < min_len) min_len = len;
    if (! seq_swap) setiseq(NR_seq[i], len);
  }

  cout << "longest and shortest : " << max_len << " and " << min_len << endl;
  cout << "Total letters: " << total_letter << endl;
  // END change all the NR_seq to iseq

  // **************************** Form NR_idx[], Sort them from Long to short
  int *size_no;
  int *size_begin;
  if ((size_no = new int[max_len-min_len+1]) == NULL ) bomb_error("Memory");
  if ((size_begin = new int[max_len-min_len+1]) == NULL ) bomb_error("Memory");

  for (i=max_len; i>=min_len; i--) {
    size_no[max_len - i] = 0;
    size_begin[max_len - i] = 0;
  }
  for (i=0; i<NR_no; i++)  size_no[max_len - NR_len[i]]++;
  for (i=max_len; i>=min_len; i--)
    for (j=max_len; j>i; j--)
      size_begin[max_len-i] += size_no[max_len-j];
  for (i=max_len; i>=min_len; i--) size_no[max_len - i] = 0;
  for (i=0; i<NR_no; i++) {
    j = max_len-NR_len[i];
    NR_idx[ size_begin[j] + size_no[j]] = i;
    size_no[j]++;
  }
  delete []size_no; delete []size_begin;
  cout << "Sequences have been sorted" << endl;
  // END sort them from long to short

  //RAM that can be allocated
  if (seq_swap) mem_limit -=                29*NR_no + 16 * NAAN;
  else          mem_limit -= total_letter + 29*NR_no + 16 * NAAN;
  
  if ( mem_limit <= 1000000 ) bomb_error("not enough memory, change -M option");

  //RAM can hold how many letters
  if (seq_swap) mem_limit /= sizeof (int) + sizeof (INTs) + 2*sizeof(char);
  else          mem_limit /= sizeof (int) + sizeof (INTs);

  SEG_no=0; j=0; k=0;
  for (i1=0; i1<NR_no; i1++) {
    i = NR_idx[i1];
    len = NR_len[i];
    j += len;
    if ( j>mem_limit ) {
      SEG_b[SEG_no] = k;
      SEG_e[SEG_no] = i1;
      sprintf(db_swap[SEG_no], "%s.SWAP.%d",db_out,SEG_no);
      j=0; k=i1+1;
      SEG_no++;
      if ( SEG_no >= MAX_SEG ) 
        bomb_error("Too many segments, enlarge Macro MAX_SEG or -M option");
    }
  }
  if ( SEG_no == 0 ) {
    SEG_b[SEG_no] = 0;
    SEG_e[SEG_no] = NR_no-1;
    sprintf(db_swap[SEG_no], "%s.SWAP.%d",db_out,SEG_no);
    SEG_no++;
  }
  else if ( SEG_e[SEG_no-1] != NR_no-1 ) { // last Segment
    SEG_b[SEG_no] = k;
    SEG_e[SEG_no] = NR_no-1;
    sprintf(db_swap[SEG_no], "%s.SWAP.%d",db_out,SEG_no);
    SEG_no++;
  }
  if (SEG_no > 1) cout << "Sequences divided into " << SEG_no << " parts\n";

   return 0;
}// END sort_seqs_divide_segs


int db2_seqs_divide_segs (int seq_swap,
                           int NR_no, int *NR_len, char *NR_seq[],
                           int mem_limit, int NAAN,
                           int &SEG_no, int *SEG_b, int *SEG_e) {
  int i, j, k, i1;

  // *************************************     change all the NR_seq to iseq
  int len, len1, len2, len22;
  long long total_letter=0;
  int max_len = 0, min_len = 99999;
  for (i=0; i<NR_no; i++) {
    len = NR_len[i];
    total_letter += len;
    if (len > max_len) max_len = len;
    if (len < min_len) min_len = len;
    if (! seq_swap) setiseq(NR_seq[i], len);
  }

  cout << "longest and shortest : " << max_len << " and " << min_len << endl;
  cout << "Total letters: " << total_letter << endl;
  // END change all the NR_seq to iseq


  //RAM that can be allocated
  if (seq_swap) mem_limit -=                29*NR_no + 16 * NAAN;
  else          mem_limit -= total_letter + 29*NR_no + 16 * NAAN;
  
  if ( mem_limit <= 1000000 ) bomb_error("not enough memory, change -M option");

  //RAM can hold how many letters
  if (seq_swap) mem_limit /= sizeof (int) + sizeof (INTs) + 2*sizeof(char);
  else          mem_limit /= sizeof (int) + sizeof (INTs);

  SEG_no=0; j=0; k=0;
  for (i1=0; i1<NR_no; i1++) {
    i = i1;
    len = NR_len[i];
    j += len;
    if ( j>mem_limit ) {
      SEG_b[SEG_no] = k;
      SEG_e[SEG_no] = i1;
      j=0; k=i1+1;
      SEG_no++;
      if ( SEG_no >= MAX_SEG ) 
        bomb_error("Too many segments, enlarge Macro MAX_SEG or -M option");
    }
  }

  if ( SEG_no == 0 ) {
    SEG_b[SEG_no] = 0;
    SEG_e[SEG_no] = NR_no-1;
    SEG_no++;
  }
  else if ( SEG_e[SEG_no-1] != NR_no-1 ) { // last Segment
    SEG_b[SEG_no] = k;
    SEG_e[SEG_no] = NR_no-1;
    SEG_no++;
  }
  if (SEG_no > 1) cout << "Sequences divided into " << SEG_no << " parts\n";

  return 0;
}// END db2_seqs_divide_segs




int db_read_and_write (ifstream &in1, ofstream &out1, 
                       int length_of_throw, int des_len,
                       char *NR_seq[], int *NR_clstr_no) {

  char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ];
  char buffer1[MAX_LINE_SIZE];
  raw_seq[0] = raw_des[0] = buffer1[0] = 0;
  int read_in = 0;
  int NR_no1 = 0;

  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');
    if ( buffer1[0] == '>' || buffer1[0] == ';') {
      if ( read_in ) { // write last record
         strcpy(raw_seq1, raw_seq);
         format_seq(raw_seq1);

         if ( strlen(raw_seq1) > length_of_throw ) {
           if (NR_clstr_no[NR_no1] >= 0 ) out1 << raw_des << "\n" << raw_seq;
           if ((NR_seq[NR_no1] = new char[des_len] ) == NULL )
             bomb_error("memory");
           strncpy(NR_seq[NR_no1], raw_des, des_len-2);
           NR_seq[NR_no1][des_len-2]=0;
           NR_no1++;
         }
      }
      strncpy(raw_des, buffer1, MAX_DES-2);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      strcat(raw_seq, buffer1); strcat(raw_seq,"\n");
    }
  } // END while(1);

  if (1) { // the last record
    strcpy(raw_seq1, raw_seq);
    format_seq(raw_seq1);

    if ( strlen(raw_seq1) > length_of_throw ) {
      if (NR_clstr_no[NR_no1] >= 0 ) out1 << raw_des << "\n" << raw_seq;
      if ((NR_seq[NR_no1] = new char[des_len] ) == NULL )
        bomb_error("memory");
      strncpy(NR_seq[NR_no1], raw_des, des_len-2);
      NR_seq[NR_no1][des_len-2]=0;
      NR_no1++;
    }
  }

  return 0;
} // END db_read_and_write



int db_read_des(ifstream &in1, 
                int length_of_throw, int des_len, char *NR_seq[]) {

  char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ];
  char buffer1[MAX_LINE_SIZE];
  raw_seq[0] = raw_des[0] = buffer1[0] = 0;
  int read_in = 0;
  int NR_no1 = 0;

  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');
    if ( buffer1[0] == '>' || buffer1[0] == ';') {
      if ( read_in ) { // write last record
         strcpy(raw_seq1, raw_seq);
         format_seq(raw_seq1);

         if ( strlen(raw_seq1) > length_of_throw ) {
           if ((NR_seq[NR_no1] = new char[des_len] ) == NULL )
             bomb_error("memory");
           strncpy(NR_seq[NR_no1], raw_des, des_len-2);
           NR_seq[NR_no1][des_len-2]=0;
           NR_no1++;
         }
      }
      strncpy(raw_des, buffer1, MAX_DES-2);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      strcat(raw_seq, buffer1); strcat(raw_seq,"\n");
    }
  } // END while(1);

  if (1) { // the last record
    strcpy(raw_seq1, raw_seq);
    format_seq(raw_seq1);

    if ( strlen(raw_seq1) > length_of_throw ) {
      if ((NR_seq[NR_no1] = new char[des_len] ) == NULL )
        bomb_error("memory");
      strncpy(NR_seq[NR_no1], raw_des, des_len-2);
      NR_seq[NR_no1][des_len-2]=0;
      NR_no1++;
    }
  }

  return 0;
} // END db_read_des


int old_clstr_read_in(ifstream &in_clstr, int &NRo_no, int &NRo90_no,
                      int *NRo_idx, int *NRo_id1, int *NRo_id2,
                      char *NRo_iden, int *NRo_clstr_no, int *NRo_NR_idx) {

  int i, j, k, i1;
  char buffer1[MAX_LINE_SIZE];
  int is_rep, len;
  char iden, iden1[4];

  NRo_no = 0;
  NRo90_no = 0;

  while(1) {
    if (in_clstr.eof()) break;
    in_clstr.getline(buffer1, MAX_LINE_SIZE-2, '\n');
    if ( buffer1[0] == '>') {
      //read in line like >Cluster 10
      NRo90_no ++;
    }
    else {
      //read in line like 
      //1	225aa, >gi|4099051|gb|AAD... at 80%
      //2	9448aa, >gi|8249467|emb|CA... *
      len = strlen(buffer1); if (len<12) continue;
      iden = 0;
      for (i=len-1; i>=0; i--) { 
        if (buffer1[i] == '*') {
          is_rep = 1; break;
        }
        else if (buffer1[i] == '%') {
          is_rep = 0; 
          for (j=i-1; isdigit(buffer1[j]); j--) ;
          strncpy(iden1, buffer1+j, i-j); iden1[i-j]=0; iden = atoi(iden1);
          break;
        }
      }

      for (i=0; i<len; i++) 
        if (buffer1[i] == '>' ) break;

      NRo_idx[NRo_no]      = NRo_no;
      des_to_idx(NRo_id1[NRo_no], NRo_id2[NRo_no], buffer1+i);
      NRo_clstr_no[NRo_no] = is_rep ? NRo90_no -1 : -1 - (NRo90_no-1);
      NRo_NR_idx[NRo_no]   = -1;
      NRo_iden[NRo_no] = iden;
      NRo_no ++;
    }
  }

  for (i=0; i<NRo_no; i++) {
    if ( (i1=NRo_clstr_no[i]) < 0) continue;

    NRo_clstr_no[i] = i;
    for (j=i-1; j>=0; j--) {
      if ( i1 == -1-NRo_clstr_no[j] ) NRo_clstr_no[j] = i;
      else break;
    }

    k=i;
    for (j=i+1; j<NRo_no; j++) {
      if ( i1 == -1-NRo_clstr_no[j] ) {NRo_clstr_no[j] = k; i++;}
      else break;
    }
  }

  quick_sort_a_b_idx(NRo_id1, NRo_id2, NRo_idx, 0, NRo_no-1);
  // now NRo_id1 and NRo_id2 are sorted, 
  // and NRo_idx restore the sorted index of NRo_clstr_no, NRo_NR_idx

  return 0;
} // END old_clstr_read_in


// this is the index the identifier of each sequence in FASTA format
// I use the first 12 alphabet or digit letters to identify each sequence
// the first 6-letter convert to a unique integer,
// and 2nd letter convert to another integer, all alphabet in lower case
// if the string is shorter than 12, others are filled with ''
// idx is
// 0 1 2 3 4 5 6 7 8 9 a b c d e f g h i j k l m n o p q r s t u v w x y z '' 
// 0                   1                   2                   3
// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 
// so total 37 for each bit
// 37 1369 50653 1874161 69343957
int des_to_idx(int &id1, int &id2, char *str1) {
  int i, j, k;
  char idstr[13];
  int len = strlen(str1);
  char c1;

  for (i=0; i<12; i++) idstr[i] = 36;

  j = 0;
  for (i=0; i<len; i++) {
    c1 = str1[i];
    if ( isalpha(c1) ) {
      idstr[j] = tolower(c1) - 'a' + 10;
      j++;
    }
    else if ( isdigit(c1) ) {
      idstr[j] = tolower(c1) - '0';
      j++;
    } 
    if (j == 12) break;
  }

  id1 = idstr[0]*69343957 + idstr[1]*1874161 + idstr[2]*50653 +
        idstr[3]*1369     + idstr[4]*37      + idstr[5];
  id2 = idstr[6]*69343957 + idstr[7]*1874161 + idstr[8]*50653 +
        idstr[9]*1369     + idstr[10]*37     + idstr[11];
  return 0;
} // END des_to_idx



// get index of a element of a sorted list using 2-div method
// calling get_index_of_sorted_list (list, begin_no, end_no, element)
// list is a sorted list in order of increasing
int get_index_of_sorted_list (int *list, int b, int e, int element) {

  int mid = (b+e) / 2;
  int mid_v = list[mid];

  while( e > b+1 ) {
    mid = (b+e) / 2;
    mid_v = list[mid];

    if      (element > mid_v) { b = mid; }
    else if (element < mid_v) { e = mid; }
    else                      { break; }
  }

  if      (element == mid_v   ) { return mid; }
  else if (element == list[e] ) { return e;   }
  else if (element == list[b] ) { return b;   }
  else                          { return -1;  }
} // END get_index_of_sorted_list


// get index of a element of a sorted list using 2-div method
// calling get_index_of_2_sorted_list (list, list2, begin_no, end_no, element)
// list is a sorted list in order of increasing
// if index of list is same, check list2
int get_index_of_2_sorted_list (int *list, int *list2, int b, int e,
                                int element, int element2) {
  int i = get_index_of_sorted_list(list, b, e, element);
  if ( i == -1 ) { return -1; }

  int bb = i;
  int ee = i;

  while (1) {
    if (bb == b) break;
    if (list[bb] == list[bb-1] ) {bb--;}
    else {break;}
  }

  while(1) {
    if (ee == e) break;
    if (list[ee] == list[ee+1] ) {ee++;}
    else {break;}
  }

  i = get_index_of_sorted_list(list2, bb, ee, element2);
  return i;
} // END get_index_of_2_sorted_list


// read in a segment of sequence
int read_swap_iseq1(int NR_no, char *NR_seq[], char *NR_seg, 
                   int sgj, char *bindbname) {
  int i, j, k, i1, j1, k1;
  char raw_seq1[MAX_SEQ];
  int NR_no1;
  int len1;

  ifstream fswap(bindbname);
  if (! fswap) bomb_error("Can not open file", bindbname);
  for (i=0; i<NR_no; i++) {
    fswap.read((char *) &len1, sizeof(int));
    if (NR_seg[i] == sgj) {
      if ( (NR_seq[i] = new char[len1+2] ) == NULL ) bomb_error("memory");
      fswap.read(NR_seq[i], len1);
    }
    else {
      fswap.read(raw_seq1, len1);
    }
  }
  fswap.close();
  return OK_FUNC;
} // END read_swap_iseq


int free_swap_iseq1(int NR_no, char *NR_seq[], char *NR_seg, int sgj) {
  int i, j, k, i1, j1, k1;

  for (i=0; i<NR_no; i++) {
    if (NR_seg[i] == sgj) {
      delete [] NR_seq[i];
      NR_seq[i] = NULL;
    }
  }
  return OK_FUNC;
} // END free_swap_iseq

int remove_tmp_files(int SEG_no, char db_swap[MAX_SEG][MAX_FILE_NAME], 
                     int seq_swap, char db_bin_swap[]) {
  char cmd[256];
  int i, j, k;

  if (seq_swap) {
    strcpy(cmd, "rm -f ");
    strcat(cmd, db_bin_swap);
    system(cmd);
  }

  for (i=0; i<SEG_no-2; i++) {
    strcpy(cmd, "rm -f ");
    strcat(cmd, db_swap[i]);
    system(cmd);
  }
} // END remove_tmp_files


int remove_tmp_files_db2(int seq_swap, char db_bin_swap[]) {
  char cmd[256];
  int i, j, k;
                                                                                
  if (seq_swap) {
    strcpy(cmd, "rm -f ");
    strcat(cmd, db_bin_swap);
    system(cmd);
  }
                                                                                
} // END remove_tmp_files_db2

void show_cpu_time(tms &CPU_begin, tms &CPU_end) {
  int  ClockTicksPerSecond, total_seconds;
//  ClockTicksPerSecond = (int)sysconf(_SC_CLK_TCK);
  ClockTicksPerSecond = (int)(100);

  total_seconds = (CPU_end.tms_utime - CPU_begin.tms_utime) 
                  / ClockTicksPerSecond;

  cout << "Total CPU time " << total_seconds << endl;
} // END  show_current_cpu_time

/////////////////////////// END ALL ////////////////////////

