// =============================================================================
// This is modified version of CD-HI, it works better with large database
// and shorter words or 2 or 3.
//
// CD-HI
//
// Cluster Database at High Identity
// 
// CD-HI clusters protein sequence database at high sequence identity threshold.
// This program can remove the high sequence redundance efficiently.
// Note, here, high identity means 70% and up.
// 
// program written by 
//                                      Weizhong Li
//                                      UCSD, San Diego Supercomputer Center
//                                      La Jolla, CA, 92093
//                                      Email liwz@sdsc.edu
//
//                 at
//                                      Adam Godzik's lab
//                                      The Burnham Institute
//                                      La Jolla, CA, 92037
//                                      Email adam@burnham-inst.org
// =============================================================================
// Usage:
// 	cd-hi [options]
//
// Options:
//	-i filename of input database in fasta format, required!
//
//	-o filename of output database, required!
//
//	-c cluster identity threshold, default => 0.9
//
//	-b max allowed gap length for alignment, default => 20
//
//	-M The available memory of your computer, default => 400 (M)
//
//      -l length_of_throw_away_sequences, default 10, sequences shorter
//         than it will be thrown away
//
//      -d length of description line in the .clstr file, default 20
//
//	-n word size, default => 4,
//		The longer the word size is, the faster the program is
//		but, the word size if restricted by the cluster threshold
//		threshold	allowed word size	good word size
//		>=85%		3,4,5			5
//		>=80%		3,4			4
//		>=75%		3			3
//		>=70%		2,3			3
//		>=60%		2			2
//		if you are clustering nr at 90% use -n 5
//
// Example:
//	cd-hi -i nr -o nr90 -M 480 -n 5
//		cluster nr at 90% threshold supposing the computer has 480M
//		memory.
//
//	cd-hi -i pdbaa -o pdbaa80
//
// Compile:
//	g++ -o cd-hi -O cd-hi.c++
//	Note the -O option (or -O2 -O3) make the program several times faster!
// =============================================================================

#include<iostream.h>
#include<fstream.h>
#include<iomanip.h>
#include<stdlib.h>
#include<stdio.h>
#include<string.h>
#include<ctype.h>

#define MAX_AA 23
#define MAX_UAA 21
#define MAX_SEQ 30000
#define MAX_DIAG 60000
#define MAX_DES 60000
#define MAX_GAP 3000
#define MAX_LINE_SIZE 60000
#define MAX_FILE_NAME 1280
#define SEQ_NO 600000
#define FRAG_NO 800000
#define MAX_SEG 50
#define BUFFER 100000

#define FAILED_FUNC 1
#define OK_FUNC 0

#define max(a,b) (((a)>(b))?(a):(b))
#define min(a,b) (((a)<(b))?(a):(b))


typedef int   INT4;
typedef short INT2;
typedef unsigned int UINT4;
typedef unsigned short UINT2;

//if the longset sequence is longer than 65535, I use INT4
#ifdef LONG_SEQ
#define INTs UINT4
#else
#define INTs UINT2
#endif

////////// Class definition //////////
class AA_MATRIX { //Matrix
  private:

  public:
    int gap, ext_gap;
    int *gap_array;
    int matrix[MAX_AA][MAX_AA];

    AA_MATRIX();
    void init();
    void set_gap(int gap1, int ext_gap1);
    void set_matrix(int *mat1);
}; // END class AA_MATRIX


int NAA1 = MAX_UAA;
int NAA2 = NAA1 * NAA1;
int NAA3 = NAA1 * NAA1 * NAA1;
int NAA4 = NAA1 * NAA1 * NAA1 * NAA1;
int NAA5 = NAA1 * NAA1 * NAA1 * NAA1 * NAA1;


int NR_no, NR90_no, NR90f_no;
int NR_len[SEQ_NO];
int NR_idx[SEQ_NO];            // idx table -> unsorted no
int NR90_idx[SEQ_NO];          // idx table -> old no
int NR90_clstr[SEQ_NO];           // cluster no.
char NR90_iden[SEQ_NO];
char NR90_flag[SEQ_NO];
char *NR_seq[SEQ_NO];
double NR_clstr = 0.9;
int BAND_width = 20;
int NAA = 4;
int NAAN = NAA4;
int DIAG_score[MAX_DIAG];
int mem_limit = 400000000;
int mem_size=60;
int length_of_throw = 10;
int des_len = 20;
int Frag_size = 400;
int Short_word = 0;

int Buffer[BUFFER];
int *AAP_lookup_no;
int *AAP_lookup_size;
int *(*AAP_lookup1);
INTs *(*AAP_lookup2);
INTs look_and_count[FRAG_NO];
int NR90f_idx[FRAG_NO];

int SEG_no;
int SEG_b[MAX_SEG], SEG_e[MAX_SEG], SEG90_b[MAX_SEG], SEG90_e[MAX_SEG];
int SEG90f_b[MAX_SEG], SEG90f_e[MAX_SEG];
char db_swap[MAX_SEG][MAX_FILE_NAME];

int NN1=0, NN2=0, NN21=0, NN22=0, NN3=0, NN4=0, NN5=0, NN6=0;

AA_MATRIX mat;
int read_swap (int sgj);
int write_swap (int sgj);
int check_this(int len, char *seqi, 
               int *taap, INTs *aap_begin, INTs *aap_list, int &has_aa2,
               int NAA, int& aan_no, int *aan_list, INTs *aan_list_no,
               int &hit_no, int libb, int libe, int &iden_no,
               int required_aan, int required_aa1, int required_aa2);
int check_this_short(int len, char *seqi, 
               int *taap, INTs *aap_begin, INTs *aap_list, int &has_aa2,
               int NAA, int& aan_no, int *aan_list, INTs *aan_list_no,
                                     int *aan_list_backup,
               int &hit_no, int libb, int libe, 
               int frg2, int libfb, int libfe, int &iden_no,
               int required_aan, int required_aa1, int required_aa2);
int add_in_lookup_table(int aan_no, int *aan_list, INTs *aan_list_no);
int add_in_lookup_table_short(int aan_no, int frg1,
                              int *aan_list, INTs *aan_list_no);
int print_usage (char *arg);
void bomb_error(char *message);
void format_seq(char *seq);
int diag_test_aapn(char iseq2[], int len1, int len2, int *taap,
        INTs *aap_begin, INTs *aap_list, int &best_sum,
        int band_width, int &band_left, int &band_right, int required_aa1);
int local_band_align(char iseq1[], char iseq2[], int len1, int len2,
                     AA_MATRIX &mat, int &best_score, int &iden_no,
                     int band_left, int band_right);
int outiseq(char iseq[], int len);
int setiseq(char *seq, int len);
int quick_sort (int *a, int lo0, int hi0 );


////////// MAIN //////////
int main(int argc, char **argv) {
  int i, j, k, i1, j1, k1, i0, j0, k0, sggi, sgj;
  int si, sj, sk;
  char db_in[MAX_FILE_NAME];
  char db_out[MAX_FILE_NAME];
  char db_clstr[MAX_FILE_NAME];
  char raw_seq[MAX_SEQ], raw_des[MAX_DES], raw_seq1[MAX_SEQ]; 
  char buffer1[MAX_LINE_SIZE];
  raw_seq[0] = raw_des[0] = buffer1[0] = 0;

  // ***********************************    parse command line and open file
  if (argc < 5) print_usage(argv[0]);

  for (i=1; i<argc; i++) {
    if      (strcmp(argv[i], "-i") == 0)
      strncpy(db_in, argv[++i], MAX_FILE_NAME-1);
    else if (strcmp(argv[i], "-o") == 0)
      strncpy(db_out, argv[++i], MAX_FILE_NAME-1);
    else if (strcmp(argv[i], "-c") == 0) {
      NR_clstr = atof(argv[++i]);
      if ((NR_clstr >= 1.0) || (NR_clstr <= 0.6)) bomb_error("invalid clstr");
    }
    else if (strcmp(argv[i], "-b") == 0) {
      BAND_width = atoi(argv[++i]);
      if (BAND_width < 0 ) bomb_error("invalid band width");
    }
    else if (strcmp(argv[i], "-M") == 0) 
      mem_limit = 1000000 * atoi(argv[++i]);
    else if (strcmp(argv[i], "-n") == 0) 
      NAA = atoi(argv[++i]);
    else if (strcmp(argv[i], "-l") == 0)
      length_of_throw = atoi(argv[++i]);
    else if (strcmp(argv[i], "-d") == 0)
      des_len = atoi(argv[++i]);
    else 
      print_usage(argv[0]);
  }
  db_clstr[0]=0; strcat(db_clstr,db_out); strcat(db_clstr,".clstr");

  NAAN = NAA4;
  if      ( NAA == 2 ) { NAAN = NAA2; mem_size = 25000; }
  else if ( NAA == 3 ) { NAAN = NAA3; mem_size = 1200; }
  else if ( NAA == 4 ) { NAAN = NAA4; mem_size = 60; }
  else if ( NAA == 5 ) { NAAN = NAA5; mem_size = 3; }
  else bomb_error("invalid -n parameter!");

  if ((AAP_lookup_no   = new int[NAAN]) == NULL) bomb_error("Memory");
  if ((AAP_lookup_size = new int[NAAN]) == NULL) bomb_error("Memory"); 
  if ((AAP_lookup1     = new int*[NAAN]) == NULL) bomb_error("Memory");
  if ((AAP_lookup2     = new INTs*[NAAN]) == NULL) bomb_error("Memory");
  if ( NAA <= 3 ) { Short_word = 1; }

  for (i=0; i<NAAN; i++) {
    AAP_lookup_no[i]=0;
    AAP_lookup_size[i] = 0;
  }

  if      ( NR_clstr > 0.87 && NAA < 5) 
    cout << "Your word length is " << NAA << ", using 5 may be faster!" <<endl;
  else if ( NR_clstr > 0.80 && NAA < 4 )
    cout << "Your word length is " << NAA << ", using 4 may be faster!" <<endl;
  else if ( NR_clstr > 0.75 && NAA < 3 )
    cout << "Your word length is " << NAA << ", using 3 may be faster!" <<endl;
     
  if ( length_of_throw <= NAA ) bomb_error("Too short -l, redefine it");

  ifstream in1(db_in);
  if ( ! in1 ) { cout << "Can not open file" << db_in << endl; exit(1); }
  ofstream out1(db_out);
  if ( ! out1) { cout << "Can not open file" << db_out << endl; exit(1); }
  ofstream out2(db_clstr);;
  if ( ! out2) { cout << "Can not open file" << db_clstr << endl; exit(1); }
  // END parse command line and open file
  
  // ***************************************************    read in sequence
  int read_in = 0;
  NR_no = 0;
  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');

    if ( buffer1[0] == '>' || buffer1[0] == ';') {
      if ( read_in ) { // write last record
         if ( strlen(raw_seq) >= MAX_SEQ-1) 
           bomb_error("Long sequence found, please redefine MAX_SEQ");
         format_seq(raw_seq); 

         if ( strlen(raw_seq) > length_of_throw ) {
           if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL ) 
             bomb_error("memory");
           strcpy( NR_seq[NR_no], raw_seq);
           NR_len[NR_no] = strlen(raw_seq);
           NR_no++;
         }
      }
      strncpy(raw_des, buffer1, MAX_DES-1);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      strcat(raw_seq, buffer1);
    }
  } // END while(1);

  if (1) { // the last record
    if ( strlen(raw_seq) >= MAX_SEQ-1)
       bomb_error("Long sequence found, please redefine MAX_SEQ");
    format_seq(raw_seq); 

    if ( strlen(raw_seq) > length_of_throw ) {
      if ( (NR_seq[NR_no] = new char[strlen(raw_seq)+2] ) == NULL )  
        bomb_error("memory");
      strcpy( NR_seq[NR_no], raw_seq);
      NR_len[NR_no] = strlen(raw_seq);
      NR_no++;
    }
  }
  in1.close(); 
  cout << "total seq: " << NR_no << endl;
  // END read in sequence

  // *************************************     change all the NR_seq to iseq
  int len, len1, len2, len22, total_letter;
  int max_len = 0, min_len = 99999;
  for (i=0; i<NR_no; i++) {
    len = NR_len[i];
    total_letter += len;
    if (len > max_len) max_len = len;
    if (len < min_len) min_len = len;
    setiseq(NR_seq[i], len);
  }
  cout << "longest and shortest : " << max_len << " and " << min_len << endl;
  cout << "Total letters: " << total_letter << endl;
  // END change all the NR_seq to iseq


  // **************************** Form NR_idx[], Sort them from Long to short
  int *size_no;
  int *size_begin;
  if ((size_no = new int[max_len-min_len+1]) == NULL ) bomb_error("Memory");
  if ((size_begin = new int[max_len-min_len+1]) == NULL ) bomb_error("Memory");

  for (i=max_len; i>=min_len; i--) {
    size_no[max_len - i] = 0;
    size_begin[max_len - i] = 0;
  }
  for (i=0; i<NR_no; i++)  size_no[max_len - NR_len[i]]++; 
  for (i=max_len; i>=min_len; i--)
    for (j=max_len; j>i; j--)
      size_begin[max_len-i] += size_no[max_len-j];
  for (i=max_len; i>=min_len; i--) size_no[max_len - i] = 0;
  for (i=0; i<NR_no; i++) {
    j = max_len-NR_len[i];
    NR_idx[ size_begin[j] + size_no[j]] = i;
    size_no[j]++;
  }
  delete []size_no; delete []size_begin;
  cout << "Sequences have been sorted" << endl;
  // END sort them from long to short

  // ******************* divide the sequences to segments to fit in memory
  mem_limit -= total_letter + 24*SEQ_NO + 16 * NAAN;
  if ( mem_limit <= 1000000 ) bomb_error("No memory, change -M option");
  mem_limit /= sizeof (int) + sizeof (INTs);
  SEG_no=0; j=0; k=0;
  for (i1=0; i1<NR_no; i1++) {
    i = NR_idx[i1];
    NR90_flag[i] = 1;
    len = NR_len[i]; 
    j += len;
    if ( j>mem_limit ) {
      SEG_b[SEG_no] = k;
      SEG_e[SEG_no] = i1;
      sprintf(db_swap[SEG_no], "SWAP.%d",SEG_no);
      j=0; k=i1+1;
      SEG_no++;
    }
  }
  if ( SEG_no == 0 ) {
    SEG_b[SEG_no] = 0;
    SEG_e[SEG_no] = NR_no-1;
    sprintf(db_swap[SEG_no], "SWAP.%d",SEG_no);
    SEG_no++;
    if ( SEG_no >= MAX_SEG ) bomb_error("Too many segments, change -M option");
  }
  else if ( SEG_e[SEG_no-1] != NR_no-1 ) { // last Segment
    SEG_b[SEG_no] = k;
    SEG_e[SEG_no] = NR_no-1;
    sprintf(db_swap[SEG_no], "SWAP.%d",SEG_no);
    SEG_no++;
  }
  if (SEG_no > 1) cout << "Sequences divided into " << SEG_no << " parts\n";

  // *********************************************                Main loop
  int band_left, band_right, hit_no, has_aa2;
  int best_score, iden_no, band_width1;
  int required_aa1, required_aa2, required_aan;
  int taap[NAA2];
  INTs aap_list[MAX_SEQ], aap_begin[NAA2];
  double aa1_cutoff = NR_clstr;
  double aa2_cutoff = 1 - (1-NR_clstr)*2;
  double aan_cutoff = 1 - (1-NR_clstr)*NAA;
  char *seqi, *seqj;
  char c1, c2;
  int c22, mm, aan_no, segb, frg1, frg2, segfb;
  int aan_list[MAX_SEQ], aan_list_backup[MAX_SEQ];
  INTs aan_list_no[MAX_SEQ];


  NR90_no = 0; NR90f_no = 0;
  for (sggi=0; sggi<SEG_no; sggi++) {
    if (SEG_no >1)
      cout << "SEG " << sggi << " " << SEG_b[sggi] << " " << SEG_e[sggi] <<endl;

    for (sgj=sggi-1; sgj>=0; sgj--) {
      cout << "Reading swap" << endl;
      if ( sgj != sggi-1) read_swap(sgj);    // reading old segment
      cout << "Comparing with SEG " << sgj << endl;
      for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) {
        i = NR_idx[i1];
        if (NR90_flag[i] == 0 ) continue;
        len = NR_len[i]; seqi = NR_seq[i];
        frg1 = (len - NAA ) / Frag_size + 1;
        frg2 = (len - NAA + BAND_width ) / Frag_size + 1;
        required_aa1 = int (aa1_cutoff* (double) len);
        required_aa2 = int (aa2_cutoff* (double) len);
        required_aan = int (aan_cutoff* (double) len);
        has_aa2 = 0;

        int flag = Short_word ? 
          check_this_short(len, seqi,
               taap, aap_begin, aap_list, has_aa2,
               NAA, aan_no, aan_list, aan_list_no,
                            aan_list_backup,
               hit_no, SEG90_b[sgj], SEG90_e[sgj],
               frg2, SEG90f_b[sgj], SEG90f_e[sgj], iden_no,
               required_aan, required_aa1, required_aa2) :
          check_this(len, seqi, 
               taap, aap_begin, aap_list, has_aa2,
               NAA, aan_no, aan_list, aan_list_no,
               hit_no, SEG90_b[sgj], SEG90_e[sgj], iden_no,
               required_aan, required_aa1, required_aa2);

        if ( flag == 1) {       // if similar to old one delete it
          delete [] NR_seq[i];
          NR90_clstr[i] = -hit_no-1;  // (-hit_no-1) for non representatives
          NR90_iden[i] = iden_no * 100 / len;
          NR90_flag[i] = 0;
        }
      } //for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++)
    } // for (sgj=0; sgj<sggi; sgj++)

    if (SEG_no >1) cout << "Refresh Memory" << endl;
    for (i=0; i<NAAN; i++) {
      AAP_lookup_no[i]=0;
//    if (AAP_lookup_size[i] > 0 ) {
//      delete [] AAP_lookup1[i];
//      delete [] AAP_lookup2[i];
//      AAP_lookup_size[i] = 0;
//    }
    }

    if (SEG_no >1) cout << "Self comparing" << endl;
    segb = NR90_no;
    segfb = NR90f_no;
    for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) {
      i = NR_idx[i1];
      if (NR90_flag[i] == 0 ) continue;
      len = NR_len[i]; seqi = NR_seq[i];
      frg1 = (len - NAA ) / Frag_size + 1;
      frg2 = (len - NAA + BAND_width ) / Frag_size + 1;
      required_aa1 = int (aa1_cutoff* (double) len);
      required_aa2 = int (aa2_cutoff* (double) len);
      required_aan = int (aan_cutoff* (double) len);
      has_aa2 = 0;

      int flag = Short_word ?
        check_this_short(len, seqi,
             taap, aap_begin, aap_list, has_aa2,
             NAA, aan_no, aan_list, aan_list_no,
                          aan_list_backup,
             hit_no, segb, NR90_no-1, frg2, segfb, NR90f_no-1, iden_no,
             required_aan, required_aa1, required_aa2) :
        check_this(len, seqi, 
             taap, aap_begin, aap_list, has_aa2,
             NAA, aan_no, aan_list, aan_list_no,
             hit_no, segb, NR90_no-1, iden_no,
             required_aan, required_aa1, required_aa2);

      if ( flag == 1) {       // if similar to old one delete it
        delete [] NR_seq[i];
        NR90_clstr[i] = -hit_no-1;  // (-hit_no-1) for non representatives
        NR90_iden[i] = iden_no * 100 / len;
      }
      else {                  // else add to NR90 db
        NR90_idx[NR90_no] = i;
        NR90_clstr[i] = NR90_no; // positive value for representatives
        NR90_iden[i] = 0;
        Short_word ? 
          add_in_lookup_table_short(aan_no, frg1, aan_list_backup, aan_list_no):
          add_in_lookup_table(aan_no, aan_list, aan_list_no);
        NR90f_idx[NR90_no] = NR90f_no;
        NR90f_no += frg1;
        NR90_no++;
      } // else
  
      if ( (i1+1) % 100 == 0 ) { 
        cerr << ".";
        if ( (i1+1) % 1000 == 0 )
          cout << i1+1 << " finished\t" << NR90_no << " clusters" << endl;
//      cout << i1 << "\t" << NR90_no << "\t" << len;
//      cout << "\t" << NN2 << "\t" << NN3 << "\t" << NN4 <<endl;
      }
    } // for (i1=SEG_b[sggi]; i1<=SEG_e[sggi]; i1++) {

    SEG90_b[sggi] = segb;  SEG90_e[sggi] = NR90_no-1;
    SEG90f_b[sggi] = segfb; SEG90f_e[sggi] = NR90f_no-1;
    if ( sggi < SEG_no-2 ) write_swap(sggi);   // if not last segment
  } // for (sggi=0; sggi<SEG_no; sggi++) {

//cout << NN2 << " " << NN3 << " " << NN4 << " " << NN5 << endl;
  for (i=0; i<NR90_no; i++)  delete [] NR_seq[ NR90_idx[i] ]; 

//  for (i=0; i<NAAN; i++)  
//    if ( AAP_lookup_size[i] > 0 ) {
//      delete [] AAP_lookup1[i];
//      delete [] AAP_lookup2[i];
//    }
//  delete [] AAP_lookup_no; delete [] AAP_lookup_size; 

  cout << "writing new database" << endl;
  // reopen db_in, and writing output:
  int NR_no1 = 0;
  in1.open(db_in); read_in = 0;
  while(1) {
    if ( in1.eof()) break;
    in1.getline(buffer1, MAX_LINE_SIZE-2, '\n');
    if ( buffer1[0] == '>' || buffer1[0] == ';') {
      if ( read_in ) { // write last record
         strcpy(raw_seq1, raw_seq);
         format_seq(raw_seq1);
         
         if ( strlen(raw_seq1) > length_of_throw ) {
           if (NR90_clstr[NR_no1] >= 0 ) out1 << raw_des << "\n" << raw_seq;
           if ((NR_seq[NR_no1] = new char[des_len] ) == NULL )
             bomb_error("memory");
           strncpy(NR_seq[NR_no1], raw_des, des_len-2);
           NR_seq[NR_no1][des_len-2]=0;
           NR_no1++;
         }
      }
      strncpy(raw_des, buffer1, MAX_DES-1);
      raw_seq[0] = 0;
    }
    else {
      read_in = 1;
      strcat(raw_seq, buffer1); strcat(raw_seq,"\n");
    }
  } // END while(1);
  if (1) { // the last record
    strcpy(raw_seq1, raw_seq);
    format_seq(raw_seq1);

    if ( strlen(raw_seq1) > length_of_throw ) {
      if (NR90_clstr[NR_no1] >= 0 ) out1 << raw_des << "\n" << raw_seq;
      if ((NR_seq[NR_no1] = new char[des_len] ) == NULL )
        bomb_error("memory");
      strncpy(NR_seq[NR_no1], raw_des, des_len-2);
      NR_seq[NR_no1][des_len-2]=0;
      NR_no1++;
    }
  }
  in1.close(); out1.close(); 
  // END reopen db_in, and writing output:

  cout << "writing clustering information" << endl;
  // write clstr information
  int *Clstr_no, *Clstr_size, *(*Clstr_list);
  if ((Clstr_no   = new int[NR90_no]) == NULL) bomb_error("Memory"); 
  if ((Clstr_size = new int[NR90_no]) == NULL) bomb_error("Memory"); 
  if ((Clstr_list = new int*[NR90_no]) == NULL) bomb_error("Memory");
  mem_size=5;
  for (i=0; i<NR90_no; i++) {
    Clstr_no[i]=0; Clstr_size[i]=mem_size;
    if((Clstr_list[i] = new int[mem_size]) == NULL) bomb_error("Memory");
  }

  for (i=0; i<NR_no; i++) {
    j1 = NR90_clstr[i];
    if ( j1 < 0 ) j1 =-j1-1;
    if ( Clstr_no[j1] == Clstr_size[j1] ) {
      int *buff1;
      if ((buff1=new int[Clstr_size[j1]]) == NULL) bomb_error("Memory");
      for (k=0; k<Clstr_no[j1]; k++) buff1[k] = Clstr_list[j1][k];
      if ( Clstr_size[j1] >0 ) delete [] Clstr_list[j1];
      if ((Clstr_list[j1] = new int[mem_size+Clstr_size[j1]]) == NULL)
        bomb_error("Memory");
      for (k=0; k<Clstr_no[j1]; k++) Clstr_list[j1][k] = buff1[k];
      Clstr_size[j1]+=mem_size;
    }
    Clstr_list[j1][ Clstr_no[j1]++ ] = i;
  }

  for (i=0; i<NR90_no; i++) {
    out2 << ">Cluster " << i << endl;
    for (k=0; k<Clstr_no[i]; k++) {
      j = Clstr_list[i][k];
      out2 << k << "\t" << NR_len[j] << "aa, "<< NR_seq[j] << "...";
      if ( NR90_iden[j]>0 ) out2 << " at " << int(NR90_iden[j]) << "%" << endl;
      else                  out2 << " *" << endl;
    }
  }
  out2.close();
  cout << "program completed !" << endl << endl;

} // END int main

///////////////////////FUNCTION of common tools////////////////////////////

int check_this(int len, char *seqi, 
               int *taap, INTs *aap_begin, INTs *aap_list, int &has_aa2,
               int NAA, int& aan_no, int *aan_list, INTs *aan_list_no,
               int &hit_no, int libb, int libe, int &iden_no,
               int required_aan, int required_aa1, int required_aa2) {

  int i, j, k, i1, j1, k1, i0, j0, k0, c22, sk, mm;

  // check_aan_list 
  aan_no = len - NAA + 1;
  if      ( NAA == 2) 
    for (j=0; j<aan_no; j++) 
      aan_list[j] = seqi[j]*NAA1 + seqi[j+1];
  else if ( NAA == 3)  
    for (j=0; j<aan_no; j++) 
      aan_list[j] = seqi[j]*NAA2 + seqi[j+1]*NAA1 + seqi[j+2];
  else if ( NAA == 4)  
    for (j=0; j<aan_no; j++) 
      aan_list[j] = 
        seqi[j]*NAA3+seqi[j+1]*NAA2 + seqi[j+2]*NAA1 + seqi[j+3];
  else if ( NAA == 5)  
    for (j=0; j<aan_no; j++)
      aan_list[j] = 
        seqi[j]*NAA4+seqi[j+1]*NAA3+seqi[j+2]*NAA2+seqi[j+3]*NAA1+seqi[j+4];
  else return FAILED_FUNC;
  
  quick_sort(aan_list,0,aan_no-1);
  for(j=0; j<aan_no; j++) aan_list_no[j]=1;
  for(j=aan_no-1; j; j--) {
    if (aan_list[j] == aan_list[j-1]) {
      aan_list_no[j-1] += aan_list_no[j];
      aan_list_no[j]=0;
    }
  }
  // END check_aan_list


  // lookup_aan
  for (j=libe; j>=libb; j--) look_and_count[j]=0;
  for (j0=0; j0<aan_no; j0++) {
    if ( j1=aan_list_no[j0] ) { 
      j = aan_list[j0];
      k1 = AAP_lookup_no[j];
//    int *ptr1 = AAP_lookup1[j];
//    int *ptr2 = AAP_lookup2[j];
//    for (k=0; k<k1; k++)
//      look_and_count[ptr1[k]] += ( j1 < ptr2[k]) ? j1 : ptr2[k] ;

                                      //for pointer's sake
      int *ptr1 = AAP_lookup1[j]-1; //note I shift the pointer left 1
      INTs *ptr2 = AAP_lookup2[j]-1; //so no ptr1[0] is out of range
      for (k=k1; k; k--)            //and prt1[lastindex] is the last element
        look_and_count[ptr1[k]] += ( j1 < ptr2[k]) ? j1 : ptr2[k] ;
    }
  }
  // END lookup_aan

  // contained_in_old_lib()
  int band_left, band_right, best_score, band_width1, best_sum, len2;
  int len1 = len - 1;
  char *seqj;
  int flag = 0;      // compare to old lib
  for (j=libe; j>=libb; j--) {
    NN2++;
    if ( look_and_count[j] < required_aan ) continue;
    len2 = NR_len[NR90_idx[j]];
    seqj = NR_seq[NR90_idx[j]];
    
    if ( has_aa2 == 0 )  { // calculate AAP array
      for (sk=0; sk<NAA2; sk++) taap[sk] = 0;
      for (j1=0; j1<len1; j1++) {
        c22= seqi[j1]*NAA1 + seqi[j1+1]; 
        taap[c22]++;
      }
      for (sk=0,mm=0; sk<NAA2; sk++) { 
        aap_begin[sk] = mm; mm+=taap[sk]; taap[sk] = 0;
      }
      for (j1=0; j1<len1; j1++) {
        c22= seqi[j1]*NAA1 + seqi[j1+1]; 
        aap_list[aap_begin[c22]+taap[c22]++] =j1;
      }
      has_aa2 = 1;
    }

    NN3++;
    band_width1 = (BAND_width < len+len2-2 ) ? BAND_width : len+len2-2;
    diag_test_aapn(seqj, len, len2, taap, aap_begin, 
                           aap_list, best_sum,
                           band_width1, band_left, band_right, required_aa1);
    if ( best_sum < required_aa2 ) continue;
    
    NN4++;
    local_band_align(seqi, seqj, len, len2, mat,
                             best_score, iden_no, band_left, band_right);
    if ( iden_no < required_aa1 ) continue;
    
    NN5++; flag = 1; break; // else flag = 1, and break loop
  }
  hit_no = j;
  return flag;
  // END contained_in_old_lib()
} // END check_this


int check_this_short(int len, char *seqi, 
               int *taap, INTs *aap_begin, INTs *aap_list, int &has_aa2,
               int NAA, int& aan_no, int *aan_list, INTs *aan_list_no,
                                     int *aan_list_backup,
               int &hit_no, int libb, int libe, 
               int frg2, int libfb, int libfe, int &iden_no,
               int required_aan, int required_aa1, int required_aa2) {

  int i, j, k, i1, j1, k1, i0, j0, k0, c22, sk, mm, fn;

  aan_no = len - NAA + 1;
  if      ( NAA == 2)
    for (j=0; j<aan_no; j++)
      aan_list_backup[j] = aan_list[j] = seqi[j]*NAA1 + seqi[j+1];
  else if ( NAA == 3)
    for (j=0; j<aan_no; j++)
      aan_list_backup[j] = aan_list[j] = 
        seqi[j]*NAA2 + seqi[j+1]*NAA1 + seqi[j+2];
  else return FAILED_FUNC;

  
  quick_sort(aan_list,0,aan_no-1);
  for(j=0; j<aan_no; j++) aan_list_no[j]=1;
  for(j=aan_no-1; j; j--) {
    if (aan_list[j] == aan_list[j-1]) {
      aan_list_no[j-1] += aan_list_no[j];
      aan_list_no[j]=0;
    }
  }
  // END check_aan_list


  // lookup_aan
  for (j=libfe; j>=libfb; j--) look_and_count[j]=0;
  for (j0=0; j0<aan_no; j0++) {
    if ( j1=aan_list_no[j0] ) { 
      j = aan_list[j0];
      k1 = AAP_lookup_no[j];
                                      //for pointer's sake
      int *ptr1 = AAP_lookup1[j]-1;   //note I shift the pointer left 1
      INTs *ptr2 = AAP_lookup2[j]-1;  //so no ptr1[0] is out of range
      for (k=k1; k; k--)              //and prt1[lastindex] is the last element
        look_and_count[ptr1[k]] += ( j1 < ptr2[k]) ? j1 : ptr2[k] ;
    }
  }
  // END lookup_aan

  // contained_in_old_lib()
  int band_left, band_right, best_score, band_width1, best_sum, len2, best1,sum;
  int len1 = len - 1;
  INTs *lookptr;

  char *seqj;
  int flag = 0;      // compare to old lib
  for (j=libe; j>=libb; j--) {
    NN2++;
    len2 = NR_len[NR90_idx[j]];

    k = (len2 - NAA) / Frag_size + 1;
    lookptr = &look_and_count[ NR90f_idx[j] ];

    if ( frg2 >= k ) {
      best1=0;
      for (j1=0; j1<k; j1++) best1 += lookptr[j1];
    }
    else {
      sum = 0;
      for (j1=0; j1<frg2; j1++) sum += lookptr[j1];
      best1 = sum;
      for (j1=frg2; j1<k; j1++) {
        sum += lookptr[j1] - lookptr[j1-frg2];
        if (sum > best1) best1 = sum;
      }
    }

    if ( best1 < required_aan ) continue;

    seqj = NR_seq[NR90_idx[j]];
    
    if ( has_aa2 == 0 )  { // calculate AAP array
      for (sk=0; sk<NAA2; sk++) taap[sk] = 0;
      for (j1=0; j1<len1; j1++) {
        c22= seqi[j1]*NAA1 + seqi[j1+1]; 
        taap[c22]++;
      }
      for (sk=0,mm=0; sk<NAA2; sk++) { 
        aap_begin[sk] = mm; mm+=taap[sk]; taap[sk] = 0;
      }
      for (j1=0; j1<len1; j1++) {
        c22= seqi[j1]*NAA1 + seqi[j1+1]; 
        aap_list[aap_begin[c22]+taap[c22]++] =j1;
      }
      has_aa2 = 1;
    }

    NN3++;
    band_width1 = (BAND_width < len+len2-2 ) ? BAND_width : len+len2-2;
    diag_test_aapn(seqj, len, len2, taap, aap_begin, 
                           aap_list, best_sum,
                           band_width1, band_left, band_right, required_aa1);
    if ( best_sum < required_aa2 ) continue;
    
    NN4++;
    local_band_align(seqi, seqj, len, len2, mat,
                             best_score, iden_no, band_left, band_right);
    if ( iden_no < required_aa1 ) continue;
    
    NN5++; flag = 1; break; // else flag = 1, and break loop
  }
  hit_no = j;
  return flag;
  // END contained_in_old_lib()
} // END check_this


int add_in_lookup_table(int aan_no, int *aan_list, INTs *aan_list_no) { 
  int i, j, k, i1, j1, k1, i0, j0, k0;

  for (j0=0; j0<aan_no; j0++) {
    if ( j1=aan_list_no[j0] ) { 
      j = aan_list[j0];
  
      if ( AAP_lookup_no[j] == AAP_lookup_size[j] ) { // resize array
        int *buff1;
        k0 = 1;
        if ( AAP_lookup_size[j] < BUFFER ) {buff1 = Buffer; k0=0;}
        else if ((buff1=new int[AAP_lookup_size[j]]) == NULL)
           bomb_error("Memory");
        for (k=0; k<AAP_lookup_no[j]; k++) buff1[k] = AAP_lookup1[j][k];
        if ( AAP_lookup_size[j] >0 ) delete [] AAP_lookup1[j];
        if ((AAP_lookup1[j] = new int[mem_size+AAP_lookup_size[j]]) == NULL)
          bomb_error("Memory");
        for (k=0; k<AAP_lookup_no[j]; k++) AAP_lookup1[j][k] = buff1[k];
    
        for (k=0; k<AAP_lookup_no[j]; k++) buff1[k] = AAP_lookup2[j][k];
        if ( AAP_lookup_size[j] >0 ) delete [] AAP_lookup2[j];
        if ((AAP_lookup2[j] = new INTs[mem_size+AAP_lookup_size[j]]) == NULL)
          bomb_error("Memory");
        for (k=0; k<AAP_lookup_no[j]; k++) AAP_lookup2[j][k] = buff1[k];
    
        AAP_lookup_size[j] += mem_size;
        if (k0 == 1) delete [] buff1;
      }
      AAP_lookup1[j][AAP_lookup_no[j]] = NR90_no;
      AAP_lookup2[j][AAP_lookup_no[j]] = j1;
      AAP_lookup_no[j]++;
    }
  } //  for (j0=0; j0<aan_no; j0++) {
  return 0;
}  // END add_in_lookup_table


int add_in_lookup_table_short(int aan_no, int frg1,
                              int *aan_list, INTs *aan_list_no) {
  int i, j, k, i1, j1, k1, i0, j0, k0, fra;

  for (i=0; i<frg1; i++) {
    k = (i+1)*Frag_size < aan_no ? (i+1)*Frag_size-1: aan_no-1;
    quick_sort(aan_list, i*Frag_size, k);
  }
  for(j=aan_no-1; j; j--) {
    if (aan_list[j] == aan_list[j-1]) {
      aan_list_no[j-1] += aan_list_no[j];
      aan_list_no[j]=0;
    }
  }
  // END check_aan_list

  for (j0=0; j0<aan_no; j0++) {
    if ( j1=aan_list_no[j0] ) {
      j = aan_list[j0];
      fra=j0/Frag_size;

      if ( AAP_lookup_no[j] == AAP_lookup_size[j] ) { // resize array
        int *buff1;
        k0 = 1;
        if ( AAP_lookup_size[j] < BUFFER ) {buff1 = Buffer; k0=0;}
        else if ((buff1=new int[AAP_lookup_size[j]]) == NULL)
           bomb_error("Memory");

        for (k=0; k<AAP_lookup_no[j]; k++) buff1[k] = AAP_lookup1[j][k];
        if ( AAP_lookup_size[j] >0 ) delete [] AAP_lookup1[j];
        if ((AAP_lookup1[j] = new int[mem_size+AAP_lookup_size[j]]) == NULL)
          bomb_error("Memory");
        for (k=0; k<AAP_lookup_no[j]; k++) AAP_lookup1[j][k] = buff1[k];

        for (k=0; k<AAP_lookup_no[j]; k++) buff1[k] = AAP_lookup2[j][k];
        if ( AAP_lookup_size[j] >0 ) delete [] AAP_lookup2[j];
        if ((AAP_lookup2[j] = new INTs[mem_size+AAP_lookup_size[j]]) == NULL)
          bomb_error("Memory");
        for (k=0; k<AAP_lookup_no[j]; k++) AAP_lookup2[j][k] = buff1[k];
   
        AAP_lookup_size[j] += mem_size;
        if (k0 == 1) delete [] buff1;
      }
      AAP_lookup1[j][AAP_lookup_no[j]] = NR90f_no + fra;
      AAP_lookup2[j][AAP_lookup_no[j]] = j1;
      AAP_lookup_no[j]++;
    }
  } //  for (j0=0; j0<aan_no; j0++) {
  return 0;
}  // END add_in_lookup_table

int read_swap (int sgj) {
  int i, j, k, i1, j1, k1;
  ifstream fswap(db_swap[sgj]);
  if (! fswap) bomb_error("Can not open file");

  for (i=0; i<NAAN; i++) {
    if ( AAP_lookup_no[i] > 0 ) {
      delete [] AAP_lookup1[i];
      delete [] AAP_lookup2[i];
    }

    fswap.read( &AAP_lookup_no[i], sizeof(int));
    AAP_lookup_size[i] = AAP_lookup_no[i];
    if (AAP_lookup_no[i] == 0 ) continue;
    if ((AAP_lookup1[i] = new int[AAP_lookup_no[i]]) == NULL) 
      bomb_error("Memory");
    if ((AAP_lookup2[i] = new INTs[AAP_lookup_no[i]]) == NULL) 
      bomb_error("Memory");

    fswap.read(AAP_lookup1[i], sizeof(int) * AAP_lookup_no[i]);
    fswap.read(AAP_lookup2[i], sizeof(INTs) * AAP_lookup_no[i]);
  }

  fswap.close();
  return OK_FUNC;
} // END read_swap


int write_swap (int sgj) {
  int i, j, k, i1, j1, k1;
  ofstream fswap(db_swap[sgj]);
  if (! fswap) bomb_error("Can not open file");

  for (i=0; i<NAAN; i++) {
    fswap.write( &AAP_lookup_no[i], sizeof(int));
    if (AAP_lookup_no[i] == 0 ) continue;
    fswap.write(AAP_lookup1[i], sizeof(int) * AAP_lookup_no[i]);
    fswap.write(AAP_lookup2[i], sizeof(INTs) * AAP_lookup_no[i]);
  }
  fswap.close();
  return OK_FUNC;
} // END write_swap


int print_usage (char *arg) {
  cout << "Usage "<< arg << " [Options] " << endl;
  cout << "Options " << endl;
  cout << "         -i in_dbname, required" << endl;
  cout << "         -o out_dbname, required" << endl;
  cout << "         -c threshold, default 0.9" << endl;
  cout << "         -b band_width, default " << BAND_width << endl;
  cout << "         -M max_memory, default 400 (Mbyte) " << endl;
  cout << "         -n word_length, default 4" << endl;
  cout << "         -l length_of_throw_away_sequences, default 10" << endl;
  cout << "         -d length of description line in the .clstr file, default 20" << endl;
  cout << "         -h, print this help" << endl;
  exit(1);
} // END print_usage


void bomb_error(char *message) {
  cerr << "\nFatal Error\n";
  cerr << message << endl;
  cerr << "\nProgram halted !! \n\n";
  exit (1);
} // END void bomb_error


void format_seq(char *seq) {
  int i, j, k;
  char c1;
  int len = strlen(seq);

  for (i=0,j=0; i<len; i++) {
    c1 = toupper(seq[i]);
    if ( isalpha(c1) ) seq[j++] = c1;
  }
  seq[j] = 0;
}


////For smiple len1 <= len2
////walk along all diag path of two sequences,
////find the diags with most aap
////return top n diags
////for diag 0 means direction (0,0) -> (1,1)
////         1 means direction (0,1) -> (1,2)
////        -1 means direction (1,0) -> (2,1)
int diag_test_aapn(char iseq2[], int len1, int len2, int *taap,
        INTs *aap_begin, INTs *aap_list, int &best_sum,
        int band_width, int &band_left, int &band_right, int required_aa1) {
  int i, i1, j, k, l, m, n;
  int nall = len1+len2-1;

  int *diag_score, *pp;
//  if ( (diag_score = new int[nall] ) == NULL ) bomb_error("Memory");
  diag_score = DIAG_score;
  for (pp=diag_score, i=nall; i; i--, pp++) *pp=0;

  int bi, bj, c22;
  INTs *bip;
  int len11 = len1-1;
  int len22 = len2-1;
  i1 = len11;
  for (i=0; i<len22; i++,i1++) {
    c22 = iseq2[i]*NAA1 + iseq2[i+1];
    if ( (j=taap[c22]) == 0) continue;
    bip = aap_list+ aap_begin[c22];     //    bi = aap_begin[c22];
    for (; j; j--, bip++) {  //  for (j=0; j<taap[c22]; j++,bi++) {
      diag_score[i1 - *bip]++;
    }
  }

  //find the best band range
  int band_b = required_aa1;
  int band_e = nall - required_aa1;
  int band_m = ( band_b+band_width-1 < band_e ) ? band_b+band_width-1 : band_e;
  int best_score=0;
  for (i=band_b; i<=band_m; i++) best_score += diag_score[i];
  int from=band_b;
  int end =band_m;
  int score = best_score;  
  for (k=from, j=band_m+1; j<band_e; j++) {
    score -= diag_score[k++]; 
    score += diag_score[j]; 
    if ( score > best_score ) {
      from = k;
      end  = j;
      best_score = score;
    }
  }
  for (j=from; j<=end; j++) { // if aap pairs fail to open gap
    if ( diag_score[j] < 5 ) { best_score -= diag_score[j]; from++;}
    else break;
  }
  for (j=end; j>=from; j--) { // if aap pairs fail to open gap
    if ( diag_score[j] < 5 ) { best_score -= diag_score[j]; end--;}
    else break;
  }

//  delete [] diag_score;
  band_left = from-len1+1; 
  band_right= end-len1+1;
  best_sum = best_score;
  return OK_FUNC;
}
// END diag_test_aapn
 

////local alignment of two sequence within a diag band
////for band 0 means direction (0,0) -> (1,1)
////         1 means direction (0,1) -> (1,2)
////        -1 means direction (1,0) -> (2,1)
////iseq len are integer sequence and its length,
////mat is matrix, return ALN_PAIR class
int local_band_align(char iseq1[], char iseq2[], int len1, int len2,
                     AA_MATRIX &mat, int &best_score, int &iden_no,
                     int band_left, int band_right) {
  int i, j, k, l, m, n, j1;
  int ii, jj, kk;
  int best_score1, iden_no1, best_i, best_j, best_j1;
  int *gap_array;
  iden_no = 0;

  if ( (band_right >= len2 ) ||
       (band_left  <= -len1) ||
       (band_left  > band_right) ) return FAILED_FUNC;

  // allocate mem for score_mat[len1][len2] etc
  int band_width = band_right - band_left + 1;
  int *(*score_mat), *(*iden_mat);
  if ((score_mat = new int * [len1]) == NULL) bomb_error("Memory");
  if ((iden_mat  = new int * [len1]) == NULL) bomb_error("Memory");
  for (i=0; i<len1; i++) {
    if ((score_mat[i] = new int [band_width]) == NULL) bomb_error("Memory");
    if ((iden_mat[i]  = new int [band_width]) == NULL) bomb_error("Memory");
  }
  //here index j1 refer to band column
  for (i=0; i<len1; i++) for (j1=0; j1<band_width; j1++) score_mat[i][j1] =  0;

  gap_array  = mat.gap_array;
  best_score = 0;

  if (band_left < 0) {  //set score to left border of the matrix within band
    int tband = (band_right < 0) ? band_right : 0;
    for (k=band_left; k<tband; k++) {
      i = -k;
      j1 = k-band_left;
      if ( ( score_mat[i][j1] = mat.matrix[iseq1[i]][iseq2[0]] ) > best_score) 
        best_score = score_mat[i][j1];
      iden_mat[i][j1] = (iseq1[i] == iseq2[0]) ? 1 : 0;
    }
  }

  if (band_right >=0) { //set score to top border of the matrix within band
    int tband = (band_left > 0) ? band_left : 0;
    for (i=0,j=tband; j<=band_right; j++) {
      j1 = j-band_left;
      if ( ( score_mat[i][j1] = mat.matrix[iseq1[i]][iseq2[j]] ) > best_score)
        best_score = score_mat[i][j1];
      iden_mat[i][j1] = (iseq1[i] == iseq2[j]) ? 1 : 0;
    }
  }

  for (i=1; i<len1; i++) {
    for (j1=0; j1<band_width; j1++) {
      j = j1+i+band_left;
      if ( j<1 ) continue;
      if ( j>=len2) continue;

      int sij = mat.matrix[iseq1[i]][iseq2[j]];
      int iden_ij = (iseq1[i] == iseq2[j] ) ? 1 : 0;
      int s1, *mat_row;
      int k0, k_idx;

      // from (i-1,j-1)
      if ( (best_score1 = score_mat[i-1][j1] )> 0 ) {
        iden_no1 = iden_mat[i-1][j1];
      }
      else {
        best_score1 = 0;
        iden_no1 = 0;
      }

      // from last row
      mat_row = score_mat[i-1];
      k0 = (-band_left+1-i > 0) ? -band_left+1-i : 0;
      for (k=j1-1, kk=0; k>=k0; k--, kk++) {
        if ( (s1 = mat_row[k] + gap_array[kk] ) > best_score1 ){
           best_score1 = s1;
           iden_no1 = iden_mat[i-1][k];
        }
      }

      k0 = (j-band_right-1 > 0) ? j-band_right-1 : 0;
      for(k=i-2, jj=j1+1,kk=0; k>=k0; k--,kk++,jj++) {
        if ( (s1 = score_mat[k][jj] + gap_array[kk] ) > best_score1 ){
           best_score1 = s1;
           iden_no1 = iden_mat[k][jj];
        }
      }

      best_score1 += sij;
      iden_no1    += iden_ij;
      score_mat[i][j1] = best_score1;
      iden_mat[i][j1]  = iden_no1;

      if ( best_score1 > best_score ) {
        best_score = best_score1;
        iden_no = iden_no1;
      }
    } // END for (j=1; j<len2; j++)
  } // END for (i=1; i<len1; i++)

  for (i=0; i<len1; i++) {
    delete [] score_mat[i]; 
    delete [] iden_mat[i];
  }
  delete [] score_mat;
  delete [] iden_mat;

  return OK_FUNC;
} // END int local_band_align


// class function definition
char aa[] = {"ARNDCQEGHILKMFPSTWYVBZX"};
int aa_idx[MAX_AA] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,2,6,20};
int aa2idx[] = {0, 2, 4, 3, 6, 13,7, 8, 9,20,11,10,12, 2,20,14, 5, 1,15,16,20,19,17,20,18, 6};
    // idx for  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T  U  V  W  X  Y  Z
    // so aa2idx[ X - 'A'] => idx_of_X, eg aa2idx['A' - 'A'] => 0, and aa2idx['M'-'A'] => 12

int BLOSUM62[] = {
  4,                                                                  // A
 -1, 5,                                                               // R
 -2, 0, 6,                                                            // N
 -2,-2, 1, 6,                                                         // D
  0,-3,-3,-3, 9,                                                      // C
 -1, 1, 0, 0,-3, 5,                                                   // Q
 -1, 0, 0, 2,-4, 2, 5,                                                // E
  0,-2, 0,-1,-3,-2,-2, 6,                                             // G
 -2, 0, 1,-1,-3, 0, 0,-2, 8,                                          // H
 -1,-3,-3,-3,-1,-3,-3,-4,-3, 4,                                       // I
 -1,-2,-3,-4,-1,-2,-3,-4,-3, 2, 4,                                    // L
 -1, 2, 0,-1,-3, 1, 1,-2,-1,-3,-2, 5,                                 // K
 -1,-1,-2,-3,-1, 0,-2,-3,-2, 1, 2,-1, 5,                              // M
 -2,-3,-3,-3,-2,-3,-3,-3,-1, 0, 0,-3, 0, 6,                           // F
 -1,-2,-2,-1,-3,-1,-1,-2,-2,-3,-3,-1,-2,-4, 7,                        // P
  1,-1, 1, 0,-1, 0, 0, 0,-1,-2,-2, 0,-1,-2,-1, 4,                     // S
  0,-1, 0,-1,-1,-1,-1,-2,-2,-1,-1,-1,-1,-2,-1, 1, 5,                  // T
 -3,-3,-4,-4,-2,-2,-3,-2,-2,-3,-2,-3,-1, 1,-4,-3,-2,11,               // W
 -2,-2,-2,-3,-2,-1,-2,-3, 2,-1,-1,-2,-1, 3,-3,-2,-2, 2, 7,            // Y
  0,-3,-3,-3,-1,-2,-2,-3,-3, 3, 1,-2, 1,-1,-2,-2, 0,-3,-1, 4,         // V
 -2,-1, 3, 4,-3, 0, 1,-1, 0,-3,-4, 0,-3,-3,-2, 0,-1,-4,-3,-3, 4,      // B
 -1, 0, 0, 1,-3, 3, 4,-2, 0,-3,-3, 1,-1,-3,-1, 0,-1,-3,-2,-2, 1, 4,   // Z
  0,-1,-1,-1,-2,-1,-1,-1,-1,-1,-1,-1,-1,-1,-2, 0, 0,-2,-1,-1,-1,-1,-1 // X
//A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X
//0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19  2  6 20
};


int outiseq(char iseq[], int len) {
  int i;
  char seq[MAX_SEQ];
  for (i=0; i<len; i++) seq[i] = aa[iseq[i]];
  seq[len]=0;
  cout << ">>" << seq << endl;
  return 0;
}

int setiseq(char *seq, int len) {
  for (int i=0; i<len; i++) {
    seq[i] = aa2idx[seq[i] - 'A'];
  }
  return 0;
} // END void SEQ::seq2iseq()

/////////////////
AA_MATRIX::AA_MATRIX() {
  int i, j, k;
  gap = -11;
  ext_gap = -1;
  if ( (gap_array = new int[MAX_GAP]) == NULL ) bomb_error("memory");
  for (i=0; i<MAX_GAP; i++)  gap_array[i] = gap + i * ext_gap;
  k = 0;
  for ( i=0; i<MAX_AA; i++)
    for ( j=0; j<=i; j++)
      matrix[j][i] = matrix[i][j] = BLOSUM62[ k++ ];
} // END AA_MATRIX::AA_MATRIX()

void AA_MATRIX::init() {
  int i, j, k;
  gap = -11;
  ext_gap = -1;
  for (i=0; i<MAX_GAP; i++)  gap_array[i] = gap + i * ext_gap;
  k = 0;
  for ( i=0; i<MAX_AA; i++)
    for ( j=0; j<=i; j++)
      matrix[j][i] = matrix[i][j] = BLOSUM62[ k++ ];
} // END void AA_MATRIX::init()

void AA_MATRIX::set_gap(int gap1, int ext_gap1) {
  int i;
  gap = gap1; ext_gap = ext_gap1;
  for (i=0; i<MAX_GAP; i++)  gap_array[i] = gap + i * ext_gap;
} // END void AA_MATRIX::set_gap

void AA_MATRIX::set_matrix(int *mat1) {
  int i, j, k;
  k = 0;
  for ( i=0; i<MAX_AA; i++)
    for ( j=0; j<=i; j++)
      matrix[j][i] = matrix[i][j] = mat1[ k++ ];
} // END void AA_MATRIX::set_matrix



//quick_sort calling (a, 0, no-1)
int quick_sort (int *a, int lo0, int hi0 ) {
  int lo = lo0;
  int hi = hi0;
  int mid;
  int tmp;

  if ( hi0 > lo0) {
    mid = a[ ( lo0 + hi0 ) / 2 ];
      
    while( lo <= hi ) { 
      while( ( lo < hi0 ) && ( a[lo] < mid ) ) lo++;
      while( ( hi > lo0 ) && ( a[hi] > mid ) ) hi--;
      if( lo <= hi ) {
        tmp=a[lo]; a[lo]=a[hi]; a[hi]=tmp;
        lo++; hi--;
      }
    } // while
    
    if( lo0 < hi ) quick_sort(a, lo0, hi );
    if( lo < hi0 ) quick_sort(a, lo, hi0 );
  } // if ( hi0 > lo0)
  return 0;
} // quick_sort

/////////////////////////// END ALL ////////////////////////
