#!/usr/bin/perl -w

# PartiGene.pl version 2.2
# A script to take a set of sequences, cluster them using CLOBB
# assemble the clusters into contigs, allow for BLASTing
# and insert into the database.

# Requirements :
# A config file - will be installed when running the program for the first time
# Perl Modules - DBD:Pg and BioPerl
# DBD::Pg is part of the DBI is available from cpan - http://www.cpan.org/
# Bioperl is available from their site - http://www.bioperl.org/
# Phrap and Cross_match - http://www.phrap.org
# CLOBB2.pl - distributed with PartiGene
# Standalone BLAST - http://www.ncbi.nlm.nih.gov/
# wget utility - included with most linux distributions

# Written by John Parkinson (john.parkinson@ed.ac.uk)
# Maintained and further developed by Ralf Schmid (R.Schmid@ed.ac.uk)
# Last updated 14/12/2004 by Ralf Schmid
# Copyright (C) 2003 John Parkinson

# Thanks to Tim Booth, EGTDC Oxford for comments and some help with the
# implementation of DBD::Pg

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.


use strict;
use File::stat;
use Tie::File;
use Term::ANSIColor;
use Bio::SearchIO;
use Bio::SeqIO;
use DBI;
  use DBD::Pg;  
use Term::ReadLine;
use Time::localtime;


my @PATH=split(":","$ENV{'PATH'}");  ### Get users Path
my $home= $ENV{HOME};
my ($vector, $blast, $QUAL_SCORE, $database, $answer, $seqrep);
my ($seqlink);
my ($genus, $species, $genusspecies, $cluster_id, $blastdb, @prot_db, @nuc_db);
my ($desc, $seq, $seq_flag, $ac, $i, $j, $l, $n, $file, $flag, $end, $length);
my ($line, $qual, $qflag, $line1, $qualseq, $beg_low, $end_low, $beg_low1, $end_low1);
my (@inseq, @inqual, $project, $title, $clone_id, @blast_DB, @blast_EXE, $dbid, $input);
my (@prog, @dir, @dbs, $conn, $yflag, $size);
my ($found_a_quality_file);
my $trace_problem_flag=0;


########################################################################
################### Readline stuff #####################################
########################################################################
my $read_gnu;
my $term = new Term::ReadLine 'sample';
$term ->ornaments(0);	#stops prompt getting underlined
my $attribs = $term->Attribs;	
$attribs->{completion_entry_function} = $attribs->{filename_completion_function};

if ($term ->ReadLine() =~ /Gnu$/) {	#returns the actual package that executes the commands - we need gnu
	$read_gnu = 1;
}
else {
  print "The readline package Term::ReadLine:Gnu was not found.\nInstalling this package will make interaction with PartiGene more friendly.\nContinuing...\n";
  $read_gnu = 0;
}


########################################################################
################### First read in the config file ######################
########################################################################

my $filename = "~/.partigene.conf";

$filename =~ s{ ^ ~ ( [^/]* ) }
              { $1
                    ? (getpwnam($1))[7]
                    : ( $ENV{HOME} || $ENV{LOGDIR}
                         || (getpwuid($>))[7]
                       )
}ex;


#### create standard config file if not found
unless (-e "$filename") {&create_conf();}

open (CONFILE,"$filename") ||  die "Can't find configuration file\n";
while (my $line=<CONFILE>) {
  if ($line=~/^VECTOR\=(.+)/i) { $vector=$1; }
  if ($line=~/^DATABASE\=(.+)/i) { $database=$1; } 
  if ($line=~/^SEQREP\=(.+)/i) { $seqrep=$1; } 
  if ($line=~/^QUALSCORE\=(\d+)/i) { $QUAL_SCORE=$1; } 
  if ($line=~/^BLASTDB\=(.+)/i) { $blastdb=$1; } 
  if ($line=~/^SEQLINK\=(.+)/i) { $seqlink=$1; } 
}
close (CONFILE);


#############################################################################
############################ test & expert mode #############################
#############################################################################

my $expert =0;
#### think about check module in future
if (@ARGV) {
  my $test = shift (@ARGV); my $tmp;
  if ($test eq "test") { 
    print "\nWelcome to the PartiGene test mode.\n\n";
    if ($tmp=&find_program("wget"))        {print "searching for:      wget............OK\n";}
    if ($tmp=&find_program("cross_match")) {print "searching for:      cross_match.....OK\n";}
    if ($tmp=&find_program("blastall"))    {print "searching for:      blastall........OK\n";}
    if ($tmp=&find_program("megablast"))   {print "searching for:      megablast.......OK\n";}  
    if ($tmp=&find_program("formatdb"))    {print "searching for:      formatdb........OK\n";}
    if ($tmp=&find_program("CLOBB2.pl"))   {print "searching for:      CLOBB2.pl.......OK\n";}
    if ($tmp=&find_program("phrap"))       {print "searching for:      phrap...........OK\n";}    
#### postgresql checks (postmaster demon & user connection)      
    &postmaster_check();
    &query_for_exit();
#### get the expert flag as well
  }
  elsif ($test eq "expert") {$expert = 1;} 
#### and exit nicely if argument doesn't make sense
  else { 
    print colored("\nPartiGene does not understand $test, exiting now.\n","white bold");
    exit;
  }
}


#############################################################################
############################ start main menu ################################
#############################################################################

&options();


##############################################################################
######## SECTION 1 - Download Sequences  #####################################
##############################################################################

sub get_sequences()  {
  print colored("\n\t##### DOWNLOADING SEQUENCES #####\n","white bold", "on_black");
  my $flag=0;
  unless ($expert == 1) {
    print "\nThis facility offers you the ability to download entire\n";
    print "datasets of EST-sequences based on species identifiers\n";
    print "from the SRS service at EBI (http://srs.ebi.ac.uk).\n";
    print "The sequences are placed in a directory called 'sequences'.\n";
    print "These can then be used to either build a new cluster\n";
    print "database or append to an existing database\n";
  }

  my $wget_exe=&find_program("wget");
  &check_directory("sequences");

  
#### New option downloading TaxID based   
  my $tax_id; my $tax_flag=0; my $srs;
  while ($tax_flag==0)  {
    print "\nEnter the name (eg \"";
    print colored("zeldia punctata","green"); print"\"; capitalisation is not important)\n";
    print  "or alternatively the NCBI taxonomy ID (eg \"";
    print colored("49351","green"); print"\"; see\n";
    print "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Root)\n";
    print "of the species whose sequences you wish to download.\n";
    my $input = &get_input("\n");
    if ($input=~/(\w+)\s+(\w+)/) { $genus=$1; $species=$2; $tax_flag=1;  $genusspecies="$genus" . " " . "$species" . "]"; }
    elsif ($input=~/\s*(\d+)\s*/) { $tax_id=$1;  $tax_flag=2; $tax_id .= "]"; }
    else { print "\n$input not valid, please try again.\n" }
  }

 
#### New option to download all sequences MB 08/2004
  my $srsstart="$wget_exe http://srs.ebi.ac.uk/srs7bin/cgi-bin/wgetz'?-e+-lv+500000+-f+seq+-sf+fasta+";
  my $srsend="+-view+Fasta2Seqs+-ascii' -O download.fsa";
  my $srsncbitax="[embl-NCBI_TaxId:";
  my $srsspecies="[embl-Organism:";
  my $srsest="[embl-Division:EST*]";
  my $srsmrna="[embl-Molecule:mRNA*]";
  my $setsrs=0;
  my $datafile = "download.fsa";

  print "\nDo you want ALL DNA sequences from your chosen species\n";
  print "or just mRNAs and ESTs, or just the ESTs?\n";
  print "The default is just ESTs.\n";
  unless ($expert == 1) {
    print "To download all sequences, enter \"";
    print colored("all","green");
    print "\"\n";
    print "To download mRNAs and ESTs, enter \"";
    print colored("mRNA","green");
    print "\"\n";
    print "\n... but do take care, because if you select \"all\" for a species\n";
    print "for which whole genome sequencing is underway, you will be asking\n";
    print "to download that too, and PartiGene cannot process huge datasets.\n";
    print "The presence of genomic features such as introns may also spoil\n";
    print "sequence clustering later. We recommend ESTs or (mRNA plus ESTs).\n\n";
  }
  my $inputall=&get_input("Type \"all\", \"mRNA\" or press ENTER for just ESTs)?\n");
  my $inputdes = "EST"; 

#### setting up srs query
  if($inputall=~/all/ && $tax_flag==1) {
     $inputdes = "all"; 
     $srs="$srsspecies" . "$genusspecies" . "$srsend";
     $setsrs=1;
  }
  elsif($inputall=~/all/ && $tax_flag==2) {
     $inputdes = "all";
     $srs="$srsncbitax" . "$tax_id" . "$srsend";
     $setsrs=1;
  }
  elsif($inputall=~/mRNA/ && $tax_flag==1) {
     $inputdes = "mRNA and EST";
     $srs="$srsest" . "&" . "$srsspecies" . "$genusspecies" . "|" . "$srsmrna" . "&" . "$srsspecies" . "$genusspecies" . "$srsend";
     $setsrs=1;
  }
  elsif($inputall=~/mRNA/ && $tax_flag==2) {
     $inputdes = "mRNA and EST";
     $srs="$srsest" . "&" . "$srsncbitax" . "$tax_id" . "|" . "$srsmrna" . "&" . "$srsncbitax" . "$tax_id" . "$srsend";
     $setsrs=1;
  }
  elsif($tax_flag==1 && $setsrs==0) {
     $srs="$srsest" . "&" . "$srsspecies" . "$genusspecies" . "$srsend";
     $setsrs=1;
  }
  elsif($tax_flag==2 && $setsrs==0) {
     $srs="$srsest" . "&" . "$srsncbitax" . "$tax_id" . "$srsend";
     $setsrs=1;
  }

  else {print "I didnt understand that...\n"; exit;}

#### Now downloading 
  my $longsrs=join("",$srsstart,$srs);
  if($tax_flag==1) {
    print "\nStarting download process for $genus $species $inputdes sequences.\n";
    print "Please wait\n\n";
  }
  if($tax_flag==2) {
    print "\nStarting download process for NCBI taxonomy ID $tax_id $inputdes sequences.\n";
    print "Please wait ...\n\n";
  }

#### srs needed for a dummyfile just with ids - get number of entries 
  my $numbersrs = $longsrs;
  $numbersrs =~ s/-f\+seq\+-sf\+fasta\+//;
  $numbersrs =~ s/view/f/;
  $numbersrs =~ s/Fasta2Seqs/acc/;
  $numbersrs =~ s/download\.fsa/number/;

#### delete old dummyfile if necessary
  if (-e "number") {system ("rm number")}

#### get ACs in file, count them
  $SIG{CHLD}='IGNORE';  ### Avoids zombie processes
  $flag=0;
  my $seq_number;
  my $kpid=fork();
  if($kpid==0) {
     exec("$numbersrs >& /dev/null");
  }
  else    {
    while($flag==0)      {      
      my $wc=`ps -e` ; 				### See if the process is still running
      unless ($wc =~ /\b$kpid\b/){ $flag=1; }     ### No it's not - quit out of loop
      if(-f "number") {
        $seq_number=`grep -c '^AC' number`;
        $seq_number=~/(\d+)/;
        $seq_number=$1;
        printf("\r%9d sequences found so far",$seq_number);
        $|=1;
        sleep(3); 
      }
    }
  } 

#### give some info and choice
  chomp $seq_number;
  unless ((-e "number") && ($seq_number > 0)) {print "\nDid not find any sequences for $genus $species - please check the spelling - exiting\n"; exit}
  system ("rm number");     
  if ($tax_flag==1) {print "\n\n$seq_number sequences found for $genus $species. Do you want to download them? ";}
  if ($tax_flag==2) {print "\n\n$seq_number sequences found for $tax_id. Do you want to download them? ";}
  $answer = &yes_no;
  unless ($answer == 1) {print "\nExiting now!\n"; exit}
  if (-e "download.fsa")  {system ("mv download.fsa download.fsa.old")} 
  if (-e "dummyfile")  {system ("rm dummyfile")} 
  my $start = 1;
  my $number_srs = int ($seq_number / 1000 + 1); 

#### download via srs has sometimes difficulties to cope with more than 50000 entries, so split them   
  if ($number_srs > 0) {
    for (my $ii = 1; $ii <= $number_srs; $ii++) { 
      my $srs_fix = $longsrs;
      $srs_fix =~ s/lv\+500000/bv+$start+-lv+1000/;
      system ("$srs_fix >& /dev/null");
      printf("\r%9d sequences downloaded so far",$start - 1);
      sleep (3);   ### don't change this, it slows down things, but we have to make sure that we don't hammer the ebi/srs to much           
      $start +=1000; 

#### get rid of <PRE> tags in first and last line of file    
      my @lines;
      tie (@lines, 'Tie::File',$datafile) or die "can't update $datafile: $!";
      if ($lines[0] =~ /^<PRE/) {delete $lines[0];}
      if ($lines[-1] =~ /^<\/PRE/)  {delete $lines[-1];}

      system ("cat download.fsa >> dummyfile")
    }
  system ("mv dummyfile download.fsa")
  }
  else {print "\nERROR\n" ; exit}
    
  
#### no counter anymore, download is fast searching takes longish
  my $progress=`grep -c '^>' download.fsa`;
  $progress=~/(\d+)/;
  $progress=$1;
  if($progress==0)  {
    if ($tax_flag == 1)  {      
      print "\nDid not download any sequences for $genus $species - please check the spelling - exiting\n";
    }
    if ($tax_flag == 2)  {      
      print "\nDid not download any sequences for NCBI taxonomy ID $tax_id - please check ID - exiting\n";
    }  
    exit();
  }
  printf ("\n%9d sequences downloaded - Download complete\n", $progress);

#### splitting multi fasta file
  print "\nMaking individual sequence files sequences - Please wait\n";
  $n=0;
  my ($id,$sequence);
  my $inseq  = Bio::SeqIO->new('-file' => "$datafile",
                         '-format' => 'Fasta');
  while (my $seq = $inseq->next_seq)  {
    $id=$seq->display_id();
    $id =~ s/^embl\|//;
    $id =~ s/\|.+$//;
    $sequence=$seq->seq();
    my $description=$seq->desc();
    my $filename = "sequences/$id";
    open(FILE,">$filename") || die "can't open $filename: $!";
    print FILE ">$id $description\n$sequence\n";
    close FILE;
    $n++;
    my $z = $progress - $n;
    printf("\r%9d sequences remaining",$z);
  }
  print "\n";
  query_for_exit();
}


##############################################################################
######## SECTION 2 - Quality check sequences##################################
##############################################################################

sub trim() {
#### Basically redoing some trace2dbest stuff to make sure everything that goes 
#### into CLOBB has decent quality
  print colored("\n\t##### PREPROCESSING SEQUENCES #####\n","white bold", "on_black");
  unless ($expert == 1) {
    print "\nThe software will now screen the sequences in directory 'sequences' for\n"; 
    print "contaminating vector sequence and trim poly(A) and poly(T) tails.\n";
    print "If you have processed all your sequences with trace2dbest you can omit\n"; 
    print "this step. Do you want to pre-process your sequences now?";
    &query_for_continue();
  }
  
  
  my @seq_files = glob("sequences/*");
  unless (@seq_files > 0) {print "\nDirectory 'sequences' appears to be empty - exiting now\n"; exit;}
#### crossmatch first
  my $crossexec=&find_program("cross_match");
  my $num = @seq_files; $n=0; my $m=0;
  if (-f "$vector")  {
    print "\nNow screening for vector sequence ...\n";
    foreach my $seqfile (@seq_files) {
      system("$crossexec $seqfile $vector -screen >& /dev/null");
      if (-e "$seqfile.screen") {
        system("mv $seqfile.screen $seqfile");
      }	
      if (-e "$seqfile.log")  {
        system("rm $seqfile.log");
      }
      $n++; 
      $m = $num - $n;
      printf("\r%9d sequences remaining",$m);
    }    
  }
  else  {
    print colored("\nI can't find the vector file so I won't be able to screen for\n","magenta bold");
    print colored("contaminating vector sequence - continue ? ","magenta bold");
    &query_for_continue();
  }


#### trimming
  print "\nNow trimming sequences ...\n";
  $n=0; $m=0;
  foreach my $seqfile (@seq_files) {
    system ("mv $seqfile $seqfile.old");
    my ($id,$sequence);
    my $inseq  = Bio::SeqIO->new('-file' => "$seqfile.old",
                         '-format' => 'Fasta');
    while (my $seq = $inseq->next_seq)  {
      $id=$seq->display_id();
      $sequence=$seq->seq();
      my $description=$seq->desc();
      my $filename = "$seqfile";
      $id =~ s/\r//g; #get rid of carriage returns which cause clobb problems
      $description=~ s/\r//g;
      $sequence=~ s/\r//g;
      $sequence=&trim_sequence($sequence);
      $sequence=~s/^.{0,50}X+(.+)/$1/;
      $sequence=~s/(\w{100,}?)XXX.+$/$1/;
      my $seq_length=length($sequence); 
      $n++;
      if($seq_length > 100)   { ### Only include sequences > 100 bp in length
        open(FILE,">$filename") || die "can't open $filename: $!";
        print FILE ">$id $description\n$sequence\n";
        close FILE;
      }
    }  
    my $m = $num - $n;
    printf("\r%9d sequences remaining",$m);
  }  
#  system ("rm sequences/*.old"); 
  my @delete = glob ("sequences/*old");
  while (@delete) {
    my $delete = shift @delete;
    unlink $delete or warn "Can't delete $delete:$!\n";
  }
    
### Print number of removed sequences
  my $seq_short; 
  $seq_short=`ls -1 sequences |wc -l`;
  $seq_short= ($num - $seq_short); 
  print "\n$seq_short sequences <= 100 bp in length have been removed.\n\n";
  query_for_exit();
}
  
  
################################################################# 
################ SECTION 3 - Perform Clustering #################
#################################################################

sub cluster() {
  print colored("\n\t##### SEQUENCE CLUSTERING #####\n","white bold", "on_black");
  unless ($expert == 1) {
    print "\nThis software clusters datasets of EST and other sequences\n";
    print "such that each cluster represents one putative gene.\n";
    print "The sequences need to be available as individual fasta files\n";
    print "in a directory called 'sequences'.\n";
    print "The PartiGene sequence download process does this automatically.\n\n";
    print "The cluster process uses a program called CLOBB to do the\n";
    print "clustering. CLOBB allows new sequences to be added to old\n";
    print "clusters while maintaining previous cluster identities\n";
    print "For more information on CLOBB see :\n";
    print "\tParkinson J, Guiliano DB, Blaxter M.  \n";
    print "\tMaking sense of EST sequences by CLOBBing them.\n";
    print "\tBMC Bioinformatics 2002 3(1):31.\n";
  }  
  my $tmp=&find_program("megablast");  #### CLOBB won't work without these programs
  my $tmp1=&find_program("formatdb");
  my $clobb_exe=&find_program("CLOBB2.pl");
 
  $cluster_id='';
  my $clus_flag=0;
  my $input;
  my $cluster_file='';
  my (%cluster_content);
  
  my @list = glob("sequences/*");
  my $size=@list;
  if(@list < 1)   {
    print colored("\nI didn't find any sequences to add to the cluster, make sure you are\n","magenta bold");
    print colored("running this script from the right place\n\n","magenta bold");
    &query_for_exit();
  }

   
  print "\nEnter the three letter cluster ID you would like to use\n";
  print "(typically this is the first letters of the genus and species\n";
  print "followed by 'C' eg. for Zeldia punctata you might use ZPC).\n\n";
  my $cluster_file_old; 
  while($clus_flag==0)   {
    $input =<>;
    chomp $input;
    if($input=~/^(\w\w\w)$/)       { 
      $cluster_id=$1; 
      $cluster_file=$cluster_id."EST"; $cluster_file_old = "dummy_CLOBB";
      if (-f "$cluster_file")        {
        print colored("### Warning ! ### $cluster_id has already been used.\n","magenta bold");
        print "Do you want to add the new sequences to these existing clusters?\n";
        print "(PartiGene will create a backup '$cluster_file.old')\n";
	my $answer=&yes_no();
        if($answer==1)         {#### clobb deletes temporary files - therefore using dummy_CLOBB
          system("cp -f $cluster_file $cluster_file_old");
          print "\nA copy of the old cluster file has been created as $cluster_file.old\n";         
         
##### Check to see if these sequences have already been clustered 
	  print "Checking sequence redundancy with current cluster file $cluster_file\n";
          my $removed_files=0;
          my $file;
          open(FH,"<$cluster_file")  || die "can't open $cluster_file: $!"; 
          while(my $line=<FH>)          {
            if($line=~/^>(\w+)\s/)            {
              $file=$1;
              if(-f "sequences/$file")      {
                system("rm sequences/$file");
                $removed_files++;
              }
            }
          }
          close(FH);
	  
          
	  $size-=$removed_files;
          print "There are $size new sequences to be added to $cluster_file\n"; 
	  if ($size==0)  {
	    print "No new sequences for $cluster_file found - back to main menu\n"; 
	    &options();
	  }
	  $clus_flag=1;
        }     
        else { print "Please re-enter a new cluster ID : "; } 
      }
      else {$clus_flag=1;}
    }     
    else {print "$input not valid, please try again";}
  }    
   
  print "\n"; 
  $SIG{CHLD}='IGNORE';  ### Avoids zombie processes
  $flag=0;
  my $kpid=fork();
  if($kpid==0)   {
    exec("$clobb_exe $input >& /dev/null");
  }
  else {
    while($flag==0)  {
      my $wc=`ps -e` ; 				### See if the process is still running
      unless ($wc =~ /\b$kpid\b/){ $flag=1; }     ### No it's not - quit out of loop
      my @progress=glob("sequences/*");
      my $size=@progress;
      printf("\r%5d sequences remaining",$size);
      $|=1;
      sleep(5); ### avoid permanent grepping and globing - try to save processing power 
     }
  }
#### sorting backupfile
  if (-e "$cluster_file_old") {system("mv $cluster_file_old $cluster_file.old");}

#### post processing 
  if(-s $cluster_file < 10)   { 
    print colored("There was an error in the clustering process, please check that the\n","red bold");
    print colored("paths for CLOBB2.pl and megablast defined in the config file are correct,\n","red bold");
    &options();
  }
  print "\n\nClustering done - now splitting cluster file into individual\n";
  print "cluster files. This will create files in directory 'Clus'\n";
  system("rm -rf sequences_done"); ### Remove the processed files 
  print "\nCreating individual cluster files, please wait...\n\n";
  &check_directory("Clus");
  &check_directory("tmp");
  open(INPUT,"<$cluster_file") || die "can't open $cluster_file: $!";
  while(<INPUT>)   {
    if(/^>/)   {
      chop;
      $_ =~ /($input\d\d\d\d\d)\s*$/;
      my $filename = $1;
      open(OUTFILE, ">>tmp/$filename")  || die "can't open tmp/$filename: $!";
      print OUTFILE "$_\n";
      next;
    }
    print OUTFILE $_ unless(/^>/);
    if (/^>/)    {  close(OUTFILE); }
  }
  close(OUTFILE);
  close(INPUT);

  opendir(DIR, "tmp");
  while (defined($file=readdir(DIR)))  {
    my $no_seqs=0;
    if($file =~ /^$cluster_id/)   {
      open(INPUT, "<tmp/$file")  || die "can't open tmp/$file: $!";
      while(<INPUT>)    {  
        if (/^>/) { $no_seqs++ }  
      }
      $cluster_content{$file} = $no_seqs; 
      close INPUT;
      if($no_seqs==1)  {
        system("cat tmp/$file >> Clus/singletons.fasta");
        system("rm tmp/$file");
      }
      else    { system("mv tmp/$file Clus/$file"); }
    }
  }
  system ("rmdir tmp");
  my $sing_num=`grep \'^>\' Clus/singletons.fasta\|wc -l`; chomp $sing_num;
  my $clus_num=`ls Clus/$cluster_id*\|wc -l`; chomp $clus_num;
  my $clus_ests=`grep \'^>\' Clus/$cluster_id*\|wc -l`; chomp $clus_ests;
  my $seqs=$sing_num+$clus_ests;
  my $clus_sum = $sing_num + $clus_num;

  print "\n\nSUMMARY OF CLUSTERING FOR $cluster_id\n";
  print "=============================================================\n";
  printf ("Number of sequences\t\t\t= %9d\n",$seqs);
  printf ("Total number of clusters\t\t= %9d\n",$clus_sum);
  printf ("Number of clusters with 1 member\t= %9d\n",$sing_num);
  print "Number of clusters with >1 member\n";
  printf ("  (derived from $clus_ests sequences)\t= %9d\n",$clus_num);
  print "=============================================================\n";

#### create file with some info
  my $key;
  my $timestamp = `date +%D+%H:%M`; 
  chomp ($timestamp);
  $timestamp =~ s/\//-/g;
  my $file_name = "CLOBB_$cluster_id" . "_$timestamp.txt"; 
  open (OUTFILE, ">OUT/$file_name") || die "can't open OUT/$file_name: $!";
  print OUTFILE "SUMMARY OF CLUSTERING FOR $cluster_id\n";
  print OUTFILE "=======================================================\n";
  print OUTFILE "Number of sequences \t\t\t=\t$seqs\n";
  print OUTFILE "Total number of clusters\t\t=\t$clus_sum\n";
  print OUTFILE "Number of clusters with 1 member\t= $sing_num\n";  
  print OUTFILE "Number of clusters with > 1 member\n";
  print OUTFILE "derived from $clus_ests sequences\t\t= $clus_num\n";
  print OUTFILE "=======================================================\n";

  foreach $key (sort keys (%cluster_content)) {
    print OUTFILE "$key $cluster_content{$key}\n";
  }
  close OUTFILE;
  print "\nThis data and additional information on each cluster,\n";
  print "has been saved in the file:\n\n\t OUT/$file_name\n";
  &query_for_exit();
}


############################################################################
################ SECTION 4 - Assemble Clusters using phrap #################
############################################################################

sub assemble_clusters()  {
  print colored("\n\t##### CLUSTER ASSEMBLY #####\n","white bold", "on_black");
  unless ($expert == 1) {
    print "\nThe sequences grouped in clusters can be 'assembled' to yield a\n";
    print "consensus sequence. PartiGene uses a program called 'phrap' for this\n";
    print "(written by Phil Green and colleagues; see http://www.phrap.org/).\n\n";
  }
  
  my $phrapexec=&find_program("phrap");
  my $trace_dir='';   ## For non-standard naming schemes
  my $clone_pattern='';
  my $clone_name_flag=0;
  my @pseudo_contig =''; ## For contig update  
  $found_a_quality_file=0;
  $cluster_id=&get_clusterid();


#####  First check for new sequences in clusters or new clusters
##### Get number of sequences in each Cluster from clus directory
  my %cluster_content;
  my $no_seqs;
  opendir (DIR, "Clus") || die "can't open Clus: $!";
  while (defined($file=readdir(DIR)))  {    
    if ($file =~ /^$cluster_id/)  {
      $no_seqs = 0;
      open (INPUT, "<Clus/$file") || die "can't open Clus/$file: $!";   
      while (<INPUT>) {
        if  (/^>/) {      
          $no_seqs++; 
	  $cluster_content{$file} = $no_seqs;     
	}
      }    
      close INPUT;
    }
  } 
  closedir (DIR);    
      
      
##### Compare number of sequences in each Clobb Cluster (ie new) with number of sequences in each phrap (ie old), keep the ones with different numbers for update      
  my $key; 
  my %cluster_upgrade = %cluster_content;
  foreach $key (keys (%cluster_upgrade))  {
    if (-e "phrap/$key")  { 
      $no_seqs = `grep -c "^>" phrap/$key`;
      chomp ($no_seqs);           
      if ($cluster_upgrade{$key} == $no_seqs)  { 
         delete $cluster_upgrade{$key};
      }
    }  	      
  }


#### create @process_list for clusters to be processed
  my @process_list;
  foreach $key (keys (%cluster_upgrade))  {
    push (@process_list,$key);
  }
  my $size=@process_list;
  if ($size == 0) {print "Could not find any clusters with new sequences\n";  query_for_exit();} 
  else {print "\n $size Clusters will be assembled or updated\n"};
  
  
#### select quality mode    
  unless ($expert == 1) {
    print "\nBefore assembling the clusters the sequences need to pre-processed\n";
    print "phrap uses quality information from the sequencing chromatographs,\n";
    print "if this is available.\n";
  }
  print "You have three options : \n";
  print "1) Attempt to use original quality files for all clusters\n";
  print "2) Attempt to use original quality files only for clusters containing\n";
  print "   2 sequences\n";
  print "3) Skip preprocessing (if quality files are unavailable).\n\n";
  print "NB. phrap can generate multiple consensuses from large clusters.\n";
  print "We have found that option 2 reduces this less-than useful feature\n\n";
  my $skip='';
  while($skip!~/1|2|3/i)   {
    print "Please select 1,2 or 3 :  ";
    $skip=<STDIN>;
    chomp $skip;
  }
  print "\n";
  
#### prepare phrap directory for update - try to catch user having messed up phrap directory
  unless (-d "phrap") {system ("mkdir phrap");}  
  my @control = glob("phrap/*");
  my $error_flag = "0";
  foreach my $control (@control) {
    unless ($control =~ /$cluster_id/) {
      print colored("$control shouldn't be in phrap directory\n","red bold");  
      $error_flag = "1";
    }  
  }   
  if ($error_flag == 1) {print "An error has occured, exiting now.\n"; exit;}

#### get rid of old ".old" files first - could cause confusion for pseudocontig->contig and vice versa updates
  system ("rm phrap/*old  >& /dev/null");

#### save old files for comparison later
  foreach $file(@process_list)  {  
    if (-e "phrap/$file")                 	{system("mv phrap/$file phrap/$file.old");}  
    if (-e "phrap/$file.ace")                	{system("mv phrap/$file.ace phrap/$file.ace.old");}
    if (-e "phrap/$file.contigs")	        {system("mv phrap/$file.contigs phrap/$file.contigs.old");}
    if (-e "phrap/$file.contigs.qual")    	{system("mv phrap/$file.contigs.qual phrap/$file.contigs.qual.old");}
    if (-e "phrap/$file.log")             	{system("mv phrap/$file.log phrap/$file.log.old");}
    if (-e "phrap/$file.problems")              {system("mv phrap/$file.problems phrap/$file.problems.old");}
    if (-e "phrap/$file.problems.qual")	        {system("mv phrap/$file.problems.qual phrap/$file.problems.qual.old");}
    if (-e "phrap/$file.singlets")        	{system("mv phrap/$file.singlets phrap/$file.singlets.old");}
    #### .old files only used for contig comparison, mv pseudocontigs to contigs.old makes life easier
    if (-e "phrap/$file.pseudocontig")    	{system("mv phrap/$file.pseudocontig phrap/$file.contigs.old");}
    if (-e "phrap/$file.pseudocontig.qual")     {system("mv phrap/$file.pseudocontig.qual phrap/$file.contigs.qual.old");}
  }

  
##### setup for EGTDC directory structure
  my $dir_sel;
  my $dir_option1 = "/home/db/est_solutions";
  my $dir_option2 = "$home/est_solutions"; 
  
##### Locate source of original sequence and quality files    
  if($skip ne "3")  {
       
####First get info whether EGTDC naming scheme is used    
    unless ($expert == 1) {
      print "You have opted to use the original phred called sequence and quality files.\n";    
    }
    print "\nHave you been using the recommended EGTDC naming scheme for your clones?\n";
    print  "For example: \"Lr_adE_06E01\"";   
    my $answer=yes_no();
    if ($answer == 1) {$clone_name_flag = 1;}


    print "\n\nSearching for files ...\n";   

#### First check whether some of the standard directories are used ...
    my $clus_flag=0;
    if ((-d "$dir_option1") && (-d "$dir_option2"))    {
      print "PartiGene has found two directories potentially holding your quality files\n";
      print "in some sub-directories\n";
      while ($clus_flag == 0) {
         print "Type \"1\" for selecting $dir_option1\n";
         print "Type \"2\" for selecting $dir_option2\n";
	 print "Type \"3\" for selecting none of these \n";
	 $dir_sel = <STDIN>;
         chomp $dir_sel;
	 if ($dir_sel == 1) {$clus_flag = 1;}
	 elsif ($dir_sel == 2) {$clus_flag = 1;}
	 elsif ($dir_sel == 3) {$clus_flag = 1;} 
	 else {print "$dir_sel has unexpected value. Please try again.\n";}
      }
    }  	 
    elsif (-d "$dir_option1") {
      print "\nPartiGene has found $dir_option1 potentially holding your quality files\n";
      print "in some sub-directories\n";
      print "Do you want to use it\n";      
      my $answer=yes_no();
      if ($answer == 1)    {$dir_sel = 1;}
      else {$dir_sel = 3}
    }  
    elsif (-d "$dir_option2") {
      print "\nPartiGene has found $dir_option2 potentially holding your quality files\n";
      print "in some sub-directories\n";
      print "Do you want to use it\n";      
      $answer=yes_no();
      if ($answer == 1)    {$dir_sel = 2;}
      else {$dir_sel = 3}
    }              
    else {$dir_sel = 3}

#### Now we have sorted whether&what standard directory to use, do the same for projects ...     
    if ($dir_sel == 1) {
      my @list = glob("$dir_option1/*");
      if  (scalar @list == 1) {  
        my $file = $list[0];
	     $file =~ /\/([\w\.\-]+)$/;
	     $project = $1;
      }
      if (scalar @list > 1) {
        foreach my $file (@list) {
	     $file =~ /\/([\w\.\-]+)$/;
	     print "$1  ";
	     }                   
        $clus_flag=0;
        while($clus_flag==0)     {
          $project = &get_input("\nWhich project contains the relevant library directories?\n");
	  $project =~ s/\s//g;
          if(-d "$dir_option1/$project") { $clus_flag=1; }         
	  else       { 
            print "$project is not valid\n\n";
            query_for_exit(); 
          }      
        }             
      }
    }    
   
    if ($dir_sel == 2) {
      my @list = glob("$dir_option2/*");
      if  (scalar @list == 1) {  
        my $file = $list[0];
	$file =~ /\/([\w\.\-]+)$/;
        $project = $1;
      }
      if (scalar @list > 1) {
        foreach my $file (@list) {
          $file =~ /\/([\w\.\-]+)$/;
          print "$1  ";
        }                     
        $clus_flag=0;
        while($clus_flag==0)     {
          $project = &get_input("\nWhich project contains the relevant library directories?\n");
	  $project =~ s/\s//g;
          if(-d "$dir_option2/$project") { $clus_flag=1; }         
	  else       { 
            print "$project is not valid\n\n";
            query_for_exit(); 
          }      
        }             
      }
    }    
           
    if (($dir_sel == 3) && ($clone_name_flag == 1))    {   
      if($seqrep && -d "$seqrep")    {          
        print "\nYou have defined $seqrep as your base sequence repository directory.\n";
        print "It contains the following projects :\n\n";
        my @dir_content = glob ("$seqrep/*");
        foreach my $dir (@dir_content)  {
	  if (-d $dir) {
	    $dir =~ /([\w\.\-]+)$/;    
	    print "$1  ";
	  }
        }
        $project='';
        $clus_flag=0;
        while($clus_flag==0)     {
          $project = &get_input("\nWhich project contains the relevant library directories?\n");
	  $project =~ s/\s//g;
          if(-d "$seqrep/$project") { $clus_flag=1; }
          else       { 
            print "$project is not valid\n\n";
            query_for_exit();
          }
        }
      }

      else      { 
        print "\nYou have not defined the SEQREP variable or it is not a valid\n";
        print "directory I will therefore be unable to find the original files\n";
        print "based on this naming convention";
        query_for_exit();
      }
    }
      
#####Back to all 3 directory structure options

##### Now find the clone ID for pattern matching purposes #####
    if ($clone_name_flag == 1) {
      unless ($expert == 1) {
        print "\nIn order to recognise a sequence for one potentially stored\n";
        print "on this machine I need the sequence clone ID. This is the first few\n";
        print "letters used to name each trace file e.g. Po, Lr, Eg etc. \n";
        print "For example: \"Lr\" for files like \"Lr_adE_06E01\"";
        print "(see the help file for information on naming policy).\n";
      }
      $clone_id='';
      $clus_flag=0;
      while($clus_flag==0)     {
        $clone_id = &get_input("\nEnter trace identifier:\n");
        if ($expert == 1) {$clus_flag=1; }
	print "$clone_id - is this correct ? [y/n] : ";
        my $input1='';
        while($input1!~/y|n/i)       {
          print "\b";
          $input1=<STDIN>;
          chomp $input1;
        }
        if($input1=~/y/i) { $clus_flag=1; }
      }
    }
##### if not egtdc ...    
    if ($clone_name_flag == 0) {	   
	
#### Try to deal with non-recommended naming schemes, first get directory     
      $clone_id='';
      $project='';
      print "\nYou have decided to use an alternative naming scheme for your clones\n";
      print "Please note that you need to place all the .qual and .seq files\n";
      print "created by PHRED into a single directory ";
      my $dir_flag=0;
      while($dir_flag==0)      {
        $trace_dir=&get_input ("\nPlease enter the full path of the directory where the files are stored\n");
        if(-d "$trace_dir") { $dir_flag=1; }
        else { 
          print "$trace_dir is not valid";
          query_for_exit();
        }    
      }
      
#### and then a regex      
      print "\nNow I need you to specify the unique clone naming scheme you have implemented.\n";
      print "This is a unique ID in the header line of the fasta file which you previously used\n";
      print "to name your clones. For this step you will need to enter a regular expression (regex)\n";
      print "that can be used by PartiGene to identify the clones. Examples might be :\n";
      print "\nMyClone\\d\\d\\d\\d\\d     - would match MyClone00001 - MyClone99999\n";
      print "MyClone_\\d\\d\\d\\d\\d    - would match MyClone_00001 - MyClone_99999\n";
      print "MyClone_\\w+_\\d+	      - would match MyClone_library1_1 - MyClone_libraryX_199921\n";
      print "MyClone_\\w\\w\\w\\w_\\d+  - would match MyClone_lib1_1 NOT MyClone_library1_1\n";
      print "MyClone_\\w\\w\\w\\w_\\d\\d - would match MyClone_lib1_1 NOT MyClone_lib1_11\n";	 
      print "\nFor help on regex I recommend you look at James Wasmuths regex notes\n";
      print "http://envgen.nox.ac.uk/courses/perl_bioperl/regex.pdf\n";
      my $pattern_flag=0;
      while($pattern_flag==0)     {
        $clone_pattern= &get_input ("\nEnter a regex to uniquely identify individual clones from their fasta headers\n");
        print "$clone_pattern - is this correct ? [y/n] : ";
        my $input1='';
        while($input1!~/y|n/i)  {
          print "\b";
          $input1=<STDIN>;
          chomp $input1;
        }
        if($input1=~/y/i) { $pattern_flag=1; }
      }            
    }

    print "\nProcessing files, please wait.\n";

    foreach  $file(@process_list)  {
      my $seq='';
      my $flag=0;
      my $header='';
      my $title='';
      my $two_seq_flag=$skip;
    
      ###### Find out number of sequences in the cluster    ######
      ###### If no. sequences = 2 then use original quality ######
      ###### file if possible, if > 2 then use original     ######
      ###### quality files only if user specified above.    ######

      my $wc=`grep -c '>' Clus/$file`;    
      chomp $wc; 
      if($wc==2 && $skip==2) { $two_seq_flag=1; }
    
      open(CLUSFILE, "<Clus/$file");
      my $seq_count=0; #print "numberfordir:$dir_sel\n";
CLUSGEN: while ($line = <CLUSFILE>)     {
        if(($line=~/^>/) || eof(CLUSFILE))      {
          if($seq) {
            if ($dir_sel == 1) {
              &prepare_qual("phrap/$file",$header,$seq,$two_seq_flag,$clone_id,"$dir_option1/$project/trace2dbest",$clone_pattern,$trace_dir);
            }
            elsif ($dir_sel == 2) {	     
              &prepare_qual("phrap/$file",$header,$seq,$two_seq_flag,$clone_id,"$dir_option2/$project/trace2dbest",$clone_pattern,$trace_dir);
            }
            else {# print "now going to prepare phrap for $file\n";
              &prepare_qual("phrap/$file",$header,$seq,$two_seq_flag,$clone_id,"$seqrep/$project",$clone_pattern,$trace_dir);	      
            }
	  } 
          $seq='';
        }
        if($line=~/^>(.+)/)      {
          $seq_count++;
          if($seq_count > 400) { last CLUSGEN; }  ### More than 400 sequences in a cluster can lead
                                                  ### to memory problems, why need more than 400
			      			  ### anyway ? (A workaround)
          $header=$1;
        }
        if($line!~/^>/) { chomp $line; $seq.=$line; }
      }      
      close(CLUSFILE);
    }

 
##### Now do phrapping #####
    chdir("phrap");
    opendir(DIR, "./");
    $size=@process_list; 
    foreach $file(@process_list)  {
      $size--;
      printf ("\r%5d clusters remaining",$size);
      system ("$phrapexec $file -new_ace >& /dev/null");  
      my $contfile=$file; 
      $contfile.=".contigs";
##### It didn't assemble so use the largest est as the contig         
      unless((-e $contfile) && (-s $contfile > 0))   {  
        push (@pseudo_contig, $file); 
        open(FP,"$file");
        my $length=0;
        my $clus_seq='';
        $seq='';
        while($line=<FP>)    {
          if($line=~/^>/ || eof(FP)) { 
            if(length($seq) > length($clus_seq)) { $clus_seq=$seq; } 
            $seq=''; 
          }
          else  { $seq.=$line; }
        }
        close(FP);
	 	 
        open(OUT,">$file.pseudocontig");
        print OUT ">$file\n$clus_seq";
        close(OUT);
##### create dummy quality file 
        open(OUT,">$file.pseudocontig.qual");
        print OUT ">$file\n";
        for (my $i=1; $i <= length($clus_seq); $i++)  {
	       print OUT "$QUAL_SCORE ";
	       if($i % 30 == 0) {  print OUT "\n"; }
        }
        print OUT "\n"; 
        close(OUT);		  		  
      }   
    }
    closedir(DIR);
    chdir("../");  
  }  
 
 
 ##### now options 1,2 done; else loop for option 3
  else    {  ##### Just copy cluster files across #####  
    system("cp Clus/$cluster_id* phrap");
    chdir("phrap");
    opendir(DIR, "./");
     
    $size=@process_list; 
    foreach $file(@process_list)  {
      $size--;
      printf ("\r%5d clusters remaining",$size);
      system ("$phrapexec $file -new_ace >& /dev/null");     
      my $contfile=$file;
      $contfile.=".contigs"; 
##### It didn't assemble so use the largest est as the contig
      unless((-e $contfile) && (-s $contfile > 0))   { 
        push (@pseudo_contig, $file);          
        open(FP,"$file");
        my $length=0;
        my $clus_seq='';
        $seq='';
	while($line=<FP>)    {
          if($line=~/^>/ || eof(FP)) { 
            if(length($seq) > length($clus_seq)) { $clus_seq=$seq; } 
            $seq=''; 
	       }
          else  { $seq.=$line; }
        }
        close(FP);	 
        open(OUT,">$file.pseudocontig");
        print OUT ">$file\n$clus_seq";
        close(OUT);
##### create dummy quality file
        open(OUT,">$file.pseudocontig.qual");
        print OUT ">$file\n";
	for (my $i=1; $i <= length($clus_seq); $i++)  {	      
	  print OUT "$QUAL_SCORE ";
	  if($i % 30 == 0) {  print OUT "\n"; }
	}
	print OUT "\n"; 
        close(OUT);
      }   
    }
    closedir(DIR);
    chdir("../");
  }
  
##### back to common stuff for all options now  
##### select entries for contig upgrade
  foreach $file(@pseudo_contig) { ## a work around
    if ($file) {  
      system ("cp phrap/$file.pseudocontig phrap/$file.contigs");
      system ("cp phrap/$file.pseudocontig.qual phrap/$file.contigs.qual");
    }  
  }
  my @retired_singletons = '';
  my @retired_contigs = '';
  my $sing_nr; 
  foreach $file(@process_list)  {  
    unless (-e "phrap/$file.old") {  #catch former singletons now in cluster       
      my $dummyfile = $file . "_1.seq";
      if (-e "protein/$dummyfile") { #avoid completely new clusters
        push (@retired_singletons, $file);
        &contig_check("phrap/$file.contigs","1", "$cluster_id");
      } 
    }  
#### now for already existing contigs, 2nd -e to catch contigs "upgraded" to pseudocontigs, which broke pg later on ... 	 
    if ((-e "phrap/$file.contigs.old") && (-e "phrap/$file.contigs" ))     { 
      if ((-s "phrap/$file.contigs") != (-s "phrap/$file.contigs.old")) {
        push (@retired_contigs, $file);     
        &contig_check("phrap/$file.contigs","phrap/$file.contigs.old","$cluster_id");
      }  
      else {  
        #### if sth changes in contig file we assume it is the sequence  
        if ((`diff phrap/$file.contigs phrap/$file.contigs.old`) ne '') {
          push (@retired_contigs, $file);
          &contig_check("phrap/$file.contigs","phrap/$file.contigs.old", "$cluster_id");
        }
      }
    }   
  }
  foreach $file(@pseudo_contig) { ## and back again
    if ($file) {
      system ("rm phrap/$file.contigs");
		system ("rm phrap/$file.contigs.qual");
    }
  }


##### warnings
  if($found_a_quality_file==0 && $skip!=3) {
    print colored("\n### Warning ! ### No quality files were used during assembly\n","magenta bold");
    print colored("You may want to check your paths and rerun this step\n","magenta bold");
  }
  if($trace_problem_flag==1) {#### Maybe should count number of quality files used just to give some idea ...
    print colored("\n### Warning ! ### There was a problem with using the original traces data\n","magenta bold");
    print colored("which may have caused problems with the assembly. Please ensure\n","magenta bold");
    print colored("that the traces were processed correctly\n","magenta bold");
  }
  print "\n\nAssembly process finished.\n";
  print "Now creating input files for the protein prediction pipeline prot4EST.\n";
 
##### Populate&update retired_contigs directory         
  if (-d "protein") {
    unless (-d "retired_contigs") {
      system ("mkdir retired_contigs");
    }
    foreach $file(@retired_singletons)  {
      if ($file) {system ("cp protein/$file* retired_contigs/");} 
    }
    foreach $file(@retired_contigs)  {
      if ($file) {system ("cp protein/$file* retired_contigs/");} 
    }
  }  
  &check_directory("protein");

           
##### Process clusters and their quality files (split into individual contigs) #####
  &process_clusters(".contigs",".seq");
  &process_clusters(".contigs.qual",".qlt");

##### Now process the singletons - not trivial ! #####
  if ($skip==3)  {&process_singletons(" "," ",$cluster_id);}
  else {
##### deal with different directory schemes  
    if ($dir_sel == 1) {	
      &process_singletons ("$dir_option1/$project/trace2dbest",$clone_id,$cluster_id,$clone_pattern,$trace_dir);
    }
    elsif ($dir_sel == 2) {	     
      &process_singletons ("$dir_option2/$project/trace2dbest",$clone_id,$cluster_id,$clone_pattern,$trace_dir);	 
    }
    else {
      &process_singletons("$seqrep/$project",$clone_id,$cluster_id,$clone_pattern,$trace_dir);        
    }
  }


##### copy dummy files for all pseudocontigs across; deal with numbering by finding all retired contigs for respective clusters  
  my @all_pseudos = glob ("phrap/*pseudocontig");
  foreach $file(@all_pseudos)  {  
    $file =~ s/phrap\///; $file =~ s/\.pseudocontig//;
    if (-e "phrap/$file.pseudocontig") {
      my $retcheck = "retired_contigs/$file" . "_1.seq";
      if (-e $retcheck) {
        my @retlist = glob "retired_contigs/$file*.seq";
        my $new_pseudo = ((length @retlist) + 1);
        my $dummyfile = $file . "_$new_pseudo.seq";
        system ("cp phrap/$file.pseudocontig protein/$dummyfile");    
        }  
      else {
        my $dummyfile = $file . "_1.seq";
        system ("cp phrap/$file.pseudocontig protein/$dummyfile");      
      }  
    } 
    
    if (-e "phrap/$file.pseudocontig.qual") {      
      my $retcheck = "retired_contigs/$file" . "_1.qlt";
      if (-e $retcheck) {
        my @retlist = glob "retired_contigs/$file*.qlt";
        my $new_pseudo = ((length @retlist) + 1);
        my $dummyfile = $file . "_$new_pseudo.qlt";
        system ("cp phrap/$file.pseudocontig protein/$dummyfile");    
      }  
      else {
        my $dummyfile = $file . "_1.qlt";
        system ("cp phrap/$file.pseudocontig.qual protein/$dummyfile");
      }
    }  
  }
   
  #### prepare input file for prot4EST  
  if (-e "prot4EST_input.fsa") {system ("mv prot4EST_input.fsa prot4EST_input.fsa.old")}
  my @prot_files = glob ("protein/*.seq");
  foreach my $protfile (@prot_files) {
    system ("cat $protfile >> prot4EST_input.fsa");
  }   



#### create file with some info
  my $timestamp = `date +%D+%H:%M`; 
  chomp ($timestamp);
  $timestamp =~ s/\//-/g;
  my $file_name = "phrap_$cluster_id" . "_$timestamp.txt"; 
  my @list = glob ("phrap/*.ace"); #modify here 
  open (OUTFILE, ">OUT/$file_name") || die "can't open OUT/$file_name: $!";
  print OUTFILE "### Cluster\tcontig\tnumber of sequences\n";
  foreach my $file (@list) {
    open (ACEFILE, $file) || die "Can't find $file\n";
    $file =~ s/\.ace//g;
    $file =~ s/phrap\///g;
    while (my $line = <ACEFILE>)  {      
      if ($line =~ /^CO\s+(Contig\d+)\s+\d+\s+(\d+)/) {   
        print OUTFILE "$file\t$1\t$2\n";    
      }
    }
  } 
  print "\nA report on the phrap based assembly process has been saved in the file:\n";
  print "\n   OUT/$file_name\n";   
  close (OUTFILE);
  close (ACEFILE);

  query_for_exit();
}


############################################################################
################ SECTION 5 - Allow user to perform BLASTs ##################
############################################################################

sub blast_annotation() {
  system("clear");
  print colored("\n\t##### BLAST ANNOTATION #####\n","white bold", "on_black");
  unless ($expert==1) {
    print "\nThis facility prepares the sequences for blasting to provide\n";
    print "some primary annotation on the sequences. Normally we recommend\n";
    print "that you perform the following BLASTS\n\n";
    print "BLASTX vrs non-redundant protein database (SWALL)\n";
    print "BLASTN vrs non-redundant nucleotide database (Genbank)\n";
    print "BLASTN vrs the EST database (dbEST)\n\n";
    print "You may alternatively have your own set of BLASTs that you\n";
    print "want to perform. If you have BLAST locally installed and\n";
    print "have databases that you wish to blast against, you can set up,\n";
    print "and run the BLASTs here. Alternatively, I can prepare the files\n";
    print "for BLASTing at your leisure. If you do want to BLAST the files\n"; 
    print "outside of PartiGene and re-import them into the process later\n";
    print "remember to use non-html formatted BLAST output (-T F) and place\n";
    print "the results in an appropriately named subdirectory of the blast\n";
    print "directory created by PartiGene\n\n";
  }
  my $blast_exec=find_program("blastall");

  print "First we need to prepare the files for blasting\n\n";
  

  &check_directory("blast");
  $cluster_id=&get_clusterid();
 
 ######### Move contig and pseudocontig files to BLAST directory ###########
  my $outfile; my $condorfile = "blast_input_" . "$cluster_id" .".txt";
  if (-e "$condorfile") {
    system ("mv $condorfile $condorfile.old");
  }
  opendir(DIR,"phrap") or die "Can't find cluster assemblies\n";
  while(defined($file=readdir(DIR)))  {
    if($file=~/contigs$/ && (-s "phrap/$file") > 0)   { 
      $outfile=$file;
      $outfile=~s/contigs/txt/; 
      system("cp phrap/$file blast/$outfile");
      system ("cat phrap/$file >> $condorfile");
    }
    elsif($file=~/pseudocontig$/)   { 
      $outfile=$file;
      $outfile=~s/pseudocontig/txt/; 
      system("cp phrap/$file blast/$outfile");
      system("cat phrap/$file >> $condorfile");
    }
  }
 
 ######### Now process singletons ###########

  my $old_file='';
#  system("cat Clus/singletons.fasta >> $condorfile");
  open(FH,"<Clus/singletons.fasta");
  while($line=<FH>)   {
    if($line=~/($cluster_id\d+)/ || eof(FH))    {
      $file=$1;
      chomp $line;
      if($old_file)     {
        open(OUTFILE, ">blast/$old_file.txt");
        print OUTFILE ">$old_file\n$seq";
	close(OUTFILE);
        system("cat blast/$old_file.txt >> $condorfile");
      }
      $old_file=$file;
      $seq='';
    }
    else { $seq.=$line; }
  }

  print "Consensus sequences needed for BLAST searches within PartiGene\n"; 
  print "have been processed and stored in the directory 'blast'.\n"; 
  print "A concatenated file which can be used as input file for BLAST\n";
  print "searches outside of PartiGene has been saved as '$condorfile'\n";
  print "in the main directory.\n\n"; 
  
  print "Would you like to continue by BLASTing against your custom databases ? ";

  &query_for_continue();
  my $blast_flag=0;

  if(!-d "$blastdb") {
    print colored("I can't find your blast database directory specified as $blastdb.\n","red bold");
    print colored("Please update your .partigene.conf file to give the correct path.\n","red bold");
    exit();
  }

  $dbs[0]='';  ### Initialise list of blastable dbs to prevent warning when
             ### No databases are selected

  while($blast_flag==0) {
    $i=&specify_blasts();
    print "\nYou have selected the following blasts - \n";
    for($j=0;$j<$i;$j++)  {
      print "$blast_exec -p $blast_EXE[$j] -d $blast_DB[$j] ....\n";
    }
    print "\nIs this correct ? ";
    $blast_flag=yes_no();
  }

  &check_directory("blast/passed");  ##### Get names of directories / automate ask ?
  for($j=0;$j<$i;$j++)  {
    my $dir_name=$blast_DB[$j];
    $dir_name=~s/\..+//;
    $dir_name=~s/^.+\///;
    if(length($dir_name) > 10)   {
      print "$dir_name is not an acceptable name for databasing purposes\n";
      print "please enter an alternative name to describe this/these databases.\n";
      my $clus_flag=0;
      my $input;
      while($clus_flag==0)    {
        $input = &get_input("\n");
        if(length($input) < 10)     {
          print "You have selected $input to represent $dir_name is this ok ? ";
          $clus_flag=&yes_no();
        }
        else      {  print "$input is too long, please select a shorter name (< 10 characters)\n"; }
      }  
      $dir[$j]=$input;
    }
    else { $dir[$j]=$dir_name; }
  
    &check_directory("blast/$dir[$j]");
  }

#### blasting
  opendir(DIR,"blast");
  my $failedflag;
  my $blast_num=`ls blast/*.txt\|wc -l`;
  my $blast_count=0;
  while(defined($file=readdir(DIR))) {
    if($file=~/txt/)  {
      $failedflag=0;
      $flag=0;   ### Also define here if no blasts defined
      for($j=0;$j<$i;$j++)   {
        $flag=0;
        my $outfile=$file;
        $outfile=~s/txt/out/;
        while($flag<5)  { #### Try 5 times    
          if(-f "blast/$dir[$j]/$outfile")     {
            if(-s "blast/$dir[$j]/$outfile" > 500) { $flag=10; next; }
          }
          system("$blast_exec -p $blast_EXE[$j] -d $blast_DB[$j] -i blast/$file -o blast/$dir[$j]/$outfile -e 0.00001 -b 25 -v 25 >& /dev/null");
	  wait();
          $flag++;
        }        
        if($flag < 10)    { 
          open(FH,">>blast/failed");
          print FH "$file failed $blast_DB[$j]\n";
	  close(FH);      
          $failedflag=1;
        } 
      }
      
      $blast_count++;
      my $blast_percent=(100*$blast_count)/$blast_num;
      printf("\r%3d%% Completed",$blast_percent);
      if ($failedflag==0)    {  
        system("mv blast/$file blast/passed");  
      }
      else {print "\n There has been a problem with blast.\n Please check blast/failed for problem files\n";}
    }
  }
  closedir(DIR);  
  query_for_exit();
}


###############################################################
################ SECTION 6 - Create HTML tables ###############
###############################################################

sub html_tables() {
  system("clear");
  print colored("\n\t##### Creating HTML Tables #####\n","white bold", "on_black");
  unless ($expert==1) {
    print "\nThis facility creates a series of HTML format results files.\n";
    print "This is recommended only for smaller datasets (<1000 sequences).\n";
    print "\nDo you want to continue?\n";
    &query_for_continue();
  }
  &check_directory("html");

 #### HTML files are of the format :
 #### Cluster ID, Num ESTs, ESTs, Blast1, Blast2....BlastN
 
 #### First read in what BLASTs are available
 
  @dir=`ls -d blast/*`;
  my @blast_dir;

  print "\nFirst select the BLASTs that you would like to include from the following list\n";

  $i=0;
  foreach $file(@dir)  {
    chomp $file;
    $file=~s/blast\///;
    if($file ne "passed" && -d "blast/$file")     { 
      $i++; print "$i $file\n"; $blast_dir[$i]=$file; 
    }
  }
  $j=0;
  $input='';
  $flag=0;
  my ($db, $bldb_flag,@list);
  while($flag==0) {
    $input= &get_input("\nPlease enter a comma separated list of numbers from the list above: ");
    $blast_DB[1]='';
    @list=split(",",$input);
    foreach $db(@list)  {
      if($db=~/\d+/)  {
        $flag=1;
        if($db < $i+1)     {
          $bldb_flag=0; 
          for($n=0; $n<$j; $n++)   {
            if($blast_DB[$n] eq $blast_dir[$db]) { $bldb_flag=1; }   ### Don't use the same blast more than once
          }
          if($bldb_flag==0) { $blast_DB[$j]=$blast_dir[$db]; $j++; }
        }
        else { $flag=0; print "\n$input is not valid\n"; next; } 
      }
    }
  }

  print "\nYou have selected the following BLASTs : \n";
  for($i=0;$i<$j;$i++)  {
    print "$blast_DB[$i]\n";
  }

  print "\nPlease wait while the tables are being generated\n";

 #### Next read in the Clusters, Sequences and associated BLAST information
 
  my @cluster_info;
  my $num;
  my $blast_file;
  my $blast_miss=0;
  my $blast_num=`ls blast/passed/ | wc -l`;
  opendir(DIR,"Clus");

  $i=0; 
  while(defined($file=readdir(DIR)))  {
    if($file=~/(\w\w\w\d+)/)  { 
    ### Cluster file   
      $cluster_info[$i][0]=$1;
      open(FH,"<Clus/$file");
      $num=0;
      while($line=<FH>)   {
        if($line=~/^>(\w+)\s/)    {
          my $seq_id=$1;
          $cluster_info[$i][1].=$seqlink;     
          $cluster_info[$i][1]=~s/PARTISEQ/$seq_id/;     
          $cluster_info[$i][1].=$seq_id." </\a>";     
          $num++;
        }
      }
      $cluster_info[$i][2]=$num;
      close(FH);

   #### Now read in associated BLAST information
      my $score=0;
      my $sig=0;
      
      for($n=0;$n<$j;$n++)   {
        $blast_file="blast/$blast_DB[$n]/$file.out";
	if (-e $blast_file)   {
          $cluster_info[$i][3+$n]=&read_blast($blast_file);
          $cluster_info[$i][3+$n]=~s/\|\|(\w+$)//;
          $prog[$n]=$1;
	}
	else {
	  print "\n$blast_file does not exist.\n";
	  sleep(1);
	  $blast_miss=1;
	}    
      }    
      $i++;
      my $blast_percent=(100*$i)/$blast_num;
      printf("\r%3d%% Completed",$blast_percent);
      $|=1;   ### Flushes print buffer
    } 
   
  }
  
  closedir(DIR);
  
  if(-f "Clus/singletons.fasta")  { ### Now do singletons   
    open(FH,"<Clus/singletons.fasta");
    while($line=<FH>)    {
    ### first pattern match to process not yet submitted sequences
      if(($line=~/^>(\w+)\s+(\w\w\w\d\d\d\d\d)$/) or ($line=~/^>(\w+)\s.+(\w\w\w\d\d\d\d\d)/))     {
        $cluster_info[$i][0]=$2;
        my $seq_id=$1;
        $cluster_info[$i][1]=$seqlink;     
        $cluster_info[$i][1]=~s/PARTISEQ/$seq_id/;     
        $cluster_info[$i][1].=$seq_id."</\a>";     
        $cluster_info[$i][2]=1;
        for($n=0;$n<$j;$n++)   {
          my $file1="blast/$blast_DB[$n]/$cluster_info[$i][0].out";
	  if (-e $file1)   {
            $cluster_info[$i][3+$n]=&read_blast($file1);
            $cluster_info[$i][3+$n]=~s/\|\|(\w+$)//;
            $prog[$n]=$1;
	  }
	  else {
	    print "\n$file1 does not exist.\n";
	    sleep(1);
	    $blast_miss=1;
          }
	}
        $i++;
        my $blast_percent=(100*$i)/$blast_num;
        printf("\r%3d%% Completed",$blast_percent);
        $|=1;
      }
    }
    close(FH);
  }
  if ($blast_miss == 1)   {
    sleep(1);
    print colored ("\n\nSome BLAST results are missing.\nWe recommend an update of the respective BLAST\n","magenta bold");
    print "\nDo you want to continue?\n";
    &query_for_continue();  
  }
 #### Now print out in tables of 50 clusters
 
 my $num_tables=1+$i/50;
 my $table_index; 
 
 for($n=0;$n<$i;$n++)
  {
  $table_index=int(1+$n/50);
  if(!(-f "html/table$table_index.html"))
   { 
   open(FH,">>html/table$table_index.html");
   print FH "<HTML>\n<head><title>Table $table_index</title>";
   print FH "<base target=\"_top\"></head><BODY BGCOLOR=\"#FFFFDD\">\n";
   print FH "<TABLE CELLSPACING=1 CELLPADDING=1 border=1 NOSAVE WIDTH=100%>";
   print FH "<tr><td width=9%></td><td width=6%></td><td width=15%></td>"; 
   my $frac=70/$j;
   for($l=0;$l<$j;$l++)
    {  print FH "<td width=$frac%></td>"; }
   print FH "</tr>";
   close(FH);
   }
  open(FH,">>html/table$table_index.html");
  if($cluster_info[$n][2] > 1)
   {
   print FH "<tr><td><a href=\"../phrap/$cluster_info[$n][0].contigs\">$cluster_info[$n][0]</a></td>";
   }
  else
   {
   print FH "<tr><td><a href=\"../blast/passed/$cluster_info[$n][0].txt\">$cluster_info[$n][0]</a></td>";   
   }
  print FH "<td align=center> $cluster_info[$n][2] </td>";
  print FH "<td><font size=1>$cluster_info[$n][1]</td>\n";
  for($l=0;$l<$j;$l++)
   {
   print FH "<td align=center><a href=\"../blast/$blast_DB[$l]/$cluster_info[$n][0].out\">$cluster_info[$n][3+$l]</a></td>\n";
   }
  print FH "</tr>";
  if(int(1+(1+$n)/50) > $table_index) { print FH "</table>"; }
  elsif($n == ($i-1)) { print FH "</table>"; }  ### for last table
  close(FH);
  }

 #### And create master files
open(FH,">html/Results.html");
print FH "<html><head><title>Results Page</title></head>";
print FH "<frameset rows=\"100,*\" framespan=0 frameborder=0 border=0 marginwidth=0>";
print FH "<frame name=\"top-frame\" scrolling=\"vertical\" noresize src=\"table_top.html\">";
print FH "<frame name=\"bottom-frame\" scrolling=\"auto\" noresize src=\"table1.html\">";
print FH "<noframes><body><p>This page uses frames, but your browser doesn't support them.</p>";
print FH "</body></noframes></frameset></html>";
close(FH);

open(FH,">html/table_top.html");
print FH "<HEAD><TITLE>Results Page</TITLE></HEAD><CENTER><B>";
print FH "<FONT SIZE=5 COLOR=\"#111111\">Results Page</FONT></B></CENTER>";
print FH "<CENTER><TABLE CELLSPACING=0 NOSAVE CELLPADDING=0 WIDTH=50%><tr>";
 for($l=1;$l<$num_tables;$l++)
  {
  print FH "<TD align=center><FONT SIZE=3 COLOR=\"#FF6666\"><A HREF=\"table$l.html\"";
  print FH "TARGET=\"bottom-frame\">Page $l</A></FONT></TD>";
  }

print FH "</TABLE><br><TABLE CELLSPACING=1 NOSAVE border=1 CELLPADDING=1 WIDTH=100%>\n";
print FH "<FONT SIZE=4 COLOR=\"#1100FF\"><TR><TD WIDTH=9%>Cluster ID</TD>\n";
print FH "<TD WIDTH=6% align=center><font size=2>No.<br>seqs</TD>\n";
print FH "<TD WIDTH=15% align=center><font size=2>List of sequences</TD>\n";
my $frac=70/$j;
for($l=0;$l<$j;$l++)
 {
 print FH "<TD WIDTH=$frac% align=center>$prog[$l] vrx $blast_DB[$l]</TD>\n";
 }
 print FH "</TR></TABLE></BODY></HTML>";
 close(FH);

 print "\nTables have been generated - if you are running this program remotely,\n";
 print "you will need to copy the following directories and contents into\n";
 print "a web accessible directory (typically \"public_html\" in your home directory - \n";
 print "ask your system administrator for further details). Directories to copy are : \n";
 print colored("html, blast and phrap\n","yellow bold");
 
 print "If you are running a local copy of this program you could view the results now ? ";
 my $view_flag=&yes_no();
 my $my_path=`pwd`;
 chomp $my_path;
 if($view_flag==0)
  {
  print "To view the results open up a web browser and open the file :\n\n";
  print colored("$my_path/html/Results.html\n","yellow bold");
  }
 else  #### Launch a suitable web browser
  {
  my @browsers=('opera','netscape','mozilla','galeon');
  my ($finalpath, $browser, $path, $pathflag);
  $pathflag=0;
  foreach $browser (@browsers)   {
    foreach $path (@PATH)    {
      if (-f "$path/$browser")     {
        print "\nPartiGene has found the web browser \"$browser\" on your system, do you want to use it?";
        $pathflag = &yes_no();        	
	if ($pathflag==1) {$finalpath="$path/$browser";}
	last;
      }    
    }
    if ($pathflag==1) { last;}
  }
  if($pathflag==0) { print colored("Couldn't find a suitable browser - please check your paths\n","red bold"); }
  else 
   { 
   $my_path.="/html/Results.html";
   system("$finalpath file://$my_path >& /dev/null");
   }
  }
 query_for_exit();
 }


##################################################################################
################ SECTION 7 - Import data into database ###########################
##################################################################################

sub do_database()  {
  system("clear");
  print colored("\n\t##### Databasing #####\n","white bold", "on_black");
  unless ($expert==1) {
    print "\nThis facility offers the ability to hold your data in a\n";
    print "SQL database using the public domain databasing software\n";
    print "postgreSQL. PostgreSQL is typically packaged with many Linux\n";
    print "distributions and is also freely available from :\n";
    print "http://www.postgresql.org/\n";
    print "In order to use this databasing facility, you will need to\n";
    print "ensure that postgres is running and that you have permissions\n";
    print "to create new databases - see the website above for more details.\n";
  } 
### Check to see if postmaster is up and running
### and whether user exists as postgresql user
  &postmaster_check();

#### get db, create it or missing tables when necessary
  my ($result,@row);
  $flag=0;
  if($database)  {
    print "\nYou have already defined a database - ";   
    print colored("$database","yellow bold");
    print " would you like to use it ?";
    $flag=&yes_no();
    if($flag==1)    { ##### is $database available ?  
      $conn=DBI->connect("dbi:Pg:dbname=$database", "", "", {PrintError => 0}); #Last two values would be user/pass.
      if (! $conn)   { ### Couldn't connect to the database  
        print "\nCouldn't connect to the database, would you like to create it ? ";
        &query_for_continue();
        &create_db($database,1,0,0,0,0,0,0,0,0); 
      }
      else   { ### Check tables are present - create them if they are not
      ## my $output=`echo "\\d"|psql $database`;
        my @table = $conn->tables('','',undef,'TABLE');
        ##print "@table"; in some DBD:Pg versions the output is not what we want, see work around below
	my $clus_table_flag=0;
        my $est_table_flag=0;
        my $estseq_table_flag=0;
        my $blast_table_flag=0;
        my $protein_table_flag=0;
	my $clonename_table_flag=0;
	my $p4eind_table_flag=0;
	my $p4eloc_table_flag=0;
	my $p4ehsp_table_flag=0;
        $i=0;
        for($n=0; $n < @table; $n++)    {
	  $table[$n] =~ s/public\.//; #get rid of "public." which is present in some versions of DBD.Pg 
          if($table[$n] eq "cluster") { $clus_table_flag=1; }
          if($table[$n] eq "est") { $est_table_flag=1; }
          if($table[$n] eq "est_seq") { $estseq_table_flag=1; }
          if($table[$n] eq "blast") { $blast_table_flag=1; }
          if($table[$n] eq "clone_name") { $clonename_table_flag=1; }
          if($table[$n] eq "p4e_ind") { $p4eind_table_flag=1; }
	  if($table[$n] eq "p4e_loc") { $p4eloc_table_flag=1; }
	  if($table[$n] eq "p4e_hsp") { $p4ehsp_table_flag=1; }
        }
#       print "\n $clus_table_flag $est_table_flag $estseq_table_flag $blast_table_flag $protein_table_flag\n";
        if(($clus_table_flag|$est_table_flag|$estseq_table_flag|$blast_table_flag|$clonename_table_flag|$p4eind_table_flag|$p4eloc_table_flag|$p4ehsp_table_flag)==0)     { 
          &create_db($database,0,$clus_table_flag,$est_table_flag,$estseq_table_flag,$blast_table_flag,$clonename_table_flag,$p4eind_table_flag,$p4eloc_table_flag,$p4ehsp_table_flag);
        }  
      }  
    
##### here updating existing db for Pg2.1 
##### First check whether column retired exists

##### This block had to be removed in 2.2 - column_info method has changed (arguments mandatory now) and there is no way to have one piece of code 
##### which serves both, new and previous DBI 
##### to avoid breaking Pg for some users just call alter table, error message not visible for the user anyway
#      my $sth = $conn->column_info();
#      my $retire_flag = 2; my $est_flag = 2;
#      while (my(@tab) = $sth->fetchrow_array()) {
#        if ($tab[2] eq "cluster" && $tab[3] eq "retired") {       
#          $retire_flag = 1;              
#        }
#	if ($tab[2] eq "est" && $tab[3] eq "est_id") {       
#          $est_flag = 1;              
#        }
#      }
##### create column retired if necessary
#      if ($retire_flag == 2) {
        my $result = $conn->do("ALTER TABLE cluster ADD COLUMN retired int null;");   
#      }
##### renaming est_name if necesary
#      if ($est_flag == 2) { 
        $result = $conn->do("ALTER TABLE est RENAME COLUMN est_name to est_id;");    
#      }  
    }
  }
 
  while($flag==0) { ### Request a new name for a database and check it doesn't already exist  
    my $db_flag=0;
    $database= &get_input("\nPlease enter the name of the database you would like to create\n");
    $conn=DBI->connect("dbi:Pg:dbname=$database", "", "", {PrintError => 0});
    if($conn) {
      $conn->disconnect or warn "Disconnection failed: $DBI::errstr\n";
      print "$database already exists - please specify an alternative name\n";
      next;
    }
    else   {
      print "Use ";
      print colored("$database","yellow bold");
      print " ? : ";
      $flag=&yes_no();
      if($flag==1)     { 
        &create_db($database,1,0,0,0,0,0,0,0,0); 
        print "\nUse this as your default database in future ? ";
        my $use_flag=&yes_no();
        if($use_flag)     {  ## Update the config file
          open(CONFILE,"<$filename") ||  die "Can't find configuration file\n";
          open(TMP, "> tmp.config")         or die "can't open temporary file: $!";
          while (<CONFILE>) {
            $_=~s/^DATABASE\=.+/DATABASE\=$database/;
            print TMP $_;
          }
          close(TMP);
          close(CONFILE);
          rename("tmp.config", $filename);
        }
      }
    }   
  }
  
#### Ready to import data

  $cluster_id=&get_clusterid();
  $conn->disconnect or warn "Disconnection failed: $DBI::errstr\n";
  $conn = DBI->connect("dbi:Pg:dbname=$database", "", ""); ##### Connect to db
  
#### start with clusters  
  $result = $conn->selectall_arrayref("select * from cluster where clus_id~'$cluster_id';");
  my $ntuples=@$result;  # Ie size of array referenced by result
  if($ntuples == 0)  { 
    print "Inserting cluster entries\n";
    &insert_cluster_info();
  }
  else  {
    print "\nCluster entries already exist for this cluster ID - Update the db ? ";
    my $answer=yes_no();
    if($answer==1)   {
##### first updating retired info
      if (-e "retired_contigs")  {
        print "Retiring redundant clusters\n";	  
	my @retired_list = glob ("retired_contigs/*.seq");
        foreach my $retired (@retired_list)  {
          my $contig; my $cluster; 
          if ($retired =~ m/(\w\w\w\d\d\d\d\d)_(\d+)/) { #real clusters
            $cluster = $1;  $contig = $2;
            my $result = $conn->do("UPDATE cluster SET retired = 1 WHERE clus_id = '$cluster' AND contig = '$contig';", {PrintError => 0}); 
          } 
          elsif ($retired =~ m/(\w\w\w\d\d\d\d\d)/) { # singletons, Pg2.0 style
            $cluster = $1;  $contig = 1; 
            my $result = $conn->do("UPDATE cluster SET retired = 1 WHERE clus_id = '$cluster' AND num_ests = '1';", {PrintError => 0});
          }
          else {
	    print colored("### Warning ! ### $retired couldn't be retired,\n","magenta bold");
	  }
	} 
      }      
      print "Deleting old cluster entries\n";   
      $result = $conn->do("DELETE from cluster where clus_id~'$cluster_id' AND retired = '0';");
      $result = $conn->do("DELETE from cluster where clus_id~'$cluster_id' AND retired is NULL;");
      print "Updating cluster entries\n";
      &insert_cluster_info();
    }
  }

#### now est and est_seq
  $result = $conn->selectall_arrayref("select * from est where clus_id~'$cluster_id';");
  $ntuples=@$result;
  if($ntuples == 0)  { 
    print "Inserting sequence entries\n";
    &insert_est_info();
  }
  else  {
    print "\nSequence entries already exist for this cluster ID - Update the db ? ";
    $answer=yes_no();
    if($answer==1)   {
      print "Deleting sequence entries\n";
      $result = $conn->do("DELETE from est_seq where est_seq.est_id=est.est_id and est.clus_id~'$cluster_id';");
      $result = $conn->do("DELETE from est where clus_id~'$cluster_id';");
      print "Updating sequence entries\n";
      &insert_est_info();
    }
  }


#### now clone_id 

  print "\nDo you want to add or update clone name entries ?\n";
  print "(Please note, this feature works only for ESTs downloaded from EBI in option 1\n";
  print "which use the EGTDC naming scheme)\n";
  $answer=yes_no();
  if($answer==1)   {
    my $datafile = "$cluster_id" . "EST";
    open (DATA, "<$datafile") || die "can't open $datafile";
    while (my $line=<DATA>) {
    #if ($line=~/^>(\S+)\s+(\S+)/) { #would be the absolutely take everything version
      if ($line=~/^>/) {
        my $est_id =''; my $clone_id ='';
        if ($line=~/^>(\S+)\s+(\w+_\w+_\w+)/) {
          $est_id = $1; 
          $clone_id = $2;
          if ($clone_id =~ /(\w+_\w+_\w+)_/) { #get rid of _text which is sometimes there
            $clone_id = $1;
          }          
        } 
#### accomodate for alternative downloadstyle    
        if ($line=~/^>(\S+)\s+\|\S+\|\s+(\w+_\w+_\w+)/) {
          $est_id = $1; 
          $clone_id = $2;
          if ($clone_id =~ /(\w+_\w+_\w+)_/) { #get rid of _text which is sometimes there
            $clone_id = $1;
          }          
        } 
        if ($est_id && $clone_id) {        
	  $result = $conn->selectall_arrayref("select est_id from clone_name where est_id~'$est_id';");
          $ntuples=@$result;
          if($ntuples == 0)  { 
            my $insert = $conn->do("INSERT INTO clone_name (est_id, clone_id) values ('$est_id','$clone_id');", {PrintError => 0});
          }      
          else  {
            my $update = $conn->do("UPDATE clone_name SET clone_id = '$clone_id' WHERE est_id = '$est_id';", {PrintError => 0});           
          } 
        }	
      }	
    }
  } 


#### now blast  
  $result = $conn->selectall_arrayref("select distinct(db) from blast where clus_id~'$cluster_id';");
  $ntuples=@$result;
  my @table_blasts;
  if($ntuples > 0)  {
    print "\nBLAST entries already exist for this cluster ID -\n\n";
    $result = $conn->prepare("select distinct(db) from blast where clus_id~'$cluster_id';");
    $result->execute() or die;
    for($i=0; $i<$ntuples; $i++)   {
      $table_blasts[$i] = $result->fetchrow_array();       
      print colored("$table_blasts[$i]\n","yellow bold");   
    }
    print "\nWould you like to remove some or all of them ? ";
    my $answer=&yes_no();
    print "\n";
    if($answer==1)   {
      for($i=0;$i<$ntuples;$i++)    {
      print "Delete $table_blasts[$i] ? ";
      my $delete_blast=&yes_no();
      if($delete_blast==1) {
        $result = $conn->do("DELETE from blast where clus_id~'$cluster_id' and db='$table_blasts[$i]';"); }
      }
    }
  }
  print "\nInserting BLAST info\n";
  &insert_blast_info();
  
#### and finally prot4EST results  
  print "\nDo you want to insert results from prot4EST into $database? ";
  my $p4e= &yes_no();
  if ($p4e==1) {&insert_p4e_info}
    
    
  query_for_exit();

}


###################################################################################
###################################################################################
###                                                                             ###
###                               all the subs                                  ###
###                                                                             ###
###################################################################################
###################################################################################


###################################################################################
###                            general subs first                               ###
###################################################################################


############################################################################################################################
sub create_conf() {   #### creates configfile if not found in home directory
    my $config_file = `echo \$HOME`; chomp $config_file;
    $config_file = "$config_file" . "/.partigene.conf";
    open (CONF, ">$config_file") || die "Sorry, could not open/.partigene.conf $!";
    print CONF "### location of blastable database, set to biolinux standard, modify if necessary ###\n";
    print CONF "BLASTDB=/home/db/blastdb\n";
    print CONF "### location of vector.seq file for vector screening, set to biolinux standard, modify if necessary ###\n";
    print CONF "VECTOR=/usr/software/phrap/phrap/vector.seq\n";
    print CONF "### name of your PartiGene database, can be modified from within the script ###\n";
    print CONF "DATABASE=newdb\n";
    print CONF "### directory where PartiGene starts to look for phred quality files (see documentation for details) ###\n";
    print CONF "SEQREP=/home/user1/project\n";
    print CONF "### quality score used in case no phred quality files are present ###\n";
    print CONF "QUALSCORE=15\n";
    print CONF "### url for sequence download, only needs be changed to be changed when ebi changes their set up ###\n";
    print CONF "SEQLINK=<a href=http://srs6.ebi.ac.uk/srs7bin/cgi-bin/wgetz?-e+[embl-acc:PARTISEQ]>\n"; 
    close (CONF) || die "Error - cannot close: $!\n";
}    
#######################################################################################################             


#########################################################################################################################
sub options () {   #### options menu
   my $answer=0;
   while($answer!=8) {
      $answer=&title_page();
      if($answer==1)  { &get_sequences(); } # Download sequences
      if($answer==2)  { &trim(); } #trimming
      if($answer==3)  { &cluster(); } #Cluster using CLOBB
      if($answer==4)  { &assemble_clusters(); } #Assemble clusters
      if($answer==5)  { &blast_annotation(); } #Do some BLASTing
      if($answer==6)  { &html_tables(); } #Create HTML tables
      if($answer==7)  { &do_database(); } #Create/update database        
   }
   system("clear");
   exit();   #exit program
}
############################################################################################################################

   
###################################################################################
sub title_page() {  #### title comments ans option selection
    print_title();
    print "\n\t  Enter the number corresponding to the part of the PartiGene\n";
    print "\t  process you want to perform:\n\n";
    print "\t\t1. Download sequences from EBI for analysis.\n";
    print "\t\t2. Pre-process sequences.\n";
    print "\t\t3. Cluster sequences.\n";
    print "\t\t4. Assemble clusters.\n";
    print "\t\t5. Perform BLASTs.\n";
    print "\t\t6. Create HTML tables of results.\n";
    print "\t\t7. Construct relational database of results.\n";
    print "\t\t8. Quit.\n\n\t";

    my $flag=0;
    while($flag==0) {
	$answer=<>;
	if($answer=~/^[1|2|3|4|5|6|7|8]$/) { $flag=1; next; }
        else {print " You have entered $answer This is not an option. Please try again\n";}
    }
    return $answer;
}
####################################################################################


############################################################################################################################
sub print_title() {    ##### title scheme
#displays title
    print "\n\n"; 
    print colored("\t################################################################\n","white bold", "on_black");
    print colored("\t###                                                          ###\n","white bold", "on_black");
    print colored("\t###   PartiGene - a script to convert individual sequences   ###\n","white bold", "on_black");
    print colored("\t###   (typically ESTs) into a Partial Genome. Vs 2.2.0       ###\n","white bold", "on_black");
    print colored("\t###                                                          ###\n","white bold", "on_black");
    print colored("\t###   Ralf Schmid and colleagues for the EGTDC 2004          ###\n","white bold", "on_black");
    print colored("\t###                                                          ###\n","white bold", "on_black");
    print colored("\t###   News, upgrades and help:   nematode.bioinf\@ed.ac.uk    ###\n","white bold", "on_black");
    print colored("\t###   Help for EG-Awardees:      helpdesk\@envgen.nox.ac.uk   ###\n","white bold", "on_black");
    print colored("\t###                                                          ###\n","white bold", "on_black");
    print colored("\t################################################################\n","white bold", "on_black");
}
###########################################################################################################################


#########################################################################################################################
sub postmaster_check() {
#### check for postmaster/postgresql process
  my $postmaster=`ps -e|grep postmaster`; ### See if the process is running
  my $username = `whoami`; chomp $username;
  if(!$postmaster)  {
    print colored("\n#### Postmaster is not running ####\n","red bold");
    print colored("Please ensure that postgreSQL is correctly installed and running\n","red bold");
    exit();
  }

#### check whether user does exist   
  my $user_status = system ("psql -l > /dev/null"); ### command will fail unless (postgresql) user exists      
  unless ($user_status == 0) {
    my $username = `whoami`; chomp $username;
    print colored("\n\t#### CONNECTION TO POSTGRESQL FAILED ####\n","red bold");
    print colored("Most likely you have forgotten to run \"createuser $username\"\n","red bold");
    print colored("during the postgreSQL setup\n","red bold");
    exit();
  }
}
#########################################################################################################################


#########################################################################################################################
sub find_program() { #### find executables
  my $prog=$_[0];
  my $pathflag=0;
  my $path;
  my $finalpath;
  foreach $path (@PATH) {
    if (-f "$path/$prog") {
      $pathflag=1; $finalpath=$path; last;
    }
  }
  if($pathflag==0)   { 
    print colored("\nCan't find the $prog utility on your system\n","red bold");
    print colored("Please ensure it is installed and in your path\n\n","red bold");
    exit();
  }
  else  {  return "$finalpath/$prog";  }
}
#########################################################################################################################


###########################################################################################################################
sub query_for_exit() {   #### exits program if 'n' entered back to main for y
   print "\nWould you like to continue with the PartiGene process? ";
   print colored("\n[y/n] : ","green bold");
   my $input='';
   while($input!~/y|n/i)   {
     print "\b";
     $input=<STDIN>;
     chomp $input;
  }
  if ($input=~/^y/i) { print "\nOK: Back to main menu\n"; &options;}
  if ($input=~/^n/i) { print "\nExiting the program\n"; exit(); }
}
####################################################################################


###########################################################################################################################
sub query_for_continue() { #### exits program if 'n' entered carries on for'y'
   print colored("\n[y/n] : ","green bold");
   my $input='';
   while($input!~/y|n/i)   {
     print "\b";
     $input=<STDIN>;
     chomp $input;
  }
  if ($input=~/^n/i) { print "\nExiting the program\n"; exit(); }
}
####################################################################################


####################################################################################
sub yes_no() {   #### returns '1' for yes 
  my $yflag=0;
  print colored("\n[y/n] : ","green bold");
  my $input='';
  while($input!~/y|n/i)   {
    print "\b";
    $input=<STDIN>;
    chomp $input;
  }

  if($input=~/^y/i) { $yflag=1; }
  return $yflag;
}
####################################################################################


####################################################################################
sub get_input() {#### uses either readline module if available
  my $input;
  my $question = shift (@_);
  if ($read_gnu) {	#true if gnu readline module installed
    $|=1;
    $input = $term->readline("$question");	#print question and get user input
  } 
  else {	#do it the old way, without readline
    print "$question";
    $input =<>;
  }  
  chomp $input;	
  $input =~ s/\s*$//;	
  return $input;
}
######################################################################################


######################################################################################
sub check_directory() { #### removes everything from dir, but keeps some stuff in blast dir
  my $dir=$_[0]; 
  if(-d "$dir")   { 
    my @list = glob("$dir/*");
    my $size=@list;
    if ($size > 0)     {
      if($dir ne "blast")    {
        print colored("\nDirectory '$dir' already contains some files which may\n","magenta bold");
        print colored("interfere with this program, shall we remove them ?","magenta bold");
        query_for_continue();
        system("rm -rf $dir");
        system("mkdir $dir");
      }
      else    {
        my @bl_dir=`ls -d blast/*`;
        my $file;
        print colored("\nYou may already have performed the following blasts for this set\n","magenta bold");
        print colored("of clusters :\n","magenta bold");
        my $i=0;
        foreach $file (@bl_dir)     {
          chomp $file;
          $file=~s/blast\///;
          if ($file ne "passed" && -d "blast/$file")     { $i++; print "$file\n"; }
        }
        print "\n";
    }
   }
  }
  elsif(-f "$dir")  {
    print colored("\nA file with the name '$dir' already exists and may\n","magenta bold");
    print colored("interfere with this program, shall we remove it ?","magenta bold");
    query_for_continue();
    system("rm -rf $dir");
    system("mkdir $dir");  
  }
  else { system("mkdir $dir"); }
}
###################################################################################


###################################################################################
sub get_clusterid {  #### gets cluster_id from CLOBB blasts, exits if that is assigned as wrong, returns id
  my $flag=0;
  if($cluster_id)  {
    my $cluster_file=$cluster_id."EST";
    if(-s $cluster_file)  { 
      print "You have previously used $cluster_id as the cluster identifier\n";
      print "Is this correct ? ";  
      query_for_continue();  
      $flag=1;
    }
  }
  if($flag==0)  { 
    my $clus_flag=0;
    my $input;
    while($clus_flag==0)   {
      $input = &get_input ("Enter three letter cluster ID you have previously defined : ");
      my $clus_file=$input."EST";
      if(-f $clus_file) { $clus_flag=1; $cluster_id=$input;}
      else     { 
        print colored("$input","red bold");
        print colored(" is not valid","red");
        query_for_exit();
      }
    }
  }
  return $cluster_id;
}
###################################################################################


###################################################################################
###                            processing subs 2nd                              ###
###################################################################################

###############################################################################
sub trim_sequence() {   ##### polyA, polyT treatment
  my $in_seq=$_[0];
  my ($trim,$i,$l,$sub);
 
  if($in_seq=~/.{150,}?(aaaaaaaaaaa.+)/i)  {
    $trim=$1;
    $l=length($trim);
    $sub='';
    for($i=0;$i<$l;$i++) { $sub.='X'; }
    $in_seq=~s/(.{150,}?)aaaaaaaaaaa.+/$1$sub/i;
  }
  if($in_seq=~/^(.{0,100}ttttttttttt)/i) {
    $trim=$1;
    $l=length($trim);
    $sub='';
    for($i=0;$i<$l;$i++) { $sub.='X'; }
    $in_seq=~s/(.{0,100})ttttttttttt/$1$sub/i;
  }
  if($in_seq=~/^(.{0,100}nnn)/i) { ### String of n's at beginning of sequence  
    $trim=$1;
    $l=length($trim);
    $sub='';
    for($i=0;$i<$l;$i++) { $sub.='X'; }
    $in_seq=~s/.{0,100}nnn/$sub/i;
  }   
  if($in_seq=~/(nnn.{0,100}$)/i)  {### String of n's at end of sequence
    $trim=$1;
    $l=length($trim);
    $sub='';
    for($i=0;$i<$l;$i++) { $sub.='X'; }
    $in_seq=~s/nnn.{0,100}$/$sub/i;
  }

  return $in_seq;
}
#########################################################################################


##########################################################################################
sub specify_blasts  {  #### create list for available blasts
  my @prot_blsdbs=`ls $blastdb/*.pin`;
  my @nuc_blsdbs=`ls $blastdb/*.nin`;  
  my ($pbdb, $nbdb);
  
  print colored("### Available protein databases ###\n","green bold");
  $i=1;
  foreach $pbdb(@prot_blsdbs)  {
    chomp $pbdb;
    $pbdb=~s/\.pin//;
    $prot_db[$i]=$pbdb;
    $pbdb=~s/$blastdb\///;
    printf("%2d %-25s ",$i,$pbdb);
    if($i % 3==0) { print "\n"; }
    $i++;
  }

  print colored("\n### Available nucleotide databases ###\n","cyan bold");
  $i=1;
  foreach $nbdb(@nuc_blsdbs)  {
    chomp $nbdb;
    $nbdb=~s/\.nin//;
    $nuc_db[$i]=$nbdb;
    $nbdb=~s/$blastdb\///;
    printf("%2d %-25s ",$i,$nbdb);
    if($i % 3==0) { print "\n"; }
    $i++;
  }
 
  print "\nPlease enter the number of the database you would like to blast and\n";
  print "the type of blast you would like to perform separated by a comma.\n"; 
  print "For example '1, blastn' followed by 'q' for finish would perform a\n";
  print " single BLASTN against nucleotide database 1.\n\n";
  print "If you would like to specify several databases, separate the numbers\n";
  print "by using '+'. E.g. entering 1+2+4, blastx, would perform a single BLASTX\n";
  print "search for each sequence against the combined protein databases 1,2 and 4\n";
  print "Upto 5 different BLASTs are allowed - enter 'q' to finish\n\n";
  print "If you would like to run more than one BLAST enter e.g '1 , blastx' RETURNKEY\n";
  print "'2 , blastx' RETURNKEY '4, blastx' RETURNKEY followed by 'q' to finish.\n";
  print "This would run three BLASTs (one against each of the protein databases 1,2 and 4)\n"; 

  $i=0;
  $input='';
  while($input!~/q/ && $i<5) {
    $input= &get_input("\n");    
    my $prog='';
    my $nums='';
    if ($input=~/(.+),.*?(t*blast[x|n])/i)  {
      $nums=$1;
      $prog=$2;
    }
    if($prog=~/^blastx$/i)  {
      if($nums!~/\+/)   { ### Just one db specified
        $nums=~/(\d+)/;
        $dbid=$1;
        if($prot_db[$dbid])   {
          $blast_DB[$i]=$prot_db[$dbid];
          $blast_EXE[$i]="blastx";
          $i++;
        }
        else { print "BLASTX cannot be performed with $dbid\n"; }
      } 
      else {  #### There are more than one databases to blast against
        my @db_list=split(/\+/,$nums);
        my $blast_txt="\"";
        $flag=0;
        foreach $dbid(@db_list)    {
          if($prot_db[$dbid])     { $blast_txt.=$prot_db[$dbid]." "; }
          else { print "BLASTX cannot be performed with $dbid\n"; last; $flag=1; }
        }
        if($flag==0)    {
          $blast_DB[$i].=$blast_txt."\"";
          $blast_EXE[$i]="blastx";
          $i++;
        }
      }
    }  
    elsif($prog=~/^blastn$/i || $prog=~/^tblastx$/)  {
      if($nums!~/\+/)   { ### Just one db specified   
        $nums=~/(\d+)/;
        $dbid=$1;
        if($nuc_db[$dbid])    {
          $blast_DB[$i]=$nuc_db[$dbid];
          $blast_EXE[$i]=$prog;
          $i++;
        }
        else { print "$prog cannot be performed with $dbid\n"; }
      }
      else  { #### There are more than one databases to blast against   
        my @db_list=split(/\+/,$nums);
        my $blast_txt="\"";
        $flag=0;
        foreach $dbid(@db_list)    {
          if($nuc_db[$dbid])     { $blast_txt.=$nuc_db[$dbid]." "; }
          else { print "$prog cannot be performed with $dbid\n"; $i--; }
        }
        if($flag==0)    {
          $blast_DB[$i].=$blast_txt."\"";
          $blast_EXE[$i]="$prog";
          $i++;
        }
      }
    }
    elsif($input!~/q/)  { print "No valid input given - try again\n"; }
  }
return $i;
}
#######################################################################################


#######################################################################################
sub read_blast() {  #### blast parsing
  use POSIX qw(strtod); 
  my $blastfile=shift;
  my $in = new Bio::SearchIO( -format => 'blast', -file   => $blastfile);
  my $score='';
  my $sig='';
  my $text='';
  my $desc='';
  my $name='';
  my $prog='';
  while( my $result = $in->next_result )  {
    $prog=$result->algorithm;
    if (my $hit = $result->next_hit)   {
      $score=$hit->raw_score;
      $desc=$hit->description;
      $sig=$hit->significance();
      $name=$hit->name;
      last;
    }
  }
  if(strtod($score) < 50) { $text="No significant hit found"; }
  else { $text=$name."  ".$desc."<br><font color=\"#FF0000\"> $score"." $sig</font>"; }
  $text.="||".$prog;
  return $text;
}
####################################################################################


####################################################################################################
sub contig_check ()  { ### This sub updates list of retired contigs and updates contig numbering from phrap output
  my $new_file = $_[0];
  my $old_file = $_[1];
  my $cluster_id = $_[2];
  my $out = '';
  my $file_out = "temporary_out_file";
  my $contig_number_old = '';
  my $contig_number_new;
  my $retired_contig_list = '';

#### get number of last contig which should have the highest number
  if ($old_file eq "1") { #previous singletons
    $contig_number_old = 1;
  }
  else {  
    open (OLDFILE,"<$old_file")||  die "Can't find $old_file\n";;    
    while (my $line=<OLDFILE>)      {
      if($line=~/^>\w+\.[a-zA-Z_]+(\d+)/)      {
        $contig_number_old = $1;      
      }     
    }    
    close OLDFILE; 
  }
#### update contig numbers in contigfiles  
  $contig_number_new = $contig_number_old;
  open(FILE,"<$new_file") ||  die "Can't find $new_file\n"; 
  while(my $line=<FILE>)          {
    if($line=~/^>(\w+\.\w+)/)            {
      $retired_contig_list = "$retired_contig_list" . "$1\n"; #update retired contigs
      $contig_number_new++;
      $line =~ s/Contig\d+/Contig$contig_number_new/; #update contig numbering     
      $out = "$out" . "$line";
    } 
    else {$out = "$out" . "$line";}
  }      
  close FILE;      	             

  open (OUT, ">$file_out") ||  die "Can't find $file_out\n";;
  print OUT $out;
  close OUT;
  system ("mv $file_out $new_file");
 
#### and now update quality files  
  $contig_number_new = $contig_number_old; $out = '';
  open(FILE,"<$new_file.qual") ||  die "Can't find $new_file.qual\n"; 
  while(my $line=<FILE>)          {
    if($line=~/^>(\w+\.\w+)/)            {
      $contig_number_new++;
      $line =~ s/Contig\d+/Contig$contig_number_new/; #update contig numbering     
      $out = "$out" . "$line";
    } 
    else {$out = "$out" . "$line";}
  }      
  close FILE;      	             

  open (OUT, ">$file_out") ||  die "Can't find $file_out\n";;
  print OUT $out;
  close OUT;

  system ("mv $file_out $new_file.qual");
  
#### and finally update ace files  
  $contig_number_new = $contig_number_old; $out = '';
  $new_file =~ s/\.contigs//;
  if (-e "$new_file.ace") { #if not then pseudocontig has replaced contig
    open(FILE,"<$new_file.ace") ||  die "Can't find $new_file.ace\n"; 
    while(my $line=<FILE>)          {
      if($line=~/^CO\s/)            {
        $contig_number_new++;
        $line =~ s/Contig\d+/Contig$contig_number_new/; #update contig numbering     
        $out = "$out" . "$line";
      } 
      else {$out = "$out" . "$line";}
    }      
    close FILE;      	             
  
    open (OUT, ">$file_out") ||  die "Can't find $file_out\n";;
    print OUT $out;
    close OUT;
    system ("mv $file_out $new_file.ace");
  }
}
#######################################################################################################


#######################################################################################################
sub process_clusters()  { #### splits the contig and qual files into individual files for decoder
  my $suffix=$_[0];
  my $newsuffix=$_[1];
  opendir(DIR, "./phrap");
  my ($file, $n, $line, $flag, $in_seq, $newfile, $header);
  while(defined($file= readdir(DIR))) {
    if($file=~/$suffix$/ && (-s "./phrap/$file" > 10))  {
      open(FSAFILE, "<phrap/$file");
      while($line=<FSAFILE>)   {
		  if($line=~/^>(.+)/)     {
          $header=$1;
          $newfile=$file;
          $newfile=~s/$suffix//;    
          if($header=~/Contig(\d+)/)      {$newfile .= "_".$1}
          $newfile.=$newsuffix;
          open(OUTFILE, ">protein/$newfile");
          $newfile=~s/$newsuffix//g;
          print OUTFILE ">$newfile\n";
        }
        else    {print OUTFILE "$line";}
      }
      close(FSAFILE);
    }
  }
}
###############################################################################


###############################################################################
sub process_singletons() {#### prepare singleton files for prot4EST
  my $basedir=$_[0];   ### Location of sequence quality files
  my $clone_id=$_[1];  ### For EGTDC style traces
  my $cluster_id=$_[2];
  my $clone_pattern=$_[3]; ### For non-standard traces
  my $trace_dir=$_[4];

#### Read in sequences one at a time
  my $line;
  open(SINGFILE, "<Clus/singletons.fasta");
  my $flag=0;
  my $header='';
  my $seq='';
  while ($line = <SINGFILE>) {
    if($line!~/^>/) { chomp $line; $seq.=$line; next; }
    if($line=~/^>/ && $seq)  {
      &prepare_qual("protein/$file",$header,$seq,1,$clone_id,$basedir,$clone_pattern,$trace_dir);
      $seq='';
    }
    if($line=~/^>(.+)/)   {
      $header=$1 . "_1";
      $header=~/($cluster_id\d\d\d\d\d)/;   ### Take cluster ID as the file name
      $file=$1 . "_1";
      next;
    }
    if($line!~/^>/) { chomp $line; $seq.=$line; next; }
  }
  close(SINGFILE);
  &prepare_qual("protein/$file",$header,$seq,1,$clone_id,$basedir,$clone_pattern,$trace_dir); #### do the last sequence
}
###############################################################################


###############################################################################
sub prepare_qual() {#### prepare quality files for phrap and preprocessing step to prot4EST
  my $outfile=$_[0];
  my $header=$_[1];
  my $seq=$_[2];
  my $two_seq_flag=$_[3];
  my $clone_id=$_[4];
  my $basedir=$_[5];
  my $clone_pattern=$_[6];
  my $trace_dir=$_[7];
  my $file_location='';
  my $seqfile='';
  my $lib='';
  my ($in_seq, $line, $n, $seq_begin, $seq_end, $qualseq, @qual_scores, @readseq);
  my ($end, $start, $startl, $endl);
  my ($seq_length, $beg_low, $end_low);
  my $special = 0;
  my @location_list;
  
## Line for EGTDC style traces ##########################
  if($clone_id) {
    if($header=~/($clone_id\_\w{2,5}\_[a-z0-9]+)/i)   {
      $seqfile=$1;
      $seqfile=~/$clone_id\_([A-Za-z0-9]{2,5})\_[a-z0-9]+/i;
      $lib=$1;
      $file_location="$basedir/$lib/";
##### catch here for EGTDC datastructure  
      if ($basedir =~ /trace2dbest$/i) {   
        $special =1;
        @location_list = glob ("$basedir/*");         
      }
    }
  }

## Line for WashU traces ##########################
   
  if($header=~/(\w\w\d\d\w\d\d\.y\d)/)  {
    $seqfile=$1;
    $lib="washu";
    $file_location="$basedir/$lib/";
  }

## Line for Other traces here ##########################

  if($clone_pattern) {
    if($header=~/($clone_pattern)/i)   {
      $seqfile=$1;
      $file_location="$trace_dir/";
    }
  }

########################################################
  
  $outfile=~/\/(.+)$/;	### Take cluster ID as the file name
  my $title=$1;
  my $orig_file_flag=0;

  if($outfile=~/protein/)  {
    open(QUALFILE,">>$outfile.qlt");
    open(SEQFILE,">>$outfile.seq");
	 ### I think prot4est likes just cluster_name/contig as identifier!? check
    my $new_header = $outfile;
    $new_header =~ s/protein\///;
    print SEQFILE ">$new_header\n";
    print QUALFILE ">$new_header\n";
  }
  else  {
    open(QUALFILE,">>$outfile.qual");
    open(SEQFILE,">>$outfile");  
	 print SEQFILE ">$header\n";
    print QUALFILE ">$header\n";
  }

  unless (@location_list) {
    push (@location_list, $file_location);
  }  

  foreach $file_location (@location_list) {
    if ($special == 1)  {
      $file_location =~ /\/([\w\-:]+)$/;
      $file_location = "$basedir/" . "$1" . "/partigene"; 
      ###
    } 
    if(-f "$file_location/$seqfile.seq") {
      open(FH,"<$file_location/$seqfile.seq");
      $in_seq='';
      while($line=<FH>)  { 
        if($line!~/^>/) { chomp $line; $in_seq.=$line; }  
      }
      close(FH);
      $seq_begin=$seq;  ### Find start and end of used sequence
      $seq_begin=~s/^(.{10}).+/$1/;
      $seq_end=$seq;
      $seq_end=~s/.+(.{10})$/$1/;
      my $statfile='';
      if (-f "$file_location/$seqfile.exp")  { 
        $statfile="$file_location/$seqfile.exp"; 
      }
      elsif (-f "$file_location/$seqfile.qual") { 
        $statfile="$file_location/$seqfile.qual"; 
      }
      if ($statfile && $two_seq_flag==1) { ### Have quality file   
        $found_a_quality_file=1;
        $orig_file_flag=1;
        open(FHA,"<$statfile");
        while($line=<FHA>)   {
          $line=~s/^AV\s//;
          if($line!~/^>/) {  $qualseq.=" ".$line; }
        }
        close(FHA);
        if($in_seq=~/(.+)$seq_begin.+$seq_end(.+)/)   {
          $start=$1; $end=$2;
          $startl=length($start);
          $endl=length($end);
        }
        else { $startl=0; $endl=0; }

 # Find beginning and end low qual regions which may have been
 # submitted by accident (basically looking for strings of 0's
 # in the quality file at beginning and end of sequence

        $beg_low1=$end_low=$beg_low=$end_low1='';
        @qual_scores=split(' ',$qualseq);
        $qualseq=~s/\d0\s/39/g;
        $qualseq=~s/\s//g;
        if($qualseq=~/^(0+)/) {  $beg_low=$1; }
        if($qualseq=~/(0+)$/) {  $end_low=$1; }
        $beg_low=length($beg_low);
        $end_low=length($end_low);
        $seq_length=length($in_seq);

        if($startl < $beg_low) { $startl=$beg_low; }
        if($endl < $end_low) { $endl=$end_low; }

        $n=0;
        @readseq=split('',"$in_seq");
        for ($i=$startl;$i<$seq_length-$endl;$i++)   {
          print SEQFILE "$readseq[$i]";
          print QUALFILE "$qual_scores[$i] ";
          $n++;
          if($n % 30 == 0) {  print QUALFILE "\n"; }	 
          if($n % 60 == 0) {  print SEQFILE "\n"; }	
        }
        if($n % 30 != 0) {  print QUALFILE "\n"; }	 
        if($n % 60 != 0) {  print SEQFILE "\n"; }	
        if($seq_length-$endl-$startl < 1)    {
          $orig_file_flag=0;
          $trace_problem_flag=1;
        }
      }
    }
  }
  if($orig_file_flag==0)   { #### Couldn't find original files on system ####
    $n=0;
    while ($seq=~/(.)/g)    {
      print SEQFILE $1;
      print QUALFILE "$QUAL_SCORE ";
      $n++;
      if($n % 30 == 0) {  print QUALFILE "\n"; }	 
      if($n % 60 == 0) {  print SEQFILE "\n"; }	
    }
    if($n % 30 != 0) {  print QUALFILE "\n"; }	 
    if($n % 60 != 0) {  print SEQFILE "\n"; }	
    close(QUALFILE);
    close(SEQFILE);
  }
}
###################################################################################


####################################################################################
###                           3rd database subs                                  ###
####################################################################################
sub create_db()  {   ####  tables to for partigene and prot4est output to start with
  my $database=shift;
  my $new_db=shift;
  my @table_flag;
  $table_flag[0]=shift;
  $table_flag[1]=shift;
  $table_flag[2]=shift;
  $table_flag[3]=shift;
  $table_flag[4]=shift;
  $table_flag[5]=shift;
  $table_flag[6]=shift;
  $table_flag[7]=shift;

  my $createdb_exe=&find_program("createdb");
  if($new_db) { system("$createdb_exe $database >& /dev/null"); }
 
  $conn=DBI->connect("dbi:Pg:dbname=$database", "", "");
  if($table_flag[0]==0) {
    my $result=$conn->do("create table cluster (clus_id varchar(10) not null, num_ests
      int null, contig int null, consensus text null, retired int null, primary key (clus_id, contig));"); 
  }
  if($table_flag[1]==0) {
    my $result=$conn->do("create table est (est_id varchar(15) not null primary key, clus_id 
       varchar(10) not null, contig int null, type smallint null, library int null, a_start
       int null, a_end int null, q_start int null, q_end int null);"); 
  }
  if($table_flag[2]==0) {
    my $result=$conn->do("create table est_seq (est_id varchar(15) not null, 
       sequence text null, constraint est_id_fk foreign key (est_id) references est (est_id));"); 
  }
  if($table_flag[3]==0) {
    my $result=$conn->do("create table blast (clus_id varchar(10) not null, prog
      varchar(10) null, db varchar(30) null, date char(11) null, score float null,
      id text null, description text null, frame smallint null, b_start int null,
      b_end int null, contig smallint null);"); 
  }
  if($table_flag[4]==0) {
    my $result=$conn->do ("create table clone_name (est_id varchar(15) not null primary key, clone_id varchar(15));");  
  }
  if($table_flag[5] == 0) {
    my $result=$conn->do ("create table p4e_ind (pept_ref int not null primary key, pept_id varchar(10) not null, clus_id varchar(10) not null,
    contig int, date varchar (12), method varchar(25), gen_code varchar(25), active bool, seq text);");  
  }
  if($table_flag[6] == 0) {
    my $result=$conn->do ("create table p4e_loc (pept_ref int not null, xtn_s int, conf_s int, frame_s int, conf_e int, xtn_e int, frame_e int,
    constraint pept_ref_fk foreign key (pept_ref) references p4e_ind (pept_ref));");  
  }
  if($table_flag[7] == 0) {
    my $result=$conn->do ("create table p4e_hsp (pept_ref int not null, hsp_num int not null, p_start int, p_end int, frame int, evalue float, 
    bit_score float, db_xref_id varchar(20), constraint pept_ref_fk foreign key (pept_ref) references p4e_ind (pept_ref), primary key (pept_ref, hsp_num));");  
  }
  my $errorMessage = $conn->errstr;
  if ($errorMessage) {print "$errorMessage\n";}
}
############################################################################################## 
 

############################################################################################## 
sub insert_cluster_info { #### processing cluster and singleton information for databasing
  my $line;
  my $sequence='';
  my (@length,@seq);
  my $num_contigs=0;
  my $est_num=0;
         
  opendir(DIR, "Clus") || die "Can't find the Clus directory";
  while (defined($file= readdir(DIR)))  {
    my $pseudoflag = 0; 
    if($file=~/$cluster_id/)    {# This is a cluster - treat as such   
      open(FH,"<Clus/$file");
      $est_num=0;
      while($line=<FH>)  {
        chomp $line;
        if($line=~/^>/)   { $est_num++; }
      }
      close(FH);
      
      if ((-e "phrap/$file.contigs") && (-s "phrap/$file.contigs" > 0))  {
        open(FH,"<phrap/$file.contigs");
	$num_contigs=0; @seq = '';
	while($line=<FH>)   {
	  if($line=~/^>\w+\.Contig(\d+)/)    {
	    $num_contigs = $1;
	    $sequence=''; 
	  }
          else { 
	    chomp $line; 
	    $sequence.=$line; 
	    $seq[$num_contigs]=$sequence; 
            $length[$num_contigs]=length($sequence);             
	  }	       
        }
        close(FH);
      }
         		
      else  { # This cluster didn't assemble
              # Make the contig - the longest sequence
        $pseudoflag = 1; $sequence='';
	print("phrap did not assemble $file - therefore using the pseudocontig file\n");
        open(FH,"<phrap/$file.pseudocontig") || die "No pseudocontig file either !"; 
        while($line=<FH>)  {
          if($line!~/^>/) { chomp $line; $sequence.=$line; next; }
        }
        close(FH);
        $seq[1]=$sequence; 
        $length[1]=length($sequence); 
        my  $pseudoname = "$file" . "_";
	my @pseudo = glob ("protein/$pseudoname*seq");
        if ($pseudo[0] =~ /_(\d+)\.seq/) {$num_contigs = $1;}
	else {$num_contigs=1;}
        $sequence='';
      }
      
      if ($pseudoflag == 1) { #pseudocontig, keep contig id number
	my $result = $conn->do("INSERT INTO cluster (clus_id,num_ests,contig,consensus,retired) values ('$file',$est_num, $num_contigs,'$seq[1]',0);",{PrintError => 0});
      }  
      else {
        for($i=1;$i<=$num_contigs;$i++)   {
          if (defined ($seq[$i])) {
            my $result = $conn->do("INSERT INTO cluster (clus_id,num_ests,contig,consensus,retired) values ('$file',$est_num, $i,'$seq[$i]',0);",{PrintError => 0});
          }
        }
      }
    }
  } 
  closedir(DIR);

###### Now process the singletons (clusters with only one sequence)
  my $inseq  = Bio::SeqIO->new('-file' => "Clus/singletons.fasta",
                         '-format' => 'Fasta');
  while (my $seq = $inseq->next_seq)  {
    my $sequence=$seq->seq();
    my $description=$seq->desc();
    if ($description =~ /($cluster_id\d+)$/) {
      my $descript = $1; 
      my $result = $conn->do("INSERT INTO cluster (clus_id,num_ests,contig,consensus,retired) values ('$descript','1','1','$sequence',0);", {PrintError => 0});
    }
    else {print "\nWARNING: Couldn't find $cluster_id in $description\n"} 
  }
}
############################################################################################## 


##############################################################################################
sub insert_est_info {
  my $cluster_file=$cluster_id."EST";
  my ($line,$name,$sequence,$description,$cluster_num,@row);
  my $inseq  = Bio::SeqIO->new('-file' => "$cluster_file",
                         '-format' => 'Fasta');

  while (my $seq = $inseq->next_seq)  {
    $name=$seq->display_id();
    $sequence=$seq->seq();
    $description=$seq->desc();
    $description=~/($cluster_id\d+)$/;
    $cluster_num=$1;
 
  ######## First step - find out which cluster the sequence is in
  ######## if its unique within a cluster give it the max indexes
  ######## if it isn't, find out the indexes from the phrap ace file
  ######## AF <sequence name> U/C X gives the start
  ######## RD <sequence name> Y gives the end

    my $result = $conn->prepare("Select num_ests from cluster where clus_id='$cluster_num' and retired='0';");
    $result->execute() or die;
    my @row =$result->fetchrow_array();

    if($row[0] == 1)    { # it is a singleton
      $end=length($sequence);
      my  $identifier = 1; # contig = 1, unless other evidence 
      my @identifiers = glob ("protein/$cluster_num*");
      if (@identifiers == 2) { 
	if ($identifiers[0] =~ /\w\w\w\d\d\d\d\d_(\d+)/) {$identifier = $1;}
      }
      else {print "\nWARNING $cluster_num doesn't seem to have entry in protein directory";}
      my $result = $conn->do("INSERT INTO est (est_id,clus_id,contig,type,library,a_start,a_end,q_start,q_end) values ('$name','$cluster_num','$identifier','1','1','1','$end','1','$end');");
      $result = $conn->do("INSERT INTO est_seq values ('$name', '$sequence');");
    }
    else  { # now Clusters and pseudocontigs
      if((-f "./phrap/$cluster_num.ace") && (-e "./phrap/$cluster_num.contigs"))    { # Cluster has contigs not pseudo contigs
        open(FH,"./phrap/$cluster_num.ace") || print("Error clus but no ./phrap/$name.ace\n");
        my $contig=1;
        my $end=0;
        my $start=1;
        my $flag=0;
        my $q_start=0;
        my $q_end=0;
        my $a_start=0;
        my $a_end=0;
        my $contigseq='';
        my $estseq='';
        my $direction='';
        my ($tmp,$val,$nstart);

        while($line=<FH>)     {
          if($line=~/Contig(\d+)/) { $contig=$1; $contigseq=''; $flag=1; next; }
          if($line=~/^BQ/) { $flag=0; next; }
          if($line=~/^.+/ && $flag==1) { chomp $line; $contigseq.=$line; next; }
          if($line=~/^AF\s.*$name\s(\w)\s(.+)/) { $estseq=''; $direction=$1; $start=$2; }
          if($line=~/^RD\s.*$name\s(\w+)/) {  $estseq=''; $end=$1; $flag=2; next; }
          if($line=~/^QA\s(\w+)\s(\w+)\s(\w+)\s(\w+)/ && $flag==2)       { 
            $q_start=$1;     ### Begin of high quality part of sequence
            $q_end=$2;       ### End of high quality part of sequence
            $a_start=$3;     ### Begin of alignment to contig
            $a_end=$4;       ### End of alignment to contig
	  
	    last;
          }
          if($line=~/^.+/ && $flag==2) { chomp $line; $estseq.=$line; next; }
        }
        close(FH);

  ## This section finds the start and ends of the sequence relative
  ## to the consensus and also finds the high quality regions
  ## This is all obtained from the phrap .ace file
  ## One problem is the insertions (*) which offset these values
  ## So we must find how many *'s there are and use this number
  ## To calculate the true alignment of the sequences
        if($flag==2)  {
          my $num_stars;  
          if($start > 0)   {
            $contigseq=~/^(.{$start})/;
            my $sub_contigseq=$1;
            $sub_contigseq=~s/a|c|t|g//gi; 
            $num_stars=length($sub_contigseq);
            $start-=$num_stars;
          }
  ## Need to find number of *'s between start and the four variables
  ## But its also relative to the read
          $nstart=$start;
          if($nstart<1) { $nstart=1; }

          $val=$q_start+$start-$nstart;
          my $len=length($contigseq);
          if(($nstart+$val) > $len) { $val=$len-$nstart; }
          if($val > 0)   {
            $contigseq=~/^.{$nstart}(.{$val})/g;
            $tmp=$1;
            $tmp=~s/a|c|t|g//gi;
            $num_stars=length($tmp);
            $q_start-=$num_stars;
            $q_start+=$start-1;
          }
   
          $val=$q_end+$start-$nstart; 
          if(($nstart+$val) > $len) { $val=$len-$nstart; }
          if($val > 0)   {
            $contigseq=~/^.{$nstart}(.{$val})/;
            $tmp=$1;
            $tmp=~s/a|c|t|g//gi;
            $num_stars=length($tmp);
            $q_end-=$num_stars;
            $q_end+=$start-1;
          }
   
          $val=$a_start+$start-$nstart; 
          if(($nstart+$val) > $len) { $val=$len-$nstart; }
          if($val > 0)        {
            $contigseq=~/^.{$nstart}(.{$val})/;
            $tmp=$1;
            $tmp=~s/a|c|t|g//gi;
            $num_stars=length($tmp);
            $a_start-=$num_stars;
            $a_start+=$start-1;
          }

          $val=$a_end+$start-$nstart-1; 
          if(($nstart+$val) > $len) { $val=$len-$nstart; }
          if($val > 0)  {
            $contigseq=~/^.{$nstart}(.{$val})/;
            $tmp=$1;
            $tmp=~s/a|c|t|g//gi;
            $num_stars=length($tmp);
            $a_end-=$num_stars;
            $a_end+=$start-1;
          }
        }

#  my $forward=1;
#  if($direction eq 'C')
#   { $forward=0; }

        my $result = $conn->do("INSERT INTO est (est_id,clus_id,contig,type,library,a_start,a_end,q_start,q_end) values ('$name', '$cluster_num','$contig','1','1','$a_start','$a_end','$q_start','$q_end');");
        $result = $conn->do("INSERT INTO est_seq values ('$name', '$sequence');");
      } 
      else  {#### Sequence hasn't been used in assembly, it is a pseudcontig 
        $end=length($sequence);
        my  $identifier = 1; # contig = 1, unless other evidence 
        my @identifiers = glob ("protein/$cluster_num*");
        if (@identifiers == 2) { 
	  if ($identifiers[0] =~ /\w\w\w\d\d\d\d\d_(\d+)/) {$identifier = $1;}
        }
        else {print "\nWARNING $cluster_num doesn't seem to have entry in protein directory";}
        my $result = $conn->do("INSERT INTO est (est_id,clus_id,contig,type,library,a_start,a_end,q_start,q_end) values ('$name','$cluster_num','$identifier','1','1','1','$end','1','$end');");
        $result = $conn->do("INSERT INTO est_seq values ('$name', '$sequence');");
      }  
    }  
  }
}
################################################################################################################


################################################################################################################
sub insert_blast_info()   {
 ##### Best approach - treat everything as non-html formatted
 ##### Read in every file in every directory in blasts
 ##### Use BioPerl to extract top hit, program, database etc etc
 ##### That way users can drop in blast output from several sources
 ##### and use concatenated blasts
 
  my ($blast_dir,$cluster,$contig,$count,$prog,$datab,$hit,$result,$name,$frame);
  my @blast_list;

  @dir=`ls -d blast/*`;
  $i=0;

  print "Please choose the BLAST results to include in the database :\n\n";
  foreach $blast_dir(@dir)  {
    chomp $blast_dir;
    $blast_dir=~s/blast\///;
    if($blast_dir!~"passed" && -d "blast/$blast_dir")  { 
      print "Include $blast_dir ? ";
      my $delete_blast=&yes_no();
      if($delete_blast==1)  {   $blast_list[$i]=$blast_dir; $i++;  }
    }
  }

  foreach $blast_dir(@blast_list)  {
    if($blast_dir ne "passed" && -d "blast/$blast_dir")  { #### Add an option asking user if they want these blasts added    
      opendir(DIR, "./blast/$blast_dir");
      while (defined($file= readdir(DIR)))    {
        my $in = new Bio::SearchIO( -format => 'blast', -file   => "./blast/$blast_dir/$file");
        my $date = ctime(stat("./blast/$blast_dir/$file")->mtime);
        $date=~s/\d\d\:\d\d\:.+//; 
        my $flag_new=0;
        while( $result = $in->next_result )     {
          $cluster=$result->query_name;
          $prog=$result->algorithm;
          $prog=lc($prog);
          $datab=$blast_dir;
          $contig=1;
          $count=0;
          if($cluster=~/\..+(\d+)/) { $contig=$1; $cluster=~s/\..+\d+//; }
          while( $hit = $result->next_hit )       {
            my $score=$hit->raw_score;
            my $desc=$hit->description;
            $name=$hit->name;
            my $sig=$hit->significance;
            my $hsp = $hit->next_hsp;
            my $start=$hsp->query->start;  
            my $end=$hsp->query->end;
            $frame=$hsp->query->frame;

            $name=~s/.+\|(\w+)$/$1/;  #####  To remove tedious ncbi gi/gb id's
            $name=~s/.+\|(.+)\|$/$1/;  
            $name=~s/\..+//;

            if($sig=~/e-(\d+)/ && $1 > 300) { $sig='0.0'; }
            if(($sig=~/e-(\d+)/ && $1 > 4) || $sig=~/^0\.0$/)   {
              $desc=~s/'/`/g;
              my $pg_result = $conn->do("INSERT INTO blast 
                  values ('$cluster','$prog','$datab','$date','$sig'
                  ,'$name','$desc','$frame','$start','$end','$contig')");
              $flag_new=2;
              $count++;
            }
            if($count>9) { last; }  # only take top 10 hits (alter for 2nd/3rd etc.)
          }
          if($flag_new!= 2) { #### There was no significant hit for this cluster 
            my $pg_result = $conn->do("INSERT INTO blast 
              values ('$cluster','$prog','$datab','$date','1'
               ,'','No Significant Hit','0','0','0','$contig')"); 
          }
        }
      }
    }
  }
}   
#######################################################################################################


################################################################################################################
sub insert_p4e_info() { #### file check (existence, maybe sanity later)
  my $dir_flag = 0; my $dir;
  while ($dir_flag == 0) {
    $dir = &get_input("\nPlease enter the prot4EST output directory (full path)\n");
    my $byebye = 0;
    unless (-e  "$dir/translations_xtn.fsa") {
    print "Couldn't find file: $dir/translations_xtn.fsa\n";
    $byebye = 1; 
    }  
    unless (-e  "$dir/prot_main.psql")  {
      print "Couldn't find file: $dir/prot_main.psql\n";
      $byebye = 1; 
    }  
    unless (-e  "$dir/prot_hsps.psql"){
      print "Couldn't find file: $dir/prot_hsps.psql\n";
      $byebye = 1; 
    }  
    if ($byebye == 1) {
      print "Couldn't find all relevant files in $dir.\n";
      print "Do you want to try again?\n";
      my $exit = &yes_no();
      unless ($exit == 1) {
        print "Exiting now - Good bye.\n"; 
        exit;
      }      
    }  
    else {$dir_flag =1}
  }

##### get maximum pept_ref $m is pept_ref  
  my $n=0; my $m; my $updateflag =1;
  $conn=DBI->connect("dbi:Pg:dbname=$database", "", "", {PrintError => 0}); #Last two values would be user/pass.
  if ($updateflag == 1) {
    my $max_m=$conn->prepare("SELECT MAX(pept_ref) FROM p4e_ind") ;
    $max_m->execute();
    my @ary = $max_m->fetchrow_array;  
    my $m_old = $ary[0]; $m=$m_old;
    unless (defined $m) {print "\nWARNING: your database for update seems to be empty\n"; $m =0;} 
  }

#### index and location table first  
  print "\nNow processing prot_main.psql\n";
  my $datafile = "$dir/prot_main.psql"; 
  my $date = `date +%D`;
  my $clus_id; my $contig;
  open (INDEX, "<$datafile") || die "can't open $datafile";
  while (my $line=<INDEX>) {
    $m++;
    my @entry = split /,/,$line;
    if ($entry[1] =~ /(\w{8})_(\d+)/) {   
      $clus_id=$1; $contig=$2;
    }  
    else {
      $clus_id=$entry[1]; $contig=1;
    } 
    if ($entry[4] eq "") {$entry[4] = "null"} 
    if ($entry[5] eq "") {$entry[5] = "null"}
    if ($entry[6] eq "") {$entry[6] = "null"}
    if ($entry[7] eq "") {$entry[7] = "null"}
    if ($entry[8] eq "") {$entry[8] = "null"}
    if ($entry[9] eq "") {$entry[9] = "null"}    

#### first retire old peptides then insert new stuff
    my $result = $conn->do("UPDATE p4e_ind SET active = '0' WHERE pept_id ~ '$entry[0]';", {PrintError => 0}); 

    $result = $conn->do("INSERT INTO p4e_ind (pept_ref,pept_id, clus_id,contig, date, method, gen_code, active) 
    values ('$m','$entry[0]', '$clus_id', '$contig', '$date', '$entry[2]', '$entry[3]','1');");
           
    $result = $conn->do("INSERT INTO p4e_loc (pept_ref, xtn_s, conf_s, frame_s, conf_e, xtn_e, frame_e) 
    values ('$m',$entry[4], $entry[5], $entry[6],$entry[7], $entry[8], $entry[9]);");
    $n++; 
    printf("\r%9d entries inserted into p4e_index and p4e_loc",$n);
  }   
  close (INDEX);

#### hsp table
  print "\nNow processing prot_hsps.psql\n";
  $datafile = "$dir/prot_hsps.psql";
  open (HSP, "<$datafile") || die "can't open $datafile";
  $n=0;
  while (my $line=<HSP>) {
    my @entry = split /,/,$line;
    my $getref=$conn->prepare("SELECT pept_ref FROM p4e_ind where pept_id = '$entry[1]' and active = '1'") ;
    $getref->execute();
    my @ary = $getref->fetchrow_array;  
    my $pept_ref = $ary[0];
    unless (defined $pept_ref) {
      print "\nProblem: $entry[1] not found in db\n"; 
      print "This entry will be ignored\n";
    }
    my $number;
    if (defined $pept_ref && ($pept_ref > 0)) {
      my $gethspnr =  $conn->prepare("SELECT hsp_num FROM p4e_hsp where pept_ref = '$pept_ref'") ; 
      @ary = $gethspnr->fetchrow_array;
      $number = (scalar @ary + 1); 
    } 
    else {$number=1}
    chomp ($entry[5]);
    if (defined $pept_ref) {
      my $result = $conn->do("INSERT INTO p4e_hsp (pept_ref, hsp_num, p_start, p_end, frame, evalue) 
      values ('$pept_ref','$number', '$entry[2]', '$entry[3]', '$entry[4]', '$entry[5]');");
      $n++;
    }
    printf("\r%9d entries inserted into p4e_hsp",$n);
  }
  close (HSP);

####  insert sequences
  print "\nNow processing translations_xtn.fsa\n";
  $datafile = "$dir/translations_xtn.fsa";
  $n=0;
  my ($id,$sequence);
  my $inseq  = Bio::SeqIO->new('-file' => "$datafile",
                         '-format' => 'Fasta');
  while (my $seq = $inseq->next_seq)  {
    $id=$seq->display_id();
    $sequence=$seq->seq();
    my $result = $conn->do("UPDATE p4e_ind SET seq = '$sequence' WHERE pept_id = '$id' and active = '1';", {PrintError => 0}); 
    $n++;
    printf("\r%9d sequences inserted into p4e_ind",$n);
  }
  $conn->disconnect or warn "Disconnection failed: $DBI::errstr\n"; 
}

###############
##### FIN #####
###############
