#!/usr/bin/perl -w

# annot8r_blast2GO.pl version 0.3
# annotates GO terms to peptide sequences based on the top hit from a blast search vs 
# uniprot (or -if desired from any other available source)
# 
# Requirements:
# uniprot GO annotation flat file (or other) "gene_association.goa_sptr.gz"
# available from ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/SPTR
# and the map file for go_slim annotations "goaslim.map"
# available from ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/goslim/ 
#
#
# Perl Modules - DBD:Pg and BioPerl
# DBD::Pg is part of the DBI is available from cpan - http://www.cpan.org/
# Bioperl is available from their site - http://www.bioperl.org/
#
# Last updated 16/12/2004 by Ralf Schmid
# Copyright (C) 2004 Ralf Schmid

# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.


use strict;
use File::stat;
use Term::ANSIColor;
use Bio::Tools::Run::StandAloneBlast;
use Bio::SearchIO;
use Bio::SeqIO;
use DBI;
  use DBD::Pg;  
use Time::localtime;

my $version_number = "0.3 ";
my @PATH=split(":","$ENV{'PATH'}");   


#############################################################################
############################ start main menu ################################
#############################################################################

#### think about configuration file for following generally used variables
my $database; # GO-database created from flatfile
my $pg_database; # PartiGene (or other database) to be decorated
my $flatfile; # flatfile containing GO terms associated entries
my $map_file; # GO_slim file
my $def_file; # GO_definitions file
my $sprotfile; # blast database fileto be processed to $GO_blastfile
my $tremblfile; # blast database file to be processed to $GO_blastfile
my $GO_blastfile = "GO_blast.fsa"; # modified blastfile, sequences without GO-terms are removed
my $seqs2blast; # sequences to blast and to decorate with GO-terms
my $blast_output; # blast output to be parsed
my $e_value; # e-value cut-off for blast
&options();


##################################################################################
#################  1-Create GO database from flatfiles  ###########################
##################################################################################

sub do_database()  {
#### import data from flatfiles into postgresqldb
  system("clear");
  print colored("\n\t##### CREATE GO-DATABASE #####\n","white bold" , "on_black");
  print "\nThis facility downloads automatically the relevant files\n";
  print "and creates or updates a postgreSQL database holding\n";
  print "sequence identifiers from UNIPROT and their associated GO-terms\n";
  print "\nThis database is required for parsing blast results and extracting\n"; 
  print "the associated GO-terms in the later steps.\n\n";

#### postgres sanity check
  &postmaster_check();
  
#### get annotation file  
  print "\n\nCreating the GO-database requires three data files:\n";    
  print "gene_association.goa_uniprot - links uniprot ids to GO-terms\n";
  print "goaslim.map - links GO-terms to GO-slim terms\n";
  print "- These two files are available from: ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/\n";
  print "GO.terms_and_ids - holds information on GO-terms\n";
  print "- This file can be downloaded from: http://www.geneontology.org/doc/\n";
  print "\nWe recommend to download them automatically to make sure";  
  print "\nyou are using the most recent versions.";  
  print "\nPlease type \"y\" to do so.";
  print "\nType \"n\" if you have downloaded them already -";
  print "\nor if want to use different files (not recommended)";
  my $download = &yes_no();

#### automated download here 
  if ($download == 1) {
    $flatfile="gene_association.goa_uniprot.gz";
    my $getdata="wget --passive-ftp --quiet --output-document=gene_association.goa_uniprot.gz ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gene_association.goa_uniprot.gz";
    print "Please wait, downloading $flatfile now ...\n";
    system("$getdata");   
    print "done\n";
    print "Please wait, unpacking $flatfile now ...\n";
    system ("gunzip $flatfile");    
    $flatfile="gene_association.goa_uniprot";
    &filecheck($flatfile);
    print "done\n";
    
    $map_file = "goaslim.map";
    print "Please wait, downloading $map_file now ...\n";
    $getdata="wget --passive-ftp --quiet --output-document=goaslim.map ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/goslim/goaslim.map";
    system("$getdata");   
    &filecheck($map_file);
    print "done\n";
  
    $def_file = "GO.terms_and_ids";
    print "Please wait, downloading $def_file now ...\n";
    $getdata="wget --passive-ftp --quiet --output-document=GO.terms_and_ids http://www.geneontology.org/doc/GO.terms_and_ids";
    system("$getdata");   
    &filecheck($def_file);
    print "done\n";  
  }
  
#### or look for local files  
  else {
    if (-e "gene_association.goa_uniprot") {
      $flatfile = "gene_association.goa_uniprot";
    } 
    else {
      print "\nPlease enter the filename of the flatfile containing the GO-associations.";
      print "\nPlease give either the full path or the relative location of the file.\n";  
      print "\nFor example ";
      print colored ("gene_association.goa_uniprot","green bold");
      print "\nThis file can be downloaded from:";
      print "\nftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/";
      $flatfile = &get_file;
      print "... $flatfile found\n";
    }
  
    if (-e "goaslim.map")  {
      $map_file = "goaslim.map";
    }
    else { 
      print "\n\nNow you have to define the file linking GO-terms to GO-slim terms";    
      print "\nFor example ";
      print colored ("goaslim.map","green bold");
      print "\nThis file can be downloaded from:";
      print "\nftp://ftp.ebi.ac.uk/pub/databases/GO/goa/goslim/";
      print "\nPlease enter the filename of the mapfile.";
      print "\nPlease give either the full path or the relative location of the file.\n";  
      $map_file = &get_file;
    }
    print "... $map_file found\n";

    if (-e "GO.terms_and_ids")  {
      $def_file = "GO.terms_and_ids";
    }
    else { 
      print "\n\nNow you have to define the file holding the definitons of the GO-terms";    
      print "\nFor example ";
      print colored ("GO.terms_and_ids","green bold");
      print "\nThis file can be downloaded from:";
      print "\nhttp://www.geneontology.org/doc/GO.terms_and_ids";
      print "\nPlease enter the filename of the definition file.";
      print "\nPlease give either the full path or the relative location of the file.\n";  
      $def_file = &get_file;
    }
    print "... $def_file found\n";
  }


#### get GO-database    
  my $get_flag = 0;
  if ($database) {
    print "\nYou have already defined $database as your GO-database. Do you want to use it?";  
    my $answer=yes_no();
    unless($answer == 1)   {$get_flag=1};
  }  
  else {$get_flag=1};
  
  if ($get_flag==1) {
    print "\nPlease give the name of the database holding the content of the GO-annotation file\n";
    $database = <STDIN>;    
    chomp $database;
  }
  
#### is $database available?  
  print "\nLooking up $database  Please wait ...\n";
  my $conn=DBI->connect("dbi:Pg:dbname=$database", "", "", {PrintError => 0}); #Last two values would be user/pass.
  if (! $conn)   { # Couldn't connect to the database  
    print "\nCouldn't connect to the database $database, would you like to create it?";
    my $answer = &yes_no();
    if ($answer == 1) {
      &create_db($database,1,1,1,1);
    }
    else {$conn->disconnect or warn "Disconnection failed: $DBI::errstr\n"; exit();}   
  }
  
#### Check tables are present - create them if they are not, allow user to define table name in next vs (for multiple tables)  
  else     { 
    print "\nDatabase $database does already exist, do you want to use it? ";
    my $answer = &yes_no();
    if ($answer == 1) {
      my @table = $conn->tables('','',undef,'TABLE');
      my $GO_table_flag=0; 
      my $GOslim_flag=0; 
      my $GOdef_flag=0;      
      for(my $n=0; $n < @table; $n++)    {
        $table[$n] =~ s/public\.//; #get rid of "public." which is present in some versions of DBD.Pg 
        if ($table[$n] eq "go") { $GO_table_flag=1; } 
	if($table[$n] eq "go_slim") { $GOslim_flag=1; } 
        if($table[$n] eq "go_def") { $GOdef_flag=1; }  
      }                           
      &create_db($database,0,$GO_table_flag,$GOslim_flag,$GOdef_flag);  ### arguments: <db, does db exist, does table exist>     
    }
    else   {$conn->disconnect or warn "Disconnection failed: $DBI::errstr\n"; exit();} 
  }

     
#### Ready to import data, delete old entries if any there, mere upgrading doesn't make sense
  my @entry; my $result;    
  open(FH,"$flatfile") ||  die "Can't open $flatfile\n";  
  $conn = DBI->connect("dbi:Pg:dbname=$database", "", ""); 
  $result = $conn->selectall_arrayref("select dbo_id from go;");
  my $ntuples=@$result;  # Ie size of array referenced by result
  if($ntuples == 0)  { 
    print "Inserting GO entries\n";   
  }
  else  {
    print "\nGO entries already exist for $database - Update the db? ";
    print "\nNote: this will delete old entries (recommended)";
    my $answer=yes_no();
    if($answer==1)   {
      print "Deleting old GO entries. Please wait ...\n";
      $result = $conn->do("DELETE from go;");
      print "Old entries deleted.\n\n";
      print "Now updating GO entries.\n";
    }
    else {print "Now adding GO entries.\n";}
  }    
  print "Note: depending on the size of $flatfile this step may take some time\n"; 
  print "(up to some hours) Please wait ...\n"; 
 
#### Processing file line by line  
  my $dummy_1 = "zero"; my $dummy_4 = "GO:zero";
  my $n=0;
  while (my $line=<FH>) {
    $n++;
    @entry = split (/\t/, $line); 
    chomp ($entry[14]);
    $entry[9] =~ s/'/\{prime\}/g;        
#### can't get quote to work properly do it this way
    
#### Now filter for redundancy: Since we annotate using blast we have to trust the annotations that are already in the flat file 
#### and don't care whether they are coming from interpro or sptr or whereever. We are taking advantage of the order of the flat
#### file. ie we only have to compare with the previous entry to identify redundancy. Note this does not work for upgrades, but 
#### it is strongly recommended to delete the old stuff anyway.
    unless (("$dummy_1" eq "$entry[1]") && ("$dummy_4" eq "$entry[4]"))  { ### keeping protentry
      $dummy_1 = $entry[1];
      $dummy_4 = $entry[4];

##  columns are go standard, even if we only need a few of them putting all in db makes it more flexible and transferable
##  and potentially useful for other stuff - column names more or less reminescent of respetive go stuff  
## 0->db_text 1->dbo_id 2->dbo_syn 3->_not 4->go_id 5->dbref 6->evid 7->w_f 8->asp 9->dbo_name 10->dbo_syn 11->dbo_typ 12->taxon 12-> 13->date 14->as_by
      $result = $conn->do("INSERT INTO go values ('$entry[0]','$entry[1]','$entry[2]','$entry[3]','$entry[4]','$entry[5]','$entry[6]',
      '$entry[7]','$entry[8]','$entry[9]','$entry[10]','$entry[11]','$entry[12]','$entry[13]','$entry[14]');", {PrintError => 0});
      printf("\r%9d entries inserted",$n);
    }       
  }
  close(FH);
  print "\n$n GO-entries added to $database.\n\n";
  
  
 #### now same story for GO_slim
  open(FH,"$map_file") ||  die "Can't open $map_file\n";  
  $conn = DBI->connect("dbi:Pg:dbname=$database", "", ""); 
  $result = $conn->selectall_arrayref("select go_id from go_slim;");
  $ntuples=@$result;  # Ie size of array referenced by result
  if($ntuples == 0)  { 
    print "Inserting GO_slim entries ...\n";   
  }
  else  {
    print "\nGO_slim entries already exist for $database - Update the db? ";
    print "\nNote: this will delete old entries (recommended)";
    my $answer=yes_no();
    if($answer==1)   {
      print "Deleting old GO_slim entries. Please wait ...\n";
      $result = $conn->do("DELETE from go_slim;");
      print "Old entries deleted.\n\n";
      print "Now updating GO_slim entries ...\n";
    }
    else {print "Now adding GO_slim entries ...\n";}
  }    
 
#### Processing file line by line  
  $n=0;
  while (my $line=<FH>) {
    unless ($line =~ /^!/) {
    @entry = split (/\t/, $line); 
    $n++; 
    chomp ($entry[1]);
## 0->go_id 1->go_slim_id 
    $result = $conn->do("INSERT INTO go_slim values ('$entry[0]','$entry[1]');", {PrintError => 0});
    printf("\r%9d entries inserted",$n);
    }       
  }
  close(FH);
  print "\n$n GO_slim entries added to $database.\n\n";
 
 #### and for GO_def
  open(FH,"$def_file") ||  die "Can't open $def_file\n";  
  $conn = DBI->connect("dbi:Pg:dbname=$database", "", ""); 
  $result = $conn->selectall_arrayref("select go_id from go_def;");
  $ntuples=@$result;  # Ie size of array referenced by result
  if($ntuples == 0)  { 
    print "Inserting GO_def entries ...\n";   
  }
  else  {
    print "\nGO_def entries already exist for $database - Update the db? ";
    print "\nNote: this will delete old entries (recommended)";
    my $answer=yes_no();
    if($answer==1)   {
      print "Deleting old GO_def entries. Please wait ...\n";
      $result = $conn->do("DELETE from go_def;");
      print "Old entries deleted.\n\n";
      print "Now updating GO_def entries ...\n";
    }
    else {print "Now adding GO_def entries ...\n";}
  }    
 
#### Processing file line by line  
  $n=0;
  while (my $line=<FH>) {
    unless ($line =~ /^!/) {
    @entry = split (/\t/, $line); 
    $entry[1] =~ s/'/\{prime\}/g;   
    chomp ($entry[2]);
    $n++; 
## 0->go_id 1->description 2->pcf  
    $result = $conn->do("INSERT INTO go_def values ('$entry[0]','$entry[1]','$entry[2]');", {PrintError => 0});
    printf("\r%9d entries inserted",$n);
    }       
  }
  close(FH);
  print "\n$n GO_def entries added to $database.\n\n";
  
  
  $conn->disconnect or warn "Disconnection failed: $DBI::errstr\n"; 
  
  query_for_exit();    
}


##################################################################################
#################  2-BLAST preparations ##########################################
##################################################################################

sub prepare_blast () {
#### tidy up and re-format blastdb
  system("clear");
  print colored("\n\t##### GO-BLAST PREPARATIONS  #####\n","white bold" , "on_black");
  print "\nBefore running Blast we have to make sure that the Blast database contains\n";
  print "solely entries that have associated GO-terms. This facility downloads the latest\n";
  print "version of uniprot, compares it with your GO-terms postgresql database and removes\n";
  print "all sequences that do not have any associated GO-terms. A blastable data-\n";
  print "base will be created in your working directory to be used in the Blast search.\n";
  print "Either keep the files there or move them to the directory defined in your BLASTDB\n";
  print "environmental variable\n\n";  

  &postmaster_check();
#### get fasta files for blast
#### get annotation file  
  print "\n\nPreparing the BLAST database requires the presence two files:";    
  print "\nWe recommend to download them now (automatically) to make sure";  
  print "\nyou are using the most recent versions.";  
  print "\nPlease type \"y\" to do so.";
  print "\nType \"n\" if you don't want to use the automated download feature or";
  print "\nwant to use different files (not recommended)";
  my $download = &yes_no();

#### automated download here 
  if ($download == 1) {
    $sprotfile="uniprot_sprot.fasta.gz";
    my $getdata="wget --passive-ftp --quiet --output-document=uniprot_sprot.fasta.gz ftp://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_sprot.fasta.gz";
    print "Please wait, downloading $sprotfile now ...\n";
    system("$getdata");   
    print "done\n";
    print "Please wait, unpacking $sprotfile now ...\n";
    system ("gunzip $sprotfile");    
    $sprotfile="uniprot_sprot.fasta";
    &filecheck($sprotfile);
    print "done\n";
    
    $tremblfile="uniprot_trembl.fasta.gz";
    $getdata="wget --passive-ftp --quiet --output-document=uniprot_trembl.fasta.gz ftp://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/uniprot_trembl.fasta.gz";
    print "Please wait, downloading $tremblfile now ...\n";
    system("$getdata");   
    print "done\n";
    print "Please wait, unpacking $tremblfile now ...\n";
    system ("gunzip $tremblfile");    
    $tremblfile="uniprot_trembl.fasta";
    &filecheck($tremblfile);
    print "done\n";
  }
  
#### or look for local files  
  else {
    if (-e "uniprot_sprot.fasta") {
      $sprotfile = "uniprot_sprot.fasta";
    } 
    else {
      print "\nPlease enter the filename of the flatfile containing the GO-associations.";
      print "\nPlease give either the full path or the relative location of the file.\n";  
      print "\nFor example ";
      print colored ("uniprot_sprot.fasta","green bold");
      print "\nThis file can be downloaded from:";
      print "\nftp://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/";
      $sprotfile = &get_file;      
    }
    print "... $sprotfile found\n";
    
    if (-e "uniprot_trembl.fasta") {
      $tremblfile = "uniprot_trembl.fasta";
    } 
    else {
      print "\nPlease enter the filename of the flatfile containing the GO-associations.";
      print "\nPlease give either the full path or the relative location of the file.\n";  
      print "\nFor example ";
      print colored ("uniprot_trembl.fasta","green bold");
      print "\nThis file can be downloaded from:";
      print "\nftp://ftp.ebi.ac.uk/pub/databases/uniprot/knowledgebase/";
      $tremblfile = &get_file;      
    }
    print "... $tremblfile found\n";
  }


#### get GO-database    
  my $get_flag = 0;
  if ($database) {
    print "\nYou have already defined $database as your GO-database. Do you want to use it?";  
    my $answer=yes_no();
    unless($answer == 1)   {$get_flag=1};
  }  
  else {$get_flag=1};
  
  if ($get_flag==1) {
    print "\nPlease give the name of the database holding the content of the GO-annotation file\n";
    $database = <STDIN>;    
    chomp $database;
  }
  
  my @list;
  my $conn=DBI->connect("dbi:Pg:dbname=$database", "", "", {PrintError => 0}); #Last two values would be user/pass.
  if (! $conn)   { #### Couldn't connect to the database  
    print "\nCouldn't connect to the database $database\n";
    &query_for_exit;    
  }
  else {
    print "... loading distinct sequences from $database\n";
    print "Please wait, may take some time\n";
    #### could add option for more tables here
    my $result = $conn->prepare("select distinct dbo_sym from go;");
    $result->execute();
    while (my $array_ref = $result->fetchrow_arrayref) {
      push @list, @$array_ref;
      printf("\rSo far %9d sequences extracted ...",scalar @list);
    }
         
    $conn->disconnect or warn "Disconnection failed: $DBI::errstr\n"; 
    my $number=@list;
    print "\n$number distinct sequences extracted\n";  
  }  
  
####  now building hash out of list for half way fast look up
  my %entries;
  foreach my $element(@list) {
    $entries{$element}='';
  }   

  
#### now check for blast db files 
  unless (-e $sprotfile) {
    print colored ("ERROR: $sprotfile not found", "red bold");
    exit;
  }
  unless (-e $tremblfile) {
    print colored ("ERROR: $tremblfile not found", "red bold");
    exit;
  }
  
  open(FH,"$sprotfile") ||  die "Can't open $sprotfile\n";    
  unless (-e "$GO_blastfile") {system "touch $GO_blastfile"}
  open (WRITE, ">$GO_blastfile") || die "Can't open $GO_blastfile\n"; 
  print "\nWriting $GO_blastfile  ... Please wait\n";
  
  my $write_flag = 0;
#### going through .fsa, writing all entries that are in db to blastnew.fsa   
  while (my $line=<FH>) {
    if ($line =~ /^>(\w+)/)  {
      my $id = $1; # print "$id\n";
      if (exists($entries{$id}))  {
        $write_flag=1;
      }     
      else {$write_flag = 0;}
    }        
    if ($write_flag == 1) {
      print WRITE "$line"; 
    }
  }
  close FH;
  open(FH,"$tremblfile") ||  die "Can't open $tremblfile\n"; 
  $write_flag = 0;
  while (my $line=<FH>) {
    if ($line =~ /^>(\w+)/)  {
      my $id = $1; # print "$id\n";
      if (exists($entries{$id}))  {
        $write_flag=1;
      }     
      else {$write_flag = 0;}
    }        
    if ($write_flag == 1) {
      print WRITE "$line"; 
    }
  }  
  close WRITE;
  
#### Format new blastdb
  print "\nFormatting $GO_blastfile  ...\n";
  system "formatdb -i $GO_blastfile";
  print "\nFormatting $GO_blastfile done, now ready for running blast\n";

  query_for_exit();
}


##################################################################################
#################  3-BLAST run   #################################################
##################################################################################

sub do_blast () {
#### do the actual blast search
  system("clear");
  print colored("\n\t##### RUN GO-BLAST #####\n","white bold" , "on_black");
  print "\nThis facility takes the output of prot4est and blasts it vs a swtrembl\n";
  print "version that solely contains entries that have associated GO-terms to\n";
  print "finally extract GO-terms from the BLAST hits. If you don't use prot4est\n";
  print "you can provide any fasta file containing peptide sequences.\n";
  print "You can also run BLAST externally and put the output file in your\n";
  print "working directory for further processing\n";
 
  my $blast_exec=find_program("blastall");

#### get sequences, add option for getting sequences from db later
  print "\nPlease enter the name of the file containing the fasta sequences you want to decorate.";
  print "\nFor example ";
  print colored ("yeti.fsa","green bold");  
  print "\nPlease give either the full or relative location of the file.\n";  
  $seqs2blast = &get_file();
#### now looking for blastdb  
  my $dbase_path;

#### if still defined from option 2
  if ($GO_blastfile && -e $GO_blastfile) {
    $dbase_path = $GO_blastfile; #print "\n succcess"; 
  }  
      
#### look whether sth useful is around   
  else {
    my @dummy = glob("*_GO.fsa");
    if ((@dummy) && (scalar @dummy == 1)) { 
      $dbase_path = $dummy[0]; ### this should be the one and only  
    }
    else { ### nothing found re-enter then
      print "\nPlease give the full path for the blastdb created in sub-menu 2\n";    
      $dbase_path = &get_file;
    }
  }    
  
  my $blast_method = "blastp"; 
  $blast_output = "$seqs2blast" . ".out";
  
#### get e-value
  print "\nPlease enter the e-value you want to use as a cut-off for GO-annotation.\n";
  print "If you intend to use only the most general levels of GO-annotation you could\n";
  print "use a rather high value eg 0.001\n";
  print "If you intend to use all levels of GO-annotation we recommend a far more\n";
  print "conservative cut-off to avoid mis-annotation eg 1e-25\n";
  print "\nIf you are not sure what to do, please consult the user guide\n";
  print "Otherwise enter e-value now.\n";
  my $e_check = 1;
  while ($e_check == 1) {
    $e_value = <STDIN>;
    chomp $e_value;    
    if ($e_value =~ /^1e-\d+$/) {### matches e- type
      print "\n$e_value accepted";
      $e_check = 2;
    }
    elsif ($e_value =~ /^\d*\.?\d*$/) {### matches numerical type
      print "\n$e_value accepted";
      $e_check = 2;
    }  
    else {print "\n$e_value not accepted, Please try again.\n";}
  }

#### get blastenvironment right
 if ($ENV{'BLASTDB'})  {
   my $file = "$ENV{'BLASTDB'}" . "$dbase_path";
#### user has put blastdb where it should be 
   unless (-e $file) {
     my $here = `pwd`;
     $ENV{'BLASTDB'}=$here;
#### user has blastdb in working directory, teporarily re-define BLASTDB for running blast     
   }
 }    
 else { ### no blastenv set at all, probably dodgy set up anyway, but try      
   print "\nHaven't found BLASTDB environmental variable. Setting one...\n";
   print "In case your blast fails, please check your blast installation.\n";
   my $here = `pwd`;
   $ENV{'BLASTDB'}=$here;
 }
    
  print "\nBlasting now ...\n";
  print "Please note this step may take some time.\n";
  &run_blast	($seqs2blast, $dbase_path, $blast_method, $e_value, $blast_output);
  print "\nBlasting done. Ready for annotation now\n";
  
  query_for_exit();

}



##################################################################################
#################  4-GO annotation  ##############################################
##################################################################################
sub do_GOannotation()  {
#### parse blast and create GO-annotation table
  system("clear");
  print colored("\n\t##### GO-annotation #####\n","white bold" , "on_black");
  print "\nThis menu takes GO-annotated uniprot entries and BLAST\n";
  print "results for your sequences to decorate your sequences with some GO-terms\n\n";
 

### First some checks...pg running,  swtrgo db, partigene db, blastoutput
   
  &postmaster_check;

  
### add flexibility later, but this will do it for now ...  
  my $update_flag = 2;
  if ($database) {
  
    print "You have already defined a database holding GO-annotation terms: $database\n";
    print "Do you want to use it?";
    my $answer = &yes_no();
    unless ($answer == 1) {
      print "Please enter alternative db name\n";
      $database = <STDIN>;
      chomp $database;
    }
  }  
  else  {
    print "Please enter name of database holding GO-annotation terms\n";
    $database = <STDIN>;
    chomp $database;
  }   
    
#### is GO annotation database available and does it contain go-table ?  
  my $conn=DBI->connect("dbi:Pg:dbname=$database", "", "", {PrintError => 0}); #Last two values would be user/pass.  
  if (! $conn)   { 
    print "\nCouldn't connect to the database $database";
    &query_for_exit();
  }
  else {
    my @table = $conn->tables('','',undef,'TABLE');
    my $GO_table_flag=0;        
    my $GOslim_table_flag=0;
    for(my $n=0; $n < @table; $n++)    {
      $table[$n] =~ s/public\.//; #get rid of "public." which is present in some versions of DBD.Pg 
      if($table[$n] eq "go") { $GO_table_flag=1; } 
      if($table[$n] eq "go_slim") { $GOslim_table_flag=1; }
    }
    if ($GO_table_flag==0) {
      print colored("\n$database does not contain GO table, please check $database\n","red bold"); 
      &query_for_continue();     
    }
    if ($GOslim_table_flag==0) {
      print colored("\n$database does not contain GO_slim table, GO_slim annotation will not be available.","red bold"); 

    }  
    $conn->disconnect or warn "Disconnection failed: $DBI::errstr\n"; 
  }   
  


#### check for partigene.conf file and partigene db
  my $filename = "~/.partigene.conf";
  my $pg_database; my $pg_flag = 0;
  $filename =~ s{ ^ ~ ( [^/]* ) }
              { $1
                    ? (getpwnam($1))[7]
                    : ( $ENV{HOME} || $ENV{LOGDIR}
                         || (getpwuid($>))[7]
                       )
}ex;

  if  (-e "$filename") {
    open (CONFILE,"$filename") ||  die "Can't open configuration file\n";
    while (my $line=<CONFILE>) {      
      if ($line=~/^DATABASE\=(.+)/i) { $pg_database=$1; }       
    }
    close (CONFILE);
#### user has a Partigene db defined in the config file

    print "\nDo you want to use PartiGene database: $pg_database to store your GO annotation?";
    $pg_flag  = &yes_no();
  }
   
#### don't use Pg database  
  if ($pg_flag != 1) {
    print "What database do you want to create or use?\n";
    $pg_database = <STDIN>; 
    chomp $pg_database; 
  }
  
 
  $conn=DBI->connect("dbi:Pg:dbname=$pg_database", "", "", {PrintError => 0}); #Last two values would be user/pass.
  if (! $conn)   { ### Couldn't connect to the database  
    print "\nCouldn't connect to the database $pg_database";
    print "\nDo you want to create it?";
    my $answer = &query_for_continue;

    &create_annodb($pg_database,"1","1");  ### arguments: <db, does db exist, does table exist>  
  }
  else {
    my @table = $conn->tables('','',undef,'TABLE');
    my $GO_table_flag=0;        
    for(my $n=0; $n < @table; $n++)    {
      $table[$n] =~ s/public\.//; #get rid of "public." which is present in some versions of DBD.Pg 
      if($table[$n] eq "blast_go") { $GO_table_flag=1; } 
    }
    if ($GO_table_flag==0) {
      print colored("\n$pg_database does not contain blast_go table, do you want to create this table?","red bold"); 
      my $answer = &yes_no();
      if ($answer == 1)  {
        &create_annodb($pg_database,"0","1");
      }
  
      else {&query_for_exit}
      print "\nblast_go table created.";
    }
    else {
      print "\nblast_go entries are already existing for $pg_database.";
      print "\nDo you want to remove old entries? (recommended)";
      print "\n\n- Otherwise you will add new entries to already existing entries;";
      print "\nprevious entries with identical protein identifier and GO terms";
      print "\nwill be overwritten";
      my $answer=yes_no();
      if($answer==1)   {#delete existing entries
        print "Deleting old entries. Please wait ...\n";
        my $result = $conn->do("DELETE from blast_go;");
        print "Old entries deleted.\n\n";                 
      }
      else {$update_flag = 1;}
            
    }  
    
    
    $conn->disconnect or warn "Disconnection failed: $DBI::errstr\n";
  }   
  
#### get e-value
  unless ($e_value) {
    print "\nPlease enter the e-value you want to use as a cut-off for GO-annotation.\n";
    print "If you intend to use only the most general levels of GO-annotation you could\n";
    print "use a rather high value eg 0.001\n";
    print "If you intend to use all levels of GO-annotation we recommend a far more\n";
    print "conservative cut-off to avoid mis-annotation eg 1e-25\n";
    print "\nIf you are not sure what to do, please consult the user guide\n";
    print "Otherwise enter e-value now.\n";
    my $e_check = 1;
    while ($e_check == 1) {
      $e_value = <STDIN>;
      chomp $e_value;    
      if ($e_value =~ /^1e-\d+$/) {### matches e- type
        print "\n$e_value accepted";
        $e_check = 2;
      }
      elsif ($e_value =~ /^\d*\.?\d*$/) {### matches numerical type
        print "\n$e_value accepted";
        $e_check = 2;
      }  
      else {print "\n$e_value not accepted, Please try again.\n";}
    }  
  }



#### file check and get blast results
  my $blast_result;
  if ($blast_output)  {
    unless (-e $blast_output)  {
      print colored("Couldn't find Blast results file $blast_output\n","red bold");   
      print "\nPlease give either the full or relative location of the blast output file.\n";  
      $blast_result = &get_file;
    }
    $blast_result = $blast_output;
  }
  else {
    print "\nPlease give either the full or relative location of the blast output file.\n";  
    $blast_result = &get_file;
  }       
  print "\nBlast output found. Now parsing results. Please wait ...\n";  
     
  
  my $conn1=DBI->connect("dbi:Pg:dbname=$database", "", "", {PrintError => 0}); #Last two values would be user/pass.  
  my $conn2=DBI->connect("dbi:Pg:dbname=$pg_database", "", "", {PrintError => 0}); #Last two values would be user/pass.
  my $in = new Bio::SearchIO( -format => 'blast',  
                             -file   => "$blast_result");
  
  my $slim=''; 
  my $score='';
  my $sig='';
  my $db_name=''; 
  my $acc=''; my $prot = ''; 
  my $desc='';
  my $name='';
  my $prog='';
  my $success='';  
  my $n = 0;
  my $m = 0;
  if ($update_flag == 1) { ### update
    my $max_m=$conn2->prepare("SELECT MAX(run_nr) FROM blast_go") ;
    $max_m->execute();
    my @ary = $max_m->fetchrow_array;  
    my $m_old = $ary[0]; $m=$m_old;
    unless ($m != 0) {print colored("\nWARNING: your database for update seems to be empty\n","red bold");}
    while( my $result = $in->next_result )  {
      $n++;
      printf("\r%9d sequences processed so far",$n);  
      while (my $hit = $result->next_hit) {
        $db_name=$result->database_name;
        $prot=$result->query_name;           
        $acc=$hit->accession; 
        $sig=$hit->significance;
        #### e-value filter here
	if ($sig > $e_value) {last;}
	
	$success=$conn1->prepare("SELECT go_id, evid, asp from go where dbo_sym~'$acc'");      
        $success->execute();
        my $array_ref = $success->fetchall_arrayref();
        
	
		 
        foreach my $output(@$array_ref)  {        
	  $m++;
          my ($go_id, $evid, $asp) =@$output;	
 	  my $success2=$conn1->prepare("SELECT slim from go_slim where go_id~'$go_id'");
	  $success2->execute();
	  my $go_ref = $success2->fetchrow_arrayref();
	  ($slim)=@$go_ref;# print "slim: $slim\n";
	  #print"$prot a $db_name b $acc c $sig d  $go_id plus $evid\n";	
	  my $check = $conn2->do("DELETE from blast_go WHERE protein_id='$prot' AND go_term='$go_id';");
	  my $insert = $conn2->do("INSERT INTO blast_go values ('$m','$prot','$db_name','$acc','$sig','$go_id','$asp','$evid','$slim');", {PrintError => 0});
        }      
      last;   #### so far it exits while loop after first hit, change here if you want to do sth different 
      }
    }
    $m=$m - $m_old;
    print "\n\nCreated $m new entries in $pg_database\n"; 
  }
    
  else { ### just enter new data
    while( my $result = $in->next_result )  {
      $n++;
      printf("\r%9d sequences processed so far",$n);  
      while (my $hit = $result->next_hit) { 
        $db_name=$result->database_name;
        $prot=$result->query_name;           
       	$acc=$hit->accession; 
        $sig=$hit->significance;
	#### e-value filter here
	if ($sig > $e_value) {last;}
	
        $success=$conn1->prepare("SELECT go_id, evid, asp from go where dbo_sym~'$acc'");      
        $success->execute();
        my $array_ref = $success->fetchall_arrayref();
      
        foreach my $output(@$array_ref)  {        
	  $m++;
          my ($go_id, $evid, $asp) =@$output;	 	  
	  #print "goid: $go_id evid: $evid asp: $asp\n";
	  my $success2=$conn1->prepare("SELECT slim from go_slim where go_id~'$go_id'");
	  $success2->execute();
	  my $go_ref = $success2->fetchrow_arrayref();
	  if ($go_ref) {
	    ($slim)=@$go_ref; 
	  }
	  else {
	    print colored("\nWARNING: Could not find GO-slim term for $go_id\n","red bold");
	    $slim = "null";
	  }
	  #print"$prot a $db_name b $acc c $sig d  $go_id plus $evid\n";	
          my $insert = $conn2->do("INSERT INTO blast_go values ('$m','$prot','$db_name','$acc','$sig','$go_id','$asp','$evid','$slim');", {PrintError => 0});
        
        }      
      last;   #### so far it exits while loop after first hit, change here if you want to do sth different 
      }
    }
    print "\n\nCreated $m new entries in $pg_database\n"; 
  }
 

  print "\nNow writing GO-slim output files.\n\n";  
#### preparing outputfiles for piecharts
 my @pie_flag = ("P", "C", "F");
 foreach (@pie_flag) {
   my $filename = "piedata_" . "$_";
   my $backup = "$filename" . ".old";
   if (-e "$filename") {system "mv $filename $backup";}
 }   
  

#### extracting data for pie charts
  $success = $conn2->prepare("SELECT DISTINCT protein_id, slim FROM blast_go");
  $success->execute();
  my $go_ref = $success->fetchall_arrayref();
  my %go_slim; my $go_slim; my $prot_id;

#### hash: go_slim id => number of entries in db  
  foreach my $output(@$go_ref) {
    ($prot_id, $go_slim) = @$output;
#    print "$prot_id, $go_slim\n";
    if (exists $go_slim{$go_slim}) {
      $go_slim{$go_slim}++;
    }      
    else {$go_slim{$go_slim}=0}
  }

##### create 3 files, loop through keys, write info to files
  open (FHC, ">piedata_C");
  print FHC "### go_slim for $database - cellular component ###\n";
  print FHC "### go_slim_id \t description\t occurences\n";
  open (FHP, ">piedata_P");
  print FHP "### go_slim for $database - biological process ###\n";
  print FHP "### go_slim_id \t description\t occurences\n";
  open (FHF, ">piedata_F");
  print FHF "### go_slim for $database - molecular function ###\n";
  print FHF "### go_slim_id \t description\t occurences\n";
        
  foreach my $key (keys %go_slim) { 
    $success = $conn1 ->prepare("SELECT go_id, descr, pcf FROM go_def WHERE go_id~'$key'"); 
    $success->execute(); 
    my $pie_ref = $success->fetchrow_arrayref();
    if ($pie_ref) {
      my ($go_id, $desc, $pcf)=@$pie_ref;  
      if ($pcf eq "C") {print FHC "$key \t $desc \t $go_slim{$key}\n";}  
      elsif ($pcf eq "P") {print FHP "$key \t $desc \t $go_slim{$key}\n";}
      elsif ($pcf eq "F") {print FHF "$key \t $desc \t $go_slim{$key}\n";}
      else {print colored ("WARNING: no valid GO-aspect (C, P or F) found for $key\n", "red bold")}
    }
    else {
      print colored("\nWARNING: couldn't process GO-slim term $key","red bold");  
    }
  }
  close FHC;
  close FHP;
  close FHF;
  $conn1->disconnect or warn "Disconnection failed: $DBI::errstr\n";
  $conn2->disconnect or warn "Disconnection failed: $DBI::errstr\n";

  &query_for_exit();     
}



###################################################################################
###################################################################################
###                                                                             ###
###                               all the subs                                  ###
###                                                                             ###
###################################################################################
###################################################################################

#########################################################################################################################
sub options () {
#### option selection
   my $answer=0;
   while($answer!=5) {
      $answer=&title_page();
      if($answer==1)  { &do_database(); } #Create GOdatabase from flatfile
      if($answer==2)  { &prepare_blast(); } # BLAST prep
      if($answer==3)  { &do_blast(); } # BLAST      
      if($answer==4)  { &do_GOannotation(); } #do GO annotation
     
   }
   system("clear");
   exit();   #exit program
}
############################################################################################################################

   
###################################################################################
sub title_page() {
#### intro & sub-menu selection
    print_title();
    print "\n\tBefore proceeding please make sure you are in an appropriate\n";
    print "\tdirectory.\n\n";
    print "\t\t1. Create GO-database\n";
    print "\t\t2. Prepare your Blast search\n";
    print "\t\t3. Blast your sequences\n";    
    print "\t\t4. Annotate your sequences based on Blast hit annotation\n";
    print "\t\t5. Quit.\n";

    my $flag=0; my $answer;
    while($flag==0) {
        $answer=<>;
	if($answer=~/^[1|2|3|4|5]$/) { $flag=1; next; }
        else {print " You have entered $answer This is not an option. Please try again\n";}
    }
    return $answer;
}
####################################################################################


############################################################################################################################
sub print_title() {
##### displays title
    print colored("\n\n\n", "white bold", "on_black"); 
    print colored("\t###########################################################\n","white bold", "on_black");
    print colored("\t###                                                     ###\n","white bold", "on_black");
    print colored("\t###             annot8r_blast2GO.pl                     ###\n","white bold", "on_black");
    print colored("\t###     a script to annotate protein sequences          ###\n","white bold", "on_black");
    print colored("\t###     with GO-terms based on blast results Vs $version_number    ###\n","white bold", "on_black");
    print colored("\t###                                                     ###\n","white bold", "on_black");
    print colored("\t###     EGTDC 2004                                      ###\n","white bold", "on_black");
    print colored("\t###                                                     ###\n","white bold", "on_black");
    print colored("\t###     For news and upgrades and help:                 ###\n","white bold", "on_black");
    print colored("\t###     nematodes.bioinf\@ed.ac.uk                       ###\n","white bold", "on_black");
    print colored("\t###                                                     ###\n","white bold", "on_black");
    print colored("\t###     Help for EG-awardees:                           ###\n","white bold", "on_black");
    print colored("\t###     helpdesk\@envgen.nox.ac.uk                       ###\n","white bold", "on_black");
    print colored("\t###                                                     ###\n","white bold", "on_black");
    print colored("\t###########################################################\n\n\n","white bold", "on_black");
   
}
###########################################################################################################################


#########################################################################################################################
sub postmaster_check() {
#### check for postmaster/postgresql process
  my $postmaster=`ps -e|grep postmaster`; ### See if the process is running

  if(!$postmaster)  {
    print colored("\n#### Postmaster is not running ####\n","red bold");
    print colored("Please ensure that postgreSQL is correctly installed and running\n","red bold");
    exit();
  }
  else {print "CHECK POSTGRESQL   OK => Postmaster running\n\n"} 

#### check whether user does exist   
  my $user_status = system ("psql -l > /dev/null"); ### command will fail unless (postgresql)user exists      
  unless ($user_status == 0) {
    my $username = `whoami`; chomp $username;
    print colored("\n\t#### CONNECTION TO POSTGRESQL FAILED ####\n","red bold");
    print colored("Most likely you have forgotten to run \"createuser $username\"\n","red bold");
    print colored("during the postgreSQL setup\n","red bold");
    exit();
  }
}


###########################################################################################################################
sub query_for_exit() {
#exits program if 'n' entered back to main for y
   print "\nWould you like to continue? ";
   print colored(" [y/n] : ","green bold");
   my $input='';
   while($input!~/y|n/i)   {
     print "\b";
     $input=<>;
     chomp $input;
  }
  if ($input=~/^y/i) { print "Back to main menu\n"; &options;}
  if ($input=~/^n/i) { print "Exiting the program\n"; exit(); }
}
####################################################################################


###########################################################################################################################
sub query_for_continue() {
#exits program if 'n' entered carries on for 'y'
#   print "\nWould you like to continue? ";
   print colored(" [y/n] : ","green bold");
   my $input='';
   while($input!~/y|n/i)   {
     print "\b";
     $input=<>;
     chomp $input;
  }
  if ($input=~/^n/i) { print "Exiting the program\n"; exit(); }
}
####################################################################################


####################################################################################
sub yes_no() {
#### returns 1 for y
  my $yflag=0;
  print colored(" [y/n] : ","green bold");
  my $input='';
  while($input!~/y|n/i)   {
    print "\b";
    $input=<STDIN>;
    chomp $input;
  }
  if($input=~/^y/i) { $yflag=1; }
  return $yflag;
}
####################################################################################


###################################################################################
sub get_file() {    
#### get file from user input, check existence
  my $flag = 0; my $file_name;
  while ($flag == 0) {
    $file_name = <STDIN>;
    chomp $file_name;
    $file_name =~ s/\s+//g;
    if (-e $file_name) {$flag = 1;}
    else {
      print "\nCan't find $file_name, do you want to try again?";
      my $answer = &yes_no();
      if ($answer != 1) {exit();} 
    }  
  }
  return "$file_name";
}  
###################################################################################

sub filecheck {
  my $file = $_[0];   
  unless (-e $file && (-s $file > 0)) {
    print colored ("Download/unpacking error for $file - exiting now\n\n","red bold");
    sleep (2);
    exit
  } 
}
#######################################################################################
sub find_program()  {
#### search for argument in path, exit if not found
  my $prog=$_[0];
  my $pathflag=0;
  my $path;
  my $finalpath;
  foreach $path (@PATH) {
    if (-f "$path/$prog") {
      $pathflag=1; $finalpath=$path; last;
    }
  }
  if($pathflag==0)   { 
    print colored("\nCan't find the $prog utility on your system\n","red bold");
    print colored("Please ensure it is installed and in your path\n\n","red bold");
    exit();
  }
  else  {
    return "$finalpath/$prog";
  }
}
###############################################################################################################


################################################################################################################
# nicked from james
sub run_blast	{
#### run blast, note alignment is needed for some of the info to be extracted downstream
	my($seqs2blast, $dbase_path, $blast_method, $e, $o) = @_;
	my @params;
	if ($o)	{	
		@params = ('d' => "$dbase_path" , 'program' => "$blast_method" , 'e' => "$e" , 'o' => "$o" , 'b' => '1' , 'v' => '1' ,
		'_READMETHOD' => "Blast");
	}
	else	{
		@params = ('d' => "$dbase_path" , 'program' => "$blast_method" , 'e' => "$e" , 'b' => '1' , 'v' => '1' ,
		'_READMETHOD' => "Blast");
	}

	my $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
	my $blast_report = $factory->blastall($seqs2blast);
	return ($blast_report);
}
#######################################################################################


###################################################################################
###  database subs
####################################################################################
sub create_db()  { 
#### should move to external db module  
  my $database = shift;
  my $new_db = shift;
  my $go_flag = shift;
  my $go_slim_flag = shift;
  my $go_def_flag =shift;  
  my $createdb_exe= &find_program("createdb");
  if($new_db == 1) { system("$createdb_exe $database >& /dev/null"); } 
  my $conn=DBI->connect("dbi:Pg:dbname=$database", "", "");
  if($go_flag==1) {
    my $result=$conn->do("create table go (db text null, dbo_id varchar(10) not null, dbo_sym varchar(10) not null, _not text null, 
    go_id varchar(12) not null, dbref text null, evid text null, w_f text null, asp text null, dbo_name text null,
    dbo_syn text null, dbo_typ text null, taxon text null, date int null, as_by text null);");     
  }
  if ($go_slim_flag==1) {
    my $result=$conn->do("create table go_slim (go_id varchar(12) not null, slim varchar(50) not null);");     
  }  
  if ($go_def_flag==1) {
    my $result=$conn->do("create table go_def (go_id varchar(12) not null, descr text, pcf varchar(1) not null);");     
  }
  my $errorMessage = $conn->errstr;
  if ($errorMessage) {print "$errorMessage\n";}
  $conn->disconnect or warn "Disconnection failed: $DBI::errstr\n";
}
############################################################################################## 

##############################################################################################
sub create_annodb()  { 
#### should be merged with previous sub, next version    
  my $database=shift;
  my $new_db=shift;
  my $table_flag = shift;
  my $createdb_exe= &find_program("createdb");
  if($new_db == 1) { system("$createdb_exe $database >& /dev/null"); }  
  my $conn=DBI->connect("dbi:Pg:dbname=$database", "", "");
  if($table_flag == 1) {
    my $result=$conn->do ("create table blast_go (run_nr int not null primary key, protein_id varchar(20) not null, blast_go_db varchar(24) not null, blast_hit varchar(16) not null,
	       blast_eval varchar(12) not null, go_term varchar(14) not null, go_asp varchar(6) not null, go_evid  varchar(12) not null, slim varchar(20));");  
  }
  my $errorMessage = $conn->errstr;
  if ($errorMessage) {print "$errorMessage\n";}
  $conn->disconnect or warn "Disconnection failed: $DBI::errstr\n";
}

##############
###   fin  ###
##############
