#!/usr/bin/perl

#######################################################################################
#Name: prot4EST_2.1.1.pl
#Author: James Wasmuth, ICAPB, University of Edinburgh
#james.wasmuth@ed.ac.uk
#Contributions:  Al Anthony, Ann Hedley, Ralf Schmid, John Parkinson, Mark Blaxter
#
#Date created: 10/07/03
#Date last modified: 20/12/04 (JW)
#
#Usage: prot4EST.pl
#
#Copyright (C) 2004  James Wasmuth
#
#Reference: 
#
#This program is free software; you can redistribute it and/or
#modify it under the terms of the GNU General Public License
#as published by the Free Software Foundation; either version 2
#of the License, or (at your option) any later version.
#
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with this program; if not, write to the Free Software
#Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#
#Description: takes the seed sequence and blasts against the rRNA blast database.
#If a "confident" hit then the seed is classified as a putative rRNA.
#Otherwise the mitochondria databases are searched, and if hit against these, the 
#sequence is flagged as a possible (probable?) mito sequence and is translated as such.
#Any sequence that cannot be confidently assigned as rRNA or mitochondrian (most) will 
#be searched over the nr protein set and any significant hits will be utilised in the 
#translation.PR
#Finally any sequence escaping all these similarity searches will be forwarded down the
#conveyer belt to Decoder and ESTScan for more indepth processing.
#######################################################################################

use strict;

#List modules to be utilised.
use Bio::Tools::Run::StandAloneBlast;
use Bio::SearchIO;
use Bio::SeqIO;
use Term::ANSIColor;
use File::Copy;
use Cwd;

#list the requirements
require 'p4e2/p4e_checker.pl';
require 'p4e2/tile_path.pl';
require 'p4e2/blaststuff.pl';
require 'p4e2/estscan_plugin.pl';
require 'p4e2/getDNA.pl';


$|=1;


###########################################################################
###########################################################################
#SECTION 1 - GLOBAL VARIABLES AND THE SUCH 

#how many of these do I still need?  to check on!

#Initiate global variables
my ($conf, $count_okay, $html, $hsp_ref_counter);
my ($options);
my ($seq_body, $seq_head, $seqname, $sixframe);
my (@further, @mito_ids);
my (%seeds);

my $version = '      version: 2.1.1  -  January \'05        ';
my $decoderbiastable="human_high.cod";   ### Change this depending on the decoder executable

my $i = 0;
my $count_rRNA = 0;		#number of putative rRNAs
my $count_mito = 0;		#number of putatuve mitochondria sequences	
my $count_nr = 0;		#number of seeds which hit nr protein
my $count_further = 0;	#number that have no significant hits so are sent to Decoder
						#and beyond...
my $total_count=0;
my $current_count=0;
my $dec_flag=1;

###########################################################################
###########################################################################
##SECTION 2 - do some checking before launching the script proper

{	#do this first

	print "\nstarting prot4EST checks...\n";
	#checks that all programs are present and correct.
	my $errors = p4e2::p4e_checker::exe("decoder","transeq","blastall","ESTScan","build_model_utils.pl" ,"build_model","makesmat","maskred");

	if (scalar @$errors)	{
		my $dec_err;
		print "\nInitial tests are unable to find the following required programs:\n";
		foreach (@$errors)	{
			if (m/decoder/)	{
				$dec_err=1;
				next;
			}
			else	{print $_,"\t" foreach @$errors;}
			print "\n\nPlease make sure these programs can be found in your PATH\n";
			exit;
		}
		if ($dec_err)	{	#could not find decoder.  Is it required?
			print   "Initial test are unable to locate the 'decoder' program.  It is not in your PATH.\n".
					"Is this intentional?\n  Your options are:\n\n".
					"1. If you do not have the program but wish to get hold of it please consult the prot4EST user guide\n".
					"2. Place the program into your PATH (e.g. \".bashrc\" / \".cshrc\")\n".
					"3. Continue prot4EST process without utilising 'decoder'\n\n";
					
			my $choice;
			while ($choice !~ /\s*[123]\s*/)	{		
				 print "Please select a number 1-3:\n";
				 $choice = <STDIN>;
				 chomp $choice;
			}
			if ($choice =~ /\s*[12]\s*/)	{exit;}
			else {$dec_flag=0};		#don't run DECODER				
		}
	}
	
	my $pico_err = p4e2::p4e_checker::exe("pico");
	
	if (scalar @$pico_err >= 1)	{
		print "Unable to find the pico text editor.  To create the configuration file use the example as a template in whatever text editor you prefer.\n\n";
	}
}


###########################################################################
###########################################################################
#SECTION 3 - Get User Input

#present title page.
#ask for/setup config file.

my $error=1;	#keeps track of whether setup is complete.
my $error_e=0;
START: while ($error eq 1 || $error_e eq 1)	{
	print "\n";
	my $answer=&title_page();
	if ($answer==1)    {&make_config(); print colored("\n\nNow select OPTION 2 to load the configuration file\n\n","green bold"); next START;}
	if ($answer==2)    {$conf=&upload();}
	if ($answer==3)    {system "less README";}
	if ($answer==4)    {print "Now exiting...\n\n\n"; exit;}
	if ($conf)	{
		$options =  &parse_conf($conf);		#perhaps use an array to tidy up.  But this works.

		my $date = (`date +%d%m%y_%H%M%S`);
		chomp $date;
		open LOG, ">prot4EST_$date.log";
		open ERROR, ">prot4EST_$date.errorlog";

		print colored ("Created prot4EST_$date.log and prot4EST_$date.errorlog\n\n",'bold');
		
		($error, my $checkin, my $pre_blast) =  &check_files($options,$dec_flag);
		$options->{'in'}=$checkin;						#if the input was a directory it is now in one file.
		$options->{'pre-blast'}=$pre_blast;
		
		($error_e,$options->{blastnE},$options->{blastxE}) = &convert_e($options);
		
	}
	else	{	
		print colored ("### Error! ###\n","red bold");
		print colored ("Cannot find configuration file or none selected\n","red bold");
		print ERROR "### Error! ###\nCannot find configuration file or none selected!\n";
	}
}

print ERROR "\n\nSUCCESS!  ALL VARIABLES CHECKED!\n\n";
print LOG "\n\nSUCCESS!  ALL VARIABLES CHECKED!\n\n";

###########################################################################
###########################################################################
#SECTION 4 - Some more checking and additional user input may be required.

#see if the selected output directory exists.  


while (-d $options->{out})	{		#check to see if choice of directory already exists
	print colored ("\n### WARNING ###\n ","green bold");
	print colored ("$options->{out} already exists!  Please choose another name...\n","green bold");
	chomp (my $choice =<STDIN>);
	unless ($choice=~m/\w/)	{	#was anything typed?
		next;
	}
	chomp ($options->{out} = $choice);
	print colored ("Thank-you.  Now using $options->{'out'}\n","green");
	next;
}





###########################################################################
###########################################################################
#SECTION 5 - Start up script.  Creates directory for output files and splits up the input file into its constituent sequences

print colored ("\n\tStarting prot4EST\n","green bold");
#print colored ("\n\tStarting prot4EST\n","black bold");

my $er_msg="\nPlease report this error to me at nematodes.bioinf\@ed.ac.uk\nPlease send me the .log file and the message given on the terminal screen.\nI'll get back to you as quickly as possible\n\n";

#need to place options into scalars

my $input = $options->{'in'};
my $out_dir = $options->{'out'};
my $species = $options->{'species'};
my $rnafile = $options->{'rnafile'};
my $mitofile = $options->{'mitofile'};
my $blastnE = $options->{'blastnE'};
my $blastxE = $options->{'blastxE'};
my $pre_blast = $options->{'pre-blast'};
my $sequaldir = $options->{'sequaldir'};
my $seqsuff = $options->{'seqsuff'};
my $qualsuff = $options->{'qualsuff'};
my $pre_DEC = $options->{'pre-DEC'};
my $estscanmatrix = $options->{'estscanmatrix'};
my $cut = $options->{'cut'};
my $gen_cod_file=$options->{'gen_code'};


#make the output directory and move into it.
mkdir "$out_dir";
chmod 0755, "$out_dir";
chdir ($out_dir) || exit; 

open INFO, ">info.txt";

print INFO "Options entered:\n"; 
for my $key (keys %$options)	{
	print INFO "$key => ".$options->{$key}."\n";
}


#extract the individual seed sequences from the input file
my $inSeqIO  = Bio::SeqIO->new(-file => "$input" , '-format' => 'Fasta');
while ( my $seq = $inSeqIO->next_seq() ) {
	$total_count++;
	my $id=$seq->display_id;
	$seeds{$id} = $seq;
}

print INFO "\n\nIn total $total_count sequences entered\n\n\n"; 

if (!$input=~m/\~/)	{$input=~s/(.*)/..\/$1/;}

###########################################################################
###########################################################################
#SECTION 6 - Preprocessing...	
#				i) asks whioch genetic codes the user wishes to use 
#				ii) Creates the 6 frame translations using both nuclear and mitochondrial codon tables
#				iii) Runs blast search of entire input against a database of choice, if required
#				iv) Runs DECoder program on all sequences, if required.  Quicker and easier than on individual sequences
########################################################
#LAUNCH THESE AS FACTORY'S IN BIOPERL?
#WE ALLOW IT TO BE DONE ON SEQ4SEQ BASIS AND
#ONLY IN DESIRED FRAME!
#Currently can't get this to work...
########################################################

my $gcodes=p4e2::getDNA::store_gen_code($gen_cod_file);
my $nuc_code=1;
my $mito_code=5;
print colored ("You need to choose which Genetic Codes to use for nuclear and mitochondrial translations\n",'green bold');
print colored ("Select a nuclear genetic code [default=$nuc_code]\n",'green bold');
for (my $i=0; $i<=$#{$gcodes}; $i++)	{
	next unless $gcodes->[$i]->[0];
	print "$i: ",$gcodes->[$i]->[0],"\n";
}
chomp (my $seln =<STDIN>);
if ($gcodes->[$seln]->[0])	{	#is it valid?
	$nuc_code=$seln;
}

print colored ("Select a mitochondrial genetic code [default=$mito_code]\n",'green bold');	
for (my $i=0; $i<=$#{$gcodes}; $i++)	{
	next unless $gcodes->[$i]->[0];
	next unless $gcodes->[$i]->[0]=~/mitochondrial/i;
	print "$i: ",$gcodes->[$i]->[0],"\n";
}
chomp ($seln = (<STDIN>));
if ($gcodes->[$seln]->[0])	{	#is it valid?
	$mito_code=$seln;
}

print colored ("You have selected nuclear genetic code: $nuc_code and mitochodrial genetic code: $mito_code\n");


#create six frame translations of the input using
#both standard and invertebrate mitochondria tables
my $table1= ($nuc_code==1 ? 0 : $nuc_code);
my $table2= ($mito_code==1 ? 0 : $mito_code);

system ("transeq $input nucl_6pep.fa -frame=6 -table=$table1 -alternative >& /dev/null");
system ("transeq $input mito_6pep.fa -frame=6 -table=$table2 -alternative >& /dev/null") ;	#uses invertebrate mito table. Consult transeq to change.
system "formatdb -i nucl_6pep.fa -p T >& /dev/null";

copy($input,'./input.fsa');
system "formatdb -i input.fsa -p F >& /dev/null";


#fetch codon table and build ESTScan matrix.
my ($embl_db,$plus_pseudo,$escan_sp);
if (!$estscanmatrix)	{
	($estscanmatrix, $embl_db, $escan_sp)=p4e2::estscan_plugin::get_embl_seqs($species, $out_dir);
	if ($estscanmatrix)	{
		print colored ("\nprot4EST includes the facility to complement the hidden Markov model training\n", 'green bold');
		print colored ("with CDS determined from the BLAST searches. These are filtered to ensure only\n", 'green bold');
		print colored ("the most robust are used.\nDo you wish to use this facility?\n",'green bold');
		while (1)	{
			print colored ("[y/n]\n",'green bold');
			my $ans=<STDIN>;
			chomp $ans;
			if ($ans =~ m/^\s*[y]\s*$/)	{
				$plus_pseudo=1;
				last;
			}
			elsif	($ans =~ m/^\s*[n]\s*$/)	{
				$plus_pseudo=0;
				last;
			}
			else	{
				redo;
			}
		}
	}
		
}
else {copy("$estscanmatrix","./");}	#copy known matrix to directory

#run decoder and all the preformatting steps
if ($dec_flag && !$pre_DEC)	{	

	
	my $DECprog=p4e2::p4e_checker::wheredecoder();

#need to get hold of a codon bias table before able to run...
#also copy across the program to directory.
	copy($DECprog, './') or die;		#make a copy of the executable in pwd - requirement of decoder fortran code.
	chmod 0755, "decoder";

	unless ($cut)	{	#no cut specified in config file
		&getcodon($species, $decoderbiastable);
	}
	else	{
		copy($cut, "./$decoderbiastable");
	}

	if (!$sequaldir)	{die;}
	if (!$qualsuff) {die;}
	if (!$seqsuff)	{die;}
	$dec_flag=1;
}

#preprocess of blasting against SWISSALL (or whatever dbase specified);

#CHECK FOR BLAST RELATED ENVIRONMENT VARIABLES FOR THIS SCRIPT

my $blastmat=0;
unless ($ENV{BLASTMAT})	{	#necessary to run blast jobs.  Make sure user has set this.
	print colored ("### Error! ###\nYou need to provide a path to the substitution matrices (e.g. BLOSUM62) for blastall.\n",'red bold');
	print colored ("This is usually done by setting the 'BLASTMAT' variable in your .bashrc or .cshrc file\n", 'red bold');
	print colored ("Please provide the full path below or type 'Q' to quit and consult the blastall documentation...\n",'bold red');
	chomp (my $choice = <STDIN>);
	if ($choice=~m/\s*Q\s*/i)	{exit;}
	else {$ENV{'BLASTMAT'}=$choice;}	#this does perform a check rather presumes the user is correct.  Could change.
}

my $blastdb;	#determine now

unless ($options->{'pre-blast'})	{ 		#so it appears that the user wants to run the blasts themselves 
	print colored ("\n\nIn order to BLAST the sequences you must have the sequence database on your system\nSearching...", 'green bold');
	unless (exists $ENV{BLASTDB})	{			#where are the blast databases?
		print colored ("\n\n### Error! ###\nThe 'BLASTDB' environmental variable has not been set\n", 'red bold');
		print colored ("Please enter the location of the blast database to use in this process or 'Q' to quit and consult the blastall documentation...\n", 'red');		
		chomp (my $choice =<STDIN>);
		if ($choice=~m/\s*Q\s*/i)	{exit;}
		else {$ENV{'BLASTDB'}=$choice;}
	}
	$blastdb=p4e2::blaststuff::finddb('prot','ask');
}

if ($blastdb)	{
	print colored ("\nSelected option to blast search against $blastdb\nThis take may some time.\nGo put kettle on...\n","green bold");
	mkdir 'blastresults', 0755;
	chdir 'blastresults';
	&run_blast($input, $blastdb, 'blastx', $blastxE, 'blastreport', undef, $nuc_code);
	$pre_blast = 'blastresults/blastreport';
	chdir '../';
}

#setup blast report file if specified
my $blfile;
if ($pre_blast)	{
	if (-f $pre_blast)	{
	#check to see if html
		my $lines = `wc -l $pre_blast`;
		$lines =~ s/^(\d+).*/$1/;
		my $count;
		open BLS, "<$pre_blast" or die;
		while (<BLS> && $count++ <50)	{
			if (/<html>/i)	{
				print colored ("\nBlast Report file selected is HTML formatted.  This must first be stripped of tags\n",'green bold');
				print colored ("This may take a few minutes, please be patient.\n",'green bold');
				$pre_blast=strip_html($pre_blast);
				last;
			}
		}
		$blfile=$pre_blast;
	}
	else {
		print colored ("\n\n### Error! ###\n", 'red bold');
		print colored ("Unable to determine the blast report file.\nPath given: $pre_blast\nIs this correct?\n\n",'red bold');
		exit(2);
	}
}

###########################################################################
#SECTION 7 - starts looking for sequences with similarities,
#this requires running blastall against mitochondrion and rRNA databases.
#(7a)Blast reports from mitochondrial and nuclear encoded proteins are parsed
#and the hsps are joined.
#(7b)These are then extended
###########################################################################


print colored ("\n\nSearching for homologues.  This may take a few minutes.\n",'green bold');
print colored ("\nribosomal RNA genes...\n",'green bold');

my @seeds;	#blast needs an array of objects NOT a hash.
for my $key (keys %seeds)	{
	push @seeds, $seeds{$key};
}
my $rRNA_blast = &run_blast(\@seeds, $rnafile, 'blastn', $blastnE, 'rna.bls', undef, undef);	#database contains ntide sequences so blastn
my ($blast_ref_rRNA, undef, undef) = &split_blast($rRNA_blast, $blastnE, $total_count,\%seeds);
print colored ("\nmitochondrial genes...\n",'green bold');
my $mito_blast = &run_blast(\@seeds, $mitofile, 'blastx', $blastxE, 'mito.bls', undef, $mito_code);
my ($blast_ref_mito, undef, undef) = &split_blast($mito_blast, $blastxE, $total_count,\%seeds);
print colored ("\nnuclear proteins. For larger datasets this may take some time...\n",'green bold');
my ($blast_ref_nr, $check_ref, undef) = &split_blast($blfile, $blastxE, $total_count,\%seeds);

#need to ensure that all the query sequences were in the blast results.
#This is being done as a seperate process as they are non-fatal errors so
#only requires a warning message
print colored ("\nChecking blast report files.  For large datasets this may take a couple of minutes...\n",'green bold');
foreach my $seed (@seeds)	{
	my $id = $seed->display_id;
	unless (exists $check_ref->{$id})	{	#i.e. its not in the blast file
		print colored ("## Warning: sequence $id was not found in the blast report file.  Non-fatal.  Process continuing...\n","green bold");
		print INFO "## Warning: sequence $id was not found in the blast report file.  Non-fatal.  Process continuing...\n"; 
		next;
	}
}
print "\n";
my $further;
foreach my $seq (@seeds)	{								#cycle through each sequence and remove redundancy in the references.
	my $id=$seq->display_id;								#not the most efficient way but is clearest.
	if (exists $$blast_ref_rRNA{$id})	{
		delete $$blast_ref_mito{$id};
		delete $$blast_ref_nr{$id};
		next;sub spl
	}
	elsif (exists $$blast_ref_mito{$id})	{
		delete $$blast_ref_nr{$id};
		next;
	}
	elsif (exists $$blast_ref_nr{$id})	{
		next;		
	}
	else	{
		push @$further, $seq;
	}
}

my $size_rRNA = keys (%$blast_ref_rRNA);
my $size_mito = keys (%$blast_ref_mito);
my $size_nr = keys (%$blast_ref_nr);
my $size_further = scalar(@$further) if $further || 0;


print colored ("SUMMARY:\tnumber of rRNA: $size_rRNA\n\t\tnumber of mito: $size_mito\n\t\tnumber of nr hits: $size_nr\n\t\tnumber sent for further processing: $size_further\n\n",'blue bold'); 
print INFO "SUMMARY:\tnumber of rRNA: $size_rRNA\n\t\tnumber of mito: $size_mito\n\t\tnumber of nr hits: $size_nr\n\t\tnumber sent for further processing: $size_further\n\n"; 


if ($size_rRNA)	{
	open (RNA, ">rRNAhits.txt") || die "\nERROR: Cannot open $out_dir/rRNAhits.txt\n$er_msg";
	print colored ("Processing putative rRNA genes...\n",'green bold');
	for my $seed_id (keys %$blast_ref_rRNA)	{
		progress(++$count_rRNA,$size_rRNA);
		print RNA ">$seed_id\tputative rRNA\t",$blast_ref_rRNA->{$seed_id}[0]->evalue,"\n";	#display: name, desc, evalue
		print RNA $blast_ref_rRNA->{$seed_id}[0]->query_string,"\n";									#sequence
	}
	close RNA;
}
else	{print colored ("No rRNA genes to record.  Proceeding...\n", 'green bold')};

#now need to create a max scoring minimal tiling path

###########################################################################
#SECTION 7a
###########################################################################
my ($sixframe_file,$transln_mito,$transln_nr);
my @sixframe_seqs;
if ($size_mito)	{
	print colored ("\n\nTranslating putative mitochondrial genes...\n",'green bold');
	$sixframe_file = Bio::SeqIO->new('-file' => "mito_6pep.fa" , '-format' => 'Fasta');
	my $seq_stats_mito;
	while (my $seq = $sixframe_file->next_seq())	{
		push @sixframe_seqs, $seq;
	}
	$seq_stats_mito = p4e2::tile_path::tile_path($blast_ref_mito, \@sixframe_seqs, '7');

	#[0] - the start position of the tile path with regard to the EST
	#[1] - the end position of the tile path with regard to the EST
	#[2] - the amino acid sequence from the tile path process
	#[3] - the full length sequence after tile path and extension
	#[4] - notes/comments
	#[5] - anonymous array which stores information relating to each hsp 
	#[6] - length of nucleotide sequence
	#[7] - the ntide coding region that is added later.

	#righteo now have anonymous hash of arrays which holds the sequences and positional stats for 
	#max scoring minimal tiling paths for all those sequences with confident homologues in the
	#mitochondrial database.
	#now the tiling paths will be extended.
	print colored ("\nExtending mitochondial protein hits...\n",'green bold');
	#print colored ("\nExtending mitochondial protein hits...\n",'black bold');

###########################################################################
#SECTION 7b
###########################################################################
	$transln_mito = p4e2::tile_path::extend($seq_stats_mito, \@sixframe_seqs, 'blast');
	$transln_mito = p4e2::getDNA::fromBlast($transln_mito, \%seeds,$mito_code,$gen_cod_file);
}
else	{print colored ("No mitochondrial genes to translate.  Proceeding...\n",'green bold')};
undef @sixframe_seqs;	#now going to write nuclear 6-frame

$sixframe_file = Bio::SeqIO->new('-file' => "nucl_6pep.fa" , '-format' => 'Fasta');

while (my $seq = $sixframe_file->next_seq())	{
	push @sixframe_seqs, $seq;
}

###########################################################################
#SECTION 7a
###########################################################################
if ($size_nr)	{
	print colored ("\n\nTranslating nuclear encoded genes...\n",'green bold');
	my $seq_stats_nr;
	$seq_stats_nr = p4e2::tile_path::tile_path($blast_ref_nr, \@sixframe_seqs, '7');
	
###########################################################################
#SECTION 7b
#righteo now have anonymous hash of arrays which holds the sequences and positional stats for 
#max scoring minimal tiling paths for all those sequences with confident homologues in the
#nuclear database.
#now the tiling paths will be extended. 
###########################################################################	
	print colored ("\nExtending nuclear protein hits...\n",'green bold');
	$transln_nr = p4e2::tile_path::extend($seq_stats_nr, \@sixframe_seqs, 'blast');	
	$transln_nr = p4e2::getDNA::fromBlast($transln_nr, \%seeds,$nuc_code,$gen_cod_file);
	
	if ($plus_pseudo)	{
		print colored ("\nBuilding pseudo CDS...\n",'green bold');
		$escan_sp=$species unless $escan_sp;	# need this as the orginal SRS search may have pulled back nothing!
		my ($pseudo_num,$pseudo_file)=p4e2::estscan_plugin::pseudo_collect($blast_ref_nr, $escan_sp, $embl_db, \%seeds);
#use species name from original ESTScan model building. This may not be the same as the one from the nucleotides.
#however the build_model script from ESTScan does not allow multiple species names  :o(		
	
		print colored("Found $pseudo_num sequences that are sufficient to create pseudo CDS entires\n");
		if ($pseudo_num > 0)	{
			
			$estscanmatrix=p4e2::estscan_plugin::build_matrix_with_pseudo($pseudo_file,$embl_db, $escan_sp);
		}
	}
	
}
else	{print colored ("No nuclear genes to translate.  Proceeding...\n",'green bold')};

###########################################################################
#SECTION 8
#
#Do some printing.  Need to print two versions.  One that includes the 'whole'
#translation, extension and all, and one that is just the tile_path made from
#the hsps.
###########################################################################	

&outfiles('open');

print colored ("\nWriting Translations...\n", 'green bold');
#print colored ("\nWriting Translations...\n", 'black bold');
my $count=1;
my $size2print=$size_mito+$size_nr;
my $mito_write=0;
for my $id (keys %$transln_mito)	{
	
	progress($count++,$size2print);
	(my $id_p = $id) =~ s/^(\w\w)C/$1P/;
	my ($main, $hsps, $prop) = &get_stats_4_psql($id_p,$transln_mito->{$id},$hsp_ref_counter);
	
	$main->[0]=$id_p;
	$main->[1]=$id;
	$main->[2]='similarity';
	$main->[3]='mitochondrially-encoded';
	
	my $desc = "putative mitochondrial protein  Method: similarity and extension";
	
	my $xtn=Bio::SeqIO->new(-file=>">>translations_xtn.fsa", -format=>'fasta');
	my $noxtn=Bio::SeqIO->new(-file=>">>translations_noxtn.fsa", -format=>'fasta');
	my $coding=Bio::SeqIO->new(-file=>">>nt_coding.fsa",-format=>'fasta');
	my $seqobj = Bio::Seq->new(-display_id=>$id_p, -desc=>$desc, -seq=>$transln_mito->{$id}->[3]);
	$xtn->write_seq($seqobj);
	#change the object to write coding region
	$seqobj->seq($transln_mito->{$id}->[7]);
	$coding->write_seq($seqobj);
	my $d=$seqobj->desc;
	$d=~s/\sand\sextension//g;
	$seqobj->desc($d);
	$seqobj->seq($transln_mito->{$id}->[2]);
	$noxtn->write_seq($seqobj);
	$mito_write++;
	print DB_MAIN join ",", @$main;
	print DB_MAIN "\n";
	
	foreach my $hsp_stats (@$hsps)	{
		print DB_HSPS join ",", @$hsp_stats;
		print DB_HSPS "\n";
		$hsp_ref_counter++;
	}
}
my $nuc_write=0;
for my $id (keys %$transln_nr)	{

	progress($count++,$size2print);
	(my $id_p = $id) =~ s/^(\w\w)C/$1P/;
	
	my ($main, $hsps, $prop) = &get_stats_4_psql($id_p,$transln_nr->{$id},$hsp_ref_counter);
	
	my $desc = "putative nuclear encoded protein  Method: similarity and extension";
	$main->[0]=$id_p;
	$main->[1]=$id;
	$main->[2]='similarity';
	$main->[3]='nuclear-encoded';
	
	my $xtn=Bio::SeqIO->new(-file=>">>translations_xtn.fsa", -format=>'fasta');
	my $noxtn=Bio::SeqIO->new(-file=>">>translations_noxtn.fsa", -format=>'fasta');
	my $coding=Bio::SeqIO->new(-file=>">>nt_coding.fsa",-format=>'fasta');
	my $seqobj = Bio::Seq->new(-display_id=>$id_p, -desc=>$desc, -seq=>$transln_nr->{$id}->[3]);
	$xtn->write_seq($seqobj);
	#change the object to write coding region
	$seqobj->seq($transln_nr->{$id}->[7]);
	$coding->write_seq($seqobj);
	my $d=$seqobj->desc;
	$d=~s/\sand\sextension//g;
	$seqobj->desc($d);
	$seqobj->seq($transln_nr->{$id}->[2]);
	$noxtn->write_seq($seqobj);
	$nuc_write++;
	print DB_MAIN join ",", @$main;
	print DB_MAIN "\n";
	
	foreach my $hsp_stats (@$hsps)	{
		print DB_HSPS join ",", @$hsp_stats;
		print DB_HSPS "\n";
		$hsp_ref_counter++;
	}
}
&outfiles('close');
my $size= $further ? scalar @$further : 0;
print colored ("\n\n$size sequences about to be passed through ESTScan and DECODER\n",'green bold');

#Look at DECoder results for those sequences without blast hits

###########################################################################
#SECTION 10
#
#Checking the ESTScan predictions
#There are issues with the location of peptides with respect to the est
#will be sorted out soon.
###########################################################################	

my $estscan_write=0;
my $estscan_res;
if ($estscanmatrix && ($size > 0))	{	
	print INFO "Carrying out ESTScan on ",scalar @$further," sequences\n";
	print colored ("Carrying out ESTScan on $size sequences...\n",'green bold');
	#print colored ("Carrying out ESTScan on $size2estscan sequences...\n",'black bold');
	open (TEMP, ">estscan.fa") || die "\nERROR: Cannot open estscan.fa\n$er_msg";
	
	foreach my $seqobj (@$further)	{
		print TEMP ">",$seqobj->display_id,"\n",$seqobj->seq,"\n";
	}
	close TEMP;
	
	p4e2::estscan_plugin::run_estscan('estscan.fa',$estscanmatrix,'estscan_nt.fsa','estscan_pep.fsa','90');
	die "Unable to find estscan_nt.fsa\n" if (!-f 'estscan_nt.fsa');
	system "rm -f estscan.fa";		#could create a temp file instead!
	
#now have a fasta file with all the estscan predictions.
#it needs to be parsed similar to the DECoder results.

	my $transln_estscan = p4e2::estscan_plugin::parse_results($further,'estscan_nt.fsa','estscan_pep.fsa',\%seeds);
	my $size_estscan=keys %$estscan_res;
		
	my $count=1;
	
###########################################################################
#SECTION 10b
#
#printing the ESTScan predictions.
###########################################################################	
	my ($estscan_seq); 
	#go through all the seqobjs in further and see which gave 'robust' translations
	my $toDECODER;
	PARSE: foreach my $seqobj (@$further)	{
		my $id=$seqobj->display_id;
		#my $transln=$transln_estscan->{$id};
		unless (exists $transln_estscan->{$id})	{
			push @$toDECODER, $seqobj;
			next PARSE;
		}
		#if ($transln->[-1][-1] == '0')	{#poor translation
		#	push @$toDECODER, $seqobj;		#use the id rather than seqobj as the latter may not exist if ESTScan never translated the sequence
		#	delete $transln_estscan->{$id};
		#}
	}
	$further=$toDECODER;
	my $size_estscan= keys (%$transln_estscan);
	
	$transln_estscan=p4e2::getDNA::fromESTScan($transln_estscan,'estscan_nt.fsa',\%seeds,$nuc_code,$gen_cod_file);
	
	print colored ("\nWriting translations...\n",'green bold');
	#print colored ("\nWriting translations...\n",'black bold');
	
	&outfiles('open');
	
	for my $id (keys %$transln_estscan)	{
		progress($count++,$size_estscan);
				
		(my $id_p = $id) =~ s/^(\w\w)C/$1P/;
		my $dub=0;
		$dub=1 if scalar @{$transln_estscan->{$id}} == 2;	#there's two of them.
		
		foreach my $transln (@{$transln_estscan->{$id}})	{
			$estscan_write++;
			my ($main,$hsp,$prop)=&get_stats_4_psql($id_p,$transln, $hsp_ref_counter);
			my $desc = "putative nuclear encoded protein  Method: ESTScan";
			if ($dub)	{
				$id_p =~ s/(_p)?$/_n/ if $main->[6]=='-1';
				$id_p =~ s/(_n)?$/_p/ if $main->[6]=='1';
			}
			$main->[0]=$id_p;
			$main->[1]=$id;
			$main->[2]='estscan';
			$main->[3]='nuclear-encoded';
		
			my $xtn=Bio::SeqIO->new(-file=>">>translations_xtn.fsa", -format=>'fasta');
			my $noxtn=Bio::SeqIO->new(-file=>">>translations_noxtn.fsa", -format=>'fasta');
			my $coding=Bio::SeqIO->new(-file=>">>nt_coding.fsa",-format=>'fasta');
			my $seqobj = Bio::Seq->new(-display_id=>$id_p, -desc=>$desc, -seq=>$transln->[3]);
			$xtn->write_seq($seqobj);
			#change the object to write coding region
			$seqobj->seq($transln->[7]);
			$coding->write_seq($seqobj);
			my $d=$seqobj->desc;
			$d=~s/\sand\sextension//g;
			$seqobj->desc($d);
			$seqobj->seq($transln->[3]);
			$noxtn->write_seq($seqobj);
			
			print DB_MAIN join ",", @$main;
			print DB_MAIN "\n";
		
		}
	}	
	&outfiles('close');
	
	my $size_further=0;
	if ($further)	{
		$size_further=scalar @$further;
	}
	print colored ("\n\nSUMMARY:\tESTScan completed\n\t\tNumber of sequences translated: $size_estscan\n\t\tNumber of sequences sent for further processing: $size_further\n\n",'blue bold');
	print INFO "\nESTScan parsing completed\n\t\tNumber of sequences translated: $size_estscan\n\t\tNumber of sequences sent for further processing: $size_further\n\n";

}

###########################################################################
#SECTION 9
#
#Checking the DECODER predictions
#There are issues with the location of peptides with respect to the est
#will be sorted out soon.
###########################################################################	
my $dec_write=0;
if ($further && $dec_flag)	{

	my $off2further;
	
	unless ($pre_DEC)	{
		$off2further=&setup_decoder($sequaldir,$qualsuff,$seqsuff,$decoderbiastable,$further);
		&run_decoder('decoderfiles','5','../decoder');
		&run_decoder('decoderfiles','3','../decoder');
		$pre_DEC='decoderfiles';
	}
	
	my $count=1;
	my ($dec_seq, $failed);
	print colored ("\n\nParsing the DECODER results...\n", 'green bold');
	#print colored ("\n\nParsing the DECoder results...\n", 'black bold');
	my $size_further = @$further;	
	foreach my $seqobj (@$further)	{
		my $dec_pred;
		my $id=$seqobj->display_id;
		$dec_pred->{$id}=&decoder_parse($id, $pre_DEC);	#check the length of the putative translation
		
		progress($count++, $size_further);
		if ($dec_pred->{$id}->[5] == 0)	{		#if poor DECoder prediction
			push @$off2further, $seqobj;
			next;
		}
		else	{
			my $start=$dec_pred->{$id}->[0];		#this needs tidying up.
			my $end=$dec_pred->{$id}->[1];		#messy because of a new location method
			my $frame=$dec_pred->{$id}->[2];
			my $ntlen=$dec_pred->{$id}->[3];
			my $pred_seq=$dec_pred->{$id}->[4];
			$dec_seq->{$id}->[0][1]=$start;
			$dec_seq->{$id}->[0][0]=$frame;
			$dec_seq->{$id}->[1][1]=$end;
			$dec_seq->{$id}->[1][0]=$frame;	
			$dec_seq->{$id}->[6]=$ntlen;
			$dec_seq->{$id}->[2]=$pred_seq;
		} 
		
	}
		
	my $size_dec= keys (%$dec_seq);
	
	#below line is an artifact from above statement and will get sorted out.
	my $transln_dec=$dec_seq;
	$transln_dec=p4e2::getDNA::fromDECODER($transln_dec,$pre_DEC,$nuc_code,$gen_cod_file);
			
	print colored ("\nWriting Translations...\n", 'green bold');
	$count=1;

###########################################################################
#SECTION 9b
#
#printing the DECODER predictions.
###########################################################################	
	&outfiles('open');
	
	for my $id (keys %$transln_dec)	{
		progress($count++,$size_dec);
		(my $id_p = $id) =~ s/^(\w\w)C/$1P/;
		
		my ($main,$hsp,$prop)=&get_stats_4_psql($id_p,$transln_dec->{$id}, $hsp_ref_counter); 
				
		my $desc = "putative nuclear encoded protein  Method: decoder";
		$main->[0]=$id_p;
		$main->[1]=$id;
		$main->[2]='decoder';
		$main->[3]='nuclear-encoded';
		
		my $xtn=Bio::SeqIO->new(-file=>">>translations_xtn.fsa", -format=>'fasta');
		my $noxtn=Bio::SeqIO->new(-file=>">>translations_noxtn.fsa", -format=>'fasta');
		my $coding=Bio::SeqIO->new(-file=>">>nt_coding.fsa",-format=>'fasta');
		my $seqobj = Bio::Seq->new(-display_id=>$id_p, -desc=>$desc, -seq=>$transln_dec->{$id}->[2]);
		$xtn->write_seq($seqobj);
		#change the object to write coding region
		$seqobj->seq($transln_dec->{$id}->[7]);
		$coding->write_seq($seqobj);
		my $d=$seqobj->desc;
		$d=~s/\sand\sextension//g;
		$seqobj->seq($transln_dec->{$id}->[2]);
		$seqobj->desc($d);
		$noxtn->write_seq($seqobj);
		$dec_write++;
		print DB_MAIN join ",", @$main;
		print DB_MAIN "\n";

	}	
	&outfiles('close');
	my $size_further=0;
	$size_further = scalar @$off2further if (defined @$off2further);
	
	print colored ("\n\nSUMMARY:\tDECODER process completed\n\t\tNumber of sequences translated: $size_dec\n\t\tNumber of sequences sent for further processing: $size_further\n\n",'blue bold');
	print INFO "\nDECODER parsing completed\n\t\tNumber of sequences translated: $size_dec\n\t\tNumber of sequences sent for further processing: $size_further\n\n";
	
	$further=$off2further;
}

###########################################################################
#SECTION 11
#
#finding and printing the longest Open Reading Frame
###########################################################################	

my $long_write=0;
if ($further)	{
	my $size2sixf = scalar (@$further);
	my $transln_sixf;
	print colored ("\n\nParsing sixframe translations for $size2sixf sequences...\n",'green bold');
	my $count=1;
	#to speed things up need to create a hash with each sequence's sixframe translations.
	my $seqIO=new Bio::SeqIO (-file=>'nucl_6pep.fa');
	my %sixframe;
	while (my $seq=$seqIO->next_seq)	{
		my $id=$seq->display_id;
		$id=~s/_\d+$//;
		push @{$sixframe{$id}}, $seq->seq;	
	}
	foreach my $seq (@$further)	{ 
		my $id=$seq->display_id;
		my $seq_fasta=&parse_sixframe($id,$size2sixf,$count++,$seq->length,$sixframe{$id});
		$transln_sixf->{$id}=$seq_fasta;
	}
	p4e2::getDNA::fromORF($transln_sixf,\%seeds,$nuc_code,$gen_cod_file);
	print colored ("\nWriting translations...\n",'green bold');
	$count=1;
	&outfiles('open');
	
	for my $id (keys %$transln_sixf)	{
		progress($count++,$size2sixf);
		(my $id_p = $id) =~ s/^(\w\w)C/$1P/;
		
		my ($main,undef,$prop)=&get_stats_4_psql($id_p,$transln_sixf->{$id});
		my $desc = "putative nuclear encoded protein  Method: Longest ORF";
		$main->[0]=$id_p;
		$main->[1]=$id;
		$main->[2]='longest_orf';
		$main->[3]='nuclear-encoded';
		
		my $xtn=Bio::SeqIO->new(-file=>">>translations_xtn.fsa", -format=>'fasta');
		my $noxtn=Bio::SeqIO->new(-file=>">>translations_noxtn.fsa", -format=>'fasta');
		my $coding=Bio::SeqIO->new(-file=>">>nt_coding.fsa",-format=>'fasta');
		my $seqobj = Bio::Seq->new(-display_id=>$id_p, -desc=>$desc, -seq=>$transln_sixf->{$id}->[2]);
		
		$xtn->write_seq($seqobj);
		#change the object to write coding region
		$seqobj->seq($transln_sixf->{$id}->[7]);
		$coding->write_seq($seqobj);
		my $d=$seqobj->desc;
		$d=~s/\sand\sextension//g;
		$seqobj->desc($d);
		$seqobj->seq($transln_sixf->{$id}->[2]);
		$noxtn->write_seq($seqobj);
		$long_write++;
		print DB_MAIN join ",", @$main;
		print DB_MAIN "\n";
	}
	&outfiles('close');
}

print colored ("\n\nAll ESTs have now been processed.\nIn the directory $out_dir contains four primary output files:\n",'bold');
print colored ("1. translations_xtn.fsa - fasta file with the predicted peptides including extensions\n2. translations_noxtn.fsa - fasta file with the predicted peptides without extensions\n3. prot_main.psql - information and stats for each translation; CSV for easy database upload\n4. prot_hsps.pqsl - details on the hsps used to construct any similarity based translations; CSV for easy database upload\n\n",'bold');

exit(0);	#and finish...
##################
#SUBs
##################

	
################################################################################################################
sub print_title	{
	print colored("\t##################################################\n","bold");
	print colored("\t###                                            ###\n","bold");
	print colored("\t###               prot4EST                     ###\n","bold");
	print colored("\t###                                            ###\n","bold");
	print colored("\t###$version###\n","bold");
	print colored("\t###                                            ###\n","bold");
	print colored("\t###  a script that converts EST sequence into  ###\n","bold");
	print colored("\t###  amino acid sequence taking frame shift,   ###\n","bold");
	print colored("\t###  substitutions et al into consideration.   ###\n","bold");
	print colored("\t###                                            ###\n","bold");
	print colored("\t##################################################\n","bold");
}
################################################################################################################
sub title_page	{
	my $answer;
	sleep(2);	#pregnant pause so that any error messages are not lost
	print_title();
   	print "\n\n\tPlease set up the config file:\n";
   	print "\n\t1. Create a configuration file.\n";
   	print "\n\t2. Use or Edit an existing configuration file.\n";
   	print "\n\t3. Get Help.\n";
	print "\n\t4. Exit Program.\n\n\n";
    
   	my $flag=0;
   	while ($flag==0)    {
		$answer=<>;
	   	if ($answer=~/^\s*[1234]\s*$/)   {$flag=1; next;}
   	}
   	return $answer;
}
################################################################################################################
sub make_config	{
	my $flag;
	while (!$flag)	{
		if (-f 'config')	{
			print colored ("\n### Warning! ###\n","green bold");
			print colored ("config already exists\noverwrite [Y/N]\n","green bold");
			my $decision=<STDIN>;
			if ($decision=~m/n/i)	{system "pico -w config";$flag=1;return 'config';}
			if ($decision=~m/y/i)	{last;}
		}
		else {last;}	
	}
	if (!$flag)	{
		open CONF, ">config" || die "\n\nCannot open config\n";
		my $date = `date`;
		print CONF 	"\tConfig file for PROT4EST created $date\n\n".
					"For help on any of these please consult the README file\n\n".
					"#Full path to fasta input file, e.g. /home/joe/EST/rubellus.fsa\n".
					"1.   Input File [fasta format]:\n\n".
					"#prot4EST will create this directory, e.g. 'output' will be created in the\n".
					"#directory p4e is launched from\n".
					"2.   Output Directory:\n\n".
					"#e.g. Lumbricus rubellus\n".
					"3.   Organism Name (full):\n\n".
					"4.   Location of genetic code file: /usr/local/ncbi/data/gc.prt \n\n".
					"#Fasta and BLAST files containing these sequences are included in the prot4EST release.\n".
					"#Enter the full path.\n".
					"5.   Ribosomal RNA BLAST database:\n".
					"6.   Mitochondria BLAST database [protein]:\n\n".
					"#The defaults are shown\n".
					"7.   Evalue for rRNA search (BLASTN): 1e-65\n".
					"8.   Evalue for BLASTX: 1e-8\n\n".
					"#If you have previous carried out BLASTx search on these sequences then enter the path to the report file\n".
					"#or directory containing only these files\n".
					"#If left blank then prot4EST assumes you wish to carry out a BLASTx search on these sequences\n".
					"#You are advised to read the userguide regarding this option\n".
					"9.    Location of pre-computed BLASTX report files/directory:\n\n".
					"#Fill in all entries for 9a-c OR just 9d (if DECODER has already been run on these sequences)\n".
					"#e.g. /home/joe/partigene/protein\n".
					"10a.  Path to sequence and quality files [protein directory]:\n".
					"#defaults shown\n".
					"10b.  Suffix for EST sequence files: seq\n".
					"10c.  Suffix for EST quailty files: qlt\n".
					"	 or\n".
					"10d.  Path to pre-computed DECODER results:\n\n".
					"11. ESTScan Matrix File [optional]:\n\n".
					"12. Codon Usage Table (gcg format) [optional]:\n\n\n".
					"For help on any of these please consult the user guide\n";
		close CONF;
		system "pico -w config";
		1;
	}
}
################################################################################################################
sub upload	{
	my $pwd = `pwd`;
	chomp $pwd;
	print colored ("Please provide path to config file...(current directory: $pwd )\n",'bold');
	my $file = <STDIN>;
	chomp $file;
	if (-f $file)	{
		my $check;
		while ($check==0)	{
			print colored ("\nConfiguration file found\nWould you like to Use or Edit this file?\n[U/E]\n","bold");
			chomp (my $choice = (<STDIN>));
			if ($choice=~m/\s*U\s*/i)	{ $check=1; }
			elsif ($choice=~m/\s*E\s*/i)	{
				system "pico -w $file";
				$check=1;
			}
		}		
	return $file;
	}
	else {return 0;}
}
################################################################################################################
sub parse_conf	{
	my $confile=shift;
	my $options;
	my %opt_key = (
		'1' => 'in',
		'2' => 'out',
		'3' => 'species',
		'4' => 'gen_code',
		'5' => 'rnafile',
		'6' => 'mitofile',
		'7' => 'blastnE',
		'8' => 'blastxE',
		'9' => 'pre-blast',
		'10a' => 'sequaldir',
		'10b' => 'seqsuff',
		'10c' => 'qualsuff',
		'10d' => 'pre-DEC',
		'11' => 'estscanmatrix',
		'12' => 'cut',
	);	#hash table of options - allows easier maintenance of code
	
		open CONFILE, "$confile" || die; #should tho as the presence of $confile has already been determined
	while (<CONFILE>)	{
		if (m/^(\d{1,2}(?:[abcde]?))\..*?:\s*([-\w\/.]+\s*[-\w\/.]+)\s*/s) {
			
			$options->{$opt_key{$1}}=$2;	#this is quite strict, but that's what is best.
			$options->{$opt_key{$1}}=~s/\n//g;	#remove any newline characters that may have creeped in.
		}
	}
	return $options;
}
################################################################################################################
sub check_files	{
	my $options=shift;
	my $dec_flag=shift;
	
#need to convert the options href into scalar variables.  Make code easier to maintain...

	my $in = $options->{'in'};
	my $out = $options->{'out'};
	my $species = $options->{'species'};
	my $rnafile = $options->{'rnafile'};
	my $mitofile = $options->{'mitofile'};
	my $blastnE = $options->{'blastnE'};
	my $blastxE = $options->{'blastxE'};
	my $pre_blast = $options->{'pre-blast'};
	my $sequaldir = $options->{'sequaldir'};
	my $seqsuff = $options->{'seqsuff'};
	my $qualsuff = $options->{'qualsuff'};
	my $pre_DEC = $options->{'pre-DEC'};
	my $estscanmatrix = $options->{'estscanmatrix'};
	my $cut=$options->{'cut'};
	my $gen_code=$options->{gen_code};

	print LOG   "Environmental variables:\n".`printenv`."\n\n\n".
				"Options for prot4EST run:\n";
				
	for my $key (keys %$options)	{
		print LOG "$key => ".$options->{$key}."\n";
	}
	
	my $err_msg = "\n### Error! ###";
	
#check input file and format
	if (-d $in)	{	#the input is a bunch of files in a directory
					#what expected from partigene process
		my @files = glob("$in/*");
		my $count_prot;
		
		while (-e 'input_seqs.fsa')	{
			print "Have found a file called input_seqs.fsa.  Overwrite this file? [Y/N]\n";
			my $choice;
			chomp ($choice = <STDIN>);
			if ($choice =~ m/\s*Y(?:ES)?\s*/i)	{
				unlink 'input_seqs.fsa';
				last;
			}
			elsif ($choice =~ m/\s*N(?:O)?\s*/i)	{
				print "prot4EST will exit so you can move the file\n";
				exit;
			}
			else	{	#typed something else
				next;
			}
		}
		
		print "Searching for sequence input...\n";
		print "Found ", scalar @files," files...\t";
		foreach my $file (@files)	{
			my $error=p4e2::p4e_checker::sequence($file, 'IUPAC');
			if (scalar @$error == 0)	{	#valid sequence
				system "cat $file >> input_seqs.fsa"; 			
				$count_prot++;
			}
			else { next; }
		}
		print "Of which $count_prot were included in the input file\n";
		
		my $pwd=getcwd();
		
		$in = "$pwd/input_seqs.fsa";
		
		print colored ("\n$in accepted and format verified\n","green");
	}				
	elsif (-f $in)	{
		my $error= p4e2::p4e_checker::sequence($in, 'IUPAC');
		
		if (scalar @$error)	{
			print colored ("$err_msg\n","red bold");
			print colored ("Input file: $in is not in fasta format!\nProblem in sequence:\n","red bold");
			foreach (@$error)	{
			print "$_\n";
			print ERROR "Input file: $in is not in fasta format!\nProblem in sequence:\n$_\n\n";
			}
			return 1;		
		}
		print colored ("\n$in accepted and format verified\n","green");
	}
	else {
			print colored ("$err_msg\nCannot find input file: $in:\n","red bold");
			print ERROR "Cannot find input file: $in:\n";
			return 1; 
	}

#check organism name format
	{my $error=p4e2::p4e_checker::species($species);
	
		foreach my $species (@$error)	{
			print colored ("$err_msg\nSpecies name: ","red bold");
			print "$species";
			print ERROR "Error in species name: $species\n";
			print colored (" is in an invalid format\n","red bold");
			print "If you are sure it is VALID then please refer to the user guide section \"Configuration Checks\"\n";
			return 1;
		}
	}

	unless (-f $gen_code)	{
		print colored ("$err_msg\nCannot find Genetic Code file: ","red bold");	
		print "$gen_code\n";
		return 1;
	}
	else	{
		open GEN, "<$gen_code" or die "$!\n";
		my $check;
		while (<GEN>)	{
			$check++ if (m/Genetic-code-table\s::=/)
		}
		unless ($check)	{
			print colored ("$err_msg\nGenetic Code file: ","red bold");	
			print "$gen_code is not in the correct format. Consult UserGuide\n\n";
			return 1;
		}
	}	
#is the RNA file present and accessible?	
	if (-f $rnafile)	{
		my $error1 = p4e2::p4e_checker::sequence($rnafile, 'IUPAC');
		
		if (scalar @$error1 >= 1)	{
			print colored ("$err_msg\nrRNA file: $rnafile is not in fasta format!\nProblem in sequence:\n","red bold");
			foreach (@$error1)	{
				print "$_\n";
				print ERROR "rRNA file: $rnafile is not in fasta format!\nProblem in sequence:\n$_\n\n";
				return 1;
			}
		}	
		my $error2 = p4e2::p4e_checker::bls_db($rnafile, 'nuc');	
		if (scalar @$error2 >= 1)	{
			print colored("$err_msg\nThe following blast database files were not found:\n", 'red bold');
			print (join ", ", @$error2),"\n";
		}	
		print colored ("\n$rnafile accepted and format verified\n","green");
	}
	else	{
		print colored ("$err_msg\nCannot find rRNA file: $rnafile\nPlease check file\n",'red bold');
		print ERROR "Cannot find rRNA file: $rnafile\n";
		return 1;
	}

#do same for mitochondria protein file	
	if (-f $mitofile)	{
		my $error1 = p4e2::p4e_checker::sequence($mitofile, 'protein');
		if (scalar @$error1 >= 1)	{
			print colored ("$err_msg\nMitochondria Protein file: $mitofile is not in fasta format or is NOT PROTEIN!\nProblem in sequence:\n","red bold");
			foreach (@$error1)	{
				print "$_\n";
				print ERROR "Mitochondria Protein file: $mitofile is not in fasta format or is NOT PROTEIN!\nProblem in sequence:\n$_\n";
				return 1;
			}		
		}
		my $error2 = p4e2::p4e_checker::bls_db($mitofile, 'pro');
		if (scalar @$error2 >= 1)	{
			print colored("$err_msg\nThe following blast database files were not found:\n", 'red bold');
			print (join ", ", @$error2),"\n";
		}	
		print colored ("\n$mitofile accepted and format verified\n","green");
	}
	else	{
		print colored ("$err_msg\nCannot find mitochondria protein file: $mitofile\nPlease check file\n",'red bold');
		print ERROR "Cannot find mitochondria protein file: $mitofile\n";
		return 1;
	}

#so if pre-computed BLAST report file selected can it be found?	
	if ($pre_blast && -e $pre_blast)	{
		print colored ("\n$pre_blast found\n\n","green");
		if (-d $pre_blast)	{			#its a directory
			unlink 'all_blast_reports.bls' if -f 'all_blast_reports.bls';
			foreach my $file (glob("$pre_blast/*"))	{
				my $error =	p4e2::p4e_checker::blastfile($file);
				if ($error eq 'none')	{
					system "cat $file >> all_blast_reports.bls";
				}
				else	{ next;}
			}
			my $pwd=`pwd`;
			chomp $pwd;
			$pre_blast="$pwd/all_blast_reports.bls";
		}
	}
	elsif ($pre_blast)	{	#so it is an option but does not exist	
		print colored ("$err_msg\nCannot find BLAST results in file/directory: $pre_blast\n","red bold");
		print ERROR "Cannot find BLAST results in file/directory: $pre_blast\n";
		return 1;
	}
	
#now checking DECODER.
#make sure the options were selected correctly.
	if ($dec_flag)	{
		print colored ("checking sequence and quality files...\n",'green'); 
		if ((-d $sequaldir) && $seqsuff && $qualsuff)	{	#provided information for running DECODER
			if ($pre_DEC)	{
				print colored ("$err_msg\nYou have selected to run DECODER and provided a path to pre-computed results\n",'red bold');
				print colored ("Please return to the configuration file and choose the desired option\n",'red bold');
				print ERROR "Selected to run DECODER and provided path to pre-computed results\n"; 
				return 1;
			}
			unless (-d $sequaldir)	{
				print colored ("$err_msg\nCannot find the directory containing the Phrap quality files\nIs '$sequaldir' correct?\n",'red bold');
				return 1;
			}
			if (p4e2::p4e_checker::decoder($sequaldir, $seqsuff, $qualsuff))	{#need to make sure that all the entries have sequence and quality files
				print colored ("$err_msg\nThere was an error is identifying all the files for DECODER.\nLook at file 'decoder_errors.txt'.\n",'red bold');
				print ERROR "There was an error is identifying all the files for DECODER.\nLook at file 'decoder_errors.txt'.\n";
				exit;
			}
			else	{print colored ("All components for DECODER located\n",'green');}
		}
		if ($pre_DEC && -d "$pre_DEC/Peptide.5" && -d "$pre_DEC/Peptide.3")	{print colored ("Located DECODER output Files\n",'green');}
		elsif ($pre_DEC)	{ #so provided the option but the files cannot be found
			print colored ("$err_msg\nCannot find DECODER output files.\nPlease check the entry in the config file or select to run DECODER\n",'red bold');
			print ERROR "Cannot find DECODER output files.\nPlease check the entry in the config file or select to run DECODER\n";
			return 1;
		}
		
		elsif (!$sequaldir)	{
			print colored ("$err_msg\nCannot find quality file directory: $sequaldir\n",'red bold');
			print ERROR "Cannot find quality file directory for DECODER: $sequaldir\n";
			return 1;
		}
		elsif (!$seqsuff && !$qualsuff)	{
			print colored ("$err_msg\nHave not provided an entry for est file suffix or quality file suffix in the configuration file\n",'red bold');
			print ERROR "Have not provided an entry for est file suffix or quality file suffix in the configuration file\n";
			return 1;
		} 
		#has the user provided a codon usage table in file format?
		if ($cut && -f $cut)	{
			print colored ("Codon Usage Table provided,",'green');
			my $error=p4e2::p4e_checker::cut_file($cut);
			if ($error)	{
				print colored ("$err_msg\nCodon Usage Table file: $cut\t invalid format\n","red bold");
				print $error,"\n";
				print colored ("Expecting gcg format\n","red");
				return 1;
			}
			else	{print colored ("accepted",'green');}
		}
		elsif ($cut)	{	#found but not a file
			print colored("$err_msg\nCodon Usage Table file: $cut\tnot found or not accessible!\n","red bold");
			return 1;
		}
	}
	if ($estscanmatrix)	{	#so option and found
		unless (-f $estscanmatrix)	{
			print colored ("$err_msg\nThe ESTScan matrix specified: '$estscanmatrix' cannot be found\n",'red bold');
			return 1;
		}
		my $error=p4e2::p4e_checker::estscanmatrix($estscanmatrix);
		
		if (scalar @$error>=1)	{
			print colored ("$err_msg\nMatrix file: $estscanmatrix\t invalid format\n","red bold");
			return 1;
		}
		else	{ print colored ("Located ESTScan matrix\n","green");}
	}
	
	print colored ("\n\nConfig file has been read and all variables accepted\n\n","green");
	
	return (0, $in, $pre_blast);
}	 
################################################################################################################
sub convert_e	{
	my $e_rRNA = $_[0]->{'blastnE'};
	my $e_blast = $_[0]->{'blastxE'};
	if ($e_rRNA)	{
		my $num=&exp2lin($e_rRNA);
		if ($num)	{	$e_rRNA=$num;	}
		else	{
			print colored ("### Error! ###\n","red bold");
			print colored ("Unable to recognise e-value for rRNA search\n","red bold");
			print colored ("Is there a typo in $e_rRNA ?\n","red bold");
			print ERROR "Unable to recognise rRNA evalue: $e_rRNA\n";
			return 1;
		}
	}
	else	{
		print colored ("### Error! ###\n","red bold");
		print colored ("Unable to recognise e-value for rRNA search\n","red bold");
		print colored ("Is there a value entered?\n","red bold");
		print ERROR "Unable to recognise rRNA evalue: $e_rRNA\n";
		return 1;
	}
	if ($e_blast)	{
		my $num=&exp2lin($e_blast);
		if ($num)	{	$e_blast=$num;	}
		else	{
			print colored ("### Error! ###\n","red bold");
			print colored ("Unable to recognise e-value for Blast\n","red bold");
			print colored ("Is there a typo in $e_blast ?\n","red bold");
			print ERROR "Unable to recognise blastx evalue: $e_blast\n";
			return 1;
		}
	}
	else	{
		print colored ("### Error! ###\n","red bold");
		print colored ("Unable to recognise e-value for blast\n","red bold");
		print colored ("Is there a value entered?\n","red bold");
		print ERROR "Unable to recognise blastx evalue: $e_blast\n";
		return 1;
	}
	return (undef, $e_rRNA, $e_blast);
}
################################################################################################################
sub getcodon	{

	my $spe_input=shift;
	my $output_file = shift;

	print colored ("fetching a codon bias table for $spe_input.\nthis may take a few seconds\n","bold");
	
	my ($spe_query,$query_ans,$choices,$numchoices,$return_data,$codon_table,$spe,$si,$spe1,$line1,$ch_add,$thisURL,$choiceURLs,$spe_id);
	my $query_return = "qret";
	my $file_return = "fret";
	my (@choices,@choiceURLs);
	my $file_count = 1;
	
	my $CDS_flag = 1;
	my $notfd=1;
	my $count=1;
	my $num_flag=1;

	while ($notfd) {
		chomp $spe_input;

		($spe_query = $spe_input )=~s/ /\+/g; #insert + for url

		print colored ("Searching, please wait...\n","bold");
		#get 'spsearch answer' web page for $spe_input
		system ("wget -w 10 -q -Oqret 'http://www.kazusa.or.jp/codon/cgi-bin/spsearch.cgi?species=$spe_query&c=i'");

		open (Q_RET, "<$query_return"); #open the web page file

		while (my $line=<Q_RET>) {
			if ($line =~ /<A HREF\="\/codon\/cgi-bin\/showcodon.cgi\?species=(.*)">/) {
				#open each wepage returned for that sp
				system ("wget -qOcret$file_count 'http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=$1&aa=1&style=GCG'");
				$thisURL = $1;	#save part of this url
				open (T_RET, "<cret$file_count");
				while (my $line1 =<T_RET>) {
			
					if ($line1 =~ /<STRONG>Not found/) { #if there is no codon table
						$thisURL ='';	#empty thisURl if no codon table
						next; 
					} 
					elsif ($line1 =~ /<STRONG><i>(.*)<\/STRONG>/) {	#if codon table present, parse + keep the details
						$ch_add = $1;
						$ch_add =~ s/<\/i>.*:/-/;
						$ch_add =~ s/<\/STRONG>//;
						$choices .= $ch_add;
						$choices .= "\n";
							if ($thisURL) {$choiceURLs .= $thisURL;$choiceURLs .= "\n";} #save thisURL,
														#if it hasn't been deleted (cos no codon table) 				
					} 
				}	
				$file_count++;
			}
		}

		if ($choiceURLs) {
			@choiceURLs = split/\n/, $choiceURLs;
		}
		@choices = split /\n/, $choices;
		$numchoices = @choices;
		if ($numchoices == 0) {
			print colored ("\n### Warning! ###\n","green bold");
			print colored ("Sorry, there are $numchoices matches for $spe_input.\nIs the organism name correct?\n","green bold");
			print colored ("Otherwise you may have to choose another, closely related organism.  For full list check out http://www.kazusa.or.jp/codon/\n\n","green bold");
			print colored ("Please select another species:\n","white");
			$spe_input=<STDIN>;
			next;
		}	
		$notfd = 0;	#match(es) found, exit while loop
	}	#end of first while loop

	while ($CDS_flag) {	#display choices until told to continue (<50 CDSs)
		$count = 1;	#reset count
		$num_flag=1; #reset flag
		print colored ("There are $numchoices matches for $spe_input:\n","bold");
		foreach (@choices) {	
			print colored ("$count. $_\n","bold");
			#print colored ("$count. $_\n","black");
			$count++;
		}
		while ($num_flag) {
			$si ='';	#reset $si
			print colored ("\nPlease select a number to use: ","bold");
			$si = <>;
			chomp $si;
			if ($si <= $numchoices && $si >= 1) {	#check that number entered is in correct range
				$num_flag = 0;
			} else { 
				print colored ("\nYou must choose a number between 1 and $numchoices.\n", "green bold");
			}
		}
		$si = $si -1;	#convert $si to computer talk

		$spe1 = $choices[$si];		#get readable form of spe (with CDS numbers)
		$spe1=~ /\((\d+) codons\)$/; 

		if ($1 < 5000) {	#flag warning if number of codons in selected codon table is less than 5000
			print colored ("The codon table you have selected was derived from just $1 codons.\n","green bold");
			print colored ("Enter 'c' to continue, or any other key to choose again\n","green bold");
			my $input = <>;
			chomp $input;
			if ($input =~ /c/i) {
				$CDS_flag = 0;
			}
		} else {
			$CDS_flag = 0;
		}
	}	
	$spe = $choiceURLs[$si]; #get selected number from @choices


	system ("wget -qOfret 'http://www.kazusa.or.jp/codon/cgi-bin/showcodon.cgi?species=$spe&aa=1&style=GCG'");
	
	open (F_RET, "<$file_return");
	while (my $line=<F_RET>) {	#extract codon table from web page
		$return_data .= $line;
	}
	
	if ($return_data =~ /Fraction.*?\.\.\n(.*)<\/PRE>/s) { 
		 $codon_table = $1;
	}	
	$spe =~ /^(.).*?\+(.)/; #get first letter of 1st and 2nd words for spe identifer

	open (OUTPUT, ">$output_file"); #save codon table to file
	print INFO "The Codon Bias table used for $spe:\n$codon_table\n";
	print OUTPUT $codon_table;
	close OUTPUT;
	print colored ("\nThe codon table for $spe1 has been saved to $output_file\n\n\n","bold");
	system ("rm cret* fret qret");
	return 1;
}

################################################################################################################
sub setup_decoder	{
	print colored ("Setting up DECODER\nThis may take a few minutes\n","green bold");
	#print colored ("Setting up DECoder\nThis may take a few minutes\n","black bold");
	
	my $path2files=shift;
	my $qualsuff=shift;
	my $seqsuff=shift;
	my $cut=shift;
	my $seqs=shift;
	my $dir='decoderfiles';		
	mkdir "$dir", 0755;
	system("mv $cut decoderfiles/");
	my $T="$dir/Translate";
	my $T5="$dir/Translate.5";
	my $T3="$dir/Translate.3";
	my $P5="$dir/Peptide.5";
	my $P3="$dir/Peptide.3";
	my @dirs = ($T, $T5, $T3, $P5, $P3);
	my @notThere;
	foreach my $dir (@dirs)	{ mkdir $dir, 0755; }
	for (my $i; $i<$#{$seqs};$i++)	{
		my $id=$seqs->[$i]->display_id;
		$id=~s/\.Contig/_/;
		if (-e "$path2files/$id.$qualsuff")	{
			copy("$path2files/$id.$qualsuff", $T5) or die;
			copy("$path2files/$id.$seqsuff", $T5) or die;
		}
		else	{
			push @notThere, $seqs->[$i];
			splice @$seqs, $i, 1;	#remove id from the array
		}
	}
		
	chdir $T5;
	&FILENAME_CHANGE($qualsuff, 'qlt');
	&FILENAME_CHANGE($seqsuff, 'seq');
	chdir '../';	#move into decoder directory
	&revDNA;
	chdir '../';	#move into outdir
	return \@notThere;
}
################################################################################################################
sub FILENAME_CHANGE {
	my $old_symbol=$_[0];		#part of file name to change ie contigs.qual
	my $new_symbol=$_[1];		#replacement string
	opendir(DIR, "./")  || die "Cannot open dir";		#open the current dir

	while (defined(my $file= readdir(DIR))) {#while there are files in the dir

#Substitute in names of blast files
#if the new string is empty or the current file name doesn't contain the new 
#string and doesn't start with a stop
		if(($new_symbol eq '' || $file!~/$new_symbol/) && $file!~/^\./ && $file=~m/$old_symbol/)	{
 			my $new_file = $file;						#define $new_file as old file name
 			$new_file=~s/$old_symbol/$new_symbol/g;	#substitute old string with new
 			system "mv '$file' '$new_file'";			#rename old file
 		}
	}
}
################################################################################################################
sub revDNA	{

	my ($file, $flag, $title1, $seq, $line, $title);
	opendir(DIR,"./Translate.5/");

	while($file=readdir(DIR))	{
		if($file=~/seq/)	{
 			open(INPUT, "<./Translate.5/$file")  || die "Unable to open Input\n";
 			$flag=0;
 			$title1='';
 			while ($line = <INPUT>)	{
  				if($line=~/^(>.+)/) { $title=$1; if($title1=~/./) { gorev($seq,$title1,$file); } $seq=''; next; }
  				else { chomp $line; $seq.=$line; $title1=$title; }
  			}
 			gorev($seq,$title1,$file);
 			close(INPUT);
 		}
		if($file=~/qlt/)	{
 			open(INPUT, "<./Translate.5/$file")  || die "Unable to open Input\n";
 			$flag=0;
 			$title1='';
 			while ($line = <INPUT>)	{
  				if($line=~/^(>.+)/) { $title=$1; if($title1=~/./) { revqual($seq,$title1,$file); } $seq=''; next; }
  				else { chomp $line; $seq.=$line; $title1=$title; }
  			}
 		revqual($seq,$title1,$file);
 		close(INPUT);
		}
	}
}
################################################################################################################
sub gorev() {
	my $seq=shift;
	my $title1=shift;
	my $file=shift;
	my @tmpseq=split('',$seq);
	my $l=length($seq);
	my @revcomp;
	my $i;
	for($i=0;$i<$l;$i++)	{	
 		if($tmpseq[$i] eq ('A'||'a')) { $revcomp[$i]='T'; }
 		elsif($tmpseq[$i] eq ('C'||'c')) { $revcomp[$i]='G'; }
 		elsif($tmpseq[$i] eq ('G'||'g')) { $revcomp[$i]='C'; }
 		elsif($tmpseq[$i] eq ('T'||'t')) { $revcomp[$i]='A'; }
 		elsif($tmpseq[$i] eq 'N') { $revcomp[$i]='N'; }
 		elsif($tmpseq[$i] eq 'X') { $revcomp[$i]='X'; }
 		elsif($tmpseq[$i] ne ' ') { $revcomp[$i]=$tmpseq[$i]; }
 	}
	open(OUTFILE, ">>./Translate.3/$file") || die "Can't open $file.rev\n";
  	print OUTFILE "$title1\n";
  	my $n=0;
	for($i=$l-1;$i>-1;$i--)	{
   		print OUTFILE "$revcomp[$i]";
   		$n++;
   		if(($n % 50) == 0) { print OUTFILE "\n"; }
   	}
  	if(($n % 50) != 0) { print OUTFILE "\n"; }
 	close OUTFILE;
}
################################################################################################################
sub revqual() {
	my $seq=shift;
	my $title1=shift;
	my $file=shift;
	my @tmpseq=split(' ',$seq);
	my $l=scalar(@tmpseq);
	open(OUTFILE, ">>./Translate.3/$file") || die "Can't open $file.rev\n";
  	print OUTFILE "$title1\n";
  	my $n=0;
	my $i;
  	for($i=$l-1;$i>-1;$i--)	{
   		print OUTFILE "$tmpseq[$i] ";
   		$n++;
   		if(($n % 50) == 0) { print OUTFILE "\n"; }
   	}
  	if(($n % 50) != 0) { print OUTFILE "\n"; }
 	close OUTFILE;
}
################################################################################################################
sub run_decoder	{
	my $dir=shift;
	my $direction = shift;
	my $path2decoder = shift;
	chdir $dir or die;
	my @totalfiles=glob("./Translate.$direction/*.seq");
	my $total_count=scalar(@totalfiles);
	my $current_count=0;
	opendir(DIR,"./Translate.$direction") or die;			#open Translate.5 or .3 Dir
	my $file;
	print colored ("\nTranslating $direction' sequences with DECODER\n","green bold");
	#print colored ("\nTranslating $direction' sequences with DECoder\n","black bold");
	
	while(defined($file=readdir(DIR)))	{		#while there are files in the Dir
 		if($file=~/\.seq/)	{				#if it's a seq file
 			system("cp Translate.$direction/$file input.seq");	#copy the file to input.seq
  			$file=~s/seq/qlt/;				#change from .seq to .qlt
			system("cp Translate.$direction/$file input.qlt");	#copy the file to input.qlt
  			system("$path2decoder >& /dev/null");		#run decoder to make result.pep & .rqt & .tfa
  			$file=~s/qlt/pep/;				#change from .qlt to .pep
  			system("mv result.pep ./Peptide.$direction/$file");	#move result.pep to Peptide dir and rename
  			$file=~s/pep/rqt/;				#change from .pep to .rqt
  			system("mv result.qlt ./Peptide.$direction/$file");	#move result.rqt to Peptide dir and rename
  			$file=~s/rqt/tfa/;				#change from .rqt to .tfa
  			system("mv result.tfa Peptide.$direction/$file");	#move result.tfa to Peptide dir and rename
  			$current_count++;
			my $percentage = ($current_count/$total_count)*100;
			my $progress = sprintf "%5.2f", $percentage;
			print colored("\r$progress%",'green bold');
			#print colored("\r$progress%",'black bold');
		}
	}
 	closedir(DIR); 					#close directory
	chdir '../'
}

################################################################################################################
sub strip_html	{
#this works for all examples tested on, but there are mnay types of html formated BLAST.
#If this doesn't seem to work then please email me with the problem or solution you may come across.
	my $html_file=shift;
	my $string='';;
	open HTML, "<$html_file" or die;
	while (<HTML>)	{
		$string .= $_;
	}
	close HTML;
	my $done = 0;
	
	# Removing "<a name =...>" and adding the '>' character for 
    # HSP alignment listings.
    $string =~ s/(\A|\n)<a name ?=[^>]+> ?/>/sgi and $done = 1;

    # Removing all "<>" tags. 
    $string =~ s/<[^>]+>|&nbsp//sgi and $done = 1;

    # Re-uniting any lone '>' characters.
    $string =~ s/(\A|\n)>\s+/\n\n>/sgi and $done = 1;

	
	open BLS, ">blast_results.txt" or die;
	print BLS "string_after=$string\n";
	close BLS;

	#die;
    
    return 'blast_results.txt';
}


################################################################################################################
sub run_blast	{
	my($seqs2blast, $dbase_path, $blast_method, $e, $o, $f, $gcode) = @_;
	$f = 'T' if (!$f);
	
	my @params;
	print LOG "<RUN_BLAST>\ndatabase = $dbase_path  method=$blast_method\n";
	
	if ($o)	{	
		@params = ('d' => "$dbase_path" , 'program' => "$blast_method" , 'e' => "$e" , 'o' => "$o" , 'b' => '25' , 'v' => '25' ,
		'_READMETHOD' => "Blast" , 'F' => $f, 'Q' => $gcode);
	}
	else	{
		@params = ('d' => "$dbase_path" , 'program' => "$blast_method" , 'e' => "$e" , 'b' => '25' , 'v' => '25' ,
		'_READMETHOD' => "Blast", 'F' => $f, 'Q' => $gcode);
	}

	my $factory = Bio::Tools::Run::StandAloneBlast->new(@params);
	my $blast_report = $factory->blastall($seqs2blast);
	print LOG "Leaving <BLAST_SEARCH_DBASE>\n";
	return ($blast_report);
}
################################################################################################################
sub split_blast	{
	my $in=shift;
	my $evalue=shift;
	my $total_count=shift;
	my $seeds=shift;		#this is a ref to a has which holds the sequence for querying.
							#its only used when parsing the blast results against swissprot.
							#if the user selects a previous set of results then they may be
							#entries for queries which are not in the query set.
							#this speeds things up greatly and makes downstream events more robust.
	my $check_ref;
	my %hsp_box;
	my $tophit_name;
	my $current_count=1;
	
	my $type=ref($in);
	my $searchio;
	my %search_res;
	my $stop;
	my $num_of_seeds = keys %$seeds;
	if ($type=~m/blast/i)	{	#the input is already an object
		$searchio = $in;
	}	
	else	{
		$searchio = new Bio::SearchIO(-format => 'blast',
					     				-file   => $in);
	}
	BLAST: while (my $results = $searchio->next_result)	{ 
		my $query_name=$results->query_name; 
		$query_name=~s/,.*//;   ### Added for hgmp output
		$query_name=~s/;//;		#stray semi-colons need removing
	
		next BLAST unless (exists $seeds->{$query_name});
		next BLAST if (exists $hsp_box{$query_name});
		if ($total_count)	{progress($current_count++,$num_of_seeds);}	
		$check_ref->{$query_name}='1';
		my $hsp_count;				
		#my $result=$search_res{$query_name};
		my @hsps;
		my $count=0;
		while( my $hit = $results->next_hit ) {
			if (!$count)	{
				$tophit_name=$hit->name;
				$count=1;	#have top hit now don't come back in here 'til next result.
			}
			while( my $hsp = $hit->next_hsp ) {
				if ($hsp->evalue < $evalue)	{
					push @{$hsp_box{$query_name}}, $hsp; 
					$hsp_count++;
					if ($hsp_count >5)	{
						next BLAST;
					}
				}
				else	{
					next BLAST;
				}
			}
		}
		
	}
	undef %search_res;
	return (\%hsp_box,$check_ref,$tophit_name);
}
##############################################################################
sub spinner	{
 my $p=0;
 print substr(qq{|/-\\|/-\\}, $p++, 1), "\b";
 $p=0 if ($p > 8);
}
##############################################################################
sub progress	{
	my $current=shift;
	my $total=shift;
	my $percentage = ($current/$total)*100;
	my $progress = sprintf "%5.2f", $percentage;
	print colored ("\r$progress%",'green bold');
}
############################################################################
sub get_fasta	{
#Retrieves fasta sequences from a flat-file.  So far the blast results.
#It is possible to use BIOPERL modules to parse the sequences out of the
#previous alignment.
#However the approach is a little derived and in this instance parsing 
#the fasta file is possibly more efficient.

	
	my $hsp_frame = $_[1]; 
	if ($hsp_frame=~m/-(\d)/)	{	#transeq numbers the sequences 1-6. 
		$hsp_frame=$1+3;
	}
	
	my $seq_id = "$_[0]"."_"."$hsp_frame";
	my $aref=$_[2];
	my @seqs_fromfile = @$aref;	
	my $seq_full;
	my $count=0;
	
		
	foreach my $seq (@seqs_fromfile)	{
		last if ($seq_full);
		if ($seq->display_id eq $seq_id)	{
			$seq_full = $seq->seq;
		}
	}
	
	if (!$seq_full)	{print colored ("### WARNING ###\nNon-Fatal Error: $seq_id\tNo sequence found\n",'green bold');}
	print LOG "Leaving <GET_FASTA>\n";
	return $seq_full;
}
###################################################################################################################
sub decoder_parse	{
	my ($input_id, $path)=@_;
	my ($off2ESTScan, $decoderfile5, $decoderfile3, $big, $decoder_pred, $strand);
	print LOG "<DECODER_PARSE>\n";
	
	$input_id=~ s/\.Contig/_/g;
	$path=~s/\/$//;			#remove end / if in path.
	if ($path=~ m/prot4estdecoderfiles/)	{	#uses the results created by this process
		$path=~s/prot4est//;	#remove identifier
		$decoderfile5 = "./$path/Peptide.5/$input_id.pep";
		$decoderfile3 = "./$path/Peptide.3/$input_id.pep";
	}
	else {#use the absolte path given in config file
		$decoderfile5 = "$path/Peptide.5/$input_id.pep";
		$decoderfile3 = "$path/Peptide.3/$input_id.pep";
	}	
	
#both of these could be the correct translation!  
#Pick longest.
	if(-s "$decoderfile3" > -s "$decoderfile5")		#if file in .3 dir is greater than file in .5 dir
	{ $big=$decoderfile3; $strand='-1' }													
	else { $big=$decoderfile5; $strand='1'; }		

	if(-f $big)
          {		
	my $in = Bio::SeqIO->new(-file=>"$big" , -format=>'Fasta');
        while (my $seq = $in->next_seq())       {
        	my $acc = $seq->display_id;
			my $desc = $seq->desc();
						
        	(my $seqstring = $seq->seq) =~ s/X+$//g;
			$seq->seq($seqstring);
			my $seqlen = $seq->length;
			$decoder_pred->[3]=$seqlen;
			my ($bp, $aa);
               
           	if ($acc && $desc && $seqlen)   {
				$acc =~ s/^(\w+_?\d?).*/$1/i;
				$seq->display_id($acc);
			}
           	else    {die "\n\nERROR: acc=$acc\tdesc=$desc\tlength=$seqlen\n$er_msg";}
               
           	if ($desc=~ m/\w+\s+(\d+)\sbp\s+(\d+)\saa.+/)   {
              	$bp=$1;
               	$aa=$2; 
           	}
       		else {
				print colored("\n\n### ERROR ###\n$acc - problem with header in DECODER proediction\n$er_msg",'red bold');
				exit(1);
			}
			#determine start and end of peptide w.r.t. the EST
			if ($desc=~m/cds\s+(\d+)\s+-\s+(\d+)/)	{
				my %mod;
				@mod{0,1,2}=(3,1,2);	#remainder maps to frame.
				my $start=$1; my $end=$2;
				$decoder_pred->[0]=$start;
				$decoder_pred->[1]=$end;
				my $rem;
				if ($strand eq '1')	{
					#positive strand.  So modulus so start by three will indicate frame.
					$rem=$start%3;
				}
				else	{
					#minus strand so need to use diff between start and end to determine frame.
					$rem=($seqlen-$start+1)%3;
				}
				$decoder_pred->[2]=$strand*$mod{$rem};
			}
			
			my $max_trans = sprintf("%.0f", $bp/3); #convert length of contig to aa
			
           	my $max_10pc = sprintf("%.0f", $max_trans*0.1); #10% of $max_trans.  Dear user feel free to alter this.  
																				#Let me know how you get on.
           	if ($seqlen<30)     {       #too short!
               	$decoder_pred->[5]='0';
				$decoder_pred->[4]=$seq->seq;
			}
           	elsif ($seqlen<$max_10pc)   {       #too short relative to contig
               	$decoder_pred->[5]='0';
				$decoder_pred->[4]=$seq->seq;
            }
           	else    {
               	$decoder_pred->[5]='1';
				$decoder_pred->[4]=$seq->seq;
           	}
        }	
        }
	else {       #Non Existant
               	$decoder_pred->[5]='0';
				$decoder_pred->[4]=undef;
			}
	
	print LOG "Leaving <DECODER_PARSE>\n";
	return $decoder_pred;
}

################################################################################################################
sub parse_sixframe	{

	my $seqid=shift;
	my $size=shift;
	my $count=shift;
	my $seed_seq_len=shift;
	my $seqs=shift;	#an aref to all sixframes of the sequence in question  
	print LOG "<PARSE_SIXFRAME>\n";
	my $i=0;
	my $frame=0;
	my $longest=0;
	my ($start, $end, $longest_frame, $aa_start, $aa_end);
	my @allframes;
	my $longest_seq;
	my $infile = "nucl_6pep.fa";
	my $seqIO = new Bio::SeqIO (-file=>$infile);

	foreach my $seq (@$seqs)	{
		$frame++;
		while ($seq=~/\**(\w+)\**?/gs)	{ 	#this allows nnnnnnn* , *nnnnnn , *nnnnnn*
			my $current=$1;
			$current=~s/X+$//i;
			$current=~s/^X+//i;
			my $seq_len=length($current);
			if ($seq_len>$longest)	{
				$longest=$seq_len;
				$longest_seq=$current;
				$longest_frame=$frame;
			}
		}
	}

	#now need to find its location w.r.t to the EST
	my $est_seq=$seqs->[$longest_frame-1];
	
	my $sixf_stats;
	if ($est_seq=~m/\Q$longest_seq\E/)	{
		$aa_start=$-[0]+1;
		$aa_end=$+[0]-1;
		#$sixf_stats->[0][1]=$-[0];
		#$sixf_stats->[1][1]=$+[0]-1;
	}
	else	{
		print colored("###Error###\nCannot determine location of putative peptide\n", 'red bold');
		print "$longest_frame\n$est_seq\n$longest_seq\n";
		print ":";
		print join "\n:", @$seqs;
		print "\n";
		exit;
	}


	if ($longest_frame > 3)	{	#minus strand
		my $frame=$longest_frame-3;
		my $offset=($aa_start*3)-(7-$longest_frame);
		my $l1=$seed_seq_len;
		my $nt_end = $l1 - $offset;	#second term is the distance between end of EST and the start of peptide.
		my $l2=length($longest_seq)*3;
		my $nt_start = (($nt_end-$l2)+1);
		if ($nt_start<1)	{
			if ($longest_seq=~m/[LVSPTARG]/)	{	#examples where the 3rd position in codon is redundant
				$nt_start=1;						#so transeq uses a two ntide codon.
			}
		}
		die "DIED $nt_start, $longest_frame, $aa_start, $l1, $l2, $nt_end, $seqid,\n$longest_seq\n" if $nt_start < 1;
		die if $nt_end > $l1;
		$sixf_stats->[0][1]=$nt_start;
		$sixf_stats->[1][1]=$nt_end;
		$longest_frame = ($longest_frame-3)*-1;
	} 
	else	{
		my $offset=$longest_frame-1;
		my $nt_start=$aa_start*3-2+$offset;
		$sixf_stats->[0][1]=$nt_start;
		$sixf_stats->[1][1]=$nt_start+3*length($longest_seq)-1;
	}
	$sixf_stats->[0][0]=$longest_frame;
	$sixf_stats->[1][0]=$longest_frame;
	$sixf_stats->[2]=$longest_seq;

	my $start_met = index($longest_seq, "M")+1;
	my $desc;
	if ($start_met > 0)	{ $desc="Possible METstart at position: $start_met";}
	
	print LOG "Leaving <PARSE_SIXFRAME>\n";	
	
	progress ($count, $size);
		
	return $sixf_stats;
}
################################################################################################################
sub exp2lin	{
	my $e=shift;
	if ($e =~ m/((?:\d+\.?\d*|\.\d+)[eE][+-]?\d+)/)	{
		my $exp=$1;
		my $limit;
		($limit=$e)=~s/(?:\d+\.?\d*|\.\d+)[eE][+-]?(\d+)/$1/;	#need to do this as issues if limit set too high
		$limit++;												#compared to that following e/E.
		$limit.='f';
		$_ = sprintf "%.$limit", $exp;
		s/0+$//;  s/\.$//;
		return $_;
	}
	elsif ($e =~ m/^\s*\d+\.?\d+\s*$/)	{
		#do nothing to e_rRNA as already linear
		return $e;
	}
	else { 
		return 0; #error - can't recognise the number 
	}
}
################################################################################################################
sub outfiles	{
	my $action=shift;
	
	if ($action eq 'open')	{
		#open NOXTN, ">>translations_noxtn.fsa" or die;
		#open XTN, ">>translations_xtn.fsa" or die;
		open DB_MAIN, ">>prot_main.psql" or die;
		open DB_HSPS, ">>prot_hsps.psql" or die;
		open DB_PROP, ">>prot_prop.psql" or die;
	}
	elsif	($action eq 'close')	{
		#close NOXTN;
		#close XTN;
		close DB_MAIN;
		close DB_HSPS;
		close DB_PROP;
	}
	else {
		print "Cannot determine file action: $action!\n";
		die;
	}
}
################################################################################################################
sub get_stats_4_psql	{
	my $id=shift;
	my $seq_stats=shift;
	my $counter=shift;
	
	my ($xtn, $xtn_start, $xtn_end, $note, $hsps);
	my (@main, @hsps, @prop);
	$main[5]=$seq_stats->[0][1];	#conf start
	$main[6]=$seq_stats->[0][0];	#start frame
	$main[7]=$seq_stats->[1][1];	#conf end
	$main[9]=$seq_stats->[1][0];	#end frame
	
	if ($seq_stats->[0][2])	{
		$main[4]=$seq_stats->[0][2];	#extn start
	} else { $main[4]=''; }
	if ($seq_stats->[1][2])	{
		$main[8]=$seq_stats->[1][2];	#extn end
	} else { $main[8]=''; }
	
	#remove for release.
	foreach ($main[4], $main[5], $main[7], $main[8])	{
		if ($_ =~m/-/)	{
			print "$id caused a crash: $main[4],$main[5],$main[7],$main[8]\n";
		}
	}
	
	
	my $count=0;
	foreach my $hsp (@{$seq_stats->[5]})	{
		my @indiv_hsp;
		$indiv_hsp[0]=$counter++;
		$indiv_hsp[1]=$id;
		for (my $i=0;$i<5;$i++)	{
			$indiv_hsp[$i+2]=$hsp->[$i];
		}
		push @hsps, \@indiv_hsp;
	}
	
	return (\@main,\@hsps,\@prop);
}
################################################################################################################
sub get_location	{
	my $id=shift;
	my $transln_seq=shift;
	my $evalue=shift;
	my $method=shift;
	
	my ($seq_stat,$failed);
	
	my $bls_report=&run_blast($transln_seq,'input.fsa','tblastn',$evalue,'out','F');
	my %seed;
	$seed{$id}='';
	my ($hsps,undef,$top_name)=&split_blast($bls_report,'0.0001',undef,\%seed);	#ohh dear needs a $seeds entry!
		
	#first ensure that the top hit matches what we think (hope) it is...
	if ($top_name ne $id)	{	
		print INFO  "$method prediction for $id is not robust.  Fails in BLAST search.  Non-Fatal.\n";
		$failed = $id;
	}
	elsif (!$hsps->{$id}[0])	{
		$failed = $id;
	}
	else	{
	#need to find the frame of the most 5' hsp, the start amino acid and extend
	#this has serperate functionality to the tile_path sub rountine.

		my $start_loc=10000000;
		my $end_loc;
		my ($five_frame,$three_frame);
		my ($h_start,$h_end, $q_start, $q_end);
		
		foreach my $hsp (@{$hsps->{$id}})	{
			my ($start, $end);
			if ($hsp->hit->strand =~m/-/)	{
				$start=$hsp->hit->end;
				$end=$hsp->hit->start;
			}
			else	{
				$start=$hsp->hit->start;
				$end=$hsp->hit->end;
			}
			if ($start<$start_loc)	{	#lets look at start.
				$start_loc=$start;
				$five_frame=($hsp->hit->frame+1)*$hsp->hit->strand;
				$q_start=$hsp->query->start;
			}
			if ($end>$end_loc)	{		#lets look at end.
				$end_loc=$end;
				$three_frame=($hsp->hit->frame+1)*$hsp->hit->strand;
				$q_end=$hsp->query->end;
			}
		}	

		#the blast search gives the location of the translation mapped onto the nucleotide.
		#However the hsp may not contain the entire translation.
		
		my $query_fullength = $transln_seq->length;	
		
		if ($q_start != 1)	{		#the blast hit is not at the start of the translation
			if ($five_frame=~m/-/)	{
				$start_loc+=(($q_start-1)*3);
			}
			else {
				$start_loc-=(($q_start-1)*3);
			}
		}
		if ($q_end != $query_fullength)	{		#blast hit has not found the end of the translation	
			if ($three_frame=~m/-/)	{
				$end_loc-=(($query_fullength-$q_end)*3);
			}
			else	{
				$end_loc+=(($query_fullength-$q_end)*3);
			}
		}
		
		$seq_stat->[0][0]=$five_frame;
		$seq_stat->[1][0]=$three_frame;
		
		$seq_stat->[0][1]=$start_loc;
		$seq_stat->[1][1]=$end_loc;  
		$seq_stat->[2]=$transln_seq->seq;
	}
	return ($seq_stat, $failed);
}

################################################################################################################

__END__

