#!/usr/bin/perl -w

#######################################################################
#######################################################################
#  Copyright 2008 Roney S. Coimbra

#  This file is part of genealiases

#  genealiases is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License.
#  genealiases is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with genealiases (file: COPYING).  If not, see <http://www.gnu.org/licenses/>.

#######################################################################
#######################################################################


sub get_parameter {
  print "$_[0] [$_[1]]: ";
  my $rc = <>;
  chomp $rc;
  if($rc eq ""){
    if ($_[1] ne "") {
      $rc = $_[1];
    }else {
      die "input data missing";
    }
  }
  return $rc;
}


print "\n##########################################################\n" .
  "Copyright 2008 Roney S. Coimbra\n" .
  "GeneAliases is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License.\n" .
  "GeneAliases is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.\n" . 
  "##########################################################\n\n";

my $gene_names = get_parameter("List of canonical gene names and their aliases tab separated, one gene per line", "");
my $input_baseline  = get_parameter("Baseline abstract set", "");
my $pre_proc_baseline = get_parameter ("Pre-process baseline abstract corpora (yes or no)", "no");
my $exclud_dict = get_parameter("Dictionary of terms to be excluded by default", "");
my $number_of_abstracts = get_parameter("Maximal number of abstracts to be fetched from Medline", "100");
my $use_filters = get_parameter("Use filters (yes or no)", "yes");
my $filter1 = get_parameter("Select entries with at least this number of abstracts in Pubmed", "1");
my $min_size = get_parameter("Minimal number of abstrats retrieved to include names", "1");
my $cutoff_baseline = get_parameter("Maximal term frequency allowed in baseline - terms exceeding this threshold are excluded", ".05");
my $t = get_parameter("t value for filter 2 of main program", ".15");
my $k = get_parameter("k value for filter 2 of main program", "1.5");
my $filter4 = get_parameter("Discards words present in abstracts of more than the 1/f4 of all entries", "1");



if (! $input_baseline || ! $exclud_dict || ! $gene_names){
     print "No gene names, baseline abstracts, or exclusion dictionary\n";
     exit
}

if ($pre_proc_baseline eq "yes"){
  system ("main_program_baseline.pl $input_baseline");
}

open GENE_LIST, ("<$gene_names");

while (<GENE_LIST>){

chomp;

(my @aliases) = split "\t";

my $canonical_name = $aliases[0];
my $unrelated_name1 = $aliases[@aliases-1];
my $unrelated_name2 = $aliases[@aliases-2];
my $unrelated_name3 = $aliases[@aliases-3];

system ("mkdir $canonical_name");

my $canonical_name_dir_file = join '/', ".", $canonical_name, $canonical_name;

open ALIASES_LIST, (">>$canonical_name_dir_file.list");

for ($i = 0; $i < @aliases; $i++){

  print ALIASES_LIST "$aliases[$i]\t$aliases[$i]\n";

}
close ALIASES_LIST;

system ("commander.pl $canonical_name_dir_file.list $canonical_name $unrelated_name1 $unrelated_name2 $unrelated_name3 $number_of_abstracts $input_baseline $exclud_dict $use_filters $filter1 $min_size $cutoff_baseline $t $k $filter4 $gene_names");
}

close GENE_LIST;

system ("process_results.pl $gene_names.jac");

