#!/usr/bin/perl -w

#######################################################################
#######################################################################
#  Copyright 2008 Roney S. Coimbra

#  This file is part of genealiases

#  genealiases is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License.
#  genealiases is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with genealiases (file: COPYING).  If not, see <http://www.gnu.org/licenses/>.

#######################################################################
#######################################################################

use  Lingua::EN::Inflect qw ( PL PL_N PL_V PL_ADJ NO NUM
                                     PL_eq PL_N_eq PL_V_eq PL_ADJ_eq
                                     A AN
                                     PART_PRES
                                     ORD NUMWORDS
                                     inflect classical
                                     def_noun def_verb def_adj def_a def_an );

my $input_baseline = $ARGV[0];


if (! $input_baseline){
     print "No baseline abstracts provided\n";
     exit
}

my $BASE_ID_old;
my $BASE_ID_new;
my %hash_abs_baseline = ();
my %has_abs_BASE_ID = ();
my %hash_BASE_ID = ();
my %hash_stock_BASE_ID = ();
my %hash_baseline = ();
my $count_BASE_ID = 0;
my $counter_abs_baseline;


my $baseline_dbm = substr ($input_baseline, 0, -4);

dbmopen (%hash_baseline_2, $baseline_dbm, 0644) || die "Cannot open DBM baseline: $!";

open (BASELINE, "$input_baseline") || die "can't open input_baseline file";

while (<BASELINE>){
  next if (/^GENE\_NAME\t/);

  if (! $BASE_ID_old){ # First abstract of first gene

    ($BASE_ID_new, my $PMID, my $abstr) = split "\t";
    $BASE_ID_old = $BASE_ID_new; # Iinitialize ID baseline


    # Count words in the current abstract
    if ($abstr){
      $_ = $abstr;
      while (/(\w[\w\-\'][\w\-\']+)/g){
	my $key = lc($1); # filtro 0 : replace capital letters with small letters
	if (! defined $hash_abs_baseline{$key}){
	  $hash_abs_baseline{$key}=1;
	}
      }
    }

  } elsif ($BASE_ID_old){ # From second abstract on

    # merge words from current abstract in the hash of the current BASE_ID
    foreach (keys %hash_abs_baseline) {

      s/^[\'\:\(\;\:\.\,\-]+//og;
      s/[\'\:\(\;\:\.\,\-]+$//og;

      if (defined $hash_baseline {$_}){
	$hash_BASE_ID{$_}++;
      }else {
	$hash_BASE_ID{$_}= 1;
      }
    }
    %hash_abs_baseline = (); # Reinitializes the hash of abstract
    ($BASE_ID_new, my $PMID, my $abstr) = split "\t";


    if ($BASE_ID_new eq $BASE_ID_old){ # New abstract to the current gene name/alias
      $counter_abs_baseline++;

      # count words in the current line
     if ($abstr){
	$_ = $abstr;
	while (/(\w[\w\-\'][\w\-\']+)/g){
	  my $key = lc($1); # filtro 0 : replaces capital letters with small letters
	  if (! defined $hash_abs_baseline{$key}){
	    $hash_abs_baseline{$key}=1;
	  }
	}
      }

    } elsif ($BASE_ID_new ne $BASE_ID_old) { # New abstract to a new gene name/alias
      my $word_and_freq;
      foreach (keys %hash_BASE_ID){
	$word_and_freq .= "$_ " . ($hash_BASE_ID{$_}/$counter_abs_baseline) . "\t";
      }
      if (! defined $hash_stock_BASE_ID{$BASE_ID_old}){
	$hash_stock_BASE_ID{$BASE_ID_old} = $word_and_freq;
      }
      ($BASE_ID_new, my $PMID, my $abstr) = split "\t";
      $BASE_ID_old = $BASE_ID_new; # New BASE_ID
      %hash_BASE_ID = (); # Reinitialize the hash to the new BASE_ID
      $counter_abs_baseline = 1;

      # count words in the current line

      if ($abstr){
	$_ = $abstr;
	while (/(\w[\w\-\'][\w\-\']+)/g){
	  my $key = lc($1); # filtro 0 : Replace capital letters with small letters
	  if (! defined $hash_abs_baseline{$key}){
	    $hash_abs_baseline{$key}=1;
	  }
	}
      }
    }
  }
}
close BASELINE;

#####################################################################################
# Processes the last gene in abstracts baseline

# combinar as palavras no abstract corrente no hash da BASE_ID corrente
foreach (keys %hash_abs_baseline) {

  s/^[\'\:\(\;\:\.\,\-]+//og;
  s/[\'\:\(\;\:\.\,\-]+$//og;

  if (defined $hash_baseline {$_}){
    $hash_BASE_ID{$_}++;
  }else {
    $hash_BASE_ID{$_}= 1;
  }
}

my $word_and_freq;
foreach (keys %hash_BASE_ID){
  $word_and_freq .= "$_ " . ($hash_BASE_ID{$_}/$counter_abs_baseline) . "\t";
}
if (! defined $hash_stock_BASE_ID{$BASE_ID_old}){
  $hash_stock_BASE_ID{$BASE_ID_old} = $word_and_freq;
}


######################################################################################


# Calculate the absolute counts of each term to each BASE_ID

foreach (keys %hash_stock_BASE_ID){
    $count_BASE_ID++;
    if ($hash_stock_BASE_ID{$_}){
	my @words_and_freqs_selected_BASE = split ('\t', $hash_stock_BASE_ID{$_});
	foreach (@words_and_freqs_selected_BASE){
	    (my $each_word_BASE, my $its_freq_BASE) = split;
	    if (! defined $hash_baseline{$each_word_BASE}){
		$hash_baseline{$each_word_BASE} = $its_freq_BASE;
	    } else {
		$hash_baseline{$each_word_BASE} += $its_freq_BASE;
	    }
	}
    }
}

# Clear numbers, blank spaces and other special characters
foreach (keys %hash_baseline){
    if (/^[\d\#\(\)\%\&\;\,\-\.\:\s\+\*\[\]\=\/]+$/){
	delete $hash_baseline{$_};
    }
}

# Replace pluriels with their singular and merge their counts

foreach (sort keys %hash_baseline){


    if ($_ ne PL($_)){
	if (exists $hash_baseline{PL($_)}){
	    $hash_baseline{$_} += $hash_baseline{PL($_)};
	    delete $hash_baseline{PL($_)};
	}
    }
}

# replace absolut counts with frequencies in hash_baseline
foreach (keys %hash_baseline){
    my $freq_word_baseline = ($hash_baseline{$_} / $count_BASE_ID);
    $hash_baseline{$_} = $freq_word_baseline;

}

# duplicate baseline creating entries to singular and pluriels with the same frequency - ASSURE BOTH FORMS WILL BE FILTERED DURING EXPERIMENTAL DATA ANALISES

foreach (keys %hash_baseline){
    my $plural =  PL($_);
    if ($_ ne $plural){
	$hash_baseline_2{$_} = $hash_baseline{$_};
	$hash_baseline_2{$plural} = $hash_baseline_2{$_};
    }
}


dbmclose (%hash_baseline_2);
