#!/usr/bin/perl -w

#######################################################################
#######################################################################
#  Copyright 2008 Roney S. Coimbra

#  This file is part of genealiases

#  genealiases is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License.
#  genealiases is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with genealiases (file: COPYING).  If not, see <http://www.gnu.org/licenses/>.

#######################################################################
#######################################################################


use  Lingua::EN::Inflect qw ( PL PL_N PL_V PL_ADJ NO NUM
                                     PL_eq PL_N_eq PL_V_eq PL_ADJ_eq
                                     A AN
                                     PART_PRES
                                     ORD NUMWORDS
                                     inflect classical
                                     def_noun def_verb def_adj def_a def_an );

my $exclud_dict = $ARGV[0];
my $input_baseline = $ARGV[1];
my $input_file = $ARGV[2];
my $use_filters = $ARGV[3];
my $filter1 = $ARGV[4];
my $min_size = $ARGV[5];
my $cutoff_baseline = $ARGV[6];
my $t = $ARGV[7];
my $k = $ARGV[8];
my $filter4 = $ARGV[9];

my $output_file = join '.', (substr ($input_file, 0, -14)),"wordfreq";

if (! $input_file || ! $exclud_dict){
     print "No baseline or input file or exclusion dictionary provided\n";
     exit
}

my $param_filter1;
my $AFFY_ID_old;
my $counter_abs = 1;
my %hash_exclud_default = ();
my %hash_abs = ();
my %hash_AFFY_ID = ();
my %hash_stock_AFFY_ID = ();
my %hash_words_average_freq = ();
my %hash_stock_AFFY_ID_cleaned = ();
my $count_AFFY_ID = 0;
my $abs_ID_old;



################################## Load dictionary of terms to be excluded by default

open (EXCLUD_DICT, "$exclud_dict") || die "can't open exclusion dictionary";
while (<EXCLUD_DICT>){
  chomp;

# to replace capital letters with small letters

  my $key = lc($_);
  if (!defined $hash_exclud_default{$key}){
    $hash_exclud_default{$key} = 1;
  }
  my $pluriel_key = PL($key);
  if (!defined $hash_exclud_default{$pluriel_key}){
    $hash_exclud_default{$pluriel_key} = 1;
  }
}

############################################ Load baseline DBM file

my $the_dbm = substr ($input_baseline, 0, -4);

dbmopen (%hash_baseline_2, $the_dbm, 0644) || die "Cannot open DBM baseline $!";;


############################################ Data analysis starts here


open (INPUT, "$input_file") || die "can't open input file";

while (<INPUT>){ 
  next if (/^GENE\_NAME\t/);
  next if (/^$/);
  if (! $AFFY_ID_old){

    ($AFFY_ID_new, my $PMID, my $abstr) = split "\t";

# to initialize a new entry

    $AFFY_ID_old = $AFFY_ID_new;

    if ($abstr){
      $_ = $abstr;
      while ((/(\w[\w\-\'][\w\-\']+)/g)){

# filtro 0 : Replace capital letters with small letters

	my $key = lc($1);
	if (! defined $hash_abs{$key}){
	  $hash_abs{$key}=1;
	}
      }
    }

  } elsif ($AFFY_ID_old){

# Add terms from current abstract to the hash of the current entry

    foreach (sort keys %hash_abs){
      if (($_ ne PL($_))&&(defined $hash_abs{PL($_)})){
	delete $hash_abs{PL($_)};
      }
    }

    foreach (keys %hash_abs){
      s/^[\'\:\(\;\:\.\,\-]+//og;
      s/[\'\:\(\;\:\.\,\-]+$//og;

      if (defined $hash_AFFY_ID{$_}){
	$hash_AFFY_ID{$_}++;
      } else {
	$hash_AFFY_ID{$_} = 1;
      }
    }
    %hash_abs = ();
    ($AFFY_ID_new, my $PMID, my $abstr) = split '\t';
    if ($AFFY_ID_new eq $AFFY_ID_old){
      $counter_abs++;
      if ($abstr){
	$_ = $abstr;
	while ((/(\w[\w\-\'][\w\-\']+)/g)){

# filtro 0 : Replace capital letters with small letters

	  my $key = lc($1);
	  if (! defined $hash_abs{$key}){
	    $hash_abs{$key}=1;
	  }
	}
      }

    } elsif ($AFFY_ID_new ne $AFFY_ID_old) {


# Calculate term frequencies to the current entry based on the value of $counter_abs corrente

# filter 01 : at least n abstracts per AFFY_ID

      if ($counter_abs >= $filter1){
	my $word_and_freq;


	if ($counter_abs <= 5) {
	  $param_filter1 = ($t + ($k / 5));
	}else {
	  $param_filter1 = ($t + ($k / $counter_abs));
	}
	
# Clear numbers, blank spaces and special characters

	foreach (keys %hash_AFFY_ID){
	  if (/^[\d\#\(\)\%\&\;\,\-\.\:\s\+\*\[\]\=\/]+$/){
	    delete $hash_AFFY_ID{$_};
	  }
	}
	
# Clear pluriels

	foreach (sort keys %hash_AFFY_ID){
	  if ($_ ne PL($_)){
	    if (exists $hash_AFFY_ID{PL($_)}){
	      $hash_AFFY_ID{$_} += $hash_AFFY_ID{PL($_)};
	      delete $hash_AFFY_ID{PL($_)};
	    }
	  }
	}
	
# filter 02a: if f_term_data exists in hash_baseline and f_term_in_baseline > $cutoff_baseline, discard term_data

# filter 02b : f_term_in_data minus f_term_in_baseline >= [t + (k/n)], where t = 0.15, k = 1.5 and n = $counter_abs

	foreach (keys %hash_AFFY_ID){


# Search the term in the dictionary of terms to be excluded by default

	  if (!defined $hash_exclud_default{$_}){

	    if ($use_filters eq "yes"){
	      my $plural = PL($_);
	
	
	      if (defined $hash_baseline_2{$_}){
		if ($hash_baseline_2{$_} <= $cutoff_baseline){
		  if ((($hash_AFFY_ID{$_}/$counter_abs) - $hash_baseline_2{$_}) > $param_filter1){
		    $word_and_freq .= "$_ " . ($hash_AFFY_ID{$_}/$counter_abs) . "\t";
		  }
		}
	      } elsif (defined $hash_baseline_2{$plural}){
		if ($hash_baseline_2{$plural} <= $cutoff_baseline){
		  if ((($hash_AFFY_ID{$_}/$counter_abs) - $hash_baseline_2{$plural}) > $param_filter1){
		    $word_and_freq .= "$plural " . ($hash_AFFY_ID{$_}/$counter_abs) . "\t";
		  }
		}
	      } else {
		if (($hash_AFFY_ID{$_}/$counter_abs) > $param_filter1){
		  $word_and_freq .= "$_ " . ($hash_AFFY_ID{$_}/$counter_abs) . "\t";
		}
		
	
	      }
	
	    } else {
	      $word_and_freq .= "$_ " . ($hash_AFFY_ID{$_}/$counter_abs) . "\t";
	    }
	  }
	}
	
	if (! defined $hash_stock_AFFY_ID{$AFFY_ID_old}){
	  $hash_stock_AFFY_ID{$AFFY_ID_old} = $word_and_freq;
	}
      }


      ($AFFY_ID_new, my $PMID, my $abstr) = split '\t';

# New entry

      $AFFY_ID_old = $AFFY_ID_new;

# Restart hash to a new entry

      %hash_AFFY_ID = ();

# Restart abstracts count to a new entry

      $counter_abs = 1;

      if ($abstr){
	$_ = $abstr;
	while ((/(\w[\w\-\'][\w\-\']+)/g)){

# filtro 0 : replace capital letters with small letter

	  my $key = lc($1);
	  if (! defined $hash_abs{$key}){
	    $hash_abs{$key}=1;
	  }
	}
      }
    }
  }
}


close INPUT;

############################################### Process the last AFFY_ID

# Clear pluriels in hash_abs only when their respective singular forms also occur in this hash

# Copy terms from last abstract of hash_abs to hash_AFFY_ID

   foreach (sort keys %hash_abs){

# Sort function assures singular forms precede their pluriels during analysis

      if (($_ ne PL($_))&&(defined $hash_abs{PL($_)})){
	delete $hash_abs{PL($_)};
      }
    }

    foreach (keys %hash_abs){
      s/^[\'\:\(\;\:\.\,\-]+//og;
      s/[\'\:\(\;\:\.\,\-]+$//og;

      if (defined $hash_AFFY_ID{$_}){
	$hash_AFFY_ID{$_}++;
      } else {
	$hash_AFFY_ID{$_} = 1;
      }
    }


# Calculate term frequencies to the current entry based on the value of $counter_abs corrente

# filter 01 : at least n abstracts per AFFY_ID


if ($counter_abs => $filter1){
  my $word_and_freq;


  if ($counter_abs <= 5) {
    $param_filter1 = ($t + ($k / 5));
  } else {
    $param_filter1 = ($t + ($k / $counter_abs));
  }

# Clear numbers, blank spaces and special characters

  foreach (keys %hash_AFFY_ID){
    if (/^[\d\#\(\)\%\&\;\,\-\.\:\s\+\*\[\]\=\/]+$/){
      delete $hash_AFFY_ID{$_};
    }
  }

# Clear pluriels

  foreach (sort keys %hash_AFFY_ID){
    if ($_ ne PL($_)){
      if (exists $hash_AFFY_ID{PL($_)}){
	$hash_AFFY_ID{$_} += $hash_AFFY_ID{PL($_)};
	delete $hash_AFFY_ID{PL($_)};
      }
    }
  }



#filter 02a: if f_term_data exists in hash_baseline and f_term_in_baseline > $cutoff_baseline, discard term_data

#filter 02b : f_term_in_data minus f_term_in_baseline >= [t + (k/n)], where t = 0.15, k = 1.5 and n = $counter_abs

  foreach (keys %hash_AFFY_ID){


# Search the current term in the dictionary of terms to be excluded by default

    if (!defined $hash_exclud_default{$_}){

      if ($use_filters eq "yes"){
	my $plural = PL($_);


	if (defined $hash_baseline_2{$_}){
	  if ($hash_baseline_2{$_} <= $cutoff_baseline){
	    if ((($hash_AFFY_ID{$_}/$counter_abs) - $hash_baseline_2{$_}) > $param_filter1){
	      $word_and_freq .= "$_ " . ($hash_AFFY_ID{$_}/$counter_abs) . "\t";
	    }
	  }
	} elsif (defined $hash_baseline_2{$plural}){
	  if ($hash_baseline_2{$plural} <= $cutoff_baseline){
	    if ((($hash_AFFY_ID{$_}/$counter_abs) - $hash_baseline_2{$plural}) > $param_filter1){
	      $word_and_freq .= "$plural " . ($hash_AFFY_ID{$_}/$counter_abs) . "\t";
	    }
	  }
	} else {
	  if (($hash_AFFY_ID{$_}/$counter_abs) > $param_filter1){
	    $word_and_freq .= "$_ " . ($hash_AFFY_ID{$_}/$counter_abs) . "\t";
	  }


	}

     } else {
       $word_and_freq .= "$_ " . ($hash_AFFY_ID{$_}/$counter_abs) . "\t";
     }
    }
  }

  if (! defined $hash_stock_AFFY_ID{$AFFY_ID_old}){
    $hash_stock_AFFY_ID{$AFFY_ID_old} = $word_and_freq;
  }
}


########################################################################################

# Calculate the frequency of each term in the text corpora of the full set of entries

foreach (keys %hash_stock_AFFY_ID){

    $count_AFFY_ID++;
    if ($hash_stock_AFFY_ID{$_}){
	my @words_and_freqs_selected = split ('\t', $hash_stock_AFFY_ID{$_});
	
	foreach (@words_and_freqs_selected){
	    (my $each_word, my $its_freq) = split;
	
	    if (! defined $hash_words_average_freq{$each_word}){
		$hash_words_average_freq{$each_word} = 1;
	    } else {
		$hash_words_average_freq{$each_word}++;
	    }
	}
    }
}

# Merge singular and pluriels to produce the repertoire of authorized terms

foreach (sort keys %hash_words_average_freq){

    if ($_ ne PL($_)){
	my $plural = PL($_);
	if (defined $hash_words_average_freq{$plural}){
	    $hash_words_average_freq{$_} += $hash_words_average_freq{$plural};
	    delete $hash_words_average_freq{$plural};
	}
    }
}

# Create repertoire of authorized terms	

foreach (keys %hash_words_average_freq){

# Discard numbers only

  if ((! /^\d+$/)
      &&

# filter 03: words present in abstracts of at least $min_size entries

      ($hash_words_average_freq{$_} >= $min_size)
      &&

# filter 04: words not present in abstracts of more than a fraction of the total AFFY_IDs

      ($hash_words_average_freq{$_} < ($count_AFFY_ID * $filter4))){

	$hash_allowed_words{$_}=1;
      }
}

# filters 03 and 04 eliminates from each entry terms missing in the repertoire of authorized terms.

foreach (keys %hash_stock_AFFY_ID){
    my $the_ID = $_;
    my $word_and_freq2;
    if ($hash_stock_AFFY_ID{$the_ID}){
	my @words_and_freqs_selected2 = split ('\t', $hash_stock_AFFY_ID{$the_ID});
	foreach (@words_and_freqs_selected2){
	    (my $each_word2, my $its_freq2) = split;
	    if (defined $hash_allowed_words{$each_word2}){
		$word_and_freq2 .= "$each_word2 " . "$its_freq2" . "\t";
	    } elsif (defined $hash_allowed_words{PL($each_word2)}){
		$word_and_freq2 .= PL($each_word2) . " " . $its_freq2 . "\t";
	    }
	}	
    }
    if ($word_and_freq2){
	$hash_stock_AFFY_ID_cleaned{$the_ID} = $word_and_freq2;
    }
}

# Print the final matrix

open (OUTPUT, ">$output_file");

print OUTPUT "UNIQID";
foreach (sort keys %hash_allowed_words){

	my $allowed_word = $_;
	print OUTPUT "\t$allowed_word";
}
print OUTPUT "\n";

foreach (keys %hash_stock_AFFY_ID_cleaned){
    my $key4 = $_;
    my %final_individual_hash = ();
    if ($hash_stock_AFFY_ID_cleaned{$key4}){

	my @words_and_freqs_selected3 = split ('\t', $hash_stock_AFFY_ID_cleaned{$key4});
	foreach (@words_and_freqs_selected3){
	    (my $each_word3, my $its_freq3) = split;
	    $_ = $each_word3;
		$final_individual_hash{$each_word3} = $its_freq3;
	}
    }

    foreach (keys %hash_allowed_words){
	my $allowed_word = $_;
	if (! $final_individual_hash{$allowed_word}){
	    $final_individual_hash{$allowed_word} = 0;
	}
    }
    print OUTPUT "$key4\t";

    foreach (sort keys %final_individual_hash) {
	print OUTPUT "$final_individual_hash{$_}" . "\t";
    }
    print OUTPUT "\n";
}

close OUTPUT;
dbmclose (%hash_baseline_2);
