#!/usr/bin/perl -w

#######################################################################
#######################################################################
#  Copyright 2008 Roney S. Coimbra

#  This file is part of genealiases

#  genealiases is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License.
#  genealiases is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with genealiases (file: COPYING).  If not, see <http://www.gnu.org/licenses/>.

#######################################################################
#######################################################################


my $input = $ARGV[0];
my $canonical_name = $ARGV[1];
my $unrelated_name1 = $ARGV[2];
my $unrelated_name2 = $ARGV[3];
my $unrelated_name3 = $ARGV[4];
my $project = $ARGV[5];

my @matrix;
my $lenar;

open (INPUT, "< $input") || die "can't open input file";
open (OUTPUT, ">> $project.jac") || die "can't open output file";


while (<INPUT>){
  next if (/UNIQID/);
  chomp;
  (my $gene, my @freqs) = split "\t";
  $vocab_size = @freqs;

  if ($gene eq $canonical_name){
    unshift @matrix, ($gene, \@freqs); # put canonical name in the begining of the matrix
  }else {
    push @matrix, ($gene, \@freqs);
  }
}

############################################################
############################################################

print OUTPUT "CANONICAL_NAME\tUNRELATED_NAME1\tUNRELATED_NAME2\tUNRELATED_NAME3\tNAME1\tNAME2\tJACCARD_DISTANCE\tVOCABULARY\n";


if ($#matrix > 1){
  for ($i = 1; $i < @matrix; $i++){
    if (($i / 2) == int ($i / 2)){
      my $toto = $i+1;
      (my $res) = jaccard_distance($matrix[1], $matrix[$toto]);

      print OUTPUT "$canonical_name\t$unrelated_name1\t$unrelated_name2\t$unrelated_name3\t$matrix[0]\t$matrix[$i]\t$res\t$vocab_size\n";
    }
  }
} elsif ($#matrix == 1){ # in case only one synonym produce valid abstract collection or vocabulary
  (my $res) = jaccard_distance($matrix[1], $matrix[1]);
 
print OUTPUT "$canonical_name\t$unrelated_name1\t$unrelated_name2\t$unrelated_name3\t$matrix[0]\t$matrix[$0]\t$res\t$vocab_size\n";

}

close INPUT;
close OUTPUT;
#############################################################
#############################################################

sub jaccard_distance {
  my ($x, $y) = @_;
  my $p = 0;
  my $q = 0;
  my $r = 0;

  for (my $i = 0; $i < @$x; $i++) {# para cada termo
    if(($x->[$i] > 0) && ($y->[$i] > 0)){
      $p++;
    } elsif (($x->[$i] > 0) && ($y->[$i] == 0)){
      $q++;
    } elsif (($x->[$i] == 0) && ($y->[$i] > 0)){
      $r++;
    }
  }



    my $jaccard_coef = $p / ($p + $q + $r);

  return  (1 - $jaccard_coef); # returns the jaccard distance
}


