#!/usr/bin/perl -w


#######################################################################
#######################################################################
#  Copyright 2008 Roney S. Coimbra

#  This file is part of genealiases

#  genealiases is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 3 of the License.
#  genealiases is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.

# You should have received a copy of the GNU General Public License along with genealiases (file: COPYING).  If not, see <http://www.gnu.org/licenses/>.

#######################################################################
#######################################################################

use LWP::Simple;

my $input_file = $ARGV[0]; # gene_name_list dir and file
my $abstract_number = $ARGV[1]; # maximal_number of abstracts to fetch
my $canonical_name = $ARGV[2]; # canonical name of the current gene

my $utils = "http://www.ncbi.nlm.nih.gov/entrez/eutils";
my $db = "Pubmed";
my $report = "xml";

my $output_file1 = join '.', (substr ($input_file, 0, -5)), "abstracts.tmp";

open(OUTPUT1, ">>$output_file1") || die "can't open output file: $output_file1";

open (INPUT1, "$input_file") || die "can't open input file: $input_file";
while (<INPUT1>){
  chomp;
  my @ligne = split "\t";
  my $query = $ligne[1];
  my $gene_name = $ligne[0];

  print OUTPUT1 "\>$gene_name\n";

  my $esearch = "$utils/esearch.fcgi?" .
    "db=$db&usehistory=y&term=";

  my $esearch_result = get($esearch . $query);

  $esearch_result =~ 
    m|<Count>(\d+)</Count>.*<QueryKey>(\d+)</QueryKey>.*<WebEnv>(\S+)</WebEnv>|s;

  my $QueryKey = $2;
  my $WebEnv   = $3;

  my $efetch = "$utils/efetch.fcgi?" .
    "rettype=$report&retmode=text&" .
      "db=$db&query_key=$QueryKey&WebEnv=$WebEnv&retmax=$abstract_number";

  my $efetch_result = get($efetch);

  print OUTPUT1 "$efetch_result\n";
}

close INPUT1;
close OUTPUT1;

#---------------------------------------------------------------------------
# this section formats the text corpora to the literature profiling analysis

open (INPUT2, "$output_file1") || die "can't open input file: $output_file1";

my $output_file2 = join '.', (substr ($output_file1, 0, -4)), "tab";
open (OUTPUT2, ">$output_file2") || die "can't open output file: $output_file2";

my $GENE_NAME;
my $ABSTRACT;
my $PMID;
my $new_doc;

print OUTPUT2 "GENE_NAME\tPUBMED_ID\tABSTRACT\n";

while (<INPUT2>){
  s/\t//og;
  if (/^\>[\w\-\(\)\,\[\]\s]+$/){

    chomp;
    $GENE_NAME = substr ($_, 1);

  } elsif (/\<PubmedArticle\>/){
    $new_doc = 1;

  } elsif (/\<\/PubmedArticle\>/){
    print OUTPUT2 "$GENE_NAME\t$PMID\t$ABSTRACT\n";

    $new_doc = 0;
    $ABSTRACT = "";


  } else{

    if ($new_doc){
      if (/\<\/ArticleTitle\>/){
	$titlegate = 1;
	
	if (/\<ArticleTitle\>/){ 
	  s/\<ArticleTitle\>//;
	  s/\<\/ArticleTitle\>//;
	  s/\t//og;
	  s/\n/ /og;
	  $ABSTRACT .= "$_";
	  $titlegate = 0;
	
	} else {
	  s/\<ArticleTitle\>//; 
	  s/\<\/ArticleTitle\>//;
	  s/\t//og;
	  s/\n/ /og;
	  $ABSTRACT .= "$_";
	}
	
      } elsif (/\<\/ArticleTitle\>/){ 
	s/\<\/ArticleTitle\>//;
	s/\t//og;
	s/\n/ /og;
	$ABSTRACT .= "$_";
	$titlegate = 0;
	
      } elsif ($titlegate){
	s/\t//og;
	s/\n/ /og;
	$ABSTRACT .= "$_";
	
      } elsif (/\<AbstractText\>/){
	$abstractgate = 1;
	
	if (/\<\/AbstractText\>/){ 
	  s/\<AbstractText\>//;
	  s/\<\/AbstractText\>//;
	  s/\t//og;
	  s/\n/ /og;
	  $ABSTRACT .= "$_";
	  $abstractgate = 0;
	
	} else {
	  s/\<AbstractText\>//; 
	  s/\t//og;
	  s/\n/ /og;
	  $ABSTRACT .= "$_";
	}
	
      } elsif (/\<\/AbstractText\>/){ 
	s/\<\/AbstractText\>//;
	s/\t//og;
	s/\n/ /og;
	  $ABSTRACT .= "$_";
	$abstractgate = 0;
	
      } elsif ($abstractgate){
	s/\t//og;
	s/\n/ /og;
	$ABSTRACT .= "$_";	
	
      } elsif (/\<PMID\>/){
	chomp;
	s/\<PMID\>//;
	s/\<\/PMID\>//;
	$PMID = $_;
      }
    }
  }
}
close INPUT2;
close OUTPUT2;

system ("rm $output_file1");
