#!/usr/bin/perl -w

# Extract CDS information from NCBI
#
# Deyra N. Rodriguez, Michael Zhang, Johan Stenberg
# Copyright Stanford University, 2007
#
# Purpose: Extract exon coordinates from NCBI's GenBank for a given set of genes
# Usage: perl extract_cds_from_ncbi.pl -i [gene name input file] -o [exon coord output file]
# Input: file with gene names; one gene name per line
# Output: tab delimited file containing columns:
# 	1. Exon ID/name
# 	2. Parent sequence (w. version)
# 	3. Start coordinate (1-based, always on top strand)
# 	4. End coordinate (1-based, always on top strand)
# 	5. Polarity of the region (1 for top strand, -1 for bottom strand)
# 	6. [Optional] Nucleotide sequence of region, when necessary. (currently not produced)

# The list considers splice variants. Overlapping CDSs are merged into larger ones in output file.


use strict;
use warnings;

# Load built-in modules
use LWP::Simple;
use FileHandle;
use Getopt::Long;

# add custom library directory to the path
use FindBin;
use lib $FindBin::Bin . "/../lib";

use Bio::Disperse::Utils;
use Log::Log4perl qw/:easy/;


# initialize log for perl level to error
Log::Log4perl->easy_init($WARN);
my $logger = get_logger();


my ($exon_name_infile, $exon_coord_outfile);

GetOptions(
	"i=s" => \$exon_name_infile,
	"o=s" => \$exon_coord_outfile
);

# if either input or output files were not specified, output usage
if (!defined($exon_name_infile) || !defined($exon_coord_outfile))
{
	&usage();
}

# Create input filehandle
my $in_fh = new FileHandle($exon_name_infile);
if (!defined($in_fh))
{
	$logger->error( "Could not open file: $exon_name_infile!");
	exit(1);
}

# Create output filehandle
my $out_fh = new FileHandle(">$exon_coord_outfile");
if (!defined($out_fh))
{
	$logger->error("Could not open file: $exon_coord_outfile!");
	exit(1);
}



# Iterate through each line of the file containing gene names

my %gene_seen;
my $gene_not_found = 0;

GENEINPUT: while(<$in_fh>) {


	# Parse for gene name

	my $geneinput;

	if (/^(\S+)/) {

		$geneinput = $1;


		# Ignore any duplicate gene entries

		next GENEINPUT if (exists $gene_seen{$geneinput});
		
		$gene_seen{$geneinput}++;
	}
	
	print "Processing $geneinput...\n";


	# Use each gene name and NCBI's E-utilities esearch to access and store
	# the NCBI page containing general gene information including geneid

	my $idsearch = get("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=gene&term=.$geneinput.[gene]%20AND%20HOMO%20sapiens%20[ORGN]");


	# Parse through the stored page for the geneid

	my $geneid;

	my @geneid_ary;
	while ($idsearch =~ /<Id>(\d+)<\/Id>/g) # look for the number inside the id tag
	{
		push @geneid_ary, $1;
	}
	
	if (scalar(@geneid_ary) == 0)
	{
		$gene_not_found = 1;
		$logger->warn("No CDS entry found in NCBI GenBank for gene name: $geneinput");
		next GENEINPUT;
	}
	
	my $version;
	my $genestart;
	my $geneend;

	
	GENEID: while (my $geneid = shift(@geneid_ary))
	{
		# Use the geneid and NCBI's E-utilities efetch to access and store the NCBI
		# page containing, within other info, accesion number and coordinates of gene
	
		my $url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=$geneid&rettype=xml";
		my $accandcoor = get ($url);
		
	
		# Parse through page for accession number and version. Concatenate them to
		# generate a $version variable
	
		my $accpart1;
		my $accpart2;
	
		if ($accandcoor =~ /Gene-commentary_accession&gt\;(NC_\d*)/) {
			$accpart1 = $1;
		}
		else {
			next GENEID;
		}
	
	
		if ($accandcoor =~ /Gene-commentary_version&gt\;(.*)&lt/) {
			$accpart2 = $1;
		}
		else {
			next GENEID;
		}
	
		$version = join ("." , $accpart1 , $accpart2);
	
	
	
		#Parse through page for gene coordinates and store them
	
		if ($accandcoor =~ /Seq-interval_from&gt\;(\d*)&lt\;/) {
			$genestart = $1 + 1;
		}
		else {
			next GENEID;
		}
	
		if ($accandcoor =~ /Seq-interval_to&gt\;(\d*)&lt\;/) {
			$geneend = $1 + 1;
		}
		else {
			next GENEID;
		}

	 	if (defined($version) && defined($genestart) && defined($geneend))
		{
			last GENEID;
		}
	}
	
	if (!defined($version) || !defined($genestart) || !defined($geneend))
	{
		$gene_not_found = 1;
		$logger->warn("No CDS entry found in NCBI GenBank for gene name: $geneinput");
		next GENEINPUT;
	}
	


	# Use gene version and coordinates to access and store the NCBI page
	# containing gene $out_fhrmation including CDS
	
	my $url = "http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val=$version&from=$genestart&to=$geneend&dopt=gb&sendto=t";
	
	my $geneinfo = get ($url);



	# Parse though page and extract every CDS for the gene, including splice
	# variants. Store lines containing CDS $out_fhrmation

	my @cdslist;
	my $cdsstringb;
	my $strand;

	while ($geneinfo =~ /CDS\s{13}((complement\()?(join\()?[>\d\,\n\s\.]+\)?\)?)/gs) {

		my $cdsstring = $1;

		# call a subroutine to convert + and - strand $out_fh into 1 and -1 to be used
		# by Bioperl

		$strand = &convertstrand ($cdsstring);

		# Parse to get only the string of all CDS start and end position

		if ($cdsstring =~ /(\d.*\d)/s) {
			$cdsstringb = $1;
		}


		# Split CDS string by the "," and create array of (start..end) position of each CDS

		my @cdsstringc = split (/,\s*/, $cdsstringb);



		# Split CDS array elements (start..end) by the .. and assign each start and end position
		# to a variable $cdsstrart, $cdsend


		for (@cdsstringc) {
			my ($cdsstart, $cdsend);
			if (/^(\d+)\.\.(\d+)$/)
			{
				$cdsstart = $1;
				$cdsend = $2;
			}
			
			# check for '>' in coordinate (e.g. 304..>522)
			elsif (/^(\d+)\.\.\>(\d+)$/)
			{
				$cdsstart = $1;
				$cdsend = $2;
				
				# log this
				$logger->warn("$geneinput has '>' in coordinate: $_");
			}
			elsif (/^(\d+)$/)
			{
				$cdsstart = $1;
				$cdsend = $1;
				
				# log this
				$logger->warn("$geneinput has coordinate with single number, interpreted as both start and stop: $_");
			}
			else
			{
				# log this
				$logger->error("$geneinput has bad coordinate: $_");
				exit(1);
			}

			# Use BioPerl to create a location containing the start position and end
			# position and strand (+/-) for each CDS and store it in an array

			push (@cdslist , [$cdsstart, $cdsend]);
		}
	} # close while loop
	
	if (scalar(@cdslist) == 0)
	{
		$gene_not_found = 1;
		$logger->warn("No CDS entry found in NCBI GenBank for gene name: $geneinput");
		next GENEINPUT;
	}

	my $cdsuniqueformat = &Bio::Disperse::Utils::merge_coords(\@cdslist);		

	my $count = 0;
	for (@$cdsuniqueformat)
	{
		$count++;
		my $exon_start = $_->[0]+$genestart-1;
		my $exon_stop = $_->[1]+$genestart-1;
		if ($strand == 1)
		{
			my $cds_number = $count;
			my $gene_cds_name = $geneinput . "_CDS_${cds_number}";
			print $out_fh join "\t", ($gene_cds_name, $version, $exon_start, $exon_stop, $strand);
			print $out_fh "\n";
		}
		else
		{
			my $cds_number = @$cdsuniqueformat - $count + 1;
			my $gene_cds_name = $geneinput . "_CDS_${cds_number}";
			print $out_fh join "\t", ($gene_cds_name, $version, $exon_start, $exon_stop, $strand);
			print $out_fh "\n";
		}

	}

}

if ($gene_not_found)
{
	print "\nSome gene names were not found in the NCBI database.\n";
	exit(1);
}


# Subroutines

sub convertstrand{

	my $strandtoconvert = $_[0];

	if ($strandtoconvert =~ /complement/gs){
		return -1;
	}
	else {
		return 1;
	}
}


sub usage
{
	print "Bad input.  Usage:\n";
	print "\tperl extract_cds_from_ncbi.pl -i [exon name input file] -o [exon coord output file]\n";
	exit;
}
