#!/usr/bin/perl -w

# Exon coordinate generator
#
# Michael Zhang, Johan Stenberg
# Copyright Stanford University, 2007
#
# Purpose: Extract exon coordinates from CCDS for a given set of genes
# Usage: perl exon_coord_generator.pl -i [exon name input file] -ccds [CCDS file] -o [exon coord output file]
# Input: file with gene names; one gene name per line
# Output: tab delimited file containing columns:
# 	1. Exon ID/name
# 	2. Parent sequence (w. version)
# 	3. Start coordinate (1-based, always on top strand)
# 	4. End coordinate (1-based, always on top strand)
# 	5. Polarity of the region (1 for top strand, -1 for bottom strand)
# 	6. [Optional] Nucleotide sequence of region, when necessary. (currently not produced)


use strict;

use FileHandle;
use Getopt::Long;

# add custom library directory to the path
use FindBin;
use lib $FindBin::Bin."/../lib";


use Bio::Disperse::CcdsHandle;
use Bio::Disperse::Utils;
use Log::Log4perl qw/:easy/;


# initialize log for perl level to error
Log::Log4perl->easy_init($WARN);
my $logger = get_logger();
my $SCRIPTS_PATH = $FindBin::Bin;

my $DEFAULT_CCDS_FILE = $FindBin::Bin . '/../data/CCDS.20070227.txt';

my ($exon_name_infile, $exon_coord_outfile, $ccds_file);

GetOptions(
	"i=s" => \$exon_name_infile,
	"o=s" => \$exon_coord_outfile,
	"ccds=s" => \$ccds_file
);

# if either input or output files were not specified, output usage
if (!defined($exon_name_infile) || !defined($exon_coord_outfile))
{
	&usage();
}

if (!defined($ccds_file) || ! -e $ccds_file)
{
	$ccds_file = $DEFAULT_CCDS_FILE;
	$logger->info("CCDS file not provided as an argument.  Using default: $DEFAULT_CCDS_FILE");
}

# Create CCDS handle to retrieve information from the CCDS file or DB
my $ccds = new Bio::Disperse::CcdsHandle($ccds_file);

# Create input filehandle
my $in_fh = new FileHandle($exon_name_infile);
if (!defined($in_fh))
{
	$logger->error( "Could not open file: $exon_name_infile!");
	exit(1);
}

# Create output filehandle
my $out_fh = new FileHandle(">$exon_coord_outfile");
if (!defined($out_fh))
{
	$logger->error("Could not open file: $exon_coord_outfile!");
	exit(1);
}

# Print target file header
print $out_fh "# gene names file: $exon_name_infile\n";

my @non_ccds_genes;

# iterate through gene name input file
while (<$in_fh>) {
	chomp;
	if (/^\#/ || $_ eq "") { next; }
	
	# get gene name from input file
	my $gene_name = $_;
	
	# get information on the gene from CCDS
	my $gene_info = $ccds->get_gene_exon_coords($gene_name);
	if (defined($gene_info)) # gene name was found in CCDS
	{
		my $unique_exon_coord_hash = {};
		
		my ($g_accession, $cds_strand);
		my $exon_coord_entry_ary = [];
		
		# iterate through the various CCDS IDs
		for my $ccds_id (keys(%$gene_info)) {
			my $ccds_entry = $gene_info->{$ccds_id};
			# assume all exons have the same g_accession and cds_strand
			$g_accession = $ccds_entry->{'g_accession'};
			$cds_strand = $ccds_entry->{'cds_strand'};
			
			# iterate through all exons within each CCDS entry
			for my $exon_coord ( @{$ccds_entry->{'cds_locations'}} ) {
				my $coord_ary = [split("-", $exon_coord)];
				push @$exon_coord_entry_ary, {'ids' => [$ccds_id], 'coords' => $coord_ary};
			}
		}
		
		# sometimes there will be multiple entries for the same gene
		# and there will be duplicate exons.  This process eliminates redundant exons as well as merges overlapping exons.		
		my $merged_coords_entry_ary = Bio::Disperse::Utils::track_merged_coords($exon_coord_entry_ary);
			
		my $exon_count = 1;
		
		for my $cds_coord_entry (@$merged_coords_entry_ary) {
			my $ccds_ids = join ',', @{$cds_coord_entry->{'ids'}};
			
			my $cds_coord = $cds_coord_entry->{'coords'};
			my ($start_0based, $stop_0based) = @$cds_coord;
			
			# make 1-based coordinates from 0-based coordinates
			my $start_1based = $start_0based + 1;
			my $stop_1based = $stop_0based + 1;
			
			# have the exon number count according to strand (1: 5' - 3'; -1: 3' - 5')
			my $exon_num = $cds_strand == 1 ? $exon_count : @$merged_coords_entry_ary + 1 - $exon_count;
			
			my @exon_entry = (
				$gene_name . "_CDS_" . $exon_num,
				$ccds_ids,
				$g_accession,
				$start_1based,
				$stop_1based,
				$cds_strand
			);
			
			# print entry to outfile
			print $out_fh join "\t", @exon_entry;
			print $out_fh "\n";
			
			$exon_count++;
		}
	}
	else
	{
		push @non_ccds_genes, $gene_name;
	}
}

if (scalar(@non_ccds_genes) != 0)
{
	print "The following genes were not found in CCDS:\n";
	print join "\n", @non_ccds_genes;
	print "\n\nRetrieving CDS information for these genes from NCBI's GenBank...\n";
	
	my $non_ccds_file = "non_ccds";
	my $non_ccds_fh = new FileHandle(">$non_ccds_file");
	print $non_ccds_fh join "\n", @non_ccds_genes;
	$non_ccds_fh->close;
	
	
	my $cds_file = "cds_file";
	my $extract_cds_from_ncbi_cmd = "perl $SCRIPTS_PATH/extract_cds_from_ncbi.pl";
	my $extract_cds_from_ncbi_params = "-i $non_ccds_file -o $cds_file";
	my $extract_cds_return_code = system("$extract_cds_from_ncbi_cmd $extract_cds_from_ncbi_params");
	
	my $cds_fh = new FileHandle($cds_file);
	while (<$cds_fh>)
	{
		chomp;
		my ($cds_id, $g_accession, $start, $stop, $strand) = split "\t", $_;
		my $ccds_id = 'NO_CCDS';
		print $out_fh join "\t", ($cds_id, $ccds_id, $g_accession, $start, $stop, $strand);
		print $out_fh "\n";
	}

	if ($extract_cds_return_code != 0) { exit(1); }
	
}



$in_fh->close;
$out_fh->close;

sub usage
{
	print "Bad input.  Usage:\n";
	print "\tperl exon_coord_generator.pl -i [exon name input file] -ccds [CCDS file] -o [exon coord output file]\n";
	exit;
}
