#!/usr/bin/perl -w

# ROI coordinate generator
#
# Michael Zhang, Johan Stenberg
# Copyright Stanford University, 2007
#
# Purpose: Create ROI coordinates for the set of CDS exons
# Usage: perl ROI_coord_generator.pl -i [exon coord input file] -o [ROI coord output file] -flank [ROI flank size]
# Input: tab delimited file containing columns:
# 	1. exon ID/name
# 	2. Parent sequence (w. version)
# 	3. Start coordinate (1-based, always on top strand)
# 	4. End coordinate (1-based, always on top strand)
# 	5. Polarity of the region (1 for top strand, -1 for bottom strand)
# 	6. [Optional] Nucleotide sequence of region, when necessary. (currently not produced)
# Output: tab delimited file containing columns:
# 	1. ROI ID/name
# 	2. Parent sequence (w. version)
# 	3. Start coordinate (1-based, always on top strand)
# 	4. End coordinate (1-based, always on top strand)
# 	5. Polarity of the region (1 for top strand, -1 for bottom strand)
# 	6. [Optional] Nucleotide sequence of region, when necessary. (currently not produced)



use strict;

use FileHandle;
use Getopt::Long;

# add custom library directory to the path
use FindBin;
use lib $FindBin::Bin."/../lib";

use Bio::Disperse::Utils;
use Log::Log4perl qw/:easy/;


# initialize log for perl level to error
Log::Log4perl->easy_init($WARN);
my $logger = get_logger();


my ($exon_coord_infile, $ROI_coord_outfile, $ROI_flank_size);

GetOptions(
	"i=s" => \$exon_coord_infile,
	"o=s" => \$ROI_coord_outfile,
	"flank=i" => \$ROI_flank_size
);

# if either input or output files were not specified, output usage
if (!defined($exon_coord_infile) || !defined($ROI_coord_outfile) || !defined($ROI_flank_size))
{
	&usage();
}

# Create input filehandle
my $in_fh = new FileHandle($exon_coord_infile);
if (!defined($in_fh))
{
	$logger->error( "Could not open file: $exon_coord_infile!");
	exit(1);
}

# Create output filehandle
my $out_fh = new FileHandle(">$ROI_coord_outfile");
if (!defined($out_fh))
{
	$logger->error("Could not open file: $ROI_coord_outfile!");
	exit(1);
}


# Print target file header
print $out_fh "# CDS exon file: $exon_coord_infile\n";
print $out_fh "# ROI flank size: $ROI_flank_size\n";
print $out_fh "# ";
print $out_fh join "\t", qw/roi_id cds_ids g_accession roi_start roi_stop strand/;
print $out_fh "\n";


my $exon_hash = {};
my $ccds_id_hash = {};

# iterate through exon coordinate input file
while (<$in_fh>) {
	if (/^\#/) {
		print $out_fh $_;
		next;
	}
	chomp;
	
	# get gene name from input file
	my ($cds_id, $ccds_ids, $g_accession, $cds_start, $cds_stop, $strand) = split("\t", $_);
	$cds_id =~ /^(.*)_CDS_(\d+)/;
	my $gene_name = $1;
	my $cds_number = $2;
	
	if (!defined($exon_hash->{$gene_name})) {
		$exon_hash->{$gene_name} = { g_accession => $g_accession, strand => $strand, ROI_coords => [] };
	}
	
	my $coord_hash = { 'ids'=>[$cds_id], 'coords'=>[$cds_start-$ROI_flank_size, $cds_stop+$ROI_flank_size] };
	$ccds_id_hash->{$cds_id} = $ccds_ids;
	
	push @{$exon_hash->{$gene_name}->{'ROI_coords'}}, $coord_hash;
	
}

# Output ROI coords for every gene
for my $gene_name (keys(%$exon_hash)) {
	# merge the ROIs wherever overlap occurs
	$exon_hash->{$gene_name}->{'ROI_coords'} = Bio::Disperse::Utils::track_merged_coords($exon_hash->{$gene_name}->{'ROI_coords'});
	
	my $ROI_count = 1;
	my $total_ROIs = @{$exon_hash->{$gene_name}->{'ROI_coords'}};
	for my $ROI_coord ( @{$exon_hash->{$gene_name}->{'ROI_coords'}} ) {
		my $ROI_num = $exon_hash->{$gene_name}->{'strand'} == 1 ? $ROI_count : $total_ROIs - $ROI_count + 1;
		my $CDS_IDs = join ",", @{$ROI_coord->{'ids'}};
		my @ccds_ids;
		for my $cds_id (@{$ROI_coord->{'ids'}}) {
			push @ccds_ids, $ccds_id_hash->{$cds_id};
		}
		my $CCDS_IDs_entry = join ",", @ccds_ids;
		my @ROI_entry = (
			$gene_name . "_ROI_" . $ROI_num,
			$CDS_IDs,
			$CCDS_IDs_entry,
			$exon_hash->{$gene_name}->{'g_accession'},
			$ROI_coord->{'coords'}->[0],
			$ROI_coord->{'coords'}->[1],
			$exon_hash->{$gene_name}->{'strand'}
		);
		
		# print entry to outfile
		print $out_fh join "\t", @ROI_entry;
		print $out_fh "\n";
		
		$ROI_count++;
	}
}

$in_fh->close;
$out_fh->close;

sub usage
{
	print "Bad input.  Usage:\n";
	print "\tperl ROI_coord_generator.pl -i [exon coord input file] -o [ROI coord output file] -flank [ROI flank size]\n";
	exit;
}
