#!/usr/bin/perl -w

# Target sequence (and PieceMaker input) generator
#
# Michael Zhang, Johan Stenberg
# Copyright (c) 2006-2008 Stanford University
# This file is part of Disperse.
#
# Disperse is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Disperse is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Disperse; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

# Purpose: 	Extract the target sequences for a given set of ROIs
#			Create the appropriate PieceMaker input file.
#			Create the appropriate reference sequence file.
# Usage: perl target_seq_generator.pl -i [ROI input file] -target [target output file] -refseq [reference sequence output file] -flank [target flank size]
# Input: 
# 	1. ROI ID/name
# 	2. Parent sequence (w. version)
# 	3. Start coordinate (1-based, always on top strand)
# 	4. End coordinate (1-based, always on top strand)
# 	5. Polarity of the region (1 for top strand, -1 for bottom strand)
# 	6. [Optional] Nucleotide sequence of region, when necessary. (currently not produced)
# Target (PieceMaker input) Output:
#    1. Target ID (Must be unique within file)
#    2. Parent Accession/ID (Should be versioned, e.g. NC_000001.9 for chr. 1)
#    3. Start coordinate of target in genome (1-based in chromosome, plus strand)
#    4. Region of interest start (1-based in target, plus strand)
#    5. Region of interest end (1-based in target, plus strand)
#    6. Target nucleotide sequence
# Reference Sequence Output:
#    1. ID (G_accession:Target_Start-Target_Stop)
#    2. Parent Accession/ID (Should be versioned, e.g. NC_000001.9 for chr. 1)
#    3. Start coordinate of target in genome (1-based in chromosome, plus strand)
#    4. Stop coordinate of target in genome (1-based in chromosome, plus strand)
#    5. Target nucleotide sequence



use strict;

use FileHandle;
use Getopt::Long;

# add custom library directory to the path
use FindBin;
use lib $FindBin::Bin."/../lib";

use Bio::Disperse::FastaCmd;
use Log::Log4perl qw/:easy/;


# initialize log for perl level to error
Log::Log4perl->easy_init($WARN);
my $logger = get_logger();


my ($ROI_infile, $target_outfile, $refseq_outfile, $target_flank_size, $fastacmd_bin, $fastacmd_blastdb);

GetOptions(
	"i=s" => \$ROI_infile,
	"target=s" => \$target_outfile,
	"refseq=s" => \$refseq_outfile,
	"flank=i" => \$target_flank_size,
	"fastacmd=s" => \$fastacmd_bin,
	"blastdb=s" => \$fastacmd_blastdb
);

# if either input or output files or flank size were not specified, output usage
if (!defined($ROI_infile) || !defined($target_outfile) || !defined($target_outfile) || !defined($target_flank_size) || !defined($fastacmd_blastdb))
{
	&usage();
}
#Hope it is on the PATH
if (!defined($fastacmd_bin)) {
	$fastacmd_bin = "fastacmd";
}

# Create input filehandle
my $in_fh = new FileHandle($ROI_infile);
if (!defined($in_fh))
{
	$logger->error( "Could not open file: $ROI_infile!");
	exit(1);
}

# Create output filehandles
my $target_out_fh = new FileHandle(">$target_outfile");
if (!defined($target_out_fh))
{
	$logger->error("Could not open file: $target_outfile!");
	exit(1);
}

my $refseq_out_fh = new FileHandle(">$refseq_outfile");
if (!defined($refseq_out_fh))
{
	$logger->error("Could not open file: $refseq_outfile!");
	exit(1);
}

# Create FastaCmd handle to retrieve information from the BlastDB
my $fastacmd = new Bio::Disperse::FastaCmd($fastacmd_bin, $fastacmd_blastdb);


# Print target file header
print $target_out_fh "# ROI file: $ROI_infile\n";
print $target_out_fh "# target flank size: $target_flank_size\n";

# iterate through ROI coordinate input file
while (<$in_fh>) {
	if (/^\#/) {
		print $target_out_fh $_;
		next;
	}
	chomp;

	my $line = $_;	
	my ($ROI_name, $CDS_IDs, $CCDS_IDs, $g_accession, $genomic_ROI_start, $genomic_ROI_stop, $strand) = split /\t/;
	my $genomic_target_start = $genomic_ROI_start - $target_flank_size;
	my $genomic_target_stop = $genomic_ROI_stop + $target_flank_size;
	
	# get the target sequence; always top strand
	my $target_sequence = $fastacmd->get_fasta($g_accession, $genomic_target_start, $genomic_target_stop, 1);
	
	$target_sequence =~ s/.*\n//;
	$target_sequence =~ s/\n//g;

	my $exp_length = $genomic_target_stop - $genomic_target_start + 1;
	my $got_length = length($target_sequence);
	if ($got_length!=$exp_length)	{		
		$logger->error("fastacmd error. Make sure the location of fastacmd program is correclty specified in the configuration file.");
		$logger->error("Offending query: $line");						
		exit(1);

	}
	
	my $target_ROI_start = $target_flank_size + 1;
	my $target_ROI_stop = $target_ROI_start + $genomic_ROI_stop - $genomic_ROI_start;
	print $target_out_fh join("\t", ($ROI_name, $g_accession, $genomic_target_start, $target_ROI_start, $target_ROI_stop, $target_sequence));
	print $target_out_fh "\n";
	print $refseq_out_fh join("\t", ("${g_accession}:${genomic_target_start}-${genomic_target_stop}" , $g_accession, $genomic_target_start, $genomic_target_stop, $target_sequence));
	print $refseq_out_fh "\n";
}

sub usage
{
	print "Bad input.  Usage:\n";
	print "\tperl target_seq_generator.pl -i [ROI input file] -target [target output file] -refseq [reference sequence output file] -flank [target flank size]\n";
	exit(1);
}
