#!/usr/bin/perl -w

# Output file consolidator
#
# Michael Zhang, Georges Natsoulis
#
# Copyright (c) 2006-2008 Stanford University
# This file is part of Disperse.
#
# Disperse is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Disperse is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Disperse; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# Purpose: Create a directory with consolidated output files from the selector pipeline
# Usage: perl output_file_consolidator.pl -cds [cds coord file] -roi [roi coord file] -snp [snp info file] -target [target file]
#						-snptarget [snp target file] -frag [fragment file] -amp [amplicon file] -probe [probe file]
# Outfile fields:
# 	CDS file:
# 		cds_id
#		ccds_ids
# 		g_accession
# 		chr_num
# 		cds_start
# 		cds_stop
# 		strand
# 		
# 	SNP file: 
# 		dbsnp_id
# 		g_accession
# 		chr_num
# 		upstream_pos
# 		downstream_pos
# 		alleles
# 		
# 	SNP Target file: combines ROI file and the SNP target file
# 		roi_id
# 		gene_name
# 		roi_num
# 		cds_ids
#		ccds_ids
# 		g_accession
# 		chr_num
# 		chr_roi_start
# 		chr_roi_stop
# 		roi_strand
# 		chr_target_start
# 		target_roi_start
# 		target_roi_stop
# 		target_seq_clean
#		target_seq_degenerate
# 		
# 	Probe file: combines probe, target, amplicon, and ROI files
# 		fragment_id
# 		gene_name
# 		roi_num
# 		cds_ids
#		ccds_ids
# 		g_accession
# 		chr_num
# 		strand
# 		fragment_start
# 		fragment_stop
# 		flap_pos
# 		amplicon_start
# 		amplicon_stop
# 		probe_seq
		


use strict;

use FileHandle;
use Getopt::Long;

# add custom library directory to the path
use FindBin;
use lib $FindBin::Bin."/../lib";

use Bio::Disperse::Utils;
use File::Spec;
use Log::Log4perl qw/:easy/;


# initialize log for perl level to error
Log::Log4perl->easy_init($WARN);
my $logger = get_logger();


my ($cds_infile, $roi_infile, $snp_infile, $target_infile, $snp_target_infile, $fragment_infile, $amplicon_infile, $probe_infile);

unless (GetOptions(
	"cds=s" => \$cds_infile,
	"roi=s" => \$roi_infile,
	"snp=s" => \$snp_infile,
	"target=s" => \$target_infile,
	"snptarget=s" => \$snp_target_infile,
	"frag=s" => \$fragment_infile,
	"amp=s" => \$amplicon_infile,
	"probe=s" => \$probe_infile
))
{
	&usage();
}



# Set output filenames
my $OUTFILE_PREFIX = 'OUT.';
my $OUTFILE_POSTFIX = '.txt';

############ MAIN ##################

if (defined($cds_infile)) {	&cds_formatting($cds_infile); }
else { print "No CDS file specified. Skipping CDS file formatting.\n"; }

if (defined($snp_infile)) { &snp_formatting($snp_infile); }
else { print "No SNP file specified. Skipping SNP file formatting.\n"; }

if (defined($target_infile) && defined($snp_target_infile) && defined($roi_infile))
{
	&snp_target_consolidation($target_infile, $snp_target_infile, $roi_infile);
}
else { print "No SNP target or ROI file specified. Skipping SNP target consolidation.\n"; }

if (defined($probe_infile) && defined($fragment_infile) && defined($amplicon_infile) && defined($roi_infile))
{
	probe_file_consolidation($probe_infile, $fragment_infile, $amplicon_infile, $roi_infile);
}
else { print "No probe, fragment, amplicon, or ROI file specified. Skipping probe file consolidation.\n"; }

############### END MAIN ##################

#Get output file on the form of prefix.filename.postfix.
sub get_out_file_name 
{
	my ($infile) = @_;
	#Extract volume and path from filename
	my ($vol,$path,$filename) = File::Spec->splitpath($infile); 	
	#Create new filename
	my $outfilename = $OUTFILE_PREFIX . $filename . $OUTFILE_POSTFIX;
	#Assemble full path
	return File::Spec->catpath($vol,$path,$outfilename);
}


# add chromosome number and column headings to cds file
sub cds_formatting
{
	my ($infile) = @_;
	
	my $outfile = get_out_file_name($infile);
	
	# create filehandles
	my $in_fh = new FileHandle($infile);
	if (!defined($in_fh))
	{
		$logger->error( "Could not open file: $infile!");
		exit(1);
	}
	my $out_fh = new FileHandle(">$outfile");
	if (!defined($out_fh))
	{
		$logger->error( "Could not open file: $outfile!");
		exit(1);
	}
	
	my $cds_ary = [];
	
	while (<$in_fh>)
	{
		if (/^\#/) {
			print $out_fh $_;
			next;
		}
		else
		{
			chomp;
			my ($cds_id, $ccds_ids, $g_accession, $cds_start, $cds_stop, $strand) = split("\t", $_);
			my $chr_num = &Bio::Disperse::Utils::chr_num_from_g_accession($g_accession);
			push @$cds_ary, [$cds_id, $ccds_ids, $g_accession, $chr_num, $cds_start, $cds_stop, $strand];
		}
	}
	
	# print column headings
	print $out_fh "# ";
	print $out_fh join "\t", qw/cds_id ccds_ids g_accession chr_num cds_start cds_stop strand/;
	print $out_fh "\n";
	
	# print cds content
	for my $cds_entry (@$cds_ary)
	{
		print $out_fh join "\t", @$cds_entry;
		print $out_fh "\n";
	}
	
	# close files
	$in_fh->close;
	$out_fh->close;

}

# add chromosome number and column headings to snp file
sub snp_formatting
{
	my ($infile) = @_;
	
	my $outfile = get_out_file_name($infile);
	
	# create filehandles
	my $in_fh = new FileHandle($infile);
	if (!defined($in_fh))
	{
		$logger->error( "Could not open file: $infile!");
		exit(1);
	}
	my $out_fh = new FileHandle(">$outfile");
	if (!defined($out_fh))
	{
		$logger->error( "Could not open file: $outfile!");
		exit(1);
	}
	
	my $snp_ary = [];
	
	while (<$in_fh>)
	{
		if (/^\#/) {
			print $out_fh $_;
			next;
		}
		else
		{
			chomp;
			my ($dbsnp_id, $g_accession, $upstream_pos, $downstream_pos, $alleles) = split("\t", $_);
			my $chr_num = &Bio::Disperse::Utils::chr_num_from_g_accession($g_accession);
			push @$snp_ary, [$dbsnp_id, $g_accession, $chr_num, $upstream_pos, $downstream_pos, $alleles];
		}
	}
	
	# print column headings
	print $out_fh "# ";
	print $out_fh join "\t", qw/dbsnp_id g_accession chr_num upstream_pos downstream_pos alleles/;
	print $out_fh "\n";
	
	# print snp content
	for my $snp_entry (@$snp_ary)
	{
		print $out_fh join "\t", @$snp_entry;
		print $out_fh "\n";
	}
	
	# close files
	$in_fh->close;
	$out_fh->close;

}

# consolidating target, snp target and roi files
# creating columns for gene name, roi number, and cds ids.
sub snp_target_consolidation
{
	my ($target_file, $snp_target_infile, $roi_infile) = @_;
	
	my $snp_target_outfile = get_out_file_name($snp_target_infile);
	
	# create filehandles
	my $target_infh = new FileHandle($target_infile);
	if (!defined($target_infh))
	{
		$logger->error( "Could not open file: $target_infile!");
		exit(1);
	}
	my $snp_target_infh = new FileHandle($snp_target_infile);
	if (!defined($snp_target_infh))
	{
		$logger->error( "Could not open file: $snp_target_infile!");
		exit(1);
	}
	my $roi_infh = new FileHandle($roi_infile);
	if (!defined($roi_infh))
	{
		$logger->error( "Could not open file: $roi_infile!");
		exit(1);
	}
	my $out_fh = new FileHandle(">$snp_target_outfile");
	if (!defined($out_fh))
	{
		$logger->error( "Could not open file: $snp_target_outfile!");
		exit(1);
	}
	
	my @roi_order_ary = ();
	my $snp_target_hash = {};
	
	while (<$target_infh>)
	{
		if (/^\#/) {
			print $out_fh $_;
			next;
		}
		else
		{
			chomp;
			my ($roi_id, $g_accession, $chr_target_start, $target_roi_start, $target_roi_stop, $target_seq) = split("\t", $_);
			my $chr_num = &Bio::Disperse::Utils::chr_num_from_g_accession($g_accession);
			push @roi_order_ary, $roi_id;
			$roi_id =~ /^(.*)_ROI_(\d+)/;
			my $gene_name = $1;
			my $roi_num = $2;

			$snp_target_hash->{$roi_id} = {	'roi_id'=>$roi_id,
											'gene_name'=>$gene_name,
											'roi_num'=>$roi_num,
											'g_accession'=>$g_accession,
											'chr_num'=>$chr_num,
											'chr_target_start'=>$chr_target_start,
											'target_roi_start'=>$target_roi_start,
											'target_roi_stop'=>$target_roi_stop,
											'target_seq_clean'=>$target_seq
										}
		}
	}

		
	while (<$snp_target_infh>)
	{
		if (/^\#/) {
			print $out_fh $_;
			next;
		}
		else
		{
			chomp;
			my ($roi_id, $g_accession, $chr_target_start, $target_roi_start, $target_roi_stop, $snp_target_seq) = split("\t", $_);

			$snp_target_hash->{$roi_id}->{'target_seq_degenerate'} = $snp_target_seq;
		}
	}
	
	while (<$roi_infh>)
	{
		if (/^\#/) {
			next;
		}
		else
		{
			chomp;
			my ($roi_id, $cds_ids, $ccds_ids, $g_accession, $chr_roi_start, $chr_roi_stop, $roi_strand) = split("\t", $_);
			$snp_target_hash->{$roi_id}->{'cds_ids'} = $cds_ids;
			$snp_target_hash->{$roi_id}->{'ccds_ids'} = $ccds_ids;
			$snp_target_hash->{$roi_id}->{'chr_roi_start'} = $chr_roi_start;
			$snp_target_hash->{$roi_id}->{'chr_roi_stop'} = $chr_roi_stop;
			$snp_target_hash->{$roi_id}->{'roi_strand'} = $roi_strand;
		}
	}

	my @col_headers = qw/roi_id gene_name roi_num cds_ids ccds_ids g_accession chr_num 
						chr_roi_start chr_roi_stop roi_strand
						chr_target_start target_roi_start
						target_roi_stop target_seq_clean target_seq_degenerate/;
	
	# print column headings
	print $out_fh "# ";
	print $out_fh join "\t", @col_headers;
	print $out_fh "\n";
	
	# print snp content
	for my $roi_id (@roi_order_ary)
	{
		my @print_list;
		for my $key (@col_headers)
		{
			push @print_list, $snp_target_hash->{$roi_id}->{$key};
		}
		print $out_fh join "\t", @print_list;
		print $out_fh "\n";
	}
	
	# close files
	$snp_target_infh->close;
	$roi_infh->close;
	$out_fh->close;

}

sub probe_file_consolidation
{
	my ($probe_infile, $fragment_infile, $amplicon_infile, $roi_file) = @_;
	
	my $probe_outfile = get_out_file_name($probe_infile);

	# create filehandles
	my $probe_infh = new FileHandle($probe_infile);
	if (!defined($probe_infh))
	{
		$logger->error( "Could not open file: $probe_infile!");
		exit(1);
	}
	my $fragment_infh = new FileHandle($fragment_infile);
	if (!defined($fragment_infh))
	{
		$logger->error( "Could not open file: $fragment_infile!");
		exit(1);
	}
	my $amplicon_infh = new FileHandle($amplicon_infile);
	if (!defined($amplicon_infh))
	{
		$logger->error( "Could not open file: $amplicon_infile!");
		exit(1);
	}
	my $roi_infh = new FileHandle($roi_infile);
	if (!defined($roi_infh))
	{
		$logger->error( "Could not open file: $roi_infile!");
		exit(1);
	}
	my $out_fh = new FileHandle(">$probe_outfile");
	if (!defined($out_fh))
	{
		$logger->error( "Could not open file: $probe_outfile!");
		exit(1);
	}
	
	my $roi_hash = {};
	# get cds and ccds ids from the roi file
	while (<$roi_infh>)
	{
		if (/^\#/) {
			next;
		}
		else
		{
			chomp;
			my ($roi_id, $cds_ids, $ccds_ids, $g_accession, $chr_roi_start, $chr_roi_stop, $roi_strand) = split("\t", $_);
			$roi_hash->{$roi_id} = {'cds_ids'=>$cds_ids, 'ccds_ids'=>$ccds_ids};
		}
	}
	
	my $probe_hash = {};
	my @fragment_order_ary = ();
	
	while (<$probe_infh>)
	{
		if (/^\#/) {
			next;
		}
		else
		{
			chomp;
			my ($probe_id, $probe_seq) = split("\t", $_);
			$probe_id =~ /^PROBE\|(.*)$/;
			my $fragment_id = $1;
			push @fragment_order_ary, $fragment_id;
			
			# split fragment id by the pipe character
			my ($roi_id, $restriction_enzymes, $strand, $frag_num) = split(/\|/, $fragment_id);
			
			# change strand from + or - to 1 or -1 respectively
			if ($strand eq '+') { $strand = 1; }
			elsif ($strand eq '-') { $strand = -1; }
			else
			{
				print "Unknown strand string in probe file $probe_infile: $strand\n";
				exit(1);
			}
			
			# get gene name and roi number from the roi id (dummy var will always be "ROI")
			my ($gene_name, $dummy_var, $roi_num) = split("_", $roi_id);
			
			# associate fragment with cds and ccds ids
			my $cds_ids = $roi_hash->{$roi_id}->{'cds_ids'};
			my $ccds_ids = $roi_hash->{$roi_id}->{'ccds_ids'};
			
			$probe_hash->{$fragment_id} = { 'fragment_id' => $fragment_id,
											'probe_seq' => $probe_seq,
											'cds_ids' => $cds_ids,
											'ccds_ids' => $ccds_ids,
											'strand' => $strand,
											'gene_name' => $gene_name,
											'roi_num' => $roi_num
											};
		}
	}
	
	
	while (<$fragment_infh>)
	{
		if (/^\#/) {
			next;
		}
		else
		{
			chomp;
			my ($fragment_id, $g_accession, $fragment_start, $fragment_stop, $flap_pos) = split("\t", $_);
			my $chr_num = &Bio::Disperse::Utils::chr_num_from_g_accession($g_accession);
			$probe_hash->{$fragment_id}->{'g_accession'} = $g_accession;
			$probe_hash->{$fragment_id}->{'fragment_start'} = $fragment_start;
			$probe_hash->{$fragment_id}->{'fragment_stop'} = $fragment_stop;
			$probe_hash->{$fragment_id}->{'flap_pos'} = $flap_pos;
			$probe_hash->{$fragment_id}->{'chr_num'} = $chr_num;
		}
	}

	
	while (<$amplicon_infh>)
	{
		if (/^\#/) {
			next;
		}
		else
		{
			chomp;
			my ($fragment_id, $g_accession, $amplicon_start, $amplicon_stop, $strand) = split("\t", $_);
			$probe_hash->{$fragment_id}->{'amplicon_start'} = $amplicon_start;
			$probe_hash->{$fragment_id}->{'amplicon_stop'} = $amplicon_stop;
		}
	}

	my @col_headers = qw/fragment_id gene_name roi_num cds_ids ccds_ids g_accession chr_num 
						strand fragment_start fragment_stop flap_pos amplicon_start
						amplicon_stop probe_seq/;
	
	# print column headings
	print $out_fh "# ";
	print $out_fh join "\t", @col_headers;
	print $out_fh "\n";
	
	# print snp content
	for my $fragment_id (@fragment_order_ary)
	{
		my @print_list;
		for my $key (@col_headers)
		{
			push @print_list, $probe_hash->{$fragment_id}->{$key};
		}
		print $out_fh join "\t", @print_list;
		print $out_fh "\n";
	}

	# close files
	$probe_infh->close;
	$fragment_infh->close;
	$amplicon_infh->close;
	$roi_infh->close;
	$out_fh->close;

}

sub usage
{
	print "Bad input.\nUsage:\n";
	print "\tperl output_file_consolidator.pl\n";
	print "\t\t-cds [cds coord file]\n";
	print "\t\t-roi [roi coord file]\n";
	print "\t\t-snp [snp info file]\n";
	print "\t\t-target [target file]\n";
	print "\t\t-snptarget [snp target file]\n";
	print "\t\t-frag [fragment file]\n";
	print "\t\t-amp [amplicon file]\n";
	print "\t\t-probe [probe file]\n";
	exit(1);
}
