package PWMLib;
###################################################################################################
# PWM library of core promoter element, with algorithms to search and score sequences.
###################################################################################################

use strict;
use IO::File;

my %PWM;	# ref to hash of PWM matrices, which are each anonymouse arrays of arrays: @PWM{name} [nt] [pos]

#$PWM{TEST} = [ [ 10, 70, 10, 10, ], [ 10, 10, 10, 10, ], [ 10, 10, 10, 10, ], [ 70, 10, 70, 70, ], ];

$PWM{TATA} = [
# Buchers TATA-box HMM trained from 600 unrelated vertebrate promoter sequences
# Consensus - - T A T A W A D R - -
 [ 17.7 ,19.3 ,6.6 ,83.4 ,0.0  ,95.0 ,72.3 ,94.2 ,53.3 ,29.3 ,17.7 ,22.7], #A
 [ 21.1 ,36.1 ,14.8 ,0.0 ,0.0   ,0.0 ,0.0 ,0.0 ,0.0 ,9.0 ,32.5 ,33.0], #C
 [ 29.0 ,36.4 ,6.8 ,0.0  ,0.0   ,0.0 ,0.0 ,5.8 ,20.1 ,51.2 ,37.7 ,33.2], #G
 [ 32.2 ,8.2 ,71.8 ,16.6 ,100.0 ,5.0 ,27.7 ,0.0 ,26.6 ,10.5 ,12.1 ,11.1] #T
];
$PWM{INR} = [
#  Consensus  [CT] [TC] A - [TA](+1) [CT] [TC]
# from epd,Hs ; the Bucher one is a different consensus
#[       2.55,   0.89,   59.03,  8.86,   38.87,  2.99,   2.44,],
#[       53.49,  65.12,  14.51,  36.43,  6.64,   45.63,  46.40,],
#[       7.42,   4.21,   14.06,  36.10,  4.21,   10.08,  4.54,],
#[       36.54,  29.79,  12.40,  18.60,  50.28,  41.31,  46.62,],
# From Chalkley and Verrijzer 1999
[ 0.00, 0.00, 100.00, 23.21, 28.57, 16.07, 0.00, ],
[ 55.36, 75.00, 0.00, 28.57, 0.00, 42.86, 51.79, ],
[ 0.00, 0.00, 0.00, 26.79, 0.00, 0.00, 16.07, ],
[ 44.64, 25.00, 0.00, 21.43, 71.43, 41.07, 32.14, ],
];

$PWM{MTE} = [
# From 2006 Jin et al
# SVAGCSSRGCGS
[3.4,   24.1,   87.9,   8.6,    1.7,    1.7,    10.3,   43.1,   12.1,   5.2,    1.7,    17.2,   ],
[34.5,  41.4,   3.4,    5.2,    94.8,   41.4,   44.8,   0.0,    8.6,    86.2,   5.2,    34.5,   ],
[60.3,  31.0,   8.6,    74.1,   0.0,    53.4,   44.8,   56.9,   67.2,   3.4,    89.7,   46.6,   ],
[1.7,   3.4,    0.0,    12.1,   3.4,    3.4,    0.0,    0.0,    12.1,   5.2,    3.4,    1.7,    ],
];

$PWM{DPE} = [
# Constructed from EPD
# [GA]G[AT][CT][GCA]
#[  42.2 ,3.2  ,44.7 ,3.7  ,21.8],
#[  2.7  ,6.6  ,8.3  ,47.5 ,30.9],
#[  52.3 ,87.7 ,5.3  ,10.0 ,44.7],
#[  2.8  ,2.5  ,41.7 ,38.9 ,2.7]
# From 2006 Jin et al
[51.7,  0.0,    58.8,   0.0,    21.5,   ],
[0.0,   0.0,    0.0,    55.2,   30.5,   ],
[48.3,  100.0,  0.0,    0.0,    48.0,   ],
[0.0,   0.0,    41.2,   44.8,   0.0,    ],
];

$PWM{BRE} = [
# [GC][GC][GA]CGCC
# Constructed from EPD
#[       0.72,   2.16,   23.26,  5.04,   1.68,   2.40,   5.04,],
#[       53.72,  44.84,  11.03,  80.58,  5.52,   84.41,  77.46,],
#[       42.93,  51.08,  63.31,  10.55,  88.25,  9.83,   11.03,],
#[       2.64,   1.92,   2.40,   3.84,   4.56,   3.36,   6.47,],
# From 2006 Jin et al
[0.0,   0.0,    35.1,   0.0,    0.0,    0.0,    0.0,    ],
[68.9,  67.6,   0.0,    100.0,  0.0,    100.0,  100.0,  ],
[31.1,  32.4,   64.9,   0.0,    100.0,  0.0,    0.0,    ],
[0.0,   0.0,    0.0,    0.0,    0.0,    0.0,    0.0,    ],
];
my $CUTOFF_DEFAULT=0.8;

my $core_only=1;

if ($core_only != 1) {
$PWM{SP1} = [
# from M00195.pwm
[ 29, 22, 12, 15, 00, 00, 17, 1, 00, 19, 15, 2, 8, ],
[ 19, 18, 9, 00, 00, 1, 74, 4, 00, 4, 9, 50, 37, ],
[ 32, 51, 60, 82, 100, 98, 00, 91, 91, 70, 66, 19, 29, ],
[ 18, 7, 17, 00, 00, 00, 8, 1, 7, 5, 8, 26, 25, ],
];

$PWM{SP2} = [
#Matrix Name: V$SP2.01
#Family: V$SP1F (GC-Box factors SP1/GC)
[ 21.05, 31.58, 5.26, 0.00, 0.00, 5.26, 0.00, 5.26, 94.74, 0.00, 21.05, 63.16, ],
[ 5.26, 0.00, 5.26, 0.00, 94.74, 10.53, 0.00, 0.00, 0.00, 110.53, 0.00, 0.00, ],
[ 15.79, 78.95, 100.00, 110.53, 15.79, 94.74, 110.53, 100.00, 15.79, 0.00, 0.00, 0.00, ],
[ 57.89, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 5.26, 0.00, 0.00, 78.95, 26.32, ],
];

$PWM{NRSF} = [
#NRSE: ttCAGCACCacGGAcAGcgcC
#from M00256.pwm
[07, 00, 00, 100, 00, 00, 100, 00, 00, 60, 14, 03, 00, 82, 00, 100, 00, 35, 25, 07, 14],
[07, 14, 96, 00, 00, 96, 00, 100, 100, 00, 71, 00, 00, 03, 92, 00, 00, 60, 14, 57, 78],
[10, 10, 00, 00, 100, 00, 00, 00, 00, 25, 07, 96, 100, 00, 07, 00, 100, 00, 57, 28, 00],
[75, 75, 03, 00, 00, 03, 00, 00, 00, 14, 07, 00, 00, 14, 00, 00, 00, 03, 03, 07, 07],
];

$PWM{NF1} = [
#from M00193.pwm
[22.67, 22.67, 6.67, 0.00, 0.00, 1.33, 6.67, 42.86, 30.67, 21.33, 26.67, 45.33, 13.33, 13.33, 14.67, 40.00, 36.00, 29.33,],
[24.00, 32.00, 14.67, 0.00, 0.00, 0.00, 89.33, 18.57, 26.67, 21.33, 25.33, 9.33, 22.67, 54.67, 53.33, 24.00, 16.00, 18.67,],
[18.67, 18.67, 1.33, 1.33, 100.00, 98.67, 2.67, 11.43, 26.67, 42.67, 24.00, 25.33, 36.00, 12.00, 16.00, 12.00, 28.00, 26.67,],
[34.67, 26.67, 77.33, 98.67, 0.00, 0.00, 1.33, 27.14, 16.00, 14.67, 24.00, 20.00, 28.00, 20.00, 16.00, 24.00, 20.00, 25.33,],
];
$PWM{CCAAT} = [
# from MO0254
[32.00, 18.29, 14.29, 58.29, 29.14, 0.00, 0.00, 100.00, 68.00, 9.71, 13.14, 66.29,],
[31.43, 29.71, 26.86, 0.57, 3.43, 98.86, 99.43, 0.00, 4.57, 0.00, 51.43, 3.43,],
[6.86, 24.57, 13.71, 40.00, 56.57, 0.57, 0.00, 0.00, 12.00, 8.57, 33.71, 29.71,],
[29.71, 27.43, 45.14, 1.14, 10.86, 0.57, 0.57, 0.00, 15.43, 81.71, 1.71, 0.57,],
];
$PWM{GRE} = [
# from M00205
[0.00, 0.00, 0.00, 88.68, 0.00, 70.86, 58.44, 30.18, 31.26, 19.30, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00,],
[10.30, 0.00, 22.13, 0.00, 79.28, 29.14, 11.62, 9.82, 37.01, 0.00, 0.00, 0.00, 32.22, 100.00, 0.00, 16.58,],
[61.08, 71.22, 9.88, 0.00, 0.00, 0.00, 19.64, 20.12, 31.74, 0.00, 100.00, 0.00, 9.58, 0.00, 0.00, 44.27,],
[28.62, 28.78, 67.98, 11.32, 20.72, 0.00, 10.30, 39.88, 0.00, 80.70, 0.00, 100.00, 58.20, 0.00, 100.00, 39.15,],
];
$PWM{IK2} = [
# from M00087
[25.00, 30.56, 13.89, 16.67, 16.67, 0.00, 0.00, 100.00, 50.00, 27.78, 33.33, 25.00,],
[27.78, 13.89, 19.44, 27.78, 0.00, 0.00, 0.00, 0.00, 13.89, 19.44, 22.22, 50.00,],
[33.33, 19.44, 19.44, 0.00, 83.33, 100.00, 100.00, 0.00, 2.78, 16.67, 25.00, 13.89,],
[13.89, 36.11, 47.22, 55.56, 0.00, 0.00, 0.00, 0.00, 33.33, 36.11, 19.44, 11.11,],
];
$PWM{NFE2} = [
# from M00037
[22.22, 22.22, 11.11, 0.00, 0.00, 100.00, 0.00, 0.00, 0.00, 88.89, 11.11,],
[22.22, 0.00, 88.89, 0.00, 0.00, 0.00, 33.33, 0.00, 88.89, 11.11, 44.44,],
[0.00, 77.78, 0.00, 0.00, 100.00, 0.00, 66.67, 11.11, 0.00, 0.00, 0.00,],
[55.56, 0.00, 0.00, 100.00, 0.00, 0.00, 0.00, 88.89, 11.11, 0.00, 44.44,],
];
}

my %ScoreSet;	# scores above cutoff, saved for each pwm in a HoH {matrix}{seqno}
my @Syncom;		# AoH to hold hits for syncom search
my $ci_vector;	# ref to Consensus index vector 

###################################################################################################
# motif offsets from TSS(optional param), HoH. Allow TSS to be 1 nt either side of the given pos
###################################################################################################
my %OFFSET = (
    "TEST"=> {stop=> -10,start=> -20},
    "BRE"=> {stop=> -31,start=> -43},
    "TATA"=> {stop=> -24,start=> -35},
    "INR"=> {stop=> -1,start=> -3},
    "DPE"=> {start=>27,stop=>29},
    "MTE"=> {start=>16,stop=>18},
);

###################################################################################################
# Reversible?
###################################################################################################
my %REVERSIBLE = (
    "TEST"=> 1,
    "BRE"=> 0,
    "TATA"=> 0,
    "INR"=> 0,
    "DPE"=> 0,
    "MTE"=> 0,
    "SP1"=> 1,
    "SP2"=> 1,
    "NRSF"=> 1,
    "NF1"=> 1,
    "CCAAT"=> 1,
    "GRE"=> 1,
    "IK2"=> 1,
    "NFE2"=> 1,
);

###################################################################################################
# Synergistic combinations, hash of hashes
###################################################################################################
my %SYNCOM =  (
	"TATA-INR"=>
	{
	element1=>"TATA",
	element2=>"INR",
	#35-25 TATA -> inr : -9-+2 TATA -> (-35->+2) - (-25->-9)
	minoffs=>16,
	maxoffs=>37,
	},
	"INR-DPE"=>
	{
	element1=>"INR",
	element2=>"DPE",
	minoffs=>28,	# allow 1 bp out, either way (should be exactly +29)
	maxoffs=>30,
	},
	"BRE-TATA"=>
	{
	element1=>"BRE",
	element2=>"TATA",
	minoffs=>5,
	maxoffs=>5,
	},
	"BRE-INR"=>
	{
	element1=>"BRE",
	element2=>"INR",
	minoffs=>21,
	maxoffs=>42,
	},
	"BRE-DPE"=>
	{
	element1=>"BRE",
	element2=>"DPE",
	minoffs=>53,
	maxoffs=>73,
	},
	"TATA-DPE"=>
	{
	element1=>"TATA",
	element2=>"DPE",
	minoffs=>48,
	maxoffs=>68,
	},
	"TATA-MTE"=>
	{
	element1=>"TATA",
	element2=>"MTE",
	minoffs=>38,
	maxoffs=>58,
	},
	"MTE-DPE"=>
	{
	element1=>"MTE",
	element2=>"DPE",
	minoffs=>10,
	maxoffs=>10,
	},
);

my $pwml; # pwm len
my @nseq; # numericised target seq
my @score;
my @MSS;	# array of matrix similarity scores

my ($i,$j,$s);

sub new {
###################################################################################################
# Constructor
###################################################################################################
    my $invocant = shift;
    my $class    = ref($invocant) || $invocant;

    my $self = {
		pwmlib => \%PWM,
    };

    bless $self;
    return $self,$class;
}

sub read_pwmlib {
###################################################################################################
# FIX ME ! Load pwms from file into %PWM HoAoA: @PWM{name} [nt] [pos]
###################################################################################################

	my $fh_pwm = IO::File->new;
	my $pwmf="./cpe.pwms";
	my ($mat,$i,@vals);

	unless ($fh_pwm->open("<$pwmf")) {
		die "ERROR: Unable to open file $pwmf\n";
	}

	while (<$fh_pwm>) {
		next if (/#/);		#skip comments
		if (s/^>//) {
			$mat = $_;
			chomp($mat);
			print "pwm = $mat\n";
			for ($i=0;$i<4;$i++) {	# loop thro sequence
				<$fh_pwm>;
				push @{$PWM{$mat}[$i]}, [ split ];
			}
		}
	}
	$fh_pwm->close;
}

sub scanall {
###################################################################################################
# scan the sequence for each pwm in lib
# Args: ref to numeric sequence array,  cutoff (0=use default), TSS pos (optional, to restrict search)
# Returns: ref to H0H of seqnos/scores above cutoff for each pwm
###################################################################################################
	my ($self,$nseqref,$cutoff,$tss)=@_;
	my ($m,$pwml,$i,$j);

	foreach $m (keys %PWM) {
		$ScoreSet{$m} = scan($self,$m,$nseqref,$cutoff,$tss); #if $m eq 'TEST';
		if ($REVERSIBLE{$m} == 1) {
			$pwml=$#{ $PWM{$m}[0] };	# pwm len
			$j=0;
			for ($i=$pwml;$i>=0;$i--) {	#loop back thro pwm
				$PWM{"$m(-)"} [0] [$j] = $PWM{$m} [3] [$i];
				$PWM{"$m(-)"} [1] [$j] = $PWM{$m} [2] [$i];
				$PWM{"$m(-)"} [2] [$j] = $PWM{$m} [1] [$i];
				$PWM{"$m(-)"} [3] [$j] = $PWM{$m} [0] [$i];
				$j++;
			}
			$ScoreSet{"$m(-)"} = scan($self,"$m(-)",$nseqref,$cutoff,$tss); #if $m eq 'TEST';
		}
	}
	return \%ScoreSet;
}

sub scan {
###################################################################################################
# Slide along sequence scoring each pos for PWM, adding scores above cutoff to the matrix scoreset
# Args: PWM name, ref to numeric sequence array, cutoff (0=use default),TSS pos (optional, to restrict search)
# Returns: ref to Hash of seqnos/scores above cutoff
###################################################################################################

	my ($self,$m,$nseq,$cutoff,$tss)=@_;
	$tss-- if (defined ($tss)); # TSS index is one less than the pos given by user

	$pwml=$#{ $PWM{$m}[0] };	# pwm len
	my ($max,$k,$cval,$cmax,$start,$stop);	# used to calc matrix similarity score
	
	$ci_vector = calc_ci_vector($m);

	#print "PWM = $m, len = " . ($pwml+1) . "\n";

	my $seqlen=$#{$nseq};

	if ($cutoff == 0 ) {
		$cutoff = $CUTOFF_DEFAULT;
	}

	if (defined ($tss) && defined $OFFSET{$m} ) {
		$start = $tss + $OFFSET{$m}{start};
		$stop = $tss + $OFFSET{$m}{stop};
		print "Searching for $m motif starting at pos " . ($start+1) . " to " .  ($stop+1) . "\n";
	}
	else {
		$start = 0;
		$stop = $seqlen-$pwml;
	}

	for ($i=$start;$i<=$stop;$i++) {	# loop thro sequence
		my $nt= $$nseq[$i];
		#print "NT $i=$nt  ";
		$s=$i;
		$score[$i]=1;	# initialise score to 1
		$MSS[$i]=0;		# initialise matrix similarity scor
		$cval=0;
		$cmax=0;

		for ($j=0;$j<=$pwml;$j++) {	#loop thro pwm

			# find most frequent nt
			$max=0;
			for ($k=0;$k<4;$k++) {
				$max = $PWM{$m} [$k] [$j] if ($PWM{$m} [$k] [$j] > $max);
			}
			
			if ($$nseq[$s] == 9 || $PWM{$m} [$$nseq[$s]] [$j] <= 2) {	# abandon if any site < 2%
				#print " Bad NT or zero score, quitting";
				$score[$i]=-9;
				next;
			}	
			next if $score[$i]==-9;	# fall out to next pos in sequence

			#print ":" . $$nseq[$s] . "/" . $j . "=" . $PWM{$m} [$$nseq[$s]] [$j];
			$score[$i] += log ( ($PWM{$m}[$$nseq[$s]][$j] / 100) / 0.25); #  log-odds ratio against random model 
			$cval += $$ci_vector[$j] * $PWM{$m}[$$nseq[$s]][$j];
			$cmax += $$ci_vector[$j] * $max;
	
			$s++;
		}
		$MSS[$i] = $cval / $cmax if $score[$i] != -9;
		#print " SCORE=$score[$i];MSS = $MSS[$i]\n";
		if ($MSS[$i] > $cutoff ) {
			$ScoreSet{$m}{$i} = $MSS[$i];
		}
	}

	return $ScoreSet{$m};

} #scan

sub calc_ci_vector {
###################################################################################################
# Calculates the CI (Conservation Index) vector for a matrix, after Frech 93
# 	Ci(i) = (100 / ln5) * ( sum(P(i,b) * ln P(i,b)) + ln5)
# Args: name of PWM
# Returns: ci_vector array ref
###################################################################################################

	my ($m)=@_;

	my ($sum,$freq,@vector);

	$pwml=$#{ $PWM{$m}[0] };	# pwm len

	for ($i=0;$i<=$pwml;$i++) {	#loop thro pwm

		$sum = 0;
		for ($j=0;$j<4;$j++) {
			$freq = $PWM{$m}[$j][$i]/100;
			$sum += $freq * log($freq) if $freq > 0;
		}

		$sum += log(5);
		$vector[$i] = (100 / log(5)) * $sum;
		#print "$m, vector $i =  $vector[$i] \n";
	}
	return \@vector;
}


sub searchcomb {
###################################################################################################
# scan the sequence for all sysnergistic combinations
# Args: ref to numeric sequence array,  TSS pos (optional, to restrict search), cutoff (0=use default)
# Returns: AoH results set
###################################################################################################
	my ($self,$nseqref,$cutoff,$tss)=@_;

	#first build the individual scores 
	my $m;
	foreach $m (keys %PWM) {
		$ScoreSet{$m} = scan($self,$m,$nseqref,$cutoff,$tss); #if $m eq 'TEST';
	}

	my $c;
	foreach $c (keys %SYNCOM) {
		&findcomb($c);
	}
	return \@Syncom;
}

sub findcomb {
###################################################################################################
# Find combinations of motifs within defined max and minimum offsets using generated scoresets. 
# Called by searchcomb().
# Args: key to hash containing parameters of combination
###################################################################################################

	my ($comb)=@_;
	#print "Scanning for $comb synergistic combination...\n";

	my $e1 = $SYNCOM{$comb}{element1};
	my $e2 = $SYNCOM{$comb}{element2};
	my $min = $SYNCOM{$comb}{minoffs};
	my $max = $SYNCOM{$comb}{maxoffs};
	my $pwml1=$#{ $PWM{$e1}[0] };	# pwm len, 1st element
	my $pwml2=$#{ $PWM{$e2}[0] };	# pwm len, 2nd element

	# let predicted tss = middle of min/max offset for second element in combo
	my $tss =  ($OFFSET{$e2}{stop}+$OFFSET{$e2}{start})/2;
	$tss *=-1 if $OFFSET{$e2}{stop} <0;
	#my $tss = int((abs $OFFSET{$e2}{stop})+(abs $OFFSET{$e2}{start}));

	my ($k1,$k2,$to);
	my $i=$#Syncom + 1;	# position ind at end of @Syncom
	foreach $k1 (sort{$a <=> $b}(keys(%{$ScoreSet{$e1}}))) {	# access element 1 hits in pos order
			foreach $k2 (sort{$a <=> $b}(keys(%{$ScoreSet{$e2}}))) {	# access element 2 hits in pos order

				if ( (($k2-$k1) >= $min ) && (($k2-$k1) <= $max ) ) {
					#print " $e1 at $k1 -> ";		# . substr($seq,$k1,$pwml1+1);
					#print ", $e2 at $k2 -> ";	# . substr($seq,$k2,$pwml2+1);
					#print ": Combined Score: " . ($ScoreSet{$e1}{$k1} + $ScoreSet{$e2}{$k2}) . "\n";
					#$to = ($k2+$pwml2+1) - $k1;
					#print substr($seq,$k1,$to) . "\n";
					$Syncom[$i]{e1} = $e1;
					$Syncom[$i]{p1} = $k1;
					$Syncom[$i]{e2} = $e2;
					$Syncom[$i]{p2} = $k2;
					$Syncom[$i]{score} = $ScoreSet{$e1}{$k1} + $ScoreSet{$e2}{$k2};
					$Syncom[$i]{tss} = $k2 - $tss;
					$i++;
				}
			} #k2
	} #k1


} #findcomb

sub getkeys {
###################################################################################################
# return an array ref of the PWM lib keys
# Args: none
# Returns: array ref
###################################################################################################

	my $keys = [(keys %PWM)];
	return $keys;

}

sub pwmlen {
###################################################################################################
# return length of PWM in lib
# Args: pwm name
# Returns: length
###################################################################################################

	my ($self,$m)=@_;
	return $#{ $PWM{$m}[0] } + 1;

}

sub pwmcutoff {
###################################################################################################
# return default cutoff score 
# Args: none
# Returns:  cutoff score
###################################################################################################

	my ($self)=@_;

	return $CUTOFF_DEFAULT;
}

sub getlib() {
###################################################################################################
#return a handle to the matrices
###################################################################################################
	return \%PWM;
}
1;
