#!/usr/bin/perl
#
# Copyright 2003 Sashidhar Gadiraju, Peter K. Rogan
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
#
#Program : scandiff
#Version : 1.2
#1.2 : producing a combined BED file
#1.1 : removing the chromosome checking based on 23, 22 
#	for making it work with multiple organisms
#1.0 : stabilized 0.10.1 as 1.0
#0.10.1 : changed the processing of the command line a bit
#0.9 : Creating a single inst file instead of different delila calls for each piece of coordinate
#0.8 : Added the stadens's substitution
#0.7 : Removing some bugs from 0.6
#0.6 : Reading from two ribls instead of from a single ribl,completed
#0.5 : Adding the help for the command line
#0.4 : Adding the option of -z for comparing zscores
#		 Also computing NL, num of seq for each of the columns
#		 Included is the -c flag to take the chromosome name
#		 as this may not be found in the scan output
#		 Also included the processing of BED files
#0.2 : Adding the code for the comparison of BS with same coordinates. 
#The method for comparison is the statistical method based on confidence intervals
#Description 
#Take two scan output files and take compare their results based on the change in Binding Sites
#
use strict;
#use IO::Handle;	#to let file handles use the ->autoflush option 
use Getopt::Long;
Getopt::Long::Configure("no_ignore_case");
#main subroutine declarations
my %OPT = {};	#the options variable
#datapos stores the positions of fields(the first pos is zero) in data file,"the output of scan"
my %datapos={};
$datapos{'coord'}=3;
$datapos{'dir'}=4;
$datapos{'Ri'}=5;
$datapos{'Z'}=6;
$datapos{'accession'}=10;	#position of the accession leader in psdataop file 
my %ribl1={};	#to store the parameters read from ribl1 
my %ribl2={};	#to store the parameters read from ribl2 
my @RIM1;	#to hold the Ri values of a,c,g,t from the ribl matrix 1
my @RIM2;	#to hold the Ri values of a,c,g,t from the ribl matrix 2
my @FRM1; #to hold the frequencies of a,c,g,t from the ribl matrix 1
my @FRM2; #to hold the frequencies of a,c,g,t from the ribl matrix 2

#these 3 indicate which output file to produce, defined from -o
my $opf = 1,my $bedf=1;	#BS in only first file
my $ops = 1,my $beds=1; 	#BS in only second file
my $opb = 1,my $bedb=1; 	#BS common to both the files
my $opc = 1;
#the below variables hold the output file descriptors pointers
my $IP1, my $IP2;
my $INST, my $BK;
my $OTD1, my $ITD1, my $OTD2, my $ITD2;	#temporary data files
my $OPF, my $OPS, my $OPB1, my $OPB2;
my $BEDF, my $BEDS, my $BEDB;
my $RIBL1="ribl1";	#name of ribl file 1
my $RIBL2="ribl2";	#name of ribl file 2
#process the command line options
&procopts();
#open the input and output files
&openfiles();
#print "done with openfiles()\n";
&readribl($RIBL1,\%ribl1,\@RIM1,\@FRM1);
&readribl($RIBL2,\%ribl2,\@RIM2,\@FRM2);
#run the comp subroutine
&comp(\%ribl1, \%ribl2, \@FRM1, \@FRM2);

#close all the open files
print "Closing all file descriptors\n";
closef( $IP1, $IP2, $OPF, $OPS, $OPB1, $OPB2, $INST, $OTD1, $OTD2) ;
closef( $BEDB, $BEDF, $BEDS );

#combine the generated files if specified
if( $opc )
{	combineBED();	}		
print "End of $0 execution\n";
#END OF MAIN SUB

sub getpos
{
	my $par= shift;
	return $datapos{$par};
}
sub getline()
{
	my $file = shift;	#get the file name
	my $ipline;
	my $flag = 1;
	while( $flag)
	{
		if( $ipline = <$file>)
		{
			my $line = &trim($ipline);
			if( $line =~ /^\*/ )	{}
			else	{	$flag = 0; }
		}
		else{ return 0 };
	}
	#print $ipline;
	return $ipline;
}#getline()

sub comp()
{
	my ($ribl1, $ribl2, $frm1, $frm2)  = @_;
	if( $opf )
	{
		&printcomments($IP1, $OPF) if ( &getparams('D') );
		&printBEDheader(&getparams('c'), $BEDF, $ribl1, "100,50,0") if( &getparams('B') );
	}
	if( $ops )
	{
		&printcomments($IP2, $OPS) if ( &getparams('D') );
		&printBEDheader(&getparams('c'), $BEDS, $ribl1, "0,60,120") if( &getparams('B') );
	}
	if( $opb )
	{
		#the comments to opb1 and opb2 can be from ip1 or ip2 or both 
		#as opb1 and opb2 differ only in the ri, z score value
		#everything else should be same
		&printcomments($IP1, $OPB1) if ( &getparams('D') );
		&printcomments($IP1, $OPB2) if ( &getparams('D') );
		&printBEDheader(&getparams('c'), $BEDB, $ribl1, "128,128,128") if( &getparams('B') );
		&printINSTheader($INST);
	}

	my $firstcnt=0;	#sites in first file
	my $seccnt=0;	#sites in second file
	my $bothtotcnt=0;	#total sites common to both the files
	my $bothsigcnt=0;	#sites in both the files, with sig. diff
	#todo print the headers of bed files
	my $l1=&getline($IP1); 
	my $l2=&getline($IP2);
	#read a line while both the files are non-empty
	while ( $l1  && $l2) 
	{
		$l1 = &trim($l1);
		$l2 = &trim($l2);
		my @list1 = split(/\s+/, $l1);
		my @list2 = split(/\s+/, $l2);
		#if both BS have same coord, compare for significant changes
		my $coord1 = $list1[ $datapos{'coord'} ];
		my $coord2 = $list2[ $datapos{'coord'} ];
		
		#print $coord1." ". $coord2. "\n";
		if( $coord1 == $coord2 )
		{
			$bothtotcnt++;
			my $res=0;
			$res=&sameBS($l1, $l2, $OPB1, $OPB2, $BEDB, $ribl1, $ribl2, $frm1, $frm2) unless ( !$opb);
			$bothsigcnt++ if($res);
			#forward both the file pointers
			$l1= &getline($IP1); 
			$l2=&getline($IP2);
		}#if same BS
		elsif($coord1 < $coord2)
		{
			&singleBS($l1, $OPF, $BEDF, $ribl1) unless( !$opf );
			#forward the first file only
			$l1=&getline($IP1); 
			$firstcnt++;
		}
		elsif($coord1 > $coord2)
		{
			&singleBS($l2, $OPS, $BEDS, $ribl2 ) unless (!$ops);
			#forward the second file only
			$l2=&getline($IP2) ;
			$seccnt++;
		}
	}#while both files are open

	#if only the first file is open 
	if($opf)
	{
		while( $l1 )
		{
			$l1 = &trim($l1);
			&singleBS($l1, $OPF, $BEDF, $ribl1) unless( !$opf );
			$l1 = &getline($IP1);
			$firstcnt++;
		}
	}
	
	#if only the second file is open
	if($ops)
	{
		while( $l2 )
		{
			$l2 = &trim($l2);
			&singleBS($l2, $OPS, $BEDS, $ribl2) unless( !$ops );
			$l2 = &getline($IP2);
			$seccnt++;
		}
	}
	# enable  autoflush for intermediate files
	my $oldfh=select($OTD1);	$|=1;
	select($OTD2);	$|=1;
	select($INST);	$|=1;
	select( $oldfh );

	closef($OTD1, $OTD2, $INST);
	#system("tail -10 tmpd2");

	#now process all the common entries for comparison by confidence intervals
	if( &getparams('s') && $opb)	# both SD comp and print common sites file
	{
		my $result = system("delila");
		#&err("Error in extracting the dna pieces using delila") and exit if($result);
	
		open( ITD1, "tmpd1") || die "cannot open input temporary file tmpd1";
		$ITD1=\*ITD1;
		open( ITD2, "tmpd2") || die "cannot open input temporary file tmpd2";
		$ITD2=\*ITD2;
		open( BK, "book") || die "cannot open input file book";
		$BK=\*BK;
		#todo print the headers of bed files
		my $l1=&getline($ITD1); 
		my $l2=&getline($ITD2);
		#read a line while both the files are non-empty
		#print "before while loop\n";
		my $comcnt = 0; #the number of common sites with significant difference
		while ( $l1  && $l2) 
		{
			$l1 = &trim($l1);
			$l2 = &trim($l2);
			my @list1 = split(/\s+/, $l1);
			my @list2 = split(/\s+/, $l2);
			#if both BS have same coord, compare for significant changes
			my $coord1 = $list1[ $datapos{'coord'} ];
			my $coord2 = $list2[ $datapos{'coord'} ];
			
			#print $coord1." ". $coord2. "\n";
			#print "line1 = $l1\n";
			#print "line2 = $l2\n";
			if( $coord1 == $coord2 )
			{
				my $res=0;
				$res=&sameBSsd($l1, $l2, $OPB1, $OPB2, $BEDB, $ribl1, $ribl2, $frm1, $frm2, $BK) unless ( !$opb);
				$bothsigcnt++ if($res);	#if there was a signific. change, increment the count
				#forward both the file pointers
				$l1= &getline($ITD1); 
				$l2=&getline($ITD2);
			}#if same BS
			else
			{
				print "Error in reading from data temporary files : coordinates do not match\n";
				print "coord1 = $coord1 ; coord2 = $coord2\n";
				$l1= &getline($ITD1); 
				$l2=&getline($ITD2);
			}
		}#second while loop
		closef( $ITD1, $ITD2);
		unlink 'tmpd1', 'tmpd2';

	}#if comparison is by confidence intervals
	print $OPF "* $firstcnt sites found with scandiff in only the first scan results\n" 
		if($opf && &getparams('D') );
	print $OPS "* $seccnt sites found with scandiff in only the second scan results\n" 
		if($ops && &getparams('D') );
	print $OPB1 "* $bothsigcnt sites found with scandiff,significantly different, common to both the scan results\n" 
		if($opb  && &getparams('D') );
	print $OPB2 "* $bothsigcnt sites found with scandiff,significantly different, common to both the scan results\n" 
	if($opb && &getparams('D') );
	print $OPB1 "* $bothtotcnt sites found with scandiff, common to both the scan results\n" 
		if($opb && &getparams('D') );
	print $OPB2 "* $bothtotcnt sites found with scandiff, common to both the scan results\n" 
		if($opb && &getparams('D') );

	
}# comp()

sub combineBED
{
	if(! &getparams('B') )	# nothing to combine
	{
		print "No BED files to combine\n";
		open(F, ">BED.txt") || die "Cannot create empty file BED.txt\n";
		close F;		
	}
	else
	{
		my $bedc = "BED.txt";
		my @bedlist = ();
		push(@bedlist, 'BEDb.txt') if($opb);
		push(@bedlist, 'BEDf.txt') if($opf);
		push(@bedlist, 'BEDs.txt') if($ops);
		my $bedfile = shift(@bedlist);
		print "No BED files to combine\n" and return if(! $bedfile);
		my $res=system("cp $bedfile $bedc");
		print "WARNING: in combining the BED files: $!\n" and return
			if($res);
		while( my $bf = shift(@bedlist) )
		{	
			$res=system(" sed  '1d' $bf >>$bedc");	
			print "WARNING: in combining the BED file $bf: $!\n" if($res);
		}
	}
}# combineBED()
		
sub singleBS()
{
	my ($l1 , $file1, $bed1, $ribl) = @_;
	print $file1 "$l1\n" if( &getparams('D') );	#print the line to output data file
	printopBEDline($l1, $bed1, $ribl)  if( &getparams('B') );	#print the line to output bed file
}#singleBS

sub sameBS()
{
	my ( $l1, $l2, $file1, $file2, $bedfile, $ribl1, $ribl2, $frm1, $frm2) = @_;
	my @list1 = split( /\s+/, $l1);
	my @list2 = split( /\s+/, $l2);
	my $ri1 = $list1[ $datapos{'Ri'} ]; 
	my $co1 = $list1[ $datapos{'coord'} ];
	my $ri2 = $list2[ $datapos{'Ri'} ]; 
	my $co2 = $list2[ $datapos{'coord'} ];
	my $dir1 = $list1[ getdatapos('dir') ];
	my $dir2 = $list2[ getdatapos('dir') ];
	if( $dir1 ne $dir2 ) 
	{	print "For sites at $co1, direction is different in both the versions\n";	}
	#print "l1: $l1\n";
	#print "l2: $l2\n";
	#print "ri1=",$ri1," co1=",$co1," $ri2=", $ri2," co2=",$co2," \n";
	my $printflag = 0;	#indicates whether a significant change has occured
	my $res=0;	#indicate whether a sig. change or not
	if( &getparams('s') )	#if the comparison method is standard deviation
	{		
		my $rfrom = getriblpar($ribl1, 'from');
		my $rto = getriblpar($ribl1, 'to');
		#&usage("cannot find program xtrpiece") and exit unless ( -e "xtrpiece" );		
		#my $xtrstr = "perl xtrpiece.pl $co1 $rfrom $rto";
		#print "Calling xtrpiece : $xtrstr\n";
		#my $result = system($xtrstr);
		#&err("Error in extracting the dna piece using xtrpiece") and exit if($result);
		#if( ! -f "piece" ) { die "Cannot find file \'piece\' \n";	}
		#rename("piece","piece1") || die "Cannot rename file piece to piece1";

		printINSTline( $co1, $rfrom, $rto, $dir1 );
		print $OTD1 "$l1\n";
		print $OTD2 "$l2\n";
		
		
		#$rfrom = getriblpar($ribl2, 'from');
		#$rto = getriblpar($ribl2, 'to');
		#my $xtrstr = "perl xtrpiece.pl $co2 $rfrom $rto";
		#print "Calling xtrpiece : $xtrstr\n";
		#my $result = system($xtrstr);
		#&err("Error in extracting the dna piece using xtrpiece") and exit if($result);
		#if( ! -f "piece" ) { die "Cannot find file \'piece\' \n";	}
		#rename("piece","piece2") || die "Cannot rename file piece to piece1";

		#open(P1, "piece1") || die "cannot open piece1\n";
		#my @p1 = <P1>;
		#my $p1 = join("",@p1);
		#undef @p1;
	
		#open(P2, "piece2") || die "cannot open piece2\n";
		#my @p2 = <P2>;
		#my $p2 = join("",@p2);
		#undef @p2;
		
		#my $p2 = $p1;	#remove the additional xtrpiece call
		#my @conf1 = &CI($p1, $ribl1, $frm1);	#get the first confidence interval
		#my @conf2 = &CI($p2, $ribl2, $frm2);	#get the second confidence interval
	
		#if( ! &overlap(\@conf1, \@conf2))	#if conf int do not overlap, a significant BS
		#{	$printflag = 1;	}
			
	}
	elsif ( &getparams('b'))	#if comparison is to be done by Ri bit diff
	{	
		#print $ri1." ". $ri2."\n";
		if( abs($ri1 - $ri2) > &getparams('b') )	#a significant BS
		{	$printflag = 1;	}
	}
	elsif( &getparams('z') )	#if comparison is to be done by z scores
	{
		my $z1 = $list1[ $datapos{'Z'} ];
		my $z2 = $list2[ $datapos{'Z'} ];
		if( abs($z1 - $z2) > &getparams('z') )	#a significant BS
		{	$printflag = 1;	}
	}
	
	if(  $printflag )		#if there is a significant change, print op files
	{
		$res=1;
		#print the data file 
		if( &getparams('D') ) {
			print $file1 "$l1\n";
			print $file2 "$l2\n";
		}
		#print the BED file for the BS with the largest Ri Value
		if( &getparams('B') ) {
			if( $ri1 >= $ri2 )
			{	printopBEDline($l1, $bedfile, $ribl1);	}
			else
			{	printopBEDline($l2, $bedfile, $ribl2);	}
		}#if: inner
	}#if: outer
		
}#sameBS()


#same BS with the option of SD
sub sameBSsd()
{
	my ( $l1, $l2, $file1, $file2, $bedfile, $ribl1, $ribl2, $frm1, $frm2, $book) = @_;
	my @list1 = split( /\s+/, $l1);
	my @list2 = split( /\s+/, $l2);
	my $ri1 = $list1[ $datapos{'Ri'} ]; 
	my $co1 = $list1[ $datapos{'coord'} ];
	my $ri2 = $list2[ $datapos{'Ri'} ]; 
	my $co2 = $list2[ $datapos{'coord'} ];
	my $dir1 = $list1[ getdatapos('dir') ];
	my $dir2 = $list2[ getdatapos('dir') ];
	if( $dir1 ne $dir2 ) 
	{	print "For sites at $co1, direction is different in both the versions\n";	}
	#print "l1: $l1\n";
	#print "l2: $l2\n";
	#print "ri1=",$ri1," co1=",$co1," $ri2=", $ri2," co2=",$co2," \n";
	my $printflag = 0;	#indicates whether a significant change has occured
	my $res=0;	#whether a sig. change or not
	if( &getparams('s') )	#if the comparison method is standard deviation
	{
		my $p1 = getdnapiece($book);
		
		my $p2 = $p1;	#remove the additional xtrpiece call
		my @conf1 = &CI($p1, $ribl1, $frm1);	#get the first confidence interval
		my @conf2 = &CI($p2, $ribl2, $frm2);	#get the second confidence interval
	
		if( ! &overlap(\@conf1, \@conf2))	#if conf int do not overlap, a significant BS
		{	$printflag = 1;	}
			
	}
	
	if(  $printflag )		#if there is a significant change, print op files
	{
		$res=1;
		#print the data file 
		if( &getparams('D') ) {
			print $file1 "$l1\n";
			print $file2 "$l2\n";
		}
		#print the BED file for the BS with the largest Ri Value
		if( &getparams('B') ) {
			if( $ri1 >= $ri2 )
			{	printopBEDline($l1, $bedfile, $ribl1);	}
			else
			{	printopBEDline($l2, $bedfile, $ribl2);	}
		}#if: inner
	}#if: outer
	return $res;	
}#sameBSsd()

#retrieve the dna 
sub getdnapiece
{
	my ($fip) = shift;
	my $return = ""; 
	my $flag = 0;	#condition for loop exit
	while( my $libline = <$fip>)
	{	if( $libline =~ /^dna/)	{	$flag = 1; last;	}	}
	&err("Error in parsing book to retrieve dna sequence")	and exit
		unless( $flag );
	$flag = 0;
	while( my $libline = <$fip>)
	{
		$libline = &trim( $libline);
		if( $libline =~ /^\*\s*(.*)$/)
		{
			my $temp = &trim( $1 );
			$return .= $temp;
		}
		elsif( $libline =~ /^\s*dna/ )
		{	$flag = 1; last;	}
		else	{	$flag =0; last;	}
	}
	&err("Warning: Could not retrieve complete dna sequence from book") 
		unless($flag);
	if( $return eq "" )
	{	return 0;	}
	else
	{	return $return;	}

}#getdnapiece()



sub getparams
{
	my $opt = shift;
	return $OPT{ $opt };
}#getparams()

#remove leading and trailing ws from a line
sub trim
{
	my $ret = shift;
	$ret =~ s/^\s+//;	#remove leading whitespaces
	$ret =~ s/\s+$//;	#remove trailing whitespaces
	return $ret;
}#trim()

#display the usage of this program
sub usage
{
	my $errmesg = shift;
	print "\n";
	print "***  $errmesg  ***\n";
	print "usage: $0 -[bcfoszBD]  [FILE1 [FILE2] ]
	
	INPUT FILES NEEDED
	ribl1, ribl2
	
	-f/--file filename,
		Give this option if the options are given in a file.
		Enter the options just as you would enter on the command line
		The options given in the file are inserted at the start of the 
		actual commandline.
	
	INPUT OPTIONS
	FILE1 first input data file
		Default : data1
	FILE2 second input data file
		Default : data2
	-c/--chrm chromosome, values [for humans: 1..22,X,Y]
		This option is needed for producing the BED file
		Default : 1
		
	COMPARISON OPTIONS
	-s/--sd Number of standard deviations 
	-b/--bits Significant difference in bits
	-z/--zscore Significant difference in zscore
	
	OUTPUT OPTIONS
-o/--opstring Output string, values [F/f,S/s,B/b,C/c]
		Capital letter produces output
		Small letter supresses output
		F/f Produce data/BED results for binding sites(BS) in first file only
			The files generated are dataf/BEDf.txt
		S/s Produce data/BED results for binding sites(BS) in second file only
			The files generated are datas/BEDs.txt
		B/b Produce data/BED results for binding sites(BS) common to both the files
			The files generated are datab/BEDb.txt
		C/c Produce the BED file combining the above defined BED files 
			The file BED.txt is generated
		Note: C combines only the other BED files generated.
		For ex: For '-o FsBC', the combined BED.txt file includes BEDf.txt and
		BEDb.txt only.
		
		This option is used in combination with -D/-B
		Default : FSBC 

	-D/--Data Produce data files
		Absence of this flag results in the supression of output data files
		
	-B/--BED Produce BED files
		Absence of this flag results in the supression of output BED files
	
	BED FILES GENERATION
		Common sites BED Track Color : 128,128,128	(Grey)
		First only sites BED Track Color : 100,50,0	(Brown)
		Second only sites BED Track Color : 0,60,120	(Blue)
	
	Notes: $0 takes the data output file 'psdataop' of promotsite as the input and 
	NOT the 'data' from scan.

	Examples:

	1.  $0 -c 10 -s 1 -o FsBC -D -B data1 data2
			Input files are data1 and data2
			Chromosome 10
			Number of Standard Deviations 1
			Produce data/BED output for BS in data1 only and for BS common to data1 and data2
			Produce the combined BED file for 'F' and 'B' output BED files
			Suppress data/BED output for BS in data2 only
			Produce output data files
			Produce output BED files\n";
}

sub initopts
{
	$OPT{'D'} = 0;	#controls the output data file generation
	$OPT{'B'} = 0;	#controls the output BED file generation
	$OPT{'f'} = 0;	#specifies whether the options should be specified thru a file
	$OPT{'C'} = "128,128,128";	#the output BED track color
	GetOptions(	
					"chrm=s" => \$OPT{'c'},
					"sd=f" => \$OPT{'s'},
					"bits=f" => \$OPT{'b'},
					"zscore=f" => \$OPT{'z'},
					"opstring=s" => \$OPT{'o'},
					"Data" => \$OPT{'D'},
					"BED" => \$OPT{'B'},
					"file=s" => \$OPT{'f'}
				);
					# removing the below 3 options
					# "1=s" => \$OPT{'1'},
					# "2=s" => \$OPT{'2'} 
					#"COLOR=s" => \$OPT{'C'},
}#initopts()

#process the input options
sub procopts()
{
	#print %OPT;
	#$OPT{'1'} defines the name of scan 1 input file  
	#$OPT{'2'} defines the name of scan 2 input file  
	#print "@ARGV";
	#print "\n";
	initopts();
	
	if( $OPT{'f'} )
	{
		#@ARGV=();	#reset ARGV to empty array list
		open( F, "$OPT{'f'}" ) || die "cannot open $OPT{'f'}\n";
		my $cmdline = "";
		foreach my $line( <F> )
		{ 
			$cmdline .= "  " . trim($line);
		}
		my @list = split( /\s+/, &trim($cmdline) );
		unshift(@ARGV, @list);
		print "command line options after reading ",$OPT{'f'},"  : \n";
		print "@ARGV";
		print "\n";
		
		#print "@ARGV";
		#print "\n";
		%OPT = {};	#reset OPT
		initopts();	#call this once again
		if( $OPT{'f'} )
		{	print "-f option is not allowed in a file\n";	exit;	}
	}
	
	if( !defined $OPT{'s'} && !defined $OPT{'b'} && !defined $OPT{'z'})
	{	&usage("") and exit;	}
	my $tmpcnt = 0;
	$tmpcnt++ if( defined $OPT{'s'} ) ;
	$tmpcnt++ if( defined $OPT{'b'} ) ;
	$tmpcnt++ if( defined $OPT{'z'} ) ;
	
	if( $tmpcnt >1 )
	{	&usage("-b , -s and -z are mutually exclusive options") and exit;	}
	if( !defined $OPT{'c'} ){
		warn("chromosome name not specified with -c or --chrm option\n");
		warn("           taking default value of 1\n");
		$OPT{'c'} = "1";
	}
	else	#chrm specified with -c
	{
		my $tch = lc( $OPT{'c'} );
		#commenting the below lines to accomodate organisms other than 'humans'
		#$tch = 23 if( $tch eq "x" );
		#$tch = 24 if( $tch eq "y" );
		#&usage("Incorrect chromosome value for -c/--chrm option") and exit 
		#	if( $tch<1 || $tch>24 ); 
		#$OPT{'c'} = "X" if($tch == 23);
		#$OPT{'c'} = "Y" if($tch == 24);
	}
	if( defined $OPT{'s'}){	
		&usage("sd range : [1,3]") and exit unless ($OPT{'s'}>=1 && $OPT{'s'}<=3);	}
	if( ! defined $OPT{'o'}){
		warn "-o/--output not specified, taking default value of \'FSB\'\n";	
		$OPT{'o'} = "FSB";	}
	#print @ARGV;
	print "\n";
	if(@ARGV)	{$OPT{'1'} = shift @ARGV;	}
	if( !defined $OPT{'1'} ){
		warn("file1 not specified, taking default file name \'data1\'\n");
		$OPT{'1'} = "data1";	}
	if(@ARGV)	{$OPT{'2'} = shift @ARGV;	}
	if( !defined $OPT{'2'} ){
		warn("file2 not specified, taking default file name \'data2\'\n");
		$OPT{'2'} = "data2";	}

	#Now print the parameters with which this program is running
	print "\n*****************PROGRAM PARAMETERS*************************\n\n";
	
	print "Scan 1 data file name : $OPT{'1'}\n";
	print "Scan 2 data file name : $OPT{'2'}\n";
	print "Chrm: $OPT{'c'}\n";
	my $compname;
	$compname = "confidence interval" if ( defined $OPT{'s'} );
	$compname = "bits" if ( defined  $OPT{'b'} );
	$compname = "Z scores" if ( defined $OPT{'z'} );
	print "Comparison Method : By $compname\n";
	print "Output Print String : $OPT{'o'}\n";
	if( $OPT{'D'} )
	{	print "Printing output data files\n";	} 
	else {	print "Skipping output data files\n";	}
	if( $OPT{'B'} )
	{	print "Printing output BED files\n";	} 
	else {	print "Skipping output BED files\n";	}
	print "\n******************END OF PARAMETERS***********************\n\n";
	
}#procopts	

sub openfiles()
{
	#open input files
	open (IP1, &getparams('1') ) || die "Cannot open file \'".&getparams('1')."\'\n";
	open (IP2, &getparams('2') ) || die "Cannot open file \'".&getparams('2')."\'\n";
	$IP1=\*IP1;	$IP2=\*IP2;
	
	#open necessary output files
	my $opstr = &getparams('o');
	open(INS, ">inst") || die "cannot open output file inst\n";
		$INST=\*INS;
	open(OTD1, ">tmpd1") || die "cannot open temporary file tmpd1";
		$OTD1=\*OTD1;
	open(OTD2, ">tmpd2") || die "cannot open temporary file tmpd2";
		$OTD2=\*OTD2;

	if( rindex($opstr, "F" ) >= 0)	# first scan output
	{	$opf = 1;	}
	elsif( rindex($opstr, "f" ) >= 0)
	{	$opf = 0;	}

	if( rindex($opstr, "S" ) >= 0)	# second scan output
	{	$ops = 1;	}
	elsif( rindex($opstr, "s" ) >= 0)
	{	$ops = 0;	}

	if( rindex($opstr, "B" ) >= 0)	# common scan output
	{	$opb = 1;	}
	elsif( rindex($opstr, "b" ) >= 0)
	{	$opb = 0;	}
	
	if( rindex($opstr, "C" ) >= 0)	# combined BED output
	{	$opc = 1;	}
	elsif( rindex($opstr, "c" ) >= 0)
	{	$opc = 0;	}
	
	my $nullf="/dev/null";
	if( &getparams('D') )
	{
		if( $opf > 0)
		{ 	open (OPF, ">dataf") || die "cannot open output file data1\n";	}
		else{ 	open (OPF, ">$nullf") || die "cannot open null output file $nullf\n";	}			
		$OPF=\*OPF;
		if( $ops > 0 )
		{ open (OPS, ">datas") || die "cannot open output file data2\n"; }
		else{ 	open (OPS, ">$nullf") || die "cannot open null output file $nullf\n";	}
		$OPS=\*OPS;
		if( $opb > 0 )
		{ 
			open (OPB1, ">datab1") || die "cannot open output file data1\n";
			open (OPB2, ">datab2") || die "cannot open output file data2\n";
		
		}
		else
		{
			open (OPB1, ">$nullf") || die "cannot open null output file $nullf\n";
			open (OPB2, ">$nullf") || die "cannot open null output file $nullf\n";
		}
		$OPB1=\*OPB1; $OPB2=\*OPB2;
	}
	
	if( &getparams('B') )
	{
		if( $opf > 0)
		{open (BEDF, ">BEDf.txt") || die "cannot open output file data1\n";}
		else{ 	open (BEDF, ">$nullf") || die "cannot open null output file $nullf\n";	}
		$BEDF=\*BEDF;
		if( $ops > 0 )
		{open (BEDS, ">BEDs.txt") || die "cannot open output file data2\n"; }
		else{ 	open (BEDS, ">$nullf") || die "cannot open null output file $nullf\n";	}
		$BEDS=\*BEDS;
		if( $opb > 0 )
		{open (BEDB, ">BEDb.txt") || die "cannot open output file BEDb.txt\n"; }
		else{ 	open (BEDB, ">$nullf") || die "cannot open null output file $nullf\n";	}
		$BEDB=\*BEDB;

	}

	&err("Cannot find file(s) lib1/cat1") and exit unless( (-f "lib1") && (-f "cat1") );
	&err("Cannot create input file(s)") and exit unless( &crfile("lib2","cat2","lib3","cat3")); 

}#openfiles	

#get the ribl parameters 
sub getriblpar
{
	my ( $RIBL, $par) = @_;
	return ${$RIBL}{$par};
}

#read the ribl matrix
sub readribl()
{
	my $fribl = $_[0];	
	my %ribl = {};	#this is the $_[1] argument
	my $RIM = $_[2];	#reference to the RIM array
	my $FRM = $_[3];	#reference to the FRM array
	open(RIBL, "<$fribl") || die "cannot open file $fribl\n"; 
	#read the ribl site name
	my $flag = 0; #while exit condition
	while(my $ip = <RIBL>)
	{
		if( $ip =~ /^\s*ribl\s+\"(\S+)\"/ )
		{	$ribl{'site'} = &trim( $1 );	$flag = 1;	last;	}#if :ribl sitename
	}
	&err("Error in $fribl : Cannot find sitename") and exit unless($flag);
	#read the from and to params
	$flag=0;
	while( my $ip = <RIBL>)
	{
		if( $ip =~ /^\s*(\S+)\s+(\S+)\s+frombase.*tobase\s*$/ )
		{
			$ribl{'from'} = $1;
			$ribl{'to'} = $2;
			$flag=1;
			last;
		}
	}
	&err("Error in $fribl : Cannot find frombase, tobase") and exit unless($flag);
	#read the ribl frequency matrix
	$flag=0;
	my $loopc=0; my $i = 0;
	$loopc = abs( $ribl{'to'} - $ribl{'from'} ) + 1;
	while( $i<$loopc && (my $ip=<RIBL>) )
	{
		$ip = &trim( $ip);
		if($ip eq ""){	next;	}
		elsif( $ip =~ /^\s*\*/ )
		{	$flag=0; last;	}
		else
		{
			my @rrow = split(/\s+/, $ip);
			&putrim($i, "a", $rrow[0], $RIM);	&putrim($i, "c", $rrow[1], $RIM);
			&putrim($i, "g", $rrow[2], $RIM);	&putrim($i, "t", $rrow[3], $RIM);

			&putfrm($i, "a", $rrow[5], $FRM);	&putfrm($i, "c", $rrow[6], $FRM);
			&putfrm($i, "g", $rrow[7], $FRM);	&putfrm($i, "t", $rrow[8], $FRM);
			#sum the number of sequences in each column
			$ribl{'N'}[$i] = $rrow[5] +  $rrow[6] +  $rrow[7] +  $rrow[8]; 
		}
		$i++;
		$flag=1 if( $i==$loopc);
	}
	&err("Error in $fribl : Cannot read $loopc(frombase-tobase+1) rows from the matrix") 
		unless($flag);
	#read the mean(Rsequence)
	$flag=0;
	while( my $ip = <RIBL>)
	{
		if( $ip =~ /^\s*(\S+)\s+.*mean/ )
		{
			$ribl{'Rsequence'} = $1;
			$flag=1;	last;
		}
	}
	&err("Error in $fribl : Cannot read mean(Rsequence)") and exit unless($flag);
	#read the standard deviation
	$flag=0;
	while( my $ip = <RIBL>)
	{
		if( $ip =~ /^\s*(\S+)\s+.*standard\s*deviation/ )
		{
			$ribl{'SD'} = $1;
			$flag=1;	last;
		}
	}
	&err("Error in $fribl : Cannot read standard deviation") and exit unless($flag);
	
	#read the Ri consensus value
	$flag=0;
	while( my $ip = <RIBL>)
	{
		if( $ip =~ /^\s*(\S+)\s+.*\s+consensus/ )
		{
			$ribl{'Ricon'} = $1;
			$flag=1;	last;
		}
	}
	&err("Error in $fribl : Cannot read consensus value") and exit unless($flag);


	#read the number of sequences used in the ribl
	$flag=0;
	while( my $ip = <RIBL>)
	{
		if( $ip =~ /^\s*(\S+)\s+.*number\s+of\s+sequences/i )
		{
			$ribl{'numofseq'} = $1;
			$flag=1;	last;
		}
	}
	&err("Error in $fribl : Cannot read the number of sequences") and exit unless($flag);
	%{$_[1]} = %ribl;	#assign the ribl values to the input ribl hash
}#readribl	

#store the ri values of a,c,g,t
sub putrim()
{
	my ($row, $nucl, $val, $RIM) = @_;	#todo
	
	${$RIM}[$row]{$nucl}=$val;
}
#get the ri values of a,c,g,t
sub getrim()
{
	my ($row, $nucl,$RIM) = @_;
	return( ${$RIM}[$row]{$nucl} );
}

#store the frequencies of a,c,g,t
sub putfrm()
{
	my ($row, $nucl, $val, $FRM) = @_;	
	${$FRM}[$row]{$nucl}=$val;
}
#get the frequencies of a,c,g,t
sub getfrm()
{
	my ($row, $nucl,$FRM) = @_;
	return( ${$FRM}[$row]{$nucl} );
}#getfrm()

sub err()
{	print "$_[0]\n";	}

#check if the confidence intervals overlap
sub overlap()
{
	my ($l1, $h1)= @{$_[0]};	#dereference conf int 1
	my ($l2, $h2) = @{$_[1]};	#dereference conf int 2
	my $ret = 0;	#0 implies no overlap, 1 implies overlap
	#correctly order in ascending the intervals
	($l1, $h1)=($h1, $l1) if ($l1 > $h1);
	($l2, $h2)=($h2, $l2) if ($l2 > $h2);
	if($l1 <= $l2)
	{	$ret = 1 unless($h1<$l2);	}
	elsif($l2 > $l1)
	{	$ret = 1 unless($h2<$l1);	}
	return $ret;
}#overlap()

#calculate the confidence intervals
sub CI()
{
	my @dna = split(//, &trim($_[0]) );
	my %ribl = %{$_[1]};	#dereference the hash ribl array
	my $frm = $_[2];	#the frequency array list reference
	my @conf;	#the conf interval , this is the return value
	my $L = abs( $ribl{'to'} - $ribl{'from'}) + 1;	#the length of the BS
	#my $N = abs( $ribl{'numofseq'} );	#no of sequences, n_bl
	warn "dna piece length not equal to abs(from-to)+1 from ribl\n"
		unless($L == @dna);
	my $exp1=0; my $exp2=0;	#these are expr 1 and 2 in the mean computation
	for (my $i=0; $i<$L; ++$i)
	{
		#getpi() is just getfrm()/N for now
		my $NL = $ribl{'N'}[$i];
		my $pi_bl = &getpi($i,$dna[$i],$NL,$frm );
		$exp1 += &logn(2,$pi_bl);
		$exp2 += (1 / $NL) * (1 - $pi_bl) / $pi_bl;
	}
		
	#compute mean and variance
	my $mean = 2*$L + $exp1 - $exp2 /2 ;
	my $variance = $exp2;
	#my $sd = $ribl{'SD'};	# doubt! can also take the value of sqrt($variance)
	my $sd = sqrt( $variance );	#this might be the correct calculation

	#compute confidence intervals
	my @ci = (0,0);
	if( ! &getparams('s') )
	{	&usage("SD value not specified") and exit;	}
	#confidence interval  = M +- Z*SD/sqrt(n) ;  
	#here M =mean, Z=user defined num of stand dev(-s option)
	#SD = ribl{sd} or  sqrt($variance),  n =1
	$ci[0] = $mean - &getparams('s')*$sd;
	$ci[1] = $mean + &getparams('s')*$sd;
	($ci[0],$ci[1]) = ($ci[1], $ci[0]) if($ci[0]>$ci[1]);
	return @ci;
}#CI()

sub logn()
{
	my ($base, $val) = @_;
	my $ret = 0.0;
	$ret = log($val)/log($base);
}#logn()

sub getpi()
{
	my( $i, $ncl, $NL, $frm) = @_;
	my $ret = 0;
	my $freq = &getfrm($i, $ncl, $frm);
	if ( $freq == 0 )
	{	$ret = 1/($NL + 2);	}	#This is staden's substitution
	else
	{	$ret = $freq / $NL;	}
	return $ret;
}#getpi()

sub printcomments()
{
	my ($ip,$op)=@_;
	my $ipline;
	my $flag = 1;
	print $op "* This file is generated by : $0 \n";
	my $t = &getdate();
	print $op "* Current Time : $t\n";
	print $op "* The comments from the input file are listed below\n";
	print $op "****************************************************";
	seek( $ip, 0, 0);	#go to starting point in the file
	while( $flag)
	{
		if( $ipline = <$ip>)
		{
			my $line = &trim($ipline);
			if( $line =~ /^\*/ )	
			{
				#print to output if its a comment line
				print $op "$line\n";
			}
			else	{	$flag = 0; }
		}
		else{ seek( $ip,0,0);	return 0 };
	}
	seek($ip, 0 ,0);
	return $ipline;
}#printcomments()

#read the data position values, for ex getdatapos('chrm')
sub getdatapos()
{
	my $pos = shift;
	return $datapos{$pos};
}#getdatapos()

#print 1 line of the output BED file
sub printopBEDline()
{
	my ($ipline, $BED, $ribl) = @_;
	my @l = split( /\s+/, &trim( $ipline) );	#list containing the data elements
	my %rmrms;
	my $opline = "";
	$rmrms{'chrm'} = "chr".&getparams('c');	#read the -c switch value i.e. chrm name 
	$rmrms{'bscoord'} = $l[ &getpos('coord') ];
	my $bsdirec = $l[ &getpos('dir') ];
	if ( $bsdirec == 1 )
	{	$rmrms{'bsdir'} = "+";	}
	else
	{	$rmrms{'bsdir'} = "-";	}
	$rmrms{'ri'} = $l[ &getpos('Ri') ];
	my $accpos =  &getpos('accession') ;#accession number position
	#the below line is for compatibility with the scan o/p rather than promotsite o/p
	#in scan o/p, there is no accession number. So set the coordinate to that name
	#the accpos is generally the last position in the data file
	if( $accpos >= @l)
	{ $accpos = &getpos('coord');	}
	$rmrms{'transid'} = $l[$accpos];	#geneleader
				
	$/="\n";
	chomp($rmrms{'chrm'});		#CHROM
	$opline .= $rmrms{'chrm'}." ";
	
	my $BEDstart=0;	#BED start
	my $BEDend=0;		#BED end
	$bsdirec = $rmrms{'bsdir'};
	#in this scenario, the Binding Site occupies the 0 position
	
	if($bsdirec eq "+")
	{
		$BEDstart = $rmrms{'bscoord'} - abs( &getriblpar($ribl, 'from') );
		if($BEDstart<0) {	$BEDstart = 0;	}	#lower bound is 0

		$BEDend = $rmrms{'bscoord'} + abs( &getriblpar($ribl,'to'));
	}#if: dir "+"
	elsif($bsdirec eq "-")
	{
		$BEDstart = $rmrms{'bscoord'} - abs( &getriblpar($ribl, 'to') );
		if($BEDstart<0) {	$BEDstart = 0;	}	#lower bound is 0
		
		$BEDend = $rmrms{'bscoord'} + abs( &getriblpar($ribl, 'from'));
	}#elsif: dir "-"
	else
	{	print ("Binding Site orientation not +/-\n");	}

	($BEDstart>0)?( $BEDstart-- ):$BEDstart;
	$opline .= $BEDstart." ";	#ChrmStart coordinate
	$opline .= $BEDend." ";	#ChrmEnd coordinate
	
	my $rival = $rmrms{'ri'};
	my $score = ($rival * 1000) / &getriblpar($ribl, 'Ricon') ;	

	my @sitel = split(/_/,getriblpar($ribl, 'site') );
	my $site = $sitel[0];
	#my $BEDname = $rmrms{'transid'}."_R".int($rival)."_S".int($score);
	my $BEDname = $rmrms{'transid'}."_".$site."_R".int($rival);
	$opline .= $BEDname." ";	#Name
	$opline .= $score." ";		#score
	
	chomp($rmrms{'bsdir'});	#strand direction
	$opline .= $rmrms{'bsdir'}." ";
	
	my $mid  = int( ($BEDstart + $BEDend)/2 );
	my $thickStart = ( $bsdirec eq "+")?($BEDstart):($mid);
	my $thickEnd = ( $bsdirec eq "+") ?($mid):($BEDend);

	$opline .= $thickStart." ";	#thickStart
	$opline .=  $thickEnd." ";	#thickEnd
	$opline .= "0"." ";	#reserved
	$opline .= "1"." ";	#blockCount

	my $blockSize = $BEDend - $BEDstart;
	$opline .=  $blockSize." ";
	
	my $blockStart = 0;
	$opline .=  $blockStart;
	$opline .= "\n";
	print $BED "$opline";
	return $opline;
}#printopBEDline()

sub printBEDheader()
{
	my ($chrm, $opfile, $ribl, $tcolor) = @_;
	if( ! defined($tcolor) )
	{	$tcolor = "128,128,128";	}
	#print the browser lines
	#the blank spaces in the first printed line below is for overwriting it
	#with a suitable  browser position like the highest Ri value at a later position
	print $opfile ("browser position chr".$chrm.":1-1000000                              \n");
	#print track lines
	my $tname = &trim( &getriblpar($ribl, 'site') );	#track name
	print $opfile ("track name=${tname} description=\"${tname}\" color=$tcolor useScore=1\n");
}

sub printINSTline
{
	my ($coord, $from, $to, $dir) = @_;

	my $lowsign="-", my $highsign="+";
	($from<0)?($lowsign="-"):($lowsign="+");
	($to<0)?($highsign="-"):($highsign="+");
	my $delilaline="";
	my $low = abs($from);
	my $high = abs($to);
	
	if( &trim($dir) eq "-1")
	{	$delilaline="get from $coord ${highsign}$low to same ${lowsign}$high direction -;\n";	}
	else
	{	$delilaline="get from $coord ${lowsign}$low to same ${highsign}$high;\n";	}

	print $INST ($delilaline);
}#printINSTline()
	
sub printINSTheader
{
	my $INST = shift;

	my $org = 0;
	my $chr = 0;
	my $piece = 0;
	&err("Error in processing delila book lib1:\nCould not find organism name") and exit
		unless($org = &parselib("organism","lib1") );
	&err("Error in processing delila book lib1:\nCould not find chromosome name") and exit
		unless($chr = &parselib("chromosome","lib1" ));
	&err("Error in processing delila book lib1:\nCould not find piece name") and exit
		unless($piece = &parselib("piece","lib1" ) );
	print "$org $chr $piece\n";			
	
	my $cdate = &getdate();
	print $INST ("title \"GenBank $cdate\";\n");
	print $INST ("organism $org;\n");
	print $INST ("chromosome $chr;\n");
	print $INST ("piece $piece;\n\n");
}#printINSTheader()

sub parselib
{
	my ($opt, $file) = @_;
	my $ret = "-";
	open(LIB1, "<$file") || die "unable to open file $file";
	#print "searching for $opt\n";
	while( my $libline = <LIB1>)
	{
		if($libline =~ /^\s*\*/)	#if it begins with a *
		{	next;	}
		else
		{
			my $temp = $libline;
			$temp =~ /^\s*(\S+)\s*/;
			if($1 eq "$opt")
			{
				my $val = <LIB1>;
				$val =~ /^\s*\*\s+(\S+)\s*/;
				$ret = $1;
				last;
			}#if: asked value
		}
	}
	if( $ret eq "-" )	{	$ret = 0;	}
	#print "$opt = $ret\n";
	close LIB1;
	return $ret;
}#parselib()

sub getdate
{
	my $ret = localtime;
	return $ret;
}#getdate

#this file creates a file if not present,else does nothing
sub crfile
{
	my $flag = 1;
	foreach my $file( @_ )
	{
		if(! -f "$file")
		{
			print "File $file not found\n";
			print "Creating empty file $file\n";
			if(! open(L,">>$file")) { warn "cannot open file $file\n"; $flag=0; last; }
			close L;
		}
	}
	return $flag;
}#crfile()

# close file handle references if defined 
sub closef
{
	foreach( @_ )
	{	close $_ if( defined $_ );	}
}



