#!/bin/bash

# Copyright 2003 Sashidhar Gadiraju, Peter K. Rogan
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#


# program : runscandiff
# version : 0.2
# Description:
# Run scandiff on the list of chromosomes given

#First check whether the Delila Genome environment variables are set
if [ "$DELGEN_GENOMES" = "" ]; then
	echo "ENV variable DELGEN_GENOMES not set"
	exit
fi

SCANOP=0	#decides whether the input data is to be removed 1=remove	
CONP=1	#number of concurrent processes 
BGLIB="$DELGEN_GENOMES"		#base dir for genome books
CHRPFX=chrm_		#chromosome directory prefix
CHRSTCNT=1	#Chromosome starting count
CHRENDCNT=24	#Chromosome ending count
ALLPSBED=AllpsBED.txt	#output filename for the catallBED script
PRNAME=$0
PRNAME=`echo $PRNAME | tr "/" "\n" | tail -1`
PARSRC=`pwd` #the actual destination directory for the output
SUMMARY="summary.txt"
function showusage
{
	echo
	echo "***************************************"
	echo $1
	echo "***************************************"
	echo "usage: $PRNAME [-dh] libsrcdir srcdir1 srcdir2 [ chrmstart [chrmend] ] ]"
	echo
	echo "-d destination_directory : the directory where the results are stored"
	echo "	default : current directory"
	echo "-h : print this help message"	
	echo "libsrcdir is searched in ${BGLIB}"
	echo "The range of chromosomes to be used is specified by"	
	echo "chrmstart: default=The smallest chromosome in libsrcdir/chrmlist"
	echo "chrmend: default=The largest chromosome  in libsrcdir/chrmlist"
	echo
	echo "Sample 'chrmlist' file contents are given below"
	echo "In this file, default chrmstart=21, chrmend=Y"
	echo "chrmend can also be specified as 24"
	echo 
	echo "*Chrm file starts here"
	echo "*Comment lines start with a *"
	echo "*ChrDirectoryNumber ChrName"
	echo "	21 21"
	echo "	22 22"
	echo "	23 X"
	echo "	24 Y"
	echo
	exit
}

function abspath
{
	[ $# -eq 1 ] || return -1
	#GET THE ABSOLUTE PATH OF THE RESULTS DIRECTORY
	D=`dirname "$1"`
	B=`basename "$1"`
	ABSPATH="`cd \"$D\" 2>/dev/null && pwd || echo \"$D\"`/$B"	
}

function isdigit
{
	[ $# -eq 1 ] || return -1

	case $1 in
		*[!0-9]*|"") return -1;;
    	*) return 0;;
	esac
}
 
if [ "$DELGEN_CONP" != "" ]; then
   CONP=$DELGEN_CONP
	if ! isdigit $CONP; then
		echo "Environment variable DELGEN_CONP not an integer"
		exit 1
	fi
	if [ $CONP -lt 1 ]; then
		echo "Environment variable DELGEN_CONP < 1, setting it to 1"
		CONP=1
	fi
fi

while getopts ":d:h" Option
do
	case $Option in
	d )   PARSRC=$OPTARG ;;
	h )   showusage ;;
	* )   showusage "$1 Option unimplemented ";;
esac
done

shift $(($OPTIND - 1))

if [ $# -lt 2 ]; then
	showusage ""
fi

GLIB="${BGLIB}/$1" 	#The genome assembly version directory to use
SRC1="$2" ; abspath "$SRC1" ; SRC1="$ABSPATH"
SRC2="$3" ; abspath "$SRC2" ; SRC2="$ABSPATH"
abspath "$PARSRC"; PARSRC="$ABSPATH"

if [ ! -d $PARSRC ]; then
	showusage "Invalid destination directory"; 
fi
cd $PARSRC
if [ ! -d $SRC1 ]; then
	showusage "Invalid srcdir1 directory"; 
fi
if [ ! -d $SRC2 ]; then
	showusage "Invalid srcdir2 directory"; 
fi
if [ ! -d $GLIB ]; then
	showusage "Invalid libsrc directory"; 
fi
if [ ! -f "$GLIB/chrmlist" ]; then
		echo "$GLIB/chrmlist not found"
		exit
fi
###################################################################
function findCHRM
{
	CMD=`echo "grep '\<$1\>' $GLIB/chrmlist | grep -v '^ *\*' | tail -1"`
	#echo $CMD
	#eval $CMD
	CHRMNUM=`eval $CMD | awk '{print $1}'`
	CHRMNAME=`eval $CMD | awk '{print $2}'`
	#return the value of isdigit function
	#set the CHRMNUM value to the mapping directory name
	isdigit $CHRMNUM
}
						 
#initialize the chromosome range with the values taken from chrmlist
#in the grep, remove empty lines and comment lines(beginning with a *)
CHRSTCNT=`sort +0n $GLIB/chrmlist | grep -v '^ *$\|^ *\*' | head -1 | awk '{print $1}'`
CHRENDCNT=`sort +0n $GLIB/chrmlist | grep -v '^ *$\|^ *\*' | tail -1 | awk '{print $1}'`

if [ $# -ge 4 ]; then
	if ! findCHRM $4
	then
		echo "chrmstart $4 not listed in  $GLIB/chrmlist"
		exit
	fi
	CHRSTCNT=$CHRMNUM
fi

if [ $# -ge 5 ]; then
	if ! findCHRM $5;	then 
		echo "chrmend $5 not listed in  $GLIB/chrmlist"
		exit
	fi
	CHRENDCNT=$CHRMNUM
fi
###
if [ $CHRSTCNT -gt $CHRENDCNT ]; then
	showusage "chrmstart > chrmend";	
fi

#store the list of  chromosomes to_be_processed in an array
count=$CHRSTCNT
cnt=0
while [ $count -le $CHRENDCNT ];
do
	if  findCHRM ${count}; then		
		CHRML[$cnt]=$CHRMNUM
		CHRMN[$cnt]=$CHRMNAME
		cnt=`expr $cnt + 1`			
	else
		echo "Skipping chromosome $count : not listed in $GLIB/chrmlist "
	fi
	
	count=`expr $count + 1`
done

if [ ${#CHRML[@]} -le 0 ]; then
	echo "No valid chromosomes to process in the range $CHRSTCNT to	$CHRENDCNT"
	exit 1
fi

###################################################################

if [ ! -f "$PARSRC/scandiffp" ]; then
	showusage "scandiffp file not present in $PARSRC";
fi


function doscandiff
{
	#the directory number
	#the node on which to run the process
	curcnt=$1
	curdir=$2
	procn=$3
	date
	cd $curdir
	cp $PARSRC/scandiffp .
	ln -sf ${GLIB}/${CHRPFX}$curcnt/lib1 
	ln -sf ${GLIB}/${CHRPFX}$curcnt/cat1
	if [ ! $SRC1/${CHRPFX}$curcnt/psdataop ]; then
		echo "psdataop not found in $SRC1/${CHRPFX}$curcnt"
		exit
	fi
	ln -s  $SRC1/${CHRPFX}$curcnt/psdataop data1
	ln -s  $SRC1/ribl ribl1
	
	if [ ! $SRC2/${CHRPFX}$curcnt/psdataop ]; then
		echo "psdataop not found in $SRC/${CHRPFX}$curcnt"
		exit
	fi
	ln -s  $SRC2/${CHRPFX}$curcnt/psdataop data2
	ln -s  $SRC2/ribl ribl2
	
	findCHRM "$curcnt"
	chrname=$CHRMNAME
	echo "chrname  = $chrname"
	echo >> scandiffp
	echo "-c $chrname" >> scandiffp

	scandiff.pl -f scandiffp

	rm lib1 cat1 lib2 cat2 lib3 cat3 
	date
}

echo "Starting scandiff on chromosomes $CHRSTCNT TO $CHRENDCNT"

terminal=`tty`
exec>$PARSRC/runscandiff.log
#echo "$PRNAME"


date 
loopcnt=0
# for count in ${CHRML[@]}	
while [ $loopcnt -lt ${#CHRML[@]} ];
do	
	count=${CHRML[$loopcnt]}
	#pnum=`ps -e | grep -c $PRNAME`
	pnum=`ps -C $PRNAME | wc -l`
	pnum=$(($pnum-2))
	#echo "pnum=$pnum conp=$CONP"
	if [ $pnum -le $CONP ];	then
		echo $count
		#run background function
		cur=${PARSRC}/${CHRPFX}$count
		if [ ! -d $cur ]; then
			echo "creating ${CHRPFX}$count"
			mkdir $cur
		fi
		echo "cur=$cur count=$count "
		( doscandiff $count $cur -1 1>$cur/scandiff.log 2>&1 )&
		loopcnt=$(($loopcnt+1))
		sleep 5
	else 
		#echo " "
		sleep 600
	fi
	
done

#waiting for the child processes created by this shell
wait
date #this is stored in runscan.log
exec>$terminal
echo 
echo
cat runscandiff.log
echo "****************************"
for count in ${CHRML[@]}
do
	findCHRM $count
	echo "Printing chromosome $CHRMNAME log"
	cat ${CHRPFX}${count}/scandiff.log
	echo "****************************"
done


#****************** PRINT THE SUMMARY.TXT FILE ****************#
exec>$SUMMARY
echo "***************Runscandiff Summary*****************"
tmpdate=`date`
echo "Execution date : " "$tmpdate"
echo "Genome draft directory : " "$GLIB"
echo "Scan 1 results directory : " "$SRC1"
echo "Scan 2 results directory : " "$SRC2"
echo "The chromosome range for this run :"
for chrtmp in ${CHRMN[@]}
do
	echo "chr " $chrtmp
done
echo 
echo '*********************************************************'
echo "global scandiff parameters file (the -c switch not shown)"
echo '*********************************************************'
cat scandiffp
echo 
# find the number of sites 
echo '************************************'
echo "Counting the number of binding sites"
echo '************************************'
sigchnum=0		# significantly changed number
commonnum=0		# common number
insigchnum=0	# insignificantly changed number
fsitenum=0		# sites in first scan
ssitenum=0		# sites in second scan

for count in ${CHRML[@]}
do
	num=`grep -i 'sites.*found.*scandiff.*significantly' ${CHRPFX}${count}/datab1 | \
	 tail -1 | awk '{ print $2}' `
	if [ $? -ne 0 -o "$num" = "" ]; then
		echo "Cannot find the number of significantly changed sites from ${CHRPFX}${count}/datab1"
	else
		sigchnum=$(($sigchnum+$num))
	fi
	num=`grep -i 'sites.*found.*scandiff' ${CHRPFX}${count}/datab1 | \
	 grep -i -v 'sites.*found.*scandiff.*significantly' | \
	 tail -1 | awk '{ print $2}' `
	if [ $? -ne 0 -o "$num" = "" ]; then
		echo "Cannot find the number of common sites from ${CHRPFX}${count}/datab1"
	else
		commonnum=$(($commonnum+$num))
	fi
	num=`grep -i 'sites.*found.*scandiff' ${CHRPFX}${count}/dataf | \
	 	tail -1 | awk '{ print $2}'`
	if [ $? -ne 0 -o "$num" = "" ]; then
		echo "Cannot find the number of sites only in the first scan from ${CHRPFX}${count}/dataf"
	else
		fsitenum=$(($fsitenum+$num))
	fi
	num=`grep -i 'sites.*found.*scandiff' ${CHRPFX}${count}/datas | \
	 	tail -1 | awk '{ print $2}'`
	if [ $? -ne 0 -o "$num" = "" ]; then
		echo "Cannot find the number of sites only in the second scan from ${CHRPFX}${count}/datas"
	else
		ssitenum=$(($ssitenum+$num))
	fi
done
insigchnum=$(($commonnum-$sigchnum))
echo "Number of sites common to both the scans : " "$commonnum"
echo "Number of sites changing significantly : " "$sigchnum"
echo "Number of sites with no significant change : " "$insigchnum"
echo "Number of sites only in the first scan : " "$fsitenum"
echo "Number of sites only in the second scan : " "$ssitenum"
echo

# now find the number of genes
cgenes=0	# common genes
fgenes=0	# first scan genes only 
sgenes=0	# second scan genes only 
echo '****************************'
echo "Counting the number of genes"
echo '****************************'
for count in ${CHRML[@]}
do
	num=`genenumbers.pl ${CHRPFX}${count}/data1 ${CHRPFX}${count}/data2 | grep -i 'common.*genes' | awk '{print $NF}'`
	if [ $? -ne 0 -o "$num" = "" ]; then
		echo "Cannot find the number of common genes from ${CHRPFX}${count}"	
	else
		cgenes=$(($cgenes+$num))
	fi
	num=`genenumbers.pl ${CHRPFX}${count}/data1 ${CHRPFX}${count}/data2 | grep -i 'genes.*file1' | awk '{print $NF}'`
	if [ $? -ne 0 -o "$num" = "" ]; then
		echo "Cannot find the number of genes only in the first scan from ${CHRPFX}${count}"	
	else
		fgenes=$(($fgenes+$num))
	fi
	num=`genenumbers.pl ${CHRPFX}${count}/data1 ${CHRPFX}${count}/data2 | grep -i 'genes.*file2' | awk '{print $NF}'`
	if [ $? -ne 0 -o "$num" = "" ]; then
		echo "Cannot find the number of genes only in the second scan from ${CHRPFX}${count}"	
	else
		sgenes=$(($sgenes+$num))
	fi	
done

echo "Number of sites common to both the scans : " "$cgenes"
echo "Number of sites only in the first scan : " "$fgenes"
echo "Number of sites only in the second scan : " "$sgenes"
echo
exit 0

