#!/bin/bash

# Copyright 2003 Sashidhar Gadiraju, Peter K. Rogan
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#


#program : runscan
#version : 2.5.1
#Description:
#This program starts the scan and/or promotsite on a particular chromosome range
#2.5 : removing sortmrna.pl and scan2ps.pl and including them in
#promotsite.pl
#2.4 : including the code for parallelizing based on the chromosome size
#2.3 : removing extraneous stuff
#2.2 : Stores results in only the current directory
		 #Does not parse the orglist and assemblylist	
#2.1 : Modified according to the genome directory structure
#2.0 : Fully working, backward compatible version of runscan
#1.6 : Made 1.5 backward compatible with 1.3
#1.5 : Added the option of chromosome range to scan 
#1.4 : Added a few output messages
#1.3 : Added the functionality to specify promotsite/scan or both programs from the command line
#1.2 : Added the log file creation in the respective chrm directories

#First check whether the Delila Genome environment variables are set
if [ "$DELGEN_GENOMES" = "" ]; then
	echo "ENV variable DELGEN_GENOMES not set"
	exit
fi

PRNAME=$0		#program name
PRNAME=`echo $PRNAME | tr "/" "\n" | tail -1`
SCANOP=0	#decides whether the input data is to be removed 1=remove	
CONP=1	#number of concurrent processes 
LOADBAL=0	# 0=load balancing OFF ; 1=load balancing ON ; 
# Set the LOADBAL to 1 using the env variable DELGEN_LOADBAL
#SCRSRC=$DELGEN_SCRIPT	#script source directory
#BINSRC=$DELGEN_BIN		#binary source directory
BGLIB=$DELGEN_GENOMES	#base genome directory
CHRPFX=chrm_		#chromosome directory prefix
CHRSTCNT=1	#Chromosome starting count
CHRENDCNT=24	#Chromosome ending count
#CHRMSTCNT, CHRMENDCNT are set later on the first and last chrm in file 'chrmlist'
ALLPSBED=AllpsBED.txt	#output filename for the catallBED script
CURDIR=`pwd`
PARSRC=`pwd` #the actual destination directory for the output

function showusage
{
	echo
	if [ "$1" != "" ]; then
		echo "***************************************"
		echo $1
		echo "***************************************"
	fi
	echo "usage: $PRNAME [-dh] libsrcdir [ program_option [ chrmstart [chrmend] ] ]"
	echo
	echo "-d destination_directory : the directory where the results are stored"
	echo "	default : current directory"
	echo "-h : print this help message"	
	echo "libsrcdir is searched in ${BGLIB}"
	echo "progam_option: 1->scan 2->promotsite 3->scan+promotsite"
	echo "The range of chromosomes to be used in scan/promotsite is specified by"
	echo "chrmstart: default=The smallest chromosome in libsrcdir/chrmlist"
	echo "chrmend: default=The largest chromosome  in libsrcdir/chrmlist"
	echo
	echo "Sample 'chrmlist' file contents are given below"
	echo "In this file, default chrmstart=21, chrmend=Y"
	echo "chrmend can also be specified as 24"
	echo 
	echo "*Chrm file starts here"
	echo "*Comment lines start with a *"
	echo "*ChrDirectoryNumber ChrName"
	echo "	21 21"
	echo "	22 22"
	echo "	23 X"
	echo "	24 Y"
	echo
	exit 0
}

function isdigit
{
	[ $# -eq 1 ] || return -1

	case $1 in
		*[!0-9]*|"") return -1;;
    	*) return 0;;
	esac
}

function abspath
{
	[ $# -eq 1 ] || return -1
	#GET THE ABSOLUTE PATH OF THE RESULTS DIRECTORY
	D=`dirname "$1"`
	B=`basename "$1"`
	ABSPATH="`cd \"$D\" 2>/dev/null && pwd || echo \"$D\"`/$B"	
}
# Check the number of concurrent processes to run
if [ "$DELGEN_CONP" != "" ]; then
   CONP=$DELGEN_CONP
	if ! isdigit $CONP; then
		echo "Environment variable DELGEN_CONP not an integer"
		exit 1
	fi
	if [ $CONP -lt 1 ]; then
		echo "Environment variable DELGEN_CONP < 1, setting it to 1"
		CONP=1
	fi
fi
# Check the load balancing feature
if [ "$DELGEN_LOADBAL" = "1" ]; then
	LOADBAL=1
	echo "Load balancing option ON"
	echo "This feature supported only on Scyld OS systems"
else
	echo "Load balancing feature OFF"
fi

while getopts ":d:h" Option
do
	case $Option in
	d )   PARSRC=$OPTARG ;;
	h )   showusage ;;
	* )   showusage "$1 Option unimplemented ";;
esac
done
shift $(($OPTIND - 1))
															 
if [ $# -lt 1 ]; then
	showusage "libsrcdir not given"
fi

abspath "$PARSRC" ; PARSRC=$ABSPATH

if [ ! -d "$PARSRC" ]; then
	echo "Missing destination directory $PARSRC "
	exit 1
fi

cd $PARSRC

RUNOPTION=${2-3} 	#default option is 3

#in the below backtick output,  line 1 is an empty line
#ORG=`echo $2 | tr "/" "\n" | sed -n '1p'`
#ASSEMBLY=`echo $2 | tr "/" "\n" | sed -n '2p'`

#now search the org and assembly in the respective
#org and assembly list file stored
#CMD=`echo "grep -v '^ *\*' $BGLIB/orglist | grep '\<$ORG\>' "`
#echo $CMD
#eval $CMD
#if [ $? -ne 0 ]; then
#	echo "Cannot find the org $ORG in file $BGLIB/orglist"
#	exit
#fi

#CMD=`echo " grep -v '^ *\*'  $BGLIB/$ORG/assemblylist | grep '\<$ASSEMBLY\>'"`
#echo $CMD
#eval $CMD
#if [ $? -ne 0 ]; then
#	echo "Cannot find the assembly $ASSEMBLY in file $BGLIB/$ORG/assemblylist"
#	exit
#fi

GLIB=$BGLIB/$1		#the actual genome directory
if [ ! -d $GLIB ]; then
	showusage "Invalid libsrcdir : $GLIB"; 
fi
if ! isdigit "$RUNOPTION"  
then
	showusage "Program_Option not 1/2/3";	
fi
if [ ! $RUNOPTION -ge 1 -o ! $RUNOPTION -le 3 ]; then
	showusage "Program_Option not 1/2/3";	
fi
if [ ! -f "$GLIB/chrmlist" ]; then
	echo "$GLIB/chrmlist not found"
	exit
fi

function findCHRM
{
	CMD=`echo "grep '\<$1\>' $GLIB/chrmlist | grep -v '^ *\*' | tail -1"`
	#echo $CMD
	#eval $CMD
	CHRMNUM=`eval $CMD | awk '{print $1}'`
	CHRMNAME=`eval $CMD | awk '{print $2}'`
	#return the value of isdigit function
	#set the CHRMNUM value to the mapping directory name
	isdigit $CHRMNUM
}
						 
#initialize the chromosome range with the values taken from chrmlist
#in the grep, remove empty lines and comment lines(beginning with a *)
CHRSTCNT=`sort +0n $GLIB/chrmlist | grep -v '^ *$\|^ *\*' | head -1 | awk '{print $1}'`
CHRENDCNT=`sort +0n $GLIB/chrmlist | grep -v '^ *$\|^ *\*' | tail -1 | awk '{print $1}'`
#echo "chrmst=$CHRSTCNT"
#echo "chrmend=$CHRENDCNT"

if [ $# -ge 3 ]; then
	if ! findCHRM $3
	then
		echo "chrmstart $3 not listed in  $GLIB/chrmlist"
		exit
	fi
	CHRSTCNT=$CHRMNUM
fi

if [ $# -ge 4 ]; then
	if ! findCHRM $4;	then 
		echo "chrmend $4 not listed in  $GLIB/chrmlist"
		exit
	fi
	CHRENDCNT=$CHRMNUM
fi
###
if [ $CHRSTCNT -gt $CHRENDCNT ]; then
	showusage "chrmstart > chrmend";	
fi

#store the list of  chromosomes to_be_processed in an array
count=$CHRSTCNT
cnt=0
while [ $count -le $CHRENDCNT ];
do
	if  findCHRM ${count}; then		
		CHRML[$cnt]=$count
		CHRMN[$cnt]=$CHRMNAME
		cnt=`expr $cnt + 1`			
	else
		echo "Skipping chromosome $count : not listed in $GLIB/chrmlist "
	fi
	
	count=`expr $count + 1`
done

if [ ${#CHRML[@]} -le 0 ]; then
	echo "No valid chromosomes to process in the range $CHRSTCNT to	$CHRENDCNT"
	exit
fi

function parallelJobs
{
	########## paral ###########
	# WARNING: The below assignment of different nodes to different jobs is
	# highly SCYLD dependant. So enable with care.
	
	which mpprun
	r1=$?
	which bpstat
	r2=$?
	if [ $r1 -ne 0 -o $r2 -ne 0 ]; then
		echo "Scyld OS commands mpprun and bpstat not found"
		echo "Disabling load balancing"
		LOADBAL=0
		return
	fi
	
	# First find the nodes which are up and running
	# UPNODES array contains the nodes which are currently up and running
	# the master node is -1 in scyld, bash does not support -1 array subscript	
	prc=0	# process number
	UPNODES[$prc]=-1
	# bpstat does not list the master node which is designated as -1
	# the first column of bpstat lists the upnodes
	bpstat | grep -i '\<up\>' | awk '{print $1}' | grep -v '^[ 	]*$'  >proclist.tmp
	while read procnum
	do
		prc=$(($prc+1))
		UPNODES[$prc]=$procnum
	done<proclist.tmp
	unset prc
	unset procnum
	#rm -f proclist.tmp
	
	# now assign the different chromosome jobs to the upnodes
	ifjob='chrsize.tmp'
	ifjob1='chrsize.tmp1'
	ofjob='jobs.out'
	nodes=${#UPNODES[@]}
	echo "Number of up nodes is $nodes"
	rm -f $ifjob	#to create an empty file
	for count in ${CHRML[@]} 
	do
		jsize=`du -kL ${GLIB}/${CHRPFX}$count/book | awk '{print $1}'` 
		#echo "jsize=$jsize"
		if [ $? -ne 0 ]; then
			echo "Cannot find the size of ${GLIB}/${CHRPFX}$count/book "
			exit 1
		fi
		echo "$count $jsize" >> $ifjob
	done
	# below, [ 	] is actually [ \t]
	grep -v '^[ 	]*$' $ifjob | sort +1nr >$ifjob1
	joborder.pl $nodes $ifjob1
	while read curjob curnode
	do
		CHRMNODE["$curjob"]=${UPNODES["$curnode"]}
	done<$ofjob
	rm -f $ifjob $ifjob1 "jobs.out" proclist.tmp
	#echo ${CHRMNODE[@]}
	######### paral ###########
}
# parallelJobs()

function dopsite
{	
	curcnt=$1
	curdir=$2

	cd $curdir
	echo "chromosome $curdir : Promotsite log"
	if [ ! -s "data" ]; then		
		echo "'data' empty/not found"
		return
	else
		echo "executing promotsite "
		#make links	
		ln -sf ${GLIB}/${CHRPFX}$curcnt/mrna.txt 
		ln -sf $PARSRC/psparams
		ln -sf $PARSRC/ribl
		#run programs
		echo "Running sortmrna"
		sortmrna.pl mrna.txt
		echo "running promotsite"
		promotsite.pl
		echo "promotsite completed"
		#remove the links
		#rm promotsite scan2ps.pl sortmrna.pl 
		rm mrna.txt psparams ribl	
		# if scan option is 1, then remove the scan data file
		if [ -s "psdatainp" -a $SCANOP -eq 1 ];
		then
			rm data
		fi
	fi
} #dopsite

function doscan
{
	if [ $# -lt 3 ]; then
		echo "Not enough number of arguments to doscan()"
		return
	else

	#the directory number
	#the node on which to run the process
	curcnt=$1
	curdir=$2
	procn=$3

	cd $curdir
	echo "chromosome $curcnt : Scan log"
	ln -sf ${GLIB}/${CHRPFX}$curcnt/book 
	ln -sf $PARSRC/scanp
	ln -sf $PARSRC/ribl

	#Load balance only on SCYLD operating systems with the 'mpprun' command
	temp=`which mpprun`
	if [ $? -eq 0 -a $LOADBAL -eq 1 ]; then
		echo "mpprun -map $procn scan"
		mpprun -map $procn scan
	else
		scan
	fi

	rm ribl scanp book
	fi
}
# doscan()

case $RUNOPTION in
	1) echo "Starting Scan on chromosomes $CHRSTCNT TO $CHRENDCNT";;
	2) echo "Starting Promotsite on chromosomes $CHRSTCNT TO $CHRENDCNT";;
	3) echo "Starting Scan+Promotsite on chromosomes $CHRSTCNT TO $CHRENDCNT";;
esac

if [ "$LOADBAL" = "1" ]; then
	parallelJobs
fi

terminal=`tty`
exec>$PARSRC/runscan.log

date
loopcnt=0 
# for count in ${CHRML[@]}	
while [ $loopcnt -lt ${#CHRML[@]} ];
do
	count=${CHRML[$loopcnt]}
	pnum=`ps -C $PRNAME | wc -l`
	#ps -ef | grep $PRNAME
	#echo "Before decrementing, pnum=$pnum"
	# decrement pnum by 2, or else the current process is included as well
	# The first line of ps -C is the format of ps line
	pnum=$(($pnum-2))
	#echo "pnum=$pnum conp=$CONP"
	
	if [ $pnum -le $CONP ];	then
		echo $count
			#run background function
			cur=${PARSRC}/${CHRPFX}$count
			if [ ! -d $cur ]; then
				echo "creating ${CHRPFX}$count"
				mkdir $cur
			fi
			curproc=${CHRMNODE[$count]}
			if ! isdigit $curproc; then
				curproc=-1	# assign master node if an error occured				
			fi
			echo "cur=$cur count=$count $curproc runoption=$RUNOPTION "
			if [ $RUNOPTION -eq 1 ]; then
				( doscan $count $cur $curproc 1>$cur/scan.log 2>&1 )&
			elif [ $RUNOPTION -eq 2 ]; then
				( dopsite $count $cur 1>$cur/ps.log 2>&1 ) &
				#dopsite $count $cur &
			elif [ $RUNOPTION -eq 3 ]; then
				( doscan $count $cur $curproc 1>$cur/scan.log 2>&1 ; dopsite $count $cur 1>$cur/ps.log 2>&1 )&
			fi
		loopcnt=$(($loopcnt+1))
		sleep 5
	else 
		#echo " "
		sleep 300
	fi		
done

#waiting for the child processes created by this shell
wait
#now combine all the psBED.txt files into a single file
cd $PARSRC

# produce the big psBED file
sed 2q ${CHRPFX}${CHRML[0]}/psBED.txt >$ALLPSBED
for count in ${CHRML[@]}
do
	sed -n '/browser\|track/!p' ${CHRPFX}${count}/psBED.txt >>$ALLPSBED	
done

# produce the sorted BED file using sortBED 
sortBED.pl $ALLPSBED
#gettopBED $ALLPSBED 100
date #this is stored in runscan.out
exec>$terminal
echo 
echo
echo "Printing the execution logs"
cat $PARSRC/runscan.log 
echo "****************************"
for count in ${CHRML[@]}
do
	findCHRM $count
	echo "Printing chromosome $CHRMNAME log"
	cat ${CHRPFX}${count}/ps.log ${CHRPFX}${count}/scan.log
	echo "****************************"
done
exit 0

