#!/usr/bin/perl
#
# Copyright 2003 Sashidhar Gadiraju, Peter K. Rogan
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
#
#
#Program: sortBED
#version: 0.5
#0.5 : Combined sortBED.pl and BED.pm
#0.3 : Removing the merge option, instead using the -T option of 'sort'
#0.3 : to specify the temporary directory
#0.2 : Command line options provided
#0.2 : Also the sortBED() is now a module to be imported
#This program takes BED file(s) as input and sorts them.  
#Note: This program is a BED sort wrapper for the linux sort command
#

use Getopt::Long;
#use BED;

Getopt::Long::Configure("no_ignore_case");

use strict;

my @options = procopts();
sortBED(@options);

sub usage
{
	print "Usage: $0 [-cdiotw]  [BEDfile]
	-c sorted_column
		Default: 4
		The default column is the BED score field 
		To sort on coordinate, choose 1
		Note: The first col in the BED file is taken as zero
		Note: The sort is numerical, not alphabetic 
	-d direction
		Default: -
		The direction of sort, whether ascending(+) or descending(-)
	-h 
		this help menu
	-i input_BED_file
		Default: psBED.txt
		The input file name can be specified as the last
		command line argument as well.
		If both -i option is given, and a file is mentioned 
		as the last argument, then the latter is taken.
	-o output_sorted_BED_file
		Default: sorted_psBED.txt
		The name of the sorted BED file
	-t TMP_DIR
		Default: /tmp
		This is where the 'sort' command stores the temporary
		sorted files. 
		If the /tmp directory does not have space for 
		the whole sorted file, then the current directory is 
		checked. If space is present, then a tmp_sort_bed is created
		in the current directory to store the temporary files.
		If any other directory is specified with -t, only that
		directory is selected.
	-w window_size
		Default: 10000
		The UCSC browser position of the sorted BED file is set
		to (start - window_size) and (end + window_size) for 
		the first binding site in the file.
		\n";	
	return 1;
}

#process the command line options
sub procopts
{
	print "in procopts\n";
	my %OPT;
	$OPT{'h'} = 0;	#help option
	GetOptions(	
		"o=s" => \$OPT{'o'},
		"i=s" => \$OPT{'i'},
		"t=s" => \$OPT{'t'},
		"d=s" => \$OPT{'d'},
		"c=s" => \$OPT{'c'},
		"w=s" => \$OPT{'w'},
		"help" => \$OPT{'h'},					
	);
	if(@ARGV)	{$OPT{'i'} = shift @ARGV;	}	#read the last argument
	if( $OPT{'h'} )
	{	usage() and exit;	} 
	
	my @str = ();	#the array to hold the hash value inputs to splitsort()
	push(@str,('OBED',$OPT{'o'}) ) if( defined $OPT{'o'} );
	push(@str,('IBED',$OPT{'i'}) ) if( defined $OPT{'i'} );
	push(@str,('TMPDIR',$OPT{'t'}) ) if( defined $OPT{'t'} );
	push(@str,('DIRECTION',$OPT{'d'}) ) if( defined $OPT{'d'} );
	push(@str,('SORTCOL',$OPT{'c'}) ) if( defined $OPT{'c'} );
	push(@str,('WSIZE',$OPT{'w'}) ) if( defined $OPT{'w'} );

	$"=";";
	print "@str\n";
	return @str;
}#procopts()


#*********** BED.pm module***********#
#sortBED is now a pluggable module
sub sortBED
{
	my $Ripos = 4;	#position of Ri value(Score) field in the BED file - first field is taken as 0
	my %args = (
		IBED => 'psBED.txt',
		OBED =>	'sorted_psBED.txt',
		TMPDIR => '/tmp',
		DIRECTION => '-',
		SORTCOLUMN => $Ripos,
		WSIZE => 10000,		
		@_,
	);

	my $iBED = $args{'IBED'};	#input BED file
	my $oBED = $args{'OBED'};	#sorted BED file
	my $dir = $args{'DIRECTION'};	#sort ascending or descending; default = - i.e. descending
	my $sortcol = $args{'SORTCOLUMN'};	#the column on which the file is sorted
	my $tmpdir = $args{'TMPDIR'};	#the temporary directory used by the sort program
	my $wsize = $args{'WSIZE'};
	my $tmpdirnew = "";
	
	
	if( ! checkspace(".",$iBED,1) )
	{	
		print "The current directory does not have the space to hold the full sorted file\n";
		print "Trying to sort anyways using $tmpdir as the temporary dir \n";
	}
	elsif (! checkspace("$tmpdir", $iBED, 3) )
	{	
		print "$tmpdir does not have the space to hold the temporary files\n";
		if($tmpdir eq "/tmp")
		{	
			$tmpdirnew = "./tmp_bed_sort";	
			if( ! -d $tmpdirnew )
			{	mkdir $tmpdirnew, 0777;	}
			if(checkspace("$tmpdirnew", $iBED, 2) )
			{
				print "Using $tmpdirnew as the new temporary directory\n";
				$tmpdir = $tmpdirnew;
			}
		}#if: inner
	}#elsif: 
	
	printBEDheader($iBED,$oBED);	#print the first 2 lines of BED 
	
	my $fcount=0;
	my $tmp = "tmp". $fcount. ".txt";
	while( -f "$tmpdir/$tmp" )
	{	$tmp = "tmp". ++$fcount. ".txt";	}
	
	my $cmdline = "grep -v '^[ \\t]*browser\\|^[ \\t]track' $iBED > $tmpdir/$tmp \n";
	print "Executing : $cmdline\n";
	my $res = system($cmdline);
	if($res)
	{	print "Error in creating temporary file $tmpdir/$tmp: $! \n";	}

	if( $dir eq "-")	#descending
	{	$cmdline = "sort -T $tmpdir +${sortcol}nr $tmpdir/$tmp >> $oBED\n";	}
	else	#ascending
	{	$cmdline = "sort -T $tmpdir +${sortcol}n $tmpdir/$tmp >> $oBED\n";	}
	print "Executing: $cmdline\n";
	my $res=system($cmdline);
	if($res)
	{	print "Error in sorting temp file $tmpdir/$tmp: $? : $!\n";	}		
	
	#now write the correct position
	print "Deleting temporary files\n";
	unlink("$tmpdir/$tmp");	#delete the temporary files
	#now print the sorted file position in the browser line of BED
	print "Setting the UCSC browser position in the sorted file\n";
	printBEDheaderOver($oBED, $wsize, "w");
}#sortBED()


#check if a directory will hold the full file size scaled by a factor.
#Assuming that there is no similar parallel execution going on.
#The function returns 0 if there is enough space in the dir, else 1.
sub checkspace
{
	my( $dir, $file, $scale ) = @_;	#dir of the target mount, full file path, scaling factor of the file size
	
	my $res=1;	#default - sufficient space on the disk
	my @st = ();
	(@st = stat($file) ) || die "cannot stat $file - probably no file\n";
	my $fsize = $st[7];

	#running the df -k option to find the size of /tmp dir
	#the troff to remove the \n is required because in some OSs
	#the fields are printed in separate lines maintaing the order
	my $tmpsize=`df -k $dir | sed '1d' | tr "\n" " "  | awk '{print \$4}'`;	#size in kilobytes
	
	$tmpsize *= 1024;	#size in bytes
	$fsize *= $scale;	#scale the file size
	print "Dir:$dir,Size:$tmpsize ; File:$file,ScaledSize:$fsize\n";
	$res = 0 if( $tmpsize <  $fsize) ;	
	return $res;
}#checkspace()


#Appends a string with spaces so that the whole string has a fixed length of characters
sub getFixedLengthStr
{
	my($str, $flength) = @_;		
	$str = trim ($str);
	$flength = 100 if (!defined $flength);
	
	my $spacenum = $flength - (length $str);	#number of trailing spaces
	$spacenum = ($spacenum>0)?$spacenum:0;
	my $hline = $str . " " x $spacenum ;
	return $hline;
}#getFixedLengthStr()
	
sub printBEDheader
{
	my( $iBED, $oBED) = @_;	#input, output files 
	open(IP, "$iBED") || die "cannot open input file $iBED";
	open(OP, ">$oBED") || die "cannot open output file $oBED";
	my $trackline = <IP>;	#read first line which is the browser position
	$trackline = <IP>;	#read the actual track line 
	#todo - The below 100 should be customizable by the user
	my $bline = getFixedLengthStr("browser position chr1", 100);
	#print OP "browser position                                       \n";	
	print OP "$bline\n";
	print OP "$trackline";
	close IP;
	close OP;
}#printBEDheader


#this opens a BED file and overwrites the first line
#with the specified header and position
#If no position is specified, then the window size relative to
#the start and end positions of the binding site coordinates are used
sub printBEDheaderOver
{
	my ( $iBED, $pos, $flag) = @_;	
	#bed file, position or window size, flag to determine window or position
	#values of flag - "b", "w"
	open(IP, "$iBED") || warn "cannot open input file $iBED";
	my $bline = <IP>;	#read the  browser line i.e 1st line
	chomp($bline);	#to remove the trailing \n during printing
	my $chline = <IP>; $chline = <IP>;	#read the first binding site line i.e. 3rd line
	close IP;
	$flag = trim( $flag);
	my $newbline="browser position ";
	if(	$flag eq "b" )	#browser position is specified explicitly
	{	
		$pos = trim($pos);
		$newbline = getFixedLengthStr( $newbline . $pos , length($bline) );
	}
	elsif( $flag eq "w")	#updown window size is specified
	{
		my @bed = split( /\s+/, trim($chline) );
		my $st = $bed[1] - $pos;
		my $end = $bed[2] + $pos;
		$st = ($st<1)?1:$st;
		$newbline = getFixedLengthStr( $newbline . $bed[0].":".$st."-".$end , length($bline) );
	}
	if( length($bline) < length($newbline) )	#unsuccessful attempt to overwrite the header line
	{
		print "Unable to overwrite the browser line with new coordinate information\n";
		print "The browser line does not have enough columns to accomodate the new browser line\n";
		return 0;	
	}
	open(OP, "+<$iBED") || warn "cannot open outputfile $iBED";
	print OP "$newbline";
	close OP;
	return 1;
}#printBEDheaderOver()
		

#works similar to chomp, removes leading and trailing ws from the input
#arguments
sub trim
{
	return ( wantarray ? () : undef ) if( ! defined @_ );
	my @retarr=();
	foreach my $str (@_)
	{
		$str =~ s/^\s+//;   #remove leading whitespaces
		$str =~ s/\s+$//;   #remove trailing whitespaces
		push(@retarr, $str);
	}
	return @retarr;
}	
 

sub isInt
{  return ($_[0] =~ /^\s*[\+-]?\d+\s*$/)?1:0;   }
 
sub isPosInt
{  return ($_[0] =~ /^\s*[\+]?\d+\s*$/)?1:0; }
 
sub isNegInt
{  return ($_[0] =~ /^\s*[-]?\d+\s*$/)?1:0;  }
 
#********** BED.pm ************#
