#!/usr/bin/perl  

# Copyright 2003 Sashidhar Gadiraju, Peter K. Rogan
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#


#Program : n2rand.pl
#version : 1.1
#Description:
#This program converts a sequence of size 'N' or more of a nucleotide character 'CH' and converts in to a random nucleotide from 'a g t c'
#The character sequence can have '\n' in between which are ignored and not included in the sequence size
#IMPORTANT: This reads lines of text from the input file. 
#If this file has no line breaks, and if all the file is read as one single line,
#'out of memory' errors may occur.
#All characters other than \n and \r are counted (even spaces are counted)
#1.1 : Writing a 'n2rchanges' file indicating which portions of the input file are being substituted
#The numbers in this file indicate the character count excluding \n and \r

use strict;
my @bp = qw(a g c t);
my $opfile = "subseq";
my $changesfile = "n2rchanges";
my $buf = "";
my $bufcnt = -1;
my @chlist;
my %OPT;
($0) = $0 =~ /([\w\.]+)$/;

while( @ARGV && $ARGV[0] =~ s/^-// )
{
   local $_=shift;
   if(/([h])/)
   {  $OPT{'h'}=1;   }
}
					 
if($OPT{'h'} || @ARGV < 3 )
{ usage() and exit;	}

my $ipfile = $ARGV[0];	#read the input file name
my $CH = lc($ARGV[1]);	#the character to be compared and substituted in lowercase
my @tempch = split( //, $CH);
$CH = $tempch[0]; #take only the first letter of argument 0

my $N = $ARGV[2];	#minimum number of continuous positions of the input character
if( ! ($N =~ /^[\+]?\d+$/) )
{	die "not a valid digit sequence";	}
#if(  $N =~ /\D/)
#{	die "seqlength not a positive integer\n";	}

print "starting the random substitution of \'$CH\' for lengths of $N or more\n";
open(IP, "<$ipfile") || die "cannot open input file $ipfile";
open(OP, ">$opfile") || die "cannot open output file $opfile";
open(COP, ">$changesfile") || die "cannot open output file $changesfile";
print COP "*Substituting $CH sequences of length >= $N with random(a,g,c,t)\n";

#mode values: 0 => non-n chars, 1 => n count< N, 2 => n count >= N
#In mode 0 and 2, char-by-char printing takes place
#In mode 1, until the seq reaches the length of N, printing is stopped
my $mode = 0;	
my $totcount = 0; #count of nucleotides {\n and \r are ignored in the count}
my $nstart = 1;
#the first nucleotide in the sequence is counted as 1
my $ncount = 0;	#the count of continuous '$CH's
my $seq = "";	#contains the input sequence of continuous $CH
while ( defined( my $ncl =  &getnext() ) )
{
	$totcount++ if( $ncl ne "\n" && $ncl ne "\r");
	#$ncl is the original nucleotide
	my $lcncl = lc($ncl);	#lowercased nucleotide
	if( $mode == 0 )
	{
		if( $lcncl eq $CH) 
		{	$mode = 1; $ncount=1; $seq = $ncl; $nstart=$totcount;	} 
		else {	&printop($ncl,0);	}
	}
	elsif ( $mode == 1)
	{
		if( $lcncl eq $CH)
		{	
			$ncount++;
			$seq .= $ncl;
		}
		elsif ( $lcncl eq "\n" || $lcncl eq "\r")
		{	$seq .= $ncl;	}	#ignore new lines
		else
		{	$mode = 0;	&printop($seq);	&printop($ncl,0);	}
	}
	elsif ( $mode == 2)
	{
		if($lcncl eq $CH || $lcncl eq "\n" || $lcncl eq "\r")
		{	&printop(&getrandseq($ncl),0); }
		else
		{	
			$mode = 0; &printop($ncl);
			print COP "$nstart to ".($totcount-1)."\n";
		}
	}
	#The sequence is printed to output on mode 1 .
	#We do not want the wholecontiguous  sequence $seq to be in memory
	#hence in mode 2, we print on each character. 
	#Anyhow, perl buffers the output. So this should not be too much of 
	#a performance overhead.
	if($mode == 1 && $ncount == $N)
	{
		&printop(&getrandseq($seq), 0);
		$seq = "";	#reset seq
		$mode = 2;
	}
}#while(getnext())
	
if( $mode == 2 )
{	print COP "$nstart to $totcount\n";	}

close IP;
close OP;
close COP;

#END OF MAIN SUBROUTINE
	
#Does a buffered read of characters from the input sequence
sub getnext()
{
	my $retch = "";

	if( $bufcnt < 0)
	{
		if($buf = <IP>)		
		{
			@chlist = split(//, $buf);
			$bufcnt = 0;
		}
		else
		{	$buf = undef;	}
	}
	if( defined($buf) )
	{
		$retch = $chlist[$bufcnt++];
		if( $bufcnt >= @chlist ){	$bufcnt = -1;	}
	}
	else 
	{	$retch = undef;	}
	#print "#${retch}:";
	return $retch;
}#getnext()

sub printop()
{
	#ch:char to print 
	#opt:option, opt = 1 implies force print immediately, opt = 0 implies buffer data
	my ($seq , $opt) = @_;	
	
	#print "$seq\n";
	print OP "$seq";
	
}#printop()

#the input nucleotide sequence is converted to a random nucleotide sequence ignoring '\n's
sub getrandseq()
{
	#seq: input sequence
	my $seq = shift;
	my $retseq = "";
	#print "rand: $seq \n";
	foreach( split(//, $seq)	)
	{
		if ( $_ ne "\n" && $_ ne "\r")
		{	
			if( $_ eq lc($_) )
			{	$retseq .= lc( $bp[rand @bp] );	}
			else
			{	$retseq .= uc( $bp[rand @bp] );	}
		}
		else
		{	$retseq .= $_;	}
	}
	return $retseq;
}

sub usage
{	print "usage: $0 inpfile char seqlength\nThe output file created is \'subseq\'\n";	}

