#!/usr/bin/perl
use strict;
use List::Util 'shuffle';
use List::Util 'max';
use POSIX;

my $max = max(my @list );
my @all_lines;
my @longest_seq;
my @bin;
my @total_base_count;

##################
### Data input ###
##################

print "\n\n ################################\n";
print " ### Motif Counting into Bins ###\n";
print " ################################ \n\n";

print " Enter the sequence of the motif:= ";
my $motif = <STDIN>;
chomp $motif;
$motif=uc $motif;

print " Enter the bin size:= ";
my $bin_size = <STDIN>;

print " Open the DNA sequence file:= ";
my $dna_filename = <STDIN>;
chomp $dna_filename;

unless ( open(DNAFILE, $dna_filename) )
{
	print "Cannot open file \"$dna_filename\"\n";
	die;
}
@all_lines = <DNAFILE>;
close DNAFILE;

open my $ofh, '>','output_normalization.txt';

#############################
### motif frequency count ###
#############################

for(my $i=0;$i<@all_lines;$i++)
{
  chomp $all_lines[$i];
  $longest_seq[$i]=length $all_lines[$i];
}
$max = max( @longest_seq ); # Find the longest sequence in the dataset       

my $longest=$max/$bin_size;
$longest= ceil($longest); # Find the number of bins for the longest sequence

for(my $i=0;$i<$longest;$i++){
 $bin[$i]=0;
}


for(my $i=0;$i<@all_lines;$i++){
		print "$i\n";

      if(substr($all_lines[$i],0,1) eq ">"){
       }
      else{
       chomp $all_lines[$i];
       
       for(my $z=0;$z<$longest;$z++){                  
        my $subtest1=substr($all_lines[$i],0+$z*$bin_size,$bin_size);
        my $base_count=length $subtest1;  
        $total_base_count[$z]=$total_base_count[$z]+$base_count;}
        
        my $position=0;
        my $q=0;
        
        for(my $k=1;$k<$longest+1;$k++){

        while($q<$bin_size*$k){
        my $test=substr($all_lines[$i],$q,length($motif));
        if($test=~m/$motif/g) {
          $position=$q+1;
          if(($position+length($motif)-1-$bin_size*$k)*2<length($motif)){
           $bin[$k-1]++;}
          else{
           $bin[$k]++;}
           $q=$q+length($motif);

           }
           else{ $q++;}
         }
        }
      }
    }

     
	 # print out total number of matched motifs across all sequences in each bin range.
     print $ofh "Bin_Range\tTotal_Match\tTotal_Base\tNormalized_Count(#motif/per base)\n";
     
     for(my $i=1;$i<$longest+1;$i++){
     my $left_boundary=$bin_size*$i-$bin_size+1;
     my $right_boundary=$bin_size*$i;
     my $Normalized_Count=$bin[$i-1]/$total_base_count[$i-1];
     print $ofh "$left_boundary ~ $right_boundary\t $bin[$i-1]\t$total_base_count[$i-1]\t         $Normalized_Count\n";
}


print " Please see the results in the output file \"output_normalization.txt\"\n\n";

### END ###

