#!/usr/bin/perl

# Rene Warren 2006,2007
# Short Sequence Assembly by Kmer search and 3' Extension (SSAKE)

#NAME
#   SSAKE v2.0  Rene Warren, September 2007
#   error-handling idea from VCAKE v1.0  William Jeck, May 2007 

#SYNOPSIS
#   Progressive clustering of millions of short DNA sequences by Kmer extension

#LICENSE
#   SSAKE Copyright (c) 2006-2007 Canada's Michael Smith Genome Science Centre.  All rights reserved.
#   Using a complete re-write of error-handling by consensus derivation (VCAKE) with its Copyright (c) 2007 University of North Carolina at Chapel Hill. All rights Reserved.

#   This program is free software; you can redistribute it and/or
#   modify it under the terms of the GNU General Public License
#   as published by the Free Software Foundation; either version 2
#   of the License, or (at your option) any later version.

#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.


use strict;
use Data::Dumper;
use Benchmark;
require "getopts.pl";
use vars qw($opt_f $opt_m $opt_o $opt_v $opt_r);
&Getopts('f:m:l:o:v:r:');
my ($base_overlap,$min_overlap,$verbose,$MIN_READ_LENGTH,$SEQ_SLIDE,$min_base_ratio)=(2,16,0,16,1,0.6);
my $per;

if(! $opt_f){
   print "Usage: $0\n";
   print "-f <A single fasta file containing all the sequence reads (.fa file)>\n";
   print "-m  Minimum number of overlapping bases with the seed/contig during overhang consensus build up (default -m 16)\n";
   print "-o <Minimum number of reads needed to call a base during an extension (default -o 2)>\n";
   print "-r <Minimum base ratio used to accept a overhang consensus base (default -r 0.6)>\n";
   die "-v <Runs in Verbose mode (1=yes 0=no, default -v 0)> (optional)\n";
}

my $file = $opt_f;

$min_overlap = $opt_m if ($opt_m);
$base_overlap = $opt_o if ($opt_o);
$min_base_ratio = $opt_r if ($opt_r);
$verbose = $opt_v if ($opt_v);

if(! -e $file){
   die "Invalid file $file.\n";
}

my $pid_num = getpgrp(0);
my $pid = ".ssake_m" . $min_overlap . "_o" . $base_overlap . "_r" . $min_base_ratio . "_pid" . $pid_num;
my $contig = $file . $pid .  ".contigs";
my $singlet = $file . $pid . ".singlets";
my $short = $file . $pid . ".short";
my $log = $file . $pid . ".log";

open (LOG, ">$log") || die "can write to $log..Exiting.\n";;

if($min_overlap < 11 || $min_overlap > 50){
   my $outofbound_message = "-m must be a number between 11-50 ...Exiting.\n";
   print $outofbound_message;
   print LOG $outofbound_message;
   close LOG;
   exit;
}

if($base_overlap < 1){
   my $outofbound_message = "-o must be set to 1 or higher ...Exiting.\n";
   print $outofbound_message;
   print LOG $outofbound_message;
   close LOG;
   exit;
}

if($min_base_ratio <= 0.5 || $min_base_ratio > 1){
   my $outofbound_message = "-r must be a number between 0.51 and 1.00 ...Exiting.\n";
   print $outofbound_message;
   print LOG $outofbound_message;
   close LOG;
   exit;
}


my $init_message = "\nRunning: $0\n-f $file\n-m $min_overlap\n-o $base_overlap\n-r $min_base_ratio\nContigs file: $contig\nSinglets file: $singlet\nExcluded short reads file: $short\nLog file: $log\n";


print $init_message;
print LOG $init_message;

my $t0 = new Benchmark;
my $date = `date`;
chomp($date);


my $reading_reads_message = "\n\nReading sequences initiated $date\n";
print $reading_reads_message;
print LOG $reading_reads_message;
my ($set,$bin) = &readFasta($file,$short);

my $t1 = new Benchmark;
$date = `date`;
chomp($date);

my $ssake_start_message = "\n\nSequence assembly initiated $date\n";
print $ssake_start_message;
print LOG $ssake_start_message;

#-------------------------------------------------
my ($tig_count,$previous_index) = (0,0);

open (TIG, ">$contig") || die "can write to $contig. Exiting.\n";
open (SIN, ">$singlet") || die "can write to $singlet. Exiting.\n";

eval{

my $status_bar = "+";
for(my $i=1;$i<=99;$i++){
   $per->{$i}++;
   my $ct = $i /10;
   if($ct == int($ct)){$status_bar .= $ct;}else{$status_bar .= "-";}
}
$status_bar .= "+ x 10 [% complete]";
print "$status_bar\n.";

my $keys_start = keys ( %$set );

ASSEMBLY:
foreach my $seq (sort {$set->{$b}<=>$set->{$a}} keys %$set){#cycle through the input [normalized] reads

   my $orig_mer = length($seq);

   if(defined $set->{$seq}){#sequence read hasn't been used, is longer than 11 nt and the user-defined overlap minimum -m
      #### Delete keys ref
      my @o=split(//,$seq);                               

      my $start_sequence = $seq;
      my $reads_needed = $set->{$seq};                      #tracks coverage
      my $total_bases = $orig_mer * $reads_needed;

      deleteData($bin,$set,$seq);                           #remove kmer from hash table and prefix tree
     
      print "\n\n>>> START SEED SEQUENCE :: $seq <<<\n\n" if ($verbose);

      ($seq, $set, $bin, $reads_needed, $total_bases) = doExtension("3", $orig_mer, $seq, $set, $bin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose);

      ####end of 3' extension, beginning of 5' extension  (via 3' RC)
      my $seqrc = reverseComplement($seq);
      ($seqrc, $set, $bin, $reads_needed, $total_bases) = doExtension("5", $orig_mer, $seqrc, $set, $bin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose);

      ####end of 5' extension

      $tig_count++;
      my $leng = length($seqrc);
      my $reversetig = reverseComplement($seqrc);

      if($start_sequence ne $seqrc && $start_sequence ne $reversetig){
         my $cov =  $total_bases / $leng;
         printf TIG ">contig%i|size%i|read%i|cov%.2f\n%s\n", ($tig_count,$leng,$reads_needed,$cov,$reversetig);    #print contigs to file
      }else{
         my $cov = $reads_needed;
         printf SIN ">contig%i|size%i|read%i|cov%.2f\n%s\n", ($tig_count,$leng,$reads_needed,$cov,$start_sequence);    #print singlets to file
      }
   }

   my $keys_left = keys( %$set );
   my $index = (int((($keys_start-$keys_left)/$keys_start)*100));
   if(defined $per->{$index}){
      print "." x ($index - $previous_index);
      $|=1; ###clear buffer
      delete $per->{$index};
   }
   $previous_index = $index;

   last ASSEMBLY if (! $keys_left);
}
print ".";
};###end eval block

my $t2 = new Benchmark;
$date = `date`;
chomp($date);

if($@){
   my $message = $@;
   my $failure = "\n\nSomething went wrong running $0 $date\n$message\n\n";
   print $failure;
   print LOG $failure; 
}else{
   my $success = "\n\n$0 executed normally $date\n";
   print $success;
   print LOG $success;
}

my $read_time = timediff($t1,$t0);
my $assembly_time = timediff($t2,$t1);
my $total_time = timediff($t2,$t0);

printf LOG "\nTime to read the input file: %s \nTime to assemble: %s\nTotal run time: %s", (timestr($read_time), timestr($assembly_time), timestr($total_time));

close TIG;
close SIN;
close LOG;
close SHO;

exit;


#-----------------
sub readFasta{
   my ($file,$short) = @_;

   my ($set,$bin);

   my $ctrd=0;

   open(IN,$file) || die "Can't open $file. Exiting.\n";
   open (SHO, ">$short") || die "can write to $short. Exiting.\n";

   while(<IN>){
      chomp;
      if(/^([ACGT]*)$/i){
         
         my $seq=$1;
         my $orig=uc($seq);  
         my $orig_mer = length($orig);

         if ($orig ne '' && $orig_mer >= $MIN_READ_LENGTH && $orig_mer >= $min_overlap){ 
            ####show progress
            my $s10k = $ctrd / 10000;
            print "." if ($s10k == int($s10k) && $s10k && $ctrd);
            $|=1; ###clear buffer
             
            my $s100k = $ctrd / 100000;
            printf "%i sequences inputted\n", ($ctrd) if ($s100k == int($s100k) && $s100k && $ctrd);

            my @f=split(//,$orig);
            $set->{$orig}++;

            $_ = $orig;
            tr/ACTG/TGAC/;
            my $rc=reverse();
            
            my @r=split(//,$rc);

            $bin->{$f[0]}{$f[1]}{$f[2]}{$f[3]}{$f[4]}{$f[5]}{$f[6]}{$f[7]}{$f[8]}{$f[9]}{$f[10]}{$orig}++;
            $bin->{$r[0]}{$r[1]}{$r[2]}{$r[3]}{$r[4]}{$r[5]}{$r[6]}{$r[7]}{$r[8]}{$r[9]}{$r[10]}{$rc}++;
            $ctrd++;
         }else{
            if($orig_mer < $MIN_READ_LENGTH){
               print SHO "$seq\tInput sequence shorter than minimum read length allowed ($orig_mer < $MIN_READ_LENGTH nt)\n";
            }elsif($orig_mer < $min_overlap){
               print SHO "$seq\tInput sequence shorter than minimum overlap specified($orig_mer < -m $min_overlap)\n";
            }
         }
      }
   }
   my $read_number_message = "$ctrd total sequences (" . keys( %$set ) . " unique)\n";
   printf $read_number_message;
   print LOG $read_number_message;

   close IN;
   close SHO;
   return $set,$bin;
}

#-----------------
sub doExtension{

   my ($direction, $orig_mer, $seq, $set, $bin, $reads_needed, $total_bases, $min_overlap, $base_overlap, $min_base_ratio, $verbose) = @_;

   my $previous = $seq;
   my $extended = 1;

   while($extended){

      my ($ct,$pos,$current_reads,$current_bases,$span) = (0,0,0,0,$orig_mer);

      my $overhang = {};
      my @overlapping_reads = ();
      for (my $x=1;$x <= ($orig_mer *2);$x++){
         ($overhang->{$x}{'A'},$overhang->{$x}{'C'},$overhang->{$x}{'G'},$overhang->{$x}{'T'}) = (0,0,0,0);
      }

      EXTENSION:
      while ($span > $min_overlap){                                                                         #will slide the subseq, until the user-defined min overlap size
         $span = $orig_mer - $ct;
         $pos = length($seq) - $span;

         my $subseq = substr($seq, $pos, $span);                                                            #make a sub-sequence of length l-(1..i) for searching

         my @s=split(//,$subseq);
         my $subset = $bin->{$s[0]}{$s[1]}{$s[2]}{$s[3]}{$s[4]}{$s[5]}{$s[6]}{$s[7]}{$s[8]}{$s[9]}{$s[10]}; #Will grab everything even the reverse complement ones

         print "####$direction' SEARCH Counter:$ct .. Span:$span .. Subseq:$subseq Previous:$previous\n" if ($verbose);

         SEARCH:   #this cycles through limited kmer space
         foreach my $pass (sort {$subset->{$b} <=> $subset->{$a}} keys %$subset){
            if($pass =~ /^$subseq([ACGT]*)/){ 
               #can we align perfectly that subseq to another rd start?
               my $dangle = $1;
               print "\n", "=" x 80, "\n$direction'- FOUND sequence: $pass -> subset: $subseq -> overhang: $dangle\n", "=" x 80, "\n\n" if ($verbose);

               # Collect all overhangs
               push @overlapping_reads, $pass;                  ### all overlapping reads
               my @over = split(//,$dangle);
               my $ct_oh = 0;
 
               foreach my $bz(@over){
                  $ct_oh++;                                     ### tracks overhang position passed the seed  
                  if(defined $set->{$pass}){
                     $overhang->{$ct_oh}{$bz} += $set->{$pass}; ### reflects read coverage (often real duplicates)
                  }else{
                     my $pass_rc = reverseComplement($pass);
                     $overhang->{$ct_oh}{$bz} += $set->{$pass_rc};
                  }
                  print "$ct_oh - $bz = $overhang->{$ct_oh}{$bz}\n" if($verbose);
               }
            }elsif($subseq =~ /$pass/){ ###cases where the read is fully embedded in the search sequence - want to include for coverage calculations
               my $complement_pass = reverseComplement($pass);

               print "$pass found in $subseq ($set->{$pass}) - deleting read: $pass and complement ($set->{$complement_pass}): $complement_pass\n\n" if ($verbose);
               if(defined $set->{$pass}){
                  $current_reads = $set->{$pass};
                  $current_bases = length($pass) * $current_reads;
                  $reads_needed += $current_reads;
                  $total_bases += $current_bases;
                  deleteData($bin,$set,$pass);
               }
               if(defined $set->{$complement_pass}){
                  $current_reads = $set->{$complement_pass};
                  $current_bases = length($complement_pass) * $current_reads;
                  $reads_needed += $current_reads;
                  $total_bases += $current_bases;
                  deleteData($bin,$set,$complement_pass);
               }
            }
         }
         $ct += $SEQ_SLIDE;
      }#while overlap >= user-defined -m minimum
 
      my $consensus = "";
      print "Finished Collecting Overlapping Reads - BUILDING CONSENSUS...\n" if ($verbose);
      print Dumper(@overlapping_reads) if ($verbose);

      ### Build consensus
      CONSENSUS:
      foreach my $ohpos (sort {$a<=>$b} keys %$overhang){
         if($ohpos){

            my $coverage = $overhang->{$ohpos}{'A'}+$overhang->{$ohpos}{'C'}+$overhang->{$ohpos}{'G'}+$overhang->{$ohpos}{'T'};
            print "pos:$ohpos cov:$coverage A:$overhang->{$ohpos}{'A'} C:$overhang->{$ohpos}{'C'} G:$overhang->{$ohpos}{'G'} T:$overhang->{$ohpos}{'T'}\n" if($verbose);

            if ($coverage < $base_overlap){
               print "COVERAGE BELOW THRESHOLD: $coverage < -o $base_overlap @ $ohpos :: will extend by: $consensus\n" if ($verbose);
               last CONSENSUS;
            }
            my $baselist = $overhang->{$ohpos};

            my $ct_dna=0;
            my $previous_bz = "";

            BASE:
            foreach my $bz (sort {$baselist->{$b}<=>$baselist->{$a}} keys %$baselist){
               #print "\t$ct_dna -> $bz..$baselist->{$previous_bz} > $baselist->{$bz}\n";
               if($ct_dna){## the two most abundant bases at that position
                  #print "\t\t$ct_dna\n";
                  if($previous_bz ne "" && ($baselist->{$previous_bz} / $coverage) >= $min_base_ratio && $baselist->{$previous_bz} > $baselist->{$bz}){### a simple consensus btw top 2 
                     $consensus .= $previous_bz;                                         ### build consensus
                     print "Added base $previous_bz (cov = $baselist->{$previous_bz}) to $consensus **\n" if ($verbose);
                     last BASE;
                  }else{
                     print "ISSUES EXTENDING: best base = $previous_bz (cov=$baselist->{$previous_bz}) at $ohpos.  Second-Best: $bz (cov=$baselist->{$bz}) (ratio best=$baselist->{$previous_bz} / total=$coverage) >= $min_base_ratio (-r) -- will terminate with $consensus\n" if($verbose);
                     last CONSENSUS;
                  }
               }
               $previous_bz = $bz;                 
               $ct_dna++;
            }
         }
      }

      ### deal with sequence reads making up the consensus/newly formed contig
      if($consensus ne ""){
        print "Will extend $seq\nwith: $consensus\n\n" if($verbose);
        my $temp_sequence = $seq . $consensus;  ## this is the contig extension
        my $integral = 0;
        foreach my $ro (@overlapping_reads){

           if($temp_sequence =~ /$ro/){                                   ### read found integral in the newly built sequence

             my $complement_ro = reverseComplement($ro);
             $integral=1;

             print "$ro found in $seq ($set->{$ro}) - deleting read: $ro and complement ($set->{$complement_ro}): $complement_ro\n\n" if ($verbose); 
             if(defined $set->{$ro}){
                 $current_reads = $set->{$ro};
                 $current_bases = length($ro) * $current_reads;
                 $reads_needed += $current_reads;
                 $total_bases += $current_bases;
                 deleteData($bin,$set,$ro);
             }

             if(defined $set->{$complement_ro}){
                 $current_reads = $set->{$complement_ro};
                 $current_bases = length($complement_ro) * $current_reads;
                 $reads_needed += $current_reads;
                 $total_bases += $current_bases;
                 deleteData($bin,$set,$complement_ro);
             }
           }
        }
        if(! $integral){### no reads are found overlapping with the consensus might be indicative of low complexity regions -- Stop the extension
           print "No overlapping reads agree with the consensus sequence.   Stopping extension" if ($verbose);
           $extended = 0;
        }else{
           $seq = $temp_sequence;
           print "New Contig is: $seq\n" if ($verbose);
           $extended = 1;
        }

        $previous = $seq;
      }else{### no consensus built, will stop the extension
        $extended = 0;
      }
   }
   print "\n*** NOTHING ELSE TO BE DONE IN $direction prime- PERHAPS YOU COULD DECREASE THE MINIMUM OVERLAP -m (currently set to -m $min_overlap) ***\n\n" if ($verbose);

   return $seq, $set, $bin, $reads_needed, $total_bases;
}

#-------------------
sub deleteData {
        my ($bin,$set,$sequence) = @_;
   
        my @o=split(//,$sequence);
        my $comp_seq = reverseComplement($sequence);
        my @c=split(//,$comp_seq);

        #remove kmer from hash table and prefix tree
        delete $bin->{$o[0]}{$o[1]}{$o[2]}{$o[3]}{$o[4]}{$o[5]}{$o[6]}{$o[7]}{$o[8]}{$o[9]}{$o[10]}{$sequence};
        delete $bin->{$c[0]}{$c[1]}{$c[2]}{$c[3]}{$c[4]}{$c[5]}{$c[6]}{$c[7]}{$c[8]}{$c[9]}{$c[10]}{$comp_seq};
        delete $set->{$sequence};
}
#-----------------------
sub reverseComplement {
        $_ = shift;
        $_ = uc();
        tr/ATGC/TACG/;
        return (reverse());
}

