#!/usr/bin/env perl

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2007 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: Util.pm,v 1.1.1.1 2002/04/02 20:25:43 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#
# written by Kazuharu Arakawa <gaou@sfc.keio.ac.jp> at
# G-language Project, Institute for Advanced Biosciences, Keio University.
#

package G::Seq::Util;

use SubOpt;
use G::Messenger;
use G::Seq::Primitive;
use G::Tools::Graph;

use strict;
use Cwd;
use SelfLoader;


require Exporter;

our @ISA = qw(Exporter);

our @EXPORT = qw(
		 find_king_of_gene
		 atcgcon
		 cds_echo
		 print_gene_function_list
		 seqinfo
		 maskseq
		 pasteseq
		 molecular_weight
		 view_cds
		 oligomer_translation
		 );


#:::::::::::::::::::::::::::::::::
#       Perldoc
#:::::::::::::::::::::::::::::::::


=head1 NAME

  G::Seq::Util - Miscellaneous analysis methods related to sequence analysis.

=head1 DESCRIPTION

    This class is a part of G-language Genome Analysis Environment, 
    collecting miscellaneous sequence analysis methods.

=cut

#::::::::::::::::::::::::::::::
#        Constants
#::::::::::::::::::::::::::::::

my %COG_fcode = (
		 J=>"Translation, ribosomal structure and biogenesis",
		 K=>"Transcription",
		 L=>"DNA replication, recombination and repair",
		 D=>"Cell division and chromosome partitioning",
		 O=>"Posttranslational modification, protein turnover, chaperones",
		 M=>"Cell envelope biogenesis, outer membrane",
		 N=>"Cell motility and secretion",
		 P=>"Inorganic ion transport and metabolism",
		 T=>"Signal transduction mechanisms",
		 C=>"Energy production and conservation",
		 G=>"Carbohydrate transport and metabolism",
		 E=>"Amino acid transport and metabolism",
		 F=>"Nucleotide transport and metabolism",
		 H=>"Coenzyme metabolism",
		 I=>"Lipid metabolism",
		 Q=>"Secondary metabolites biosynthesis, transport and catabolism",
		 R=>"General function prediction only",
		 S=>"Function unknown",
		 '-'=>"Non COG"
		 );

my %COG_fcolor = (
		  J=>"plum",
		  K=>"fuchsia",
		  L=>"pink",
		  D=>"lightgreen",
		  O=>"green",
		  M=>"khaki",
		  N=>"greenyellow",
		  P=>"darkkhaki",
		  T=>"cyan",
		  C=>"blue",
		  G=>"mediumturquoise",
		  E=>"lightskyblue",
		  F=>"mediumpurple",
		  H=>"aqua",
		  I=>"blueviolet",
		  Q=>"lightskyblue",
		  R=>"gainsboro",
		  S=>"darkgrey",
		  '-'=>"aliceblue"
		  );

__DATA__

#::::::::::::::::::::::::::::::
#        Methods Start
#::::::::::::::::::::::::::::::





=head2 seqinfo

  Name: seqinfo   -   prints out basic nucleotide sequence statistics

  Description:
    This method prints out basic compositional statistics of the 
    given nucleotide sequence, in a format similar to the one printed
    right after calling new G().

 Usage: 
    seqinfo($genome)

 Options:
   none

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20020207-01 initial posting

=cut


sub seqinfo {
    my @args = opt_get(@_);
    my $this = opt_as_gb(shift @args);
    my $length = length($this->{SEQ});

    my $a = $this->{SEQ} =~ tr/a/a/;
    my $t = $this->{SEQ} =~ tr/t/t/;
    my $g = $this->{SEQ} =~ tr/g/g/;
    my $c = $this->{SEQ} =~ tr/c/c/;
    my $others = $length - $a - $t - $g - $c;
    my $msg;

    $msg .= sprintf "\n  Length of Sequence : %9d\n" , $length;
    $msg .= sprintf "           A Content : %9d (%.2f\%)\n" , 
    $a , $a / $length * 100;
    $msg .= sprintf "           T Content : %9d (%.2f\%)\n" , 
    $t , $t / $length * 100;
    $msg .= sprintf "           G Content : %9d (%.2f\%)\n" , 
    $g , $g / $length * 100;
    $msg .= sprintf "           C Content : %9d (%.2f\%)\n" , 
    $c , $c / $length * 100;
    $msg .= sprintf "              Others : %9d (%.2f\%)\n" , 
    $others,  $others / $length * 100;
    $msg .= sprintf "          AT Content :    %.2f\%\n", 
    ($a + $t) / $length * 100;
    $msg .= sprintf "          GC Content :    %.2f\%\n\n", 
    ($g + $c) / $length * 100;

    &msg_send($msg);

    return ($a, $t, $g, $c);
}



=head2 molecular_weight

  Name: molecular_weight   -   calculates the molecular weight of given nucleotide sequence

  Description:
    This method calculates the molecular weight of the given
    nucleotide sequence, taking into account the hydrogen bonds 
    between molecules.. Molecular weight used in this method
    is as follows:
       A: 313.15
       T: 304.19
       G: 329.19
       C: 289.13
       N: 308.915

 Usage: 
    -strand      "single" or "double" strand DNA molecule (default:single)

 Options:
   none

  Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20011029-01 initial posting

=cut

sub molecular_weight {
    opt_default(strand=>"single");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my $strand = opt_val("strand");

    my %mw = ("a", 313.15, "t", 304.19, "g", 329.19, "c", "289.13", "n", 308.915);
    my $i = 0;
    my $weight = 0;

    while(substr($gb->{SEQ}, $i, 1) ne ''){
	if (substr($gb->{SEQ}, $i, 1) =~ /[atgc]/){
	    $weight += $mw{substr($gb->{SEQ}, $i, 1)};
	}else{
	    $weight += $mw{"n"};
	}
	$i ++;
    }
    my $double = $weight * 2;

    msg_send(sprintf "  Molecular Weight of Nucleotides:\n");
    msg_send(sprintf "    single strand:  %12d\n",$weight); 
    msg_send(sprintf "    double strand:  %12d\n\n\n",$double); 

    $weight *= 2 if ($strand eq "double");


    return $weight;
}


#find_king_of_gene ver.20010608-01
#scripting by Koya Mori(s98982km@sfc.keio.ac.jp)
#This program finds king of gene.
#have fun:)
#(string)=&find_king_of_gene(pointer GENOME,  boolean debug);

sub find_king_of_gene{
    my $nuc=shift;
    my $gene='you have just found the king of genes.'."\n";
    
    system("wget http://www.stagnightout.com/pics/what-to-wear/21280.jpg -O /tmp/afro.jpg -q");
    msg_gimv('/tmp/afro.jpg');
    
    return $gene;
}


#atcgcon ver.20010623-01
#scripting by Koya Mori(mory@g-language.org)
#This program counts bases in CDS.
#(hash result)=&bun(pointer Genome,  boolean debug);
sub atcgcon{
    &opt_default(output=>"stdout",filename=>"cds_info.csv");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $output=opt_val("output");
    my $filename=opt_val("filename");
    my $start;
    my $end;
    my $seq;
    my $num=1;
    my %hash;


    foreach($gb->feature()){
	if($gb->{"FEATURE$num"}->{type} eq 'CDS'){
	    $start=$gb->{"FEATURE$num"}->{start};
	    $end=$gb->{"FEATURE$num"}->{end};
	    $seq=$gb->getseq($start-1,$end-1);
	    $hash{a} += $seq =~tr/a/a/;
	    $hash{t} += $seq =~tr/t/t/;
	    $hash{g} += $seq =~tr/g/g/;
	    $hash{c} += $seq =~tr/c/c/;
	    $hash{total}+=length($seq);
	}
	$num++;
    }

    if($output eq "stdout"){
	&msg_send(sprintf("total:\t%10d base\n",$hash{total}));
	&msg_send(sprintf("a:\t%10d / %2.2f\%\n", $hash{a}, 100.0*$hash{a}/$hash{total}));
	&msg_send(sprintf("t:\t%10d / %2.2f\%\n", $hash{t}, 100.0*$hash{t}/$hash{total}));
	&msg_send(sprintf("c:\t%10d / %2.2f\%\n", $hash{c}, 100.0*$hash{c}/$hash{total}));
	&msg_send(sprintf("g:\t%10d / %2.2f\%\n", $hash{g}, 100.0*$hash{g}/$hash{total}));
	
	&msg_send(sprintf("GC content:\t%.2f\%\n", 100.0*($hash{c} + $hash{g}) / $hash{total}));
    }
    if($output eq "f"){
	open(FILE,">$filename");
	printf FILE "total:\t%10d base\n",$hash{total};
	printf FILE "a:\t%10d / %2.2f\%\n", $hash{a}, 100.0*$hash{a}/$hash{total};
	printf FILE "t:\t%10d / %2.2f\%\n", $hash{t}, 100.0*$hash{t}/$hash{total};
	printf FILE "c:\t%10d / %2.2f\%\n", $hash{c}, 100.0*$hash{c}/$hash{total};
	printf FILE "g:\t%10d / %2.2f\%\n", $hash{g}, 100.0*$hash{g}/$hash{total};
	
	printf FILE "GC content:\t%.2f\%\n", 100.0*($hash{c} + $hash{g}) / $hash{total};
	close(FILE);
    }

    return \%hash;
}


#cds_echo ver.20010623-01
#scripting by Koya Mori(mory@g-language.org)
#This program displayes CDS start and end positions.
#(hash result)=&bun(pointer Genome,  boolean debug);
sub cds_echo{
    my $gb=opt_as_gb(shift);
    my $start;
    my $end;
    my $i=1;

    foreach($gb->feature()){
        if($gb->{"FEATURE$i"}->{type} eq 'CDS'){
	    if($gb->{"FEATURE$i"}->{direction} eq 'direct'){
		$start = $gb->{"FEATURE$i"}->{start};
		$end = $gb->{"FEATURE$i"}->{end};
		&msg_send(sprintf("%d..%d\n",$start,$end));
	    }
	        
	    elsif($gb->{"FEATURE$i"}->{direction} eq 'complement'){
		$start = $gb->{"FEATURE$i"}->{end};
		$end = $gb->{"FEATURE$i"}->{start};
		&msg_send(sprintf("%d..%d\n",$start,$end));
	    }
	}
	$i++;
    }
}


#print_gene_function_list ver.20010718-01
#Author: Kazuharu Arakawa (gaou@sfc.keio.ac.jp)
#1 = &print_gene_function_list (pointer G instance, string sequence)
#This subroutine prints out the number of genes for each functions,
#and the number of specified oligomers in each gene function groups.
#The default for string sequence is 'gctggtgg'.
sub print_gene_function_list {
    my $gb = opt_as_gb(shift);
    my $seq = shift;
    $seq = 'gctggtgg' unless ($seq);
    my $revseq = complement($seq);
    my $i = 1;
    my %chi;
    my %cds;
    my $key;

    while(defined(%{$gb->{"CDS$i"}})){
        my $id = $gb->{"CDS$i"}->{feature};
        my ($function, $tmp) = split(/;/, $gb->{"FEATURE$id"}->{function}, 2);
        my $cdsseq = $gb->get_cdsseq("CDS$i");
        $cds{$function}++;
        
        my $iStart = -1;
        while(0 <= ($iStart = index($cdsseq, $seq, $iStart +1))){
            $chi{$function}++;
        }
        $iStart = -1;
        while(0 <= ($iStart = index($cdsseq, $revseq, $iStart +1))){
	    $chi{$function}++;
	}
        
        $i++;
    }
    
    &msg_send("=== $seq ===\n");
    my $tot = 0;
    foreach $key (sort keys %chi){
        &msg_send(sprintf("%20s: %8d\n",$key, $chi{$key}));
        $tot += $chi{$key};
    }
    
    &msg_send("total: $tot\n\n");
    
    
    &msg_send("=== CDS ===\n");
    $tot = 0;
    foreach $key (sort keys %cds){
        next if ($key !~ /[a-z]/);
        &msg_send(sprintf("%20s: %8d\n", $key, $cds{$key}));
        $tot += $cds{$key};
    }
    
    &msg_send("total: $tot\n");

    return 1;
}


sub maskseq{
    &opt_default(pattern=>"",start=>1,end=>"");
    my @args=opt_get(@_);

    my $gb=opt_as_gb(shift @args);
    my $seq=\$gb->{SEQ};
    my $start=opt_val("start");
    my $end=opt_val("end");
    my $pat=opt_val("pattern");
    my $masked;
    my $null;
 

    $$seq=~tr/ \n[0-9]//d;
    $$seq=~tr/A-Z/a-z/;
    
    $end=length($$seq) if($end eq "");

    for(my $i=0;$i<length($pat);$i++){
	$null.="n";
    }

    if($pat){
	$masked=substr($$seq,$start-1,$end-$start+1);
	$masked=~s/$pat/$null/g;
	substr($$seq,$start-1,$end-$start+1)=$masked;
    }
    else{
	$masked=substr($$seq,$start-1,$end-$start+1);
	$masked=~tr/a-zA-Z/n/;
	substr($$seq,$start-1,$end-$start+1)=$masked;
    }

    return $seq;
}


sub pasteseq{
    &opt_default();
    my @args=opt_get(@_);
    
    my $gb=opt_as_gb(shift);
    my $seq=\$gb->{SEQ};
    my $paste=shift @_;
    my $pos=shift @_;
    
    $$seq=~tr/ \n[0-9]//d;
    $$seq=~tr/A-Z/a-z/;
    $$paste=~tr/A-Z/a-z/;

    substr($$seq,$pos-1,0)=$$paste;

    return $seq;
}







# oligomer_translation ver.20011103-01
# Author: Kazuharu Arakawa
# Usage: (string translation) = &_oligomer_translation(string oligomer,
#         int frame);
# Options:
#   none
# Description:
#   This method returns the mixture of translated amino acid sequence
#   with untranslated nucleotide sequence of an oligomer of given
#   reading frame.

sub oligomer_translation {
    my @args = opt_get(@_);
    my $seq = shift @args;
    my $frame = shift @args;
    my $len = length($seq);
    if ($frame > 3){
	$seq = G::Seq::Util::complement($seq);
	$frame -= 3;
    }

    my %CodonTable = (
               'gac', 'D', 'caa', 'Q', 'gca', 'A', 'ctg', 'L',
               'gat', 'D', 'cag', 'Q', 'gcc', 'A', 'ctt', 'L',
               'gaa', 'E', 'agc', 'S', 'gcg', 'A', 'ata', 'I',
               'gag', 'E', 'agt', 'S', 'gct', 'A', 'atc', 'I',
               'aga', 'R', 'tca', 'S', 'gga', 'G', 'att', 'I',
               'agg', 'R', 'tcc', 'S', 'ggc', 'G', 'cca', 'P',
               'cga', 'R', 'tcg', 'S', 'ggg', 'G', 'ccc', 'P',
               'cgc', 'R', 'tct', 'S', 'ggt', 'G', 'ccg', 'P',
               'cgg', 'R', 'aca', 'T', 'gta', 'V', 'cct', 'P',
               'cgt', 'R', 'acc', 'T', 'gtc', 'V', 'atg', 'M',
               'aaa', 'K', 'acg', 'T', 'gtg', 'V', 'tgg', 'W',
               'aag', 'K', 'act', 'T', 'gtt', 'V', 'tgc', 'C',
               'cac', 'H', 'tac', 'Y', 'tta', 'L', 'tgt', 'C',
               'cat', 'H', 'tat', 'Y', 'ttg', 'L', 'taa', '/',
               'aac', 'N', 'ttc', 'F', 'cta', 'L', 'tag', '/',
               'aat', 'N', 'ttt', 'F', 'ctc', 'L', 'tga', '/'
                  );

    my $return = '';
    my $i;
    for ($i = 0; $i < $len; $i ++){
	if ($i < $frame - 1){
	    $return .= substr($seq, $i, $frame - 1) . '-';
	    $i += $frame - 2;
	} elsif ($i + 3 <= $len){
	    $return .= $CodonTable{substr($seq, $i, 3)};
	    $i += 2;
	    $return .= '-' unless ($i >= $len - 1);
	} else {
	    $return .= substr($seq, $i);
	    last;
	}
    }
    return $return;
}




=head2 view_cds

  Name: view_cds   -   displays a graph of nucleotide contents around start and stop codons

  Description:
    This method creates a graph showing the average A,T,G,C contents
    around start/stop codons. This is useful to view consensus around
    start/stop codons and to find characteristic pattern in CDS. 
    
  Usage : 
    view_cds(G instance);

  Options:
    -length    length in bases to show around start/stop codons
               (default: 100)
    -gap       gap shown in graph in between start/stop codon neighbors
               (default: 3)
    -filename  outfile name   (default: view_cds.png for graph, 
               view_cds.csv for file)
    -output    "f" for file, "g" for graph, "show" to display graph. 
               (default: "show")

  Author: 
    Kazuharu Arakawa (gaou@sfc.keio.ac.jp)

  History:
    20070707-01 moved to G::Seq::Util from G::Seq::GCskew
    20010906-01 initial posting

=cut



sub view_cds{
    &opt_default(length=>100, filename=>"view_cds.png", 
		  gap=>3, output=>"show", application=>"gimv");
    my @args = opt_get(@_);
    my $gb = opt_as_gb(shift @args);
    my (@a, @t, @g, @c, @pos);
    my $numcds = 0;
    my $i = 0;
    my $length = opt_val("length");
    my $filename = opt_val("filename");
    my $output = opt_val("output");
    my $application = opt_val("application");

    $filename = "view_cds.csv" if ($output eq "f" &&
				   opt_val("filename") eq "view_cds.png");
    my $gap = opt_val("gap");

    while(defined %{$gb->{"CDS$numcds"}}){ $numcds ++ }

    for ($i = 0; $i < $length * 4 + 6 + $gap; $i++){
	$a[$i] = 0;
	$t[$i] = 0;
	$g[$i] = 0;
	$c[$i] = 0;
    }

    foreach my $cds ($gb->cds()){
	my $seq;
	$seq  = $gb->before_startcodon($cds, $length);
	$seq .= $gb->startcodon($cds);
	$seq .= $gb->after_startcodon($cds, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    if     (substr($seq, $i, 1) eq 'a'){
		$a[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 't'){
		$t[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'g'){
		$g[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'c'){
		$c[$i] += 100/$numcds;
	    }
        }
	
	$seq  = $gb->before_stopcodon($cds, $length);
	$seq .= $gb->stopcodon($cds);
	$seq .= $gb->after_stopcodon($cds, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    if     (substr($seq, $i, 1) eq 'a'){
		$a[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 't'){
		$t[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'g'){
		$g[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'c'){
		$c[$i + length($seq) + $gap] += 100/$numcds;
	    }
	}
    }
    
    for ($i = 1; $i <= $length * 4 + 6 + $gap; $i ++){
	push(@pos, $i);
    }

    if ($output eq "g" || $output eq "show"){
	_UniMultiGrapher(
			 \@pos, -x => "position", -y => "percentage",
			 \@a, -x1=>"A", \@t, -x2=>"T",
			 \@g, -x3=>"G", \@c, -x4=>"C",
			 -filename => $filename,
			 -title => "Base Contents Around Start/Stop Codons"
			 );
	msg_gimv("graph/$filename") if($output eq "show");
    }elsif ($output eq "f"){
	open(OUT, '>data/' . $filename);
	print OUT "position,A,T,G,C\n";
	
	for ($i = 0; $i < $length * 4 + 6 + $gap; $i ++){
	    printf OUT "%d,%3.2f,%3.2f,%3.2f,%3.2f\n", $i + 1, 
	    $a[$i], $t[$i], $g[$i], $c[$i];
	}
	close(OUT);
    }
}


1;
