#!/usr/bin/env perl
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::
#             GCskew.pm odyssey
#
#     Copyright (C) 2001 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: GCskew.pm,v 1.12 2001/10/07 05:23:55 t98901ka Exp $

package G::Seq::GCskew;

use SubOpt;
use strict;
use G::Tools::Graph;
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
use Chart::Graph qw(gnuplot);

require Exporter;

@ISA = qw(Exporter AutoLoader);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
@EXPORT = qw(
	     find_ori_ter
	     gcskew
	     cum_gcskew
	     genomicskew
	     gcwin
	     view_cds
);
$VERSION = '0.01';

#::::::::::::::::::::::::::::::
#        Methods Start
#::::::::::::::::::::::::::::::
sub new{
    my $pkg = shift;
    my $filename = shift;
    my $option = shift;
    my $this;

    return $this;
}


# view_cds ver.20010906-01
# Author: Kazuharu Gaou Arakawa
# Usage: &view_cds(pointer G instance);
# Options:
# -length    length in bases to show around start/stop codons
# -gap       gap shown in graph in between start/stop codon neighbors
# -filename  outfile name   (default: view_cds.gif for graph, 
#                                     view_cds.csv for file)
# -output    "f" for file, "g" for graph, "show" to display graph.
#            (default: "show")
# Description:
#   This method creates a graph showing the average A,T,G,C contents
#   around start/stop codons. This is useful to view consensus around
#   start/stop codons and to find characteristic pattern in CDS. 
# Requirements
#   sub _UniMultiGrapher
#   SubOpt

sub view_cds{
    &opt::default(length=>100, filename=>"view_cds.gif", 
		  gap=>3, output=>"show", application=>"gimv");
    my @args = opt::get(@_);
    my $gb = shift @args;
    my (@a, @t, @g, @c, @pos);
    my $numcds = 0;
    my $i = 0;
    my $length = opt::val("length");
    my $filename = opt::val("filename");
    my $output = opt::val("output");
    my $application = opt::val("application");

    $filename = "view_cds.csv" if ($output eq "f" &&
				   opt::val("filename") eq "view_cds.gif");
    my $gap = opt::val("gap");

    while(defined %{$gb->{"CDS$numcds"}}){ $numcds ++ }

    for ($i = 0; $i < $length * 4 + 6 + $gap; $i++){
	$a[$i] = 0;
	$t[$i] = 0;
	$g[$i] = 0;
	$c[$i] = 0;
    }

    foreach my $cds ($gb->cds()){
	my $seq;
	$seq  = $gb->before_startcodon($cds, $length);
	$seq .= $gb->startcodon($cds);
	$seq .= $gb->after_startcodon($cds, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    if     (substr($seq, $i, 1) eq 'a'){
		$a[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 't'){
		$t[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'g'){
		$g[$i] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'c'){
		$c[$i] += 100/$numcds;
	    }
        }
	
	$seq  = $gb->before_stopcodon($cds, $length);
	$seq .= $gb->stopcodon($cds);
	$seq .= $gb->after_stopcodon($cds, $length);
	
	for ($i = 0; $i < length($seq); $i ++){
	    if     (substr($seq, $i, 1) eq 'a'){
		$a[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 't'){
		$t[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'g'){
		$g[$i + length($seq) + $gap] += 100/$numcds;
	    }elsif (substr($seq, $i, 1) eq 'c'){
		$c[$i + length($seq) + $gap] += 100/$numcds;
	    }
	}
    }
    
    for ($i = 1; $i <= $length * 4 + 6 + $gap; $i ++){
	push(@pos, $i);
    }

    if ($output eq "g" || $output eq "show"){
	_UniMultiGrapher(
			 \@pos, -x => "position", -y => "percentage",
			 \@a, -x1=>"A", \@t, -x2=>"T",
			 \@g, -x3=>"G", \@c, -x4=>"C",
			 -filename => $filename,
			 -title => "Base Contents Around Start/Stop Codons"
			 );
	system($application . ' graph/' . $filename . ' &') 
	    if ($output eq "show");
    }elsif ($output eq "f"){
	open(OUT, '>data/' . $filename);
	print OUT "position,A,T,G,C\n";
	
	for ($i = 0; $i < $length * 4 + 6 + $gap; $i ++){
	    printf OUT "%d,%3.2f,%3.2f,%3.2f,%3.2f\n", $i + 1, 
	    $a[$i], $t[$i], $g[$i], $c[$i];
	}
	close(OUT);
    }
}


# find_ori_ter ver. 20010905-01
# Author: Kazuharu Gaou Arakawa (gaou@g-language.org)
# Usage:
#   (int origin, int terminus) = &find_ori_ter(pointer GENOME);
# Options:
#   -window    window size to analyze the cumulative GC skew (default: 1000)
# Description:
#   Predicts the replicational origin and terminus
# Requirements:
#   use SubOpt;
# History:
#   20010905-1 options update
#   20010326-1 initial posting

sub find_ori_ter {
    &opt::default(window=>1000);
    my @args = opt::get(@_);
    my $ref_Genome = shift @args;
    my $printer = shift;
    my $iWindow = opt::val("window");
    my $i = 0;
    my $max = 0;
    my $maxi = 0;
    my $min = 0;
    my $mini = 0;
    my $before = 0;
    my ($g,$c, $cumulative);

    &msg::send("\nfind_ori_ter:\n");
    &msg::send("   Window size = $iWindow\n");

    while(length(substr($ref_Genome->{SEQ}, $i*$iWindow)) >= $iWindow){
	$g = substr($ref_Genome->{SEQ}, $i * $iWindow, $iWindow) =~ tr/g/g/;
	$c = substr($ref_Genome->{SEQ}, $i * $iWindow, $iWindow) =~ tr/c/c/;
	if ($c + $g == 0){
	    $i++;
	        &msg::error("ERROR: Window " , $i * $iWindow, "-" ,
	    $i * $iWindow + $iWindow, "bp contains no C or G\n");
	    next;
	} 
	$cumulative += ($c-$g)/($c+$g);
	if ($cumulative > $max){
	    $max = $cumulative;
	    $maxi = $i;
	}elsif($cumulative < $min){
	    $min = $cumulative;
	    $mini = $i;
	}
	$i ++;
    }

    &msg::send("   Predicted Origin:   " , $maxi * $iWindow + $iWindow / 2 , "\n");
    &msg::send("   Predicted Terminus: " , $mini * $iWindow + $iWindow / 2 , "\n\n");

    return ($maxi * $iWindow + $iWindow / 2, $mini * $iWindow + $iWindow / 2);
}


# gcskew ver.20010905-01
# Author: Kazuharu "Gaou" Arakawa (gaou@g-language.org)
# Usage: array gcskew = &gcskew(pointer sequence);
# Options:
#   -window      window size to observe (default: 10000)
#   -at          1 when observing AT skew instead of GC skew (default: 0)
#   -output      f for file output in directory "data", 
#                g for graph output in directory "graph",
#                show for graph output and display (default: "show")
#   -filename    output filename (default: "gcskew.gif" for -output=>"g",
#                                          "gcskew.csv" for -output=>"f")
#   -application application to open gif image (default: "gimv")
#
# Description:
#   This program calculates the GC skew. 
# Requirements: 
#   use Chart::Graph qw(gnuplot);
#   use SubOpt;
#   gimv
# History:
#   20010905-01 update with options
#   20010727-01 initial posting

sub gcskew {
    &opt::default(window=>10000, at=>0, output=>"show", application=>"gimv",
		  filename=>"gcskew.gif");
    my @args = opt::get(@_);
    &opt::default(filename=>"gcskew.csv") if (opt::val("output") eq 'f');
    my $ref = shift @args;
    my $window = opt::val("window");
    my $application = opt::val("application");
    my @gcskew = ();
    my @location = ();
    my $j;
    
    my $i = 0;
    while(length($$ref) - ($window * $i) >= $window){
	my $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
	$g = substr($$ref, $window * $i, $window) =~ tr/a/a/ if (opt::val("at"));
	my $c = substr($$ref, $window * $i, $window) =~ tr/c/c/;
	$g = substr($$ref, $window * $i, $window) =~ tr/t/t/ if (opt::val("at"));
	$gcskew[$i] = sprintf("%.6f",($c-$g)/($c+$g));
	$location[$i] = $i * $window;
	$i ++;
    }
    $i --;
    
    if (opt::val("output") eq 'g' || opt::val("output") eq 'show'){
	my $title = "GC skew";
	mkdir ("graph", 0777);
	$title = "AT skew" if (opt::val("at"));
	gnuplot(
		{"title" => "$title",
		  "output file" => 'graph/' . opt::val("filename"),
		  "x-axis label" => "bp",
		 "y-axis label" => "$title"},
		
		[{"title" => "$title",
		    "style" => "lines",
		  "type" => "columns"},
		  \@location, \@gcskew, ],
		);
	system("$application graph/" . opt::val("filename") . ' &') 
	    if (opt::val("output") eq 'show');
    }elsif (opt::val("output") eq 'f'){
	my $title = "GC skew";
	my $j = 0;
	mkdir ("data", 0777);
	$title = "AT skew" if (opt::val("at"));
	open(OUT, ">data/" . opt::val("filename"));
	print OUT "location,$title\n";
	for ($j = 0; $j <= $i; $j++){
	    print OUT $location[$j], ",", $gcskew[$j], "\n";
	}
	close(OUT);
    }
    
    return @gcskew;
}


# cum_gcskew ver.20010905-01
# Author: Kazuharu "Gaou" Arakawa (gaou@g-language.org)
# Usage: array cum_gcskew = &gcskew(pointer sequence);
# Options:
#   -window      window size to observe (default: 10000)
#   -at          1 when observing AT skew instead of GC skew (default: 0)
#   -output      f for file output in directory "data", 
#                g for graph output in directory "graph",
#                show for graph output and display (default: "show")
#   -filename    output filename (default: "cum_gcskew.gif" for -output=>"g",
#                                          "cum_gcskew.csv" for -output=>"f")
#   -application application to open gif image (default: "gimv")
#
# Description:
#   This program calculates the cumulative GC skew. 
# Requirements: 
#   use Chart::Graph qw(gnuplot);
#   use SubOpt;
#   gimv
# History:
#   20010905-01 update with options
#   20010727-01 initial posting

sub cum_gcskew {
    &opt::default(window=>10000, at=>0, output=>"show", application=>"gimv",
		  filename=>"cum_gcskew.gif");
    my @args = opt::get(@_);
    &opt::default(filename=>"cum_gcskew.csv") if (opt::val("output") eq 'f');
    my $ref = shift @args;
    my $window = opt::val("window");
    my $application = opt::val("application");
    my @gcskew = ();
    my @location = ();
    my $j;
    my $tmp;
    
    my $i = 0;
    while(length($$ref) - ($window * $i) >= $window){
	my $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
	$g = substr($$ref, $window * $i, $window) =~ tr/a/a/ if (opt::val("at"));
	my $c = substr($$ref, $window * $i, $window) =~ tr/c/c/;
	$g = substr($$ref, $window * $i, $window) =~ tr/t/t/ if (opt::val("at"));
	$tmp += sprintf("%.6f",($c-$g)/($c+$g));
	$gcskew[$i] = $tmp;
	$location[$i] = $i * $window;
	$i ++;
    }
    $i --;
    
    if (opt::val("output") eq 'g' || opt::val("output") eq 'show'){
	my $title = "Cumulative GC skew";
	mkdir ("graph", 0777);
	$title = "Cumulative AT skew" if (opt::val("at"));
	gnuplot(
		{"title" => "$title",
		  "output file" => 'graph/' . opt::val("filename"),
		  "x-axis label" => "bp",
		 "y-axis label" => "$title"},
		
		[{"title" => "$title",
		    "style" => "lines",
		  "type" => "columns"},
		  \@location, \@gcskew, ],
		);
	system("$application graph/" . opt::val("filename") . ' &') 
	    if (opt::val("output") eq 'show');
    }elsif (opt::val("output") eq 'f'){
	my $title = "Cumulative GC skew";
	my $j = 0;
	mkdir ("data", 0777);
	$title = "Cumulative AT skew" if (opt::val("at"));
	open(OUT, ">data/" . opt::val("filename"));
	print OUT "location,$title\n";
	for ($j = 0; $j <= $i; $j++){
	    print OUT $location[$j], ",", $gcskew[$j], "\n";
	}
	close(OUT);
    }
    
    return @gcskew;
}



# genomicskew ver.20010905-01
# Author: Kazuharu "Gaou" Arakawa (gaou@g-language.org)
# Usage: 1 = &genomicskew(pointer G instance);
# Options:
#   -divide      window number to divide into (default: 250)
#   -at          1 when observing AT skew instead of GC skew (default: 0)
#   -output      f for file output in directory "data", 
#                g for graph output in directory "graph",
#                show for graph output and display (default: "show")
#   -filename    output filename (default: "cum_gcskew.gif" for -output=>"g",
#                                          "cum_gcskew.csv" for -output=>"f")
#   -application application to open gif image (default: "gimv")
#
# Description:
#   This program graphs the GC skew for the whole genome, coding regions,
#   intergenic regions, and the third codon.
# Requirements:
#   use Chart::Graph qw(gnuplot);
#   use SubOpt;
# History:
#   20010905-01 updated options
#   20010727-01 initial posting

sub genomicskew {
    &opt::default(divide=>250, at=>0, output=>"show", application=>"gimv",
		  filename=>"genomicskew.gif");
    my @args = opt::get(@_);
    &opt::default(filename=>"genomicskew.csv") if (opt::val("output") eq 'f');
    
    my $gb = shift @args;
    my $divide = opt::val("divide");
    my $at = opt::val("at");
    my $opt = opt::val("output");
    my $application = opt::val("application");
    my (@gcskew, @betskew, @geneskew, @thirdskew, @location);
    my ($j, $window, $CDS, $BET, $THIRD);
    my $before = 0;
    my $i = 1;
    
    while(defined %{$gb->{"CDS$i"}}){
	my $feature = $gb->{"CDS$i"}->{feature};
	if ($gb->{"FEATURE$feature"}->{join}){
	    $i ++;
	    next;
	}
	my $seq = $gb->get_gbkseq(
				  $gb->{"CDS$i"}->{start},
                                  $gb->{"CDS$i"}->{end}
				  );
	$CDS .= $seq;
	
	for($j = 2; $j <= length($seq); $j += 3){
	    if ($gb->{"CDS$i"}->{direction} eq 'complement'){
		$THIRD .= substr($seq, $j, 1);
	    }else{
		$THIRD .= substr($seq, -($j + 1), 1);
	    }
	}
	$BET .= substr($gb->{SEQ}, $before, $gb->{"CDS$i"}->{start} - $before)
	    unless ($gb->{"CDS$i"}->{start} - $before < 1);
	$before = $gb->{"CDS$i"}->{end};
	$i ++;
    }
    
    for ($j = 0; $j <= $divide; $j ++){
	$location[$j] = $j;
    }
    $i = 0;
    
    $window = int(length($gb->{SEQ}) / $divide);
    while($i <= $divide){
	my $g = substr($gb->{SEQ}, $window * $i, $window) =~ tr/g/g/;
	$g = substr($gb->{SEQ}, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($gb->{SEQ}, $window * $i, $window) =~ tr/c/c/;
	$g = substr($gb->{SEQ}, $window * $i, $window) =~ tr/t/t/ if ($at);
	$gcskew[$i] = sprintf("%.6f",($c-$g)/($c+$g));
	$i ++;
    }
    $i = 0;
    
    $window = int(length($CDS) / $divide);
    while($i <= $divide){
	my $g = substr($CDS, $window * $i, $window) =~ tr/g/g/;
	$g = substr($CDS, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($CDS, $window * $i, $window) =~ tr/c/c/;
	$g = substr($CDS, $window * $i, $window) =~ tr/t/t/ if ($at);
	$geneskew[$i] = 0;
	$geneskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }
    $i = 0;
    
    $window = int(length($BET) / $divide);
    while($i <= $divide){
	my $g = substr($BET, $window * $i, $window) =~ tr/g/g/;
	$g = substr($BET, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($BET, $window * $i, $window) =~ tr/c/c/;
	$g = substr($BET, $window * $i, $window) =~ tr/t/t/ if ($at);
	$betskew[$i] = 0;
	$betskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }
    $i = 0;
    
    $window = int(length($THIRD) / $divide);
    while($i <= $divide){
	my $g = substr($THIRD, $window * $i, $window) =~ tr/g/g/;
	$g = substr($THIRD, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($THIRD, $window * $i, $window) =~ tr/c/c/;
	$g = substr($THIRD, $window * $i, $window) =~ tr/t/t/ if ($at);
	$thirdskew[$i] = 0;
	$thirdskew[$i] = sprintf("%.6f",($c-$g)/($c+$g)) unless ($c+$g<1);
	$i ++;
    }
    
    if ($opt eq "show" || $opt eq "g"){
	my $title = "GC skew";
	$title = "AT skew" if ($at);
	mkdir ("graph", 0777);
	gnuplot(
		{"title" => "$title",
		 "output file" => "graph/" . opt::val("filename"), 
		 "x-axis label" => "bp",
		 "y-axis label" => "$title"},
	        
		[{"title" => "whole genome",
		  "style" => "lines",
		  "type" => "columns"},
		 \@location, \@gcskew, ],
	        
		[{"title" => "coding region",
		  "style" => "lines",
		  "type" => "columns"},
		 \@location, \@geneskew, ],
	        
		[{"title" => "intergenic region",
		  "style" => "lines",
		  "type" => "columns"},
		 \@location, \@betskew, ],
	        
		[{"title" => "third codon",
		  "style" => "lines",
		  "type" => "columns"},
		 \@location, \@thirdskew, ],
	        
		);

	system("$application graph/" . opt::val("filename") . ' &') 
	    if (opt::val("output") eq 'show');
    }elsif ($opt eq 'f'){
	my $title = "GC skew";
	my $j = 0;
	$title = "AT skew" if ($at);
	mkdir ("data", 0777);
	open(OUT, ">data/" . opt::val("filename"));
	print OUT "location,$title,coding,intergenic,third codon\n";
	for ($j = 0; $j <= $divide; $j++){
	        print OUT $location[$j], ",", $gcskew[$j], $geneskew[$j], ",",
		$betskew[$j], ",", $thirdskew[$j], ",", "\n";
	    }
	close(OUT);
    }
    
    return 1;
}


# gcwin ver.20010905-01
# Author: Kazuharu "Gaou" Arakawa (gaou@g-language.org)
# Usage: array gcwin = &gcwin(pointer sequence);
# Options:
#   -window      window size to observe (default: 10000)
#   -at          1 when observing AT content instead of GC content (default: 0)
#   -output      f for file output in directory "data", 
#                g for graph output in directory "graph",
#                show for graph output and display (default: "show")
#   -filename    output filename (default: "gcwin.gif" for -output=>"g",
#                                          "gcwin.csv" for -output=>"f")
#   -application application to open gif image (default: "gimv")
#
# Description:
#   This program calculates the GC content.
# Requirements 
#   use Chart::Graph qw(gnuplot);
#   use SubOpt;
# History:
#   20010905-01 updated options
#   20010729-01 initial posting

sub gcwin  {
    &opt::default(window=>10000, at=>0, output=>"show", application=>"gimv",
		  filename=>"gcwin.gif");
    my @args = opt::get(@_);
    &opt::default(filename=>"gcwin.csv") if (opt::val("output") eq 'f');

    my $ref = shift @args;
    my $window = opt::val("window");
    my $at = opt::val("at");
    my $application = opt::val("application");
    my $opt = opt::val("output");
    my (@gcwin, @location);
    my $j;
    
    my $i = 0;
    while(length($$ref) - ($window * $i) >= $window){
	my $g = substr($$ref, $window * $i, $window) =~ tr/g/g/;
	$g = substr($$ref, $window * $i, $window) =~ tr/a/a/ if ($at);
	my $c = substr($$ref, $window * $i, $window) =~ tr/c/c/;
	$g = substr($$ref, $window * $i, $window) =~ tr/t/t/ if ($at);
	$gcwin[$i] = sprintf("%.6f",($g+$c)/$window);
	$location[$i] = $i * $window;
	$i ++;
    }
    $i --;
    
    if ($opt eq 'g' || $opt eq 'show'){
	my $title = "GC content";
	$title = "AT content" if ($at);
	mkdir ("graph", 0777);
	gnuplot(
		{"title" => "$title",
		  "output file" => "graph/" . opt::val("filename"),
		  "x-axis label" => "bp",
		 "y-axis label" => "$title"},
		
		[{"title" => "$title",
		    "style" => "lines",
		  "type" => "columns"},
		  \@location, \@gcwin, ],
		);
	system("$application graph/" . opt::val("filename") . ' &')
	    if ($opt eq 'show');;
    }elsif ($opt eq 'f'){
	my $title = "GC content";
	my $j = 0;
	$title = "AT content" if ($at);
	mkdir ("data", 0777);
	open(OUT, ">data/" . opt::val("filename"));
	print OUT "location,$title\n";
	for ($j = 0; $j <= $i; $j++){
	    print OUT $location[$j], ",", $gcwin[$j], "\n";
	}
	close(OUT);
    }
    
    return @gcwin;
}


sub DESTROY {
    my $self = shift;
}

1;
__END__
# Below is the stub of documentation for your module. You better edit it!

=head1 NAME

G::Seq::GCskew - Perl extension for blah blah blah

=head1 SYNOPSIS

  use G::Seq::GCskew;
  blah blah blah

=head1 DESCRIPTION

Stub documentation for G::Seq::GCskew was created by h2xs. It looks like the
author of the extension was negligent enough to leave the stub
unedited.

Blah blah blah.

=head1 AUTHOR

A. U. Thor, a.u.thor@a.galaxy.far.far.away

=head1 SEE ALSO

perl(1).

=cut
