#!/usr/bin/perl
# gccontent - perl script that analyses the GC content of CDSs on the genome 
# and finds regions with GC content deviation.
#
#
# Written by: Siomar C. Soares, Federal University of Minas Gerais (UFMG), 
#   Laboratory of Celular and Molecular Genetics, Brazil
#
# Date Written: Sep 27, 2010

open (IN, $ARGV[0]);
@file = <IN>;
$genome .= "\#Sequence   GC content\n";
for $i1 (@file){
	if ($i1 =~ m/^\>/) 	{
		$gc=0;
		$nn=0;
		@tmp = split (/ /, $i1);
		$locus_tag = $tmp[0];
		$locus_tag =~ s/\>//g;
		chomp $locus_tag;
				}
	else			{
		for ($i2=0;$i2<length ($i1);$i2 += 1)	{ 
			if ((substr($i1,$i2,1) eq "g") || (substr($i1,$i2,1) eq "c")){$gc++;$nn++;}
			else {$nn++;}
							}
			$mean->{$locus_tag}=$gc/$nn;
				}
				}
for $i1 (@file){
	if ($i1 =~ m/^\>/) 	{
		@tmp = split (/ /, $i1);
		$locus_tag = $tmp[0];
		$locus_tag =~ s/\>//g;
		chomp $locus_tag;
		$genome .= "$locus_tag     $mean->{$locus_tag}\n"; 
				}
		}
@genomegc = split ("\n", $genome);
open (IN, $ARGV[1]);
@file = <IN>;
$orf .= "\#Sequence   GC content\n";
for $i1 (@file){
	if ($i1 =~ m/^\>/) 	{
		$gc=0;
		$nn=0;
		@tmp = split (/ /, $i1);
		$locus_tag = $tmp[0];
		$locus_tag =~ s/\>//g;
		chomp $locus_tag;
				}
	else			{
		for ($i2=0;$i2<length ($i1);$i2 += 1)	{ 
			if ((substr($i1,$i2,1) eq "g") || (substr($i1,$i2,1) eq "c")){$gc++;$nn++;}
			else {$nn++;}
							}
			$mean->{$locus_tag}=$gc/$nn;
				}
				}
for $i1 (@file){
	if ($i1 =~ m/^\>/) 	{
		@tmp = split (/ /, $i1);
		$locus_tag = $tmp[0];
		$locus_tag =~ s/\>//g;
		chomp $locus_tag;
		$orf .= "$locus_tag     $mean->{$locus_tag}\n"; 
				}
		}

@orfgc = split ("\n", $orf);
#$arq1 = shift;
#open (Text1, "$arq1");
#@genome = <Text1>;
#close (Text1);
my $gctotal;

chomp @genomegc;
for $i1(@genomegc){
	@gctotal = split (/[ ]+/, $i1);
	if ($gctotal[0] !~ /^\#/){$gctotal = $gctotal[1];}
		}


#$arq2 = shift;
#open (Text2, "$arq2");
#@orfgc = <Text2>;
#close (Text2);
my @gc;
my $sumdev;
my $cc;
$cc=0;
$sumdev=0;

chomp @orfgc;
for $i2(@orfgc){
	@gc = split (/[ ]+/, $i2);
	if ($gc[0] !~ /^\#/){$sumdev=$sumdev+(($gc[1]-$gctotal)**2); $cc++;}
}
$stdev=sqrt($sumdev/$cc);
$reflow=$gctotal-(1.5*$stdev);
$refhigh=$gctotal+(1.5*$stdev);

for $i3(@orfgc){
	@hgt = split (/[ ]+/, $i3);
		{
		if ($hgt[0] !~ /^\#/)	
			{$out[0]=$hgt[0];
			if ($hgt[1]<$reflow)	
				{$out[1]="LOW_GC";}
			if ($hgt[1]>$refhigh)	
				{$out[1]="HIGH_GC";}
			if (($hgt[1]>$reflow)&&($hgt[1]<$refhigh))
				{$out[1]="NORMAL";}
			print "$out[0]"."\t"."$out[1]"."\n";
			}
		}
	       }
