#!/usr/bin/perl
# paifinder2 - perl script (2 of 2) that analyses all PAI features 
#on the clusters of genes that are absent on non-pathogenic bacterium 
#to find Putative Pathogenicity Islands.
#
# Written by: Siomar C. Soares, Federal University of Minas Gerais (UFMG), 
#   Laboratory of Celular and Molecular Genetics, Brazil
#
# Date Written: Sep 27, 2010

$in = $ARGV[0];
open (IN, "$in.clusters2");
@file = <IN>;
$info = shift (@file);
@limits = split ("\t", $info);
my $gclimit = $limits[0]*1.9;
my $codlimit = $limits[1]*1.5;
my $vflimit = $limits[2]*1;
my $hyplimit = $limits[3]*1.3;

for $i (@file)	{
	$temp .= $i;
		}
@file4 = split ("\>", $temp);
shift @file4;
$gctotal = 0;
$codtotal = 0;
$vftotal = 0;
$cdstotal =0;
$hyptotal = 0;
$res = "Negative";
$cc = 1;
$force = "Weak";
$sum = 0;
$sumtotal = 0;
open (OUT, ">$in.Putative_Islands");
open (OUT2, ">$in.PAI.tab");
print OUT ("Putative Pathogenicity Island\tGC Deviation\tCodon Usage Deviation\tVirulence Factors\tHypothetical Proteins\tGene Composition\tPosition\tPrediction Force\n");
for $i4 (@file4){
	@line4 = split ("\n", $i4);
	$cluster = shift (@line4);
	$cluster =~ s/.+\= \t//;
	$cluster =~ s/\t/\.\./;
	for $i5 (@line4)	{
		@info = split ("\t", $i5);
		$cdstotal++;
		if ($info[1] !~ /NORMAL/) {$gctotal++;}
		if ($info[2] !~ /NORMAL/) {$codtotal++;}
		if ($info[4] !~ /No hits found/) {$vftotal++;}
		if ($info[5] =~ /ypothetical/) {$hyptotal++;}
				}
	$cdb = shift (@line4);
	@cdsb = split ("\t", $cdb);
	$cde = pop (@line4);
	@cdse = split ("\t", $cde);
	if ($cdstotal >0){
	$gcp = $gctotal/$cdstotal; $gctotal = 0;
	$codp = $codtotal/$cdstotal; $codtotal = 0;
	$vfp = $vftotal/$cdstotal; $vftotal = 0;
	$hy = $hyptotal/$cdstotal;}
	if ($cdstotal >= 10){$hypp = $hyptotal/$cdstotal;}
	else {$hypp = $hyptotal/10;}
	$hyptotal = 0;
	$sum = $gcp+$codp+$vfp+$hypp;
	$sumtotal = $gclimit+$codlimit+$vflimit+$hyplimit;
	$factor = (1-$codlimit)*$sumtotal;
	if (($vfp != 0) && ($cdstotal >= 6))	{
		if (($gcp > $gclimit) | ($codp > $codlimit) | ($vfp > $vflimit) | ($hypp > $hyplimit)){$res = "Positive";}
		if (($gcp == 0) && ($codp == 0)) 	{$res = "Negative";}
		if (($gcp == 0) && ($codp == 0)) 	{$res = "Negative";}
		if (($gcp == 0) && ($hypp == 0)) 	{$res = "Negative";}
		if (($codp == 0) && ($hypp == 0)) {$res = "Negative";}
		if ($sum <$factor) {$res = "Negative";}
		if ($hypp > $hyplimit) {$res = "Positive";}
		if ($sum < (2*$vflimit))  {$res = "Negative";}
		if ($sum >=(1.2*$factor)) {$force = "Normal";}
		if ($sum > (2*$factor)) {$force = "Strong";}
			}
	$gcp =~ s/\./\,/g;
	$codp =~ s/\./\,/g;
	$vfp =~ s/\./\,/g;
	$hy =~ s/\./\,/g;
	if ($res =~ /Positive/) {print OUT ("Putative Pathogenicity Island $cc\t$gcp\t$codp\t$vfp\t$hy\t$cdsb[0]\-$cdse[0]\t$cluster\t$force\n");print OUT2 ("FT   misc_feature    $cluster\nFT                   \/note\=\"Putative Pathogenicity Island $cc\"\n");$cc++;}
	
	$res = "Negative";
	$force = "Weak";
	$sum =0;
	$sumtotal=0;
	$cdstotal = 0;
		}
