#!/usr/bin/perl
# paifinder - perl script (1 of 2) that analyses all PAI features 
#on the clusters of genes that are absent on non-pathogenic bacterium 
#to find Putative Pathogenicity Islands.
#
# Written by: Siomar C. Soares, Federal University of Minas Gerais (UFMG), 
#   Laboratory of Celular and Molecular Genetics, Brazil
#
# Date Written: Sep 27, 2010


my %gc;
my %cod;
my %tnp;
my %vf;
my $embl = $ARGV[0];
open (IN, "$embl.faa.vs.virulenceDB.txt"); #### organism vs virulence factor DB.txt ####
@file1 = <IN>;
my $gctotal = 0;
my $codtotal = 0;
my $vftotal = 0;
my $cdstotal =0;

for $i1 (@file1)	{
	@line1 = split ("\t", $i1);
	$cds = $line1[0];
	$gc->{$cds} = $line1[1];
	$cod->{$cds} = $line1[2];
	$tnp->{$cds} = $line1[3];
	chomp $line1[4];
	$vf->{$cds} = $line1[4];
	$cdstotal++;
	if ($line1[1] !~ /NORMAL/) {$gctotal++;}
	if ($line1[2] !~ /NORMAL/) {$codtotal++;}
	if ($line1[4] !~ /No hits found/) {$vftotal++;}
			}

$gcpercent = $gctotal/$cdstotal;
$gclimit = $gcpercent;
$codpercent = $codtotal/$cdstotal;
$codlimit = $codpercent;
$vfpercent = $vftotal/$cdstotal;
$vflimit = $vfpercent;
########  list file ###########
open (IN2, "$embl.list");
@file2 = <IN2>;
my %posb;
my %pose;
my $hyptotal = 0;
$cdstotal = 0;

for $i2 (@file2)	{
	@line2 = split ("[ ]+", $i2);
	$cds = $line2[0];
	$cds =~ s/\>//;
	$cds->{$cds} = $cds;
	$posb->{$cds} = $line2[1];
	$pose->{$cds} = $line2[2];
	$cdstotal++;
	if ($i2 =~ /ypothetical protein/) {$hyptotal++;}
			}
$hyppercent = $hyptotal/$cdstotal;
$hyplimit = $hyppercent;
########### clusters file #############
open (IN3, "$embl.clusters");
@file3 = <IN3>;
my $temp;

for $i3 (@file3)	{
	@line3 = split ("\t", $i3);
	$length = $line3[2]-$line3[1];
	if ($length >= 5000)	{
		$temp .= ">$i3";
		for $i2 (@file2)	{
			@line2 = split ("[ ]+", $i2);
			$cds = $line2[0];
			$cds =~ s/\>//;
			$product = $i2;
			$product =~ s/$line2[0] $line2[1] $line2[2] $line2[3] //;
			if (($posb->{$cds}>=$line3[1])&&($pose->{$cds}<=$line3[2]))	{
			$temp .= "$cds\t$gc->{$cds}\t$cod->{$cds}\t$tnp->{$cds}\t$vf->{$cds}\t$product";
											}
					}
				}
			}

$res .= "$gclimit\t$codlimit\t$vflimit\t$hyplimit\n$temp";
open (OUT, ">$embl.clusters2");
print OUT ($res);
exit
