#!/usr/bin/perl
# plasticity - perl script (1 of 2) that analyses protein similarity searchs
#and clusterize regions of genes which are absent on non-pathogenic bacterium
#to find Putative Pathogenicity Islands.
#
# Written by: Siomar C. Soares, Federal University of Minas Gerais (UFMG), 
#   Laboratory of Celular and Molecular Genetics, Brazil
#
# Date Written: Sep 27, 2010

#####################################
$faa = $ARGV[0];
open (IN, "$faa");

my %cnpb;
my %cnpe;
@file = <IN>;

for $i (@file)	{
	if ($i =~ m/^\>/){
	@line = split ("[ ]", $i);
	$cnp = $line[0];
	$cnp =~ s/\>//;
	chomp $cnp;
	$cnpb->{$cnp}=$line[1]; chomp $cnpb->{$cnp};
	$cnpe->{$cnp}=$line[2]; chomp $cnpe->{$cnp};
			}
		}
close IN;

#####################################
$faa = $ARGV[1];
open (IN, "$faa");

my %cdsb;
my %cdse;
@file = <IN>;

for $i (@file)	{
	if ($i =~ m/^\>/){
	@line = split ("[ ]", $i);
	$cds = $line[0];
	$cds =~ s/\>//;
	chomp $cds;
	$cdsb->{$cds}=$line[1]; chomp $cdsb->{$cds};
	$cdse->{$cds}=$line[2]; chomp $cdse->{$cds};
			}
		}
close IN;

###################################

$tab = $ARGV[2];
open (IN3, "$tab");
@file3 = <IN3>;

for $i (@file3)	{
	@line2 = split ("\t", $i);
	$cnp = $line2[0];
	$cds = $line2[1];
	chomp $cnp;
	chomp $cds;
	$positions .= $cnp."\t".$cnpb->{$cnp}."\t".$cnpe->{$cnp}."\t".$cds."\t".$cdsb->{$cds}."\t".$cdse->{$cds}."\n";
		}

#################################
##############################
###############################
##################

@file3 = split ("\n", $positions);
my %beg;
my %end;
my @ordered;
for $i(@file3)	{@line4 = split ("\t", $i); 
		$key=$line4[4];
		$beg->{$key}=$line4[0]."\t".$line4[1]."\t".$line4[2]."\t".$line4[3];
		chomp $line4[5];
		$end->{$key}=$line4[5];
		}
$ccc=0;
for $i(@file3)	{@line4 = split ("\t", $i); 
		$key[$ccc].=$line4[4]; $ccc++;
		}
@ordered = sort {$a <=> $b} @key;
for $i(@ordered)	{
	$positions2 .= "$beg->{$i}"."\t"."$i"."\t"."$end->{$i}"."\n";
			}
close IN3;
close OUT;


##################

my %pose;
my %posb1;
my %posb2;
my $cc=1;
my @line3;
my $e;
@file1 = split ("\n", $positions2);
for $i(@file1)	{@line3 = split ("\t", $i); 
		chomp $line3[5];
		chomp $line3[4];
		$pose->{$cc}=$line3[5];
		$b=$line3[4];
		$posb1->{$cc}=$b-1500; 
		$posb2->{$cc}=$b+1500; $cc++;}
$numblines = $cc;

############################
my $end;
my @line4;
$c=1;
$cc=2;
$p = 0;
@line = split ("\n", $positions2);
while ($c <$numblines){
	@line4 = split ("\t", $line[$p]);
	my $begin = $line4[3];
	my $end = $line4[3];
	my $locb = $line4[4];
	chomp $locb;
	my $loce = $line4[5];
	if (($pose->{$c} > $posb1->{$cc})&&($pose->{$c} < $posb2->{$cc}))	{
		while (($pose->{$c} > $posb1->{$cc})&&($pose->{$c} < $posb2->{$cc}))
		{$p++; @line4 = split ("\t", $line[$p]); 
			if ($locb > $line4[4]){$locb = $line4[4];}
			if ($loce < $line4[5]){$loce = $line4[5];} 
		$end=$line4[3]; $c++; $cc++;}
										}
	else	{$p++; $c++; $cc++;
		}
	chomp $loce; $indel1 .= $begin."\t".$end."\t".$locb."\t".$loce."\n";
			}
close OUT;

###################################


my $indel2;
my @file= split ("\n", $indel1);
for $i (@file)	{
	@line = split ("\t", $i);
	if ($line[0]!~$line[1]){
		if ($line[3] > $line[2]){
		$indel2 .= $line[0]."\-".$line[1]."\t".$line[2]."\t".$line[3]."\n";}		
		else{$indel2 .= $line[1]."\-".$line[0]."\t".$line[2]."\t".$line[3]."\n";}
				}
		}


##################################
##################################
#abrir indel2 e positions2
@file1 = split ("\n", $indel2);
@file2 = split ("\n", $positions2);
#se positions4 maior ou igual indel1 e menor ou igual a indel2
for $i1 (@file1)	{
	@indel = split ("\t", $i1);
	$list .= "\>".$i1."\n";
	for $i2 (@file2)	{
	@positions = split ("\t", $i2);
		if (($positions[4] >= $indel[1]) && ($positions[4] <= $indel[2]))
			{$list .= $i2."\n";}
				}
			}
$name = $ARGV[1];
$name =~ s/\.faa//g;
open (OUT, ">$name.indel");#####################
print OUT ($list);###########################################################################################################################
close OUT;#########################

exit
