#!/usr/bin/perl
# embl2product - perl script that extracts all CDSs prodcuts to a list.
#
#
# Written by: Siomar C. Soares, Federal University of Minas Gerais (UFMG), 
#   Laboratory of Celular and Molecular Genetics, Brazil
#
# Date Written: Sep 27, 2010
#
#
#Usage = ./embl2product.pl myinput myoutput

my $embl = $ARGV[0] || die "input file missing\n"; 
my $list = $ARGV[1] || die "output file missing\n";

#On this step it formats the embl file
open (IN, "$embl");
@file = <IN>;
open (OUT2, ">$embl");
$a=0;
$cc=1;
@name = split (/\./, $embl);
$id = $name[0];
my $head = $file[0];
if ($head =~ /FT/) {$head =~ s/FT/ID   $id\nXX\nFT/g;}

for $i (@file)
	{$line[$cc] = $i;
	$line[1]=$head;
	if ($line[$cc] =~ m/CDS             /) {$line[$cc] =~ s/\>//g; $line[$cc] =~ s/\<//g;}
	if (($line[$cc] =~ /SQ   Sequence/) && ($line[$a] =~ /FT/)){$line[$cc] =~ s/SQ/XX\nSQ/;}
	print OUT2 ("$line[$cc]"); $cc++;$a++;}

#On this step it reads the embl file and creates a product list
my $location;
my $synonim;
my $strand;
my $product;
$cc=0;

open (OUTFILE, ">$list");
if (open (INPUT, "$embl")) {
	while ($line = <INPUT>) {
		if ($line =~ m/CDS             /) {
		$cc++;
		$loc = $';
			if ($loc =~ m/complement/){$str = "R ";}else{$str = "F ";}
		$loc =~ s/complement//g;
		$loc =~ s/join//g;
		$loc =~ s/\>//g;
		$loc =~ s/\<//g;		
		$loc =~ s/\(//g;
		$loc =~ s/\)//g;
		$loc =~ s/\n//g;
		$loc =~ s/\.\..+\.\./\t/g;
		$loc =~ s/\.\./\t/g;
		$loc =~ s/\t/ /g;
		$location[$cc]=$loc;
		$strand[$cc]=$str;
		}
	}
}
$cc=0;
if (open (INPUT, "$embl")) {
	while ($line = <INPUT>) {
       		if ($line =~ m/CDS             /) {$cc++}
		elsif (($line =~ m/\/locus_tag=/) | ($line =~ m/\/systematic_id=/)){
			$syn = $';
			$syn =~ s/\n//g;
			$syn =~ s/"//g;
			if ($synonim[$cc] !~ m/.+/){$synonim[$cc]=$syn;}
										    }
		elsif ($line =~ m/\/gene=/)	{
			$syn = $';
			$syn =~ s/\n//g;
			$syn =~ s/"//g;
			if ($synonim[$cc] !~ m/.+/){$gene[$cc]=$syn;}
						}
	}
}
$cc=0;
if (open (INPUT, "$embl")) {
	while ($line = <INPUT>) {
		if ($line =~ m/CDS             /) {$cc++;}
		elsif (($line =~ m/product=/) && ($line !~ m/transfer RNA/) && ($line !~ m/ribosomal RNA/) && ($line !~ m/[0-9]+S rRNA/) && ($line !~ m/\"tRNA [a-zA-Z][a-z][a-z]\"/) && ($line !~ m/\"Trna\-[a-zA-Z][a-z][a-z]\"/)){
		$pro = $';
		$pro =~ s/"//g;
		$pro =~ s/\n//g;
		$nextline = <INPUT>;
		if ($nextline =~ m/\//){$pro2 = "";
				}else{$pro2 = $nextline;
					$pro2 =~ s/FT                  //g;
					$pro2 =~ s/\n//g;
					$pro2 =~ s/"//g}
		if ($product[$cc] !~ m/.+/){
		$product[$cc]=$pro.$pro2;}
		if ($product[$cc] !~ m/.+/){$product[$cc]="Hypothetical protein";}
		$pro="";
		$pro2="";
		}
	}
}
$cc=0;
if (open (INPUT, "$embl")) {
	while ($line = <INPUT>) {
		if ($line =~ m/CDS             /) {
		$cc++;
		if ($synonim[$cc])	{
			print OUTFILE ("\>$synonim[$cc] ");
					}
		else		 	{
			print OUTFILE ("\>$gene[$cc] ");
					}
		print OUTFILE ("$location[$cc] ");
		print OUTFILE ("$strand[$cc]");
		print OUTFILE ("$product[$cc]"."\n");
		}
	}
}
		



close OUT2;
close INPUT;
close OUTFILE;
exit
