#!/usr/bin/perl -w
# Bulk Import script for Genquire I

# by David Block <dblock@gene.pbir.nrc.ca>

# Copyright David Block and NRC-PBI

# POD Documentation - main docs before the code

=head1 NAME

=head1 SYNOPSIS

perl bulk_parse_contigs.pl

=head1 DESCRIPTION

this parses TIGR XML for contigs.  It has only been tested on the
Arabidopsis genome.

The contigs for each chromosome must be present in a series of
folders named as follows:
    
  CHR1/nnn1.xml
      /nnn2.xml
      /nnn3.xml
  CHR2/nnn1.xml
      /nnn2.xml
      /..  and so on

You will first be asked which organism you are assigning these
sequences to.  If you have no already set up an organism in genquire
(e.g. using the init.db routine), then you will need to do so by hand.
See the org.data file for an example of how to do this.

You will then be asked where the contig files are located.  Your
response must take the format:
    
/home/arabidopsis/bacs/CHR?/*.xml

where ? will be sequentially auto-replaced by the chromosome number, and
* will be sequentially replaced by each contig in that folder.

This may take some time...

Once this is complete, you should then run the script
create_tilingpath.pl in order to set the absolute start positions
of each contig in your database. If you fail to do this, all of your
genes will map on top of each other!!  Caveat Emptor!!



=head1 FEEDBACK

Like it or lump it, report to dblock@gene.pbi.nrc.ca.  Feel free to add
to the docs available at
http://bioinfo.pbi.nrc.ca/dblock/wiki

=head2 Reporting Bugs

Email the author with any bug reports.

=head1 AUTHOR - David Block

email dblock@gene.pbi.nrc.ca

=cut


use strict;
use DBI;
use Term::ReadKey;
use XML::Parser::PerlSAX;
use lib "/home/markw/OPEN_Genquire/Genquire";
use lib "/home/markw/BIOPERL/bioperl/bioperl-live";
use lib "/home/markw/BIOPERL/bioperl/bioperl-gui";
use lib "/home/markw/BIOPERL/go-dev/perl-api";

use lib "..";
use GQ::Server::DB::DbObj;
use GQ::Server::DB::Context;
use Carp;

$SIG{'INT'}=\&Quit;
my $dbobj = get_database_object();
my $context = GQ::Server::DB::Context->new($dbobj);
my $dbh=$dbobj->dbh;
my $header = "*" x 30 . "\n";

my @orgs=$context->all_orgs_by_id;
my %orgs;

print "\n\n\n\nChoose the id and version of the organism this is being\n";
print "imported to. If organismsare not present, abort this script\n";
print "now and add an organism to your database.  If you have run\n";
print "the init.db script, this has already been done for you!!\n";
print "\n(See the sample file org.data for an overview of the database\n";
print "fields that must be filled to create an organism if you want to do this by hand\n";
print $header;
print join "\t\t",qw(latin common id version);
print "\n";
foreach (@orgs) {
    print $_->latin,"\t",$_->common,"\t",$_->id,"\t",$_->version,"\n";
    $orgs{$_->id}{$_->version}=$_;
}
print $header;
print "id: "; chomp(my $org=<STDIN>);
print "version: "; chomp(my $ver=<STDIN>);
if (!exists $orgs{$org}{$ver}) {
    Quit();
} else {
    $context->organism($orgs{$org}{$ver});
}

my $chr_flag=1;
my %contig;

foreach (get_xml_file_list()) {
    next if /chr.v06/;
    print $_,"\n";
    my $handler = ContigHandler->new($context);
    my $parser = XML::Parser::PerlSAX->new(Handler => $handler);

    eval {
	$parser->parse( Source => { SystemId => $_ } );
	};
    if ($@) {
	print "Transaction aborted because: $@";
	$context->rollback;
    } else {
	$context->commit;
    }

}

# now we have to build the tiling path.  Start from the 5' end and walk!
#my $sth = $dbh->prepare("select Contig.name from Contig,Tiling_Path where fiveprime='' and contig_id = id");
#
#$sth->execute;
#
#while (my ($ctg) = $sth->fetchrow_array){
#    do {# starting from the first contig...
#        my $sth_this = $dbh->prepare("select abs_start, length from Contig, Tiling_Path where contig_id = id and name='$ctg'");
#        $sth_this->execute;
#        my ($abs_start, $length) = $sth_this->fetchrow_array;
#        
#        my $sth_next = $dbh->prepare("select name, id, threeprime, overlap5 from Tiling_Path, Contig where contig_id = id and fiveprime = '$ctg'");
#        $sth_next->execute;
#        my ($ctg, $threeprime, $ov5) = $sth_next->fetchrow_array; # change $ctg to next contig in line
#        my $new_abs_start = $abs_start + $length - $ov5;
#        $dbh->do("update Tiling_Path set abs_start = $new_abs_start where contig_id = $id");
#        
#
#
#    }
#}


$dbh->disconnect;
print "\n\n\n\aYOU MUST NOW RUN THE SCRIPT\n\n\t\t'create_tilingpath.pl'\n\nin order to calculate the\nabsolute_start position of each contig.\n";
exit;


sub get_database_object {
    my ($default_database, $default_address) =
	("genquire", "localhost");
    print "Enter your username: ";
    chomp(my $user=<STDIN>);
    print "Enter your password: ";
    ReadMode 'noecho';
    my $pass = ReadLine 0;
    chomp $pass;
    ReadMode 'normal';
    print "\n";
    print "Enter your database's name (default: $default_database): ";
    chomp(my $database = <STDIN>);
    $database ||= $default_database;
    print "Enter your database's address (default: $default_address): ";
    chomp(my $ip = <STDIN>);
    $ip ||= $default_address;
    return GQ::Server::DB::DbObj->new($user, $pass, $database, $ip);
}


sub get_xml_file_list {
    print "Enter where chromosome files are located.\n";
    my $default = "/home/arabidopsis/bacs/CHR?/*.xml";
    print "Source (default: $default): ";
    chomp(my $source = <STDIN>);
    $source ||= $default;
    my @files = glob($source);
    print scalar(@files), " file(s) found.\n";
    return @files;
}


sub Quit {
    print "quitting...";
    ReadMode 'normal';
    $context->rollback if $context;
    $dbh->disconnect if $dbh;
    print "done.\n";
    exit;
}



package ContigHandler;

use strict;
use lib "/home/markw/OPEN_Genquire/Genquire";
use lib "/home/markw/BIOPERL/bioperl/bioperl-live";
use GQ::Server::DB::Context;
use Storable qw(dclone);

sub new {
    my $type = shift;
    my $context= shift;
    my $dbh=$context->dbh;
    my $self=bless {}, $type;
    $self->{dbh}=$dbh;
    $self->{sth_seq}=$dbh->prepare("INSERT INTO Sequence ".
				   "(seq,contig_id) VALUES (?,?)"
				  );
    $self->{sth_gb}=$dbh->prepare("UPDATE ContigAssembly SET ".
				   "genbank_acc=? WHERE id=?"
				  );
    $self->{sth_tpcheck}=$dbh->prepare("SELECT count(*) FROM Tiling_Path ".
				       "WHERE contig_id=?");
    $self->{sth_tp}=$dbh->prepare("INSERT INTO Tiling_Path ".
				  "(contig_id,length,VC_start,VC_length,fiveprime,overlap5,threeprime,overlap3) ".
				  "VALUES (?,?,?,?,?,?,?,?)"
				 );
    return $self;
}

my $current_element = '';
my ($cid,
    %current,
    @GeneFeatures,
    @Transcripts,
    $name,
    $gb,
    $strand,
    $sequence,
    $length,
    $assembly,
   );

sub start_element {
    my ($self, $element) = @_;

    my %attrs = %{$element->{Attributes}};
    $current_element = $element->{Name};


    if ($current_element eq 'MODEL') {
		%current=$self->new_current($current_element,%current);
		push @{$current{taghash}{'model comment'}},$attrs{COMMENT};
    } elsif ($current_element eq 'TRNA') {
		%current=$self->new_current($current_element,%current);
		push @{$current{taghash}{anticodon}},$attrs{ANTICODON};
    } elsif ($current_element eq 'EVIDENCE') {
		push @{$current{taghash}{evidence}},'GENE_PREDICTION' if $attrs{GENE_PREDICTION};
		push @{$current{taghash}{evidence}},'PROTEIN_MATCHES' if $attrs{PROTEIN_MATCHES};
		push @{$current{taghash}{evidence}},'GENE_INDEX_MATCHES' if $attrs{GENE_INDEX_MATCHES};
    } elsif ($current_element eq 'ASMBL_ID') {
		$name=$attrs{CLONE_NAME};
			print "Name:   $name\n";
		$cid=$context->get_contig_id($name);
		unless ($cid) {
		    my $contig=$context->insert_contig(contig_name => $name);
		    $cid=$context->insert_contigAssembly(
						contig_id => $contig,
						 version => $context->version,
						 assembly => $assembly);
		}
    } elsif ($current_element eq 'LEFT_ASMBL') {
		$current{LEFT_ASMBL}=$attrs{CLONE_NAME};
    } elsif ($current_element eq 'RIGHT_ASMBL') {
		$current{RIGHT_ASMBL}=$attrs{CLONE_NAME};
    } elsif ($current_element eq 'ASSEMBLY') {
		$assembly=$context->insert_Assembly(chr => $attrs{CHROMOSOME});
    } elsif ($current_element =~ /^TU$|^EXON$|^CDS$|LEFT_UTR|RIGHT_UTR|PRE-TRNA|RNA-EXON|RRNA|SNRNA|SNORNA/) {
		%current=$self->new_current($current_element,%current);
    }
}

sub new_current {
    my ($self,$current_element,%current)=@_;
    my %parent = %{ dclone(\%current) };
    %current=();
    $current{parent}=\%parent;
    $current{element}=$current_element;
    return %current;
}

sub end_element {
    my ($self, $element) = @_;
    if ($element->{Name} eq 'ASSEMBLY_SEQUENCE') {
		$self->{sth_seq}->execute($sequence, $cid);
		} elsif ($element->{Name} eq 'ASSEMBLY') {
		$cid='';
		%current=();
		@GeneFeatures=();          #clear global variables!
		@Transcripts=();
		$name='';
		$gb='';
		$strand='';
		$sequence='';
		$length=0;
		$assembly='';
		return;
    } elsif ($element->{Name} eq 'HEADER') {
		$self->{sth_gb}->execute($gb,$cid);
    } elsif ($element->{Name} eq 'TILING_PATH') {
		$self->tiling_path;
    } elsif ($element->{Name} eq 'TU') {
		#$self->new_Gene('gene');
		$self->new_Gene('Gene');
		@Transcripts=();
    } elsif ($element->{Name} eq 'MODEL') {
		$self->new_Transcript;
    } elsif ($element->{Name} eq 'CDS') {
		$self->new_Feature('Intr');
    } elsif ($element->{Name} eq 'LEFT_UTR') {
		$self->new_Feature('UTR');
    } elsif ($element->{Name} eq 'RIGHT_UTR') {
		$self->new_Feature('UTR');
    } elsif ($element->{Name} eq 'REPEAT') {
		$self->new_Feature('Repeat');
    } elsif ($element->{Name} eq 'PRE-TRNA') {
		$self->new_Gene('TRNA Gene');
		@Transcripts=();
    } elsif ($element->{Name} eq 'TRNA') {
		#$self->new_Transcript;
		$self->new_Transcript('Transcript');
    } elsif ($element->{Name} eq 'RNA-EXON') {
		$self->new_Feature('RNA Exon');
    } elsif ($element->{Name} eq 'RRNA') {
		$self->new_Feature('RRNA');
    } elsif ($element->{Name} eq 'SNRNA') {
		$self->new_Feature('SNRNA');
    } elsif ($element->{Name} eq 'SNORNA') {
		$self->new_Feature('SNORNA');
    } elsif ($element->{Name} eq 'COORDSET') {
	#	if ($strand eq '-') {
	#	    $current{start}=$length-$current{start}+1;
	#	    $current{end}=$length-$current{end}+1;
	#	}
		if ($current{start}>$current{end}) {
			$current{strand}='-';
			@current{'start','end'}=@current{'end','start'};
		} else {
			$current{strand}='+';
		}
    }
    if ($element->{Name} =~ /^MODEL$|^TRNA$|^TU$|^EXON$|^CDS$|LEFT_UTR|RIGHT_UTR|PRE-TRNA|RNA-EXON|RRNA|SNRNA|SNORNA/) {
	%current=%{$current{parent}};
    }
}

sub characters {
    my ($self, $characters) = @_;
    my $text = $characters->{Data};
    $text =~ s/^\s*//; #strip leading whitespace
    $text =~ s/\s*$//; #strip trailing whitespace
    return unless ($text || $text eq "0");

    if ($current_element eq 'GB_ACCESSION') {
	$gb=$text;
    } elsif ($current_element eq 'ASSEMBLY_SEQUENCE') {
	$sequence.=$text;
    } elsif ($current_element =~/FROM_CONNECT|TO_CONNECT|FROM_OVERLAP|FROM_OVERHANG|TO_OVERHANG/) {
		#print "found $current_element\n";
	$current{$current_element}=$text;
    } elsif ($current_element eq 'END5') {
	$current{start}=$text;
    } elsif ($current_element eq 'END3') {
	unless ($length) {
	    $length=$text;
	    $current{element}='Contig';
	}
	$current{end}=$text;
    } elsif ($current_element eq 'ORIENTATION') {
	$strand=$text;
    } elsif ($current_element eq 'PUB_LOCUS') {
	push @{$current{taghash}{locus}},$text;
    } elsif ($current_element eq 'COM_NAME') {
	push @{$current{taghash}{function}},$text;
    } elsif ($current_element eq 'PUB_COMMENT') {
	push @{$current{taghash}{comment}},$text;
    } elsif ($current_element eq 'IS_PSEUDOGENE') {
	if ($text) {
	    $text='yes';
	} else {
	    $text='no';
	}
	push @{$current{taghash}{pseudogene}},$text;
    } elsif ($current_element eq 'GENE_SYM') {
	push @{$current{taghash}{gene_symbol}},$text;
    } elsif ($current_element eq 'EC_NUM') {
	push @{$current{taghash}{ec_num}},$text;
    } elsif ($current_element eq 'FEAT_NAME') {
	$current{feature_name}=$text;
    }
}

sub new_Feature {
    use GQ::Server::Feature;
    my ($self,$tag)=@_;
    push @GeneFeatures,GQ::Server::Feature->new(
				    context     => $context,
				    contig_id   => $cid,
				    lockid      => 'admin',
				    source_tag  => 'TIGR',
				    primary_tag => $tag,
					type		=> $tag,
				    contig_start=> $current{start},
				    contig_stop => $current{end},
				    strand      => $current{strand},
				    length      => $current{end}-$current{start}+1,
				    tagvalues   => $current{taghash},
				   );
    if ($tag eq 'Repeat') {
	pop @GeneFeatures;  #repeats are not part of genes, but they are features!
    }
    $context->commit;
}

sub new_Transcript {
    my ($self,$tag)=@_;
    my @Features;
    for(0..$#GeneFeatures) {
	push @Features, (pop @GeneFeatures);
    }
    push @Transcripts,{
		       context       => $context,
		       contig_id     => $cid,
		       primary_tag   => $tag,
			   type => 			$tag,
		       tagvalues     => $current{taghash},
		       features      => \@Features,
		      };
    $context->commit;
}

sub new_Gene {
    require GQ::Server::Gene;

    my ($self,$tag)=@_;
    my (@Features);
    foreach (@Transcripts) {
	push @Features, @{$_->{features}};
    }
    my $gene = GQ::Server::Gene->new(context    => $context,
				     contig_id  => $cid,
					contig_start => $current{start},
					contig_stop => $current{end},
				    length      => $current{end}-$current{start}+1,
				    strand      => $current{strand},
				     lockid     => 'admin',
				     source_tag => 'TIGR',
					type		=> $tag,
				     primary_tag=> $tag,
				     par_pos    => 1,
				     #transcripts=> \@Transcripts,
				     features   => \@Features,
				     tagvalues  => $current{taghash},
				     );
    
    $context->commit;
    
}

sub tiling_path {
    my ($self)=@_;

    my ($overlap3,
	$overlap5,
	$vc_start,
	$vc_length,
       );
    $overlap3=$current{TO_OVERHANG_SIZE};
    $overlap5=$current{FROM_OVERHANG_SIZE}+$current{FROM_OVERLAP_SIZE};
    $vc_start=$overlap5 + 1;
    $vc_length=$length-($overlap5 + $overlap3);
    $self->{sth_tpcheck}->execute($cid);
    my ($check)=$self->{sth_tpcheck}->fetchrow_array;
    $self->error("Contig $cid already in database") if $check;  #don't want to re-enter data that's already there!
    $self->{sth_tp}->execute($cid,
			     $length,
			     $vc_start,
			     $vc_length,
			     $current{LEFT_ASMBL},
			     $overlap5,
			     $current{RIGHT_ASMBL},
			     $overlap3,
			    );
    $context->commit;
}

sub error {
    my ($self,$message)=@_;
    die $message;

}


1;
