#!/usr/bin/env perl

#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# This file is part of G-language Genome Analysis Environment package
#
#     Copyright (C) 2001-2007 Keio University
#:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# 
#   $Id: G.pm,v 1.4 2002/07/30 17:40:56 gaou Exp $
#
# G-language GAE is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
# 
# G-language GAE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
# See the GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public
# License along with G-language GAE -- see the file COPYING.
# If not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# 
#END_HEADER
#
# written by Kazuharu Arakawa <gaou@g-language.org> at
# G-language Project, Institute for Advanced Biosciences, Keio University.
#

package G;

use strict;

# import prelude extensions
use SubOpt;
use Rcmd;

# import messenger
use G::Messenger;

# import skyline extensions
require G::Skyline;
require G::DynamicLoader;
use G::Inspire;

# import db classes
use G::DB::SDB;
use G::DB::GDBI;

# import odyssey classes
use G::Seq::AminoAcid;
use G::Seq::Codon;
use G::Seq::Consensus;
use G::Seq::Eliminate;
use G::Seq::FreeEnergy;
use G::Seq::GCskew;
use G::Seq::Operon;
use G::Seq::OverLapping;
use G::Seq::Primitive;
use G::Seq::PatSearch;
use G::Seq::Tandem;
use G::Seq::Util;
use G::Seq::Markov;
use G::Seq::ORF;
use G::Seq::Align;
use G::Seq::Usage;
use G::Seq::COMGA;
use G::Seq::ImaGene;
use G::Tools::Graph;
use G::Tools::H2v;
use G::Tools::Mapping;
use G::Tools::Blast;
use G::Tools::Fasta;
use G::Tools::Alignment;
use G::Tools::HMMER;
use G::Tools::Repeat;
use G::Tools::EPCR;
use G::Tools::SIM4;
use G::Tools::PBS;
use G::Tools::Cap3;
use G::Tools::COGs;
use G::Tools::GPAC;
use G::Tools::GOA;
use G::Tools::GlimmerM;
use G::Tools::Literature;
use G::Tools::KEGG_API;
use G::Tools::KEGG_API3;
use G::Tools::RCluster;
use G::Tools::PEC;
use G::SystemsBiology::Serizawa;
use G::SystemsBiology::Pathway;
use G::SystemsBiology::PathwayAlignment;
use G::SystemsBiology::EcellReader;
use G::SystemsBiology::KEGG;
use G::SystemsBiology::BioLayout;
use G::SystemsBiology::Interaction;
use G::SystemsBiology::DotE;

# import presage classes
use G::System::BAS;
use G::System::GEMS;
use G::System::COMGA;
use G::System::STeP;
use G::System::CHI;
use G::System::ReL8;
use G::System::FuncD;

use vars qw($VERSION @ISA @EXPORT @EXPORT_OK @INC);

require Exporter;

@ISA = qw(G::Skyline Exporter);

# export odyssay functions
push(@EXPORT, @G::Seq::AminoAcid::EXPORT);
push(@EXPORT, @G::Seq::Codon::EXPORT);
push(@EXPORT, @G::Seq::Consensus::EXPORT);
push(@EXPORT, @G::Seq::Eliminate::EXPORT);
push(@EXPORT, @G::Seq::FreeEnergy::EXPORT);
push(@EXPORT, @G::Seq::GCskew::EXPORT);
push(@EXPORT, @G::Seq::Operon::EXPORT);
push(@EXPORT, @G::Seq::OverLapping::EXPORT);
push(@EXPORT, @G::Seq::Primitive::EXPORT);
push(@EXPORT, @G::Seq::PatSearch::EXPORT);
push(@EXPORT, @G::Seq::Tandem::EXPORT);
push(@EXPORT, @G::Seq::Util::EXPORT);
push(@EXPORT, @G::Seq::Markov::EXPORT);
push(@EXPORT, @G::Seq::ORF::EXPORT);
push(@EXPORT, @G::Seq::Align::EXPORT);
push(@EXPORT, @G::Seq::Usage::EXPORT);
push(@EXPORT, @G::Seq::COMGA::EXPORT);
push(@EXPORT, @G::Seq::ImaGene::EXPORT);
push(@EXPORT, @G::Tools::Graph::EXPORT);
push(@EXPORT, @G::Tools::H2v::EXPORT);
push(@EXPORT, @G::Tools::Mapping::EXPORT);
push(@EXPORT, @G::Tools::Blast::EXPORT);
push(@EXPORT, @G::Tools::Fasta::EXPORT);
push(@EXPORT, @G::Tools::Alignment::EXPORT);
push(@EXPORT, @G::Tools::HMMER::EXPORT);
push(@EXPORT, @G::Tools::Repeat::EXPORT);
push(@EXPORT, @G::Tools::EPCR::EXPORT);
push(@EXPORT, @G::Tools::SIM4::EXPORT);
push(@EXPORT, @G::Tools::PBS::EXPORT);
push(@EXPORT, @G::Tools::Cap3::EXPORT);
push(@EXPORT, @G::Tools::COGs::EXPORT);
push(@EXPORT, @G::Tools::GPAC::EXPORT);
push(@EXPORT, @G::Tools::GOA::EXPORT);
push(@EXPORT, @G::Tools::GlimmerM::EXPORT);
push(@EXPORT, @G::Tools::Literature::EXPORT);
push(@EXPORT, @G::Tools::KEGG_API::EXPORT);
push(@EXPORT, @G::Tools::KEGG_API3::EXPORT);
push(@EXPORT, @G::Tools::RCluster::EXPORT);
push(@EXPORT, @G::Tools::PEC::EXPORT);
push(@EXPORT, @G::SystemsBiology::Serizawa::EXPORT);
push(@EXPORT, @G::SystemsBiology::Pathway::EXPORT);
push(@EXPORT, @G::SystemsBiology::PathwayAlignment::EXPORT);
push(@EXPORT, @G::SystemsBiology::EcellReader::EXPORT);
push(@EXPORT, @G::SystemsBiology::KEGG::EXPORT);
push(@EXPORT, @G::SystemsBiology::BioLayout::EXPORT);
push(@EXPORT, @G::SystemsBiology::Interaction::EXPORT);
push(@EXPORT, @G::SystemsBiology::DotE::EXPORT);
push(@EXPORT, @G::System::BAS::EXPORT);
push(@EXPORT, @G::System::GEMS::EXPORT);
push(@EXPORT, @G::System::COMGA::EXPORT);
push(@EXPORT, @G::System::STeP::EXPORT);
push(@EXPORT, @G::System::CHI::EXPORT);
push(@EXPORT, @G::System::ReL8::EXPORT);
push(@EXPORT, @G::System::FuncD::EXPORT);
push(@EXPORT, @G::DB::SDB::EXPORT);
push(@EXPORT, @SubOpt::EXPORT);
push(@EXPORT, @G::Messenger::EXPORT);

# export plugin functions (overrides all)
push(@EXPORT, @G::DynamicLoader::EXPORT);


$VERSION = $G::Skyline::VERSION;

mkdir($ENV{HOME} . '/.glang/data/');


#::::::::::::::::::::::::::::::
#          Perldoc
#::::::::::::::::::::::::::::::

1;

__END__

=head1 NAME

G - G-language Genome Analysis Environment Version 2.x core module (Skyline)

=head1 SYNOPSIS

 use G;                          # Imports G-language GAE module 
   
 $gb = new G("ecoli.gbk");       # Creates G's instance at $gb 
                                 # At the same time, read in ecoli.gbk. 
                                 # Read the annotation and sequence 
                                 # information 
                                 # See DESCRIPTION for details
   
 $gb->seq_info();                # Prints the basic sequence information.

 $find_ori_ter(\$gb->{SEQ});     # Gives sequence as a reference to
                                 # odyssey functions

=head1 DESCRIPTION

 The G-language GAE fully supports most sequence databases.

=head2 stored annotation information:

=over 4   

 LOCUS  
         $gb->{LOCUS}->{id}              -accession number 
         $gb->{LOCUS}->{length}          -length of sequence  
         $gb->{LOCUS}->{nucleotide}      -type of sequence ex. DNA, RNA  
         $gb->{LOCUS}->{circular}        -1 when the genome is circular.
                                          otherwise 0
         $gb->{LOCUS}->{type}            -type of species ex. BCT, CON  
         $gb->{LOCUS}->{date}            -date of accession 

 HEADER  
    $gb->{HEADER}  

 COMMENT  
    $gb->{COMMENT}  

 FEATURE  
         Each FEATURE is numbered(FEATURE1 .. FEATURE1172), and is a 
         hash structure that contains all the keys of Genbank.   
         In other words,  in most cases, FEATURE$i's hash at least 
         contains informations listed below: 
         $gb->{FEATURE$i}->{start}  
         $gb->{FEATURE$i}->{end}  
         $gb->{FEATURE$i}->{direction}
         $gb->{FEATURE$i}->{join}
         $gb->{FEATURE$i}->{note}  
         $gb->{FEATURE$i}->{type}        -CDS,gene,RNA,etc.

         To analyze each FEATURE, write: 

         $i = 1;  
         while(defined(%{$gb->{FEATURE$i}})){  
   
                 $i ++;  
         }  

         Each CDS is stored in a similar manner.
         There are 
         $gb->{CDS$i}->{start}
         $gb->{CDS$i}->{end}
         $gb->{CDS$i}->{direction}
         $gb->{CDS$i}->{join}
         $gb->{CDS$i}->{feature}         -number $n for $gb->{FEATURE$n}
                                          where "CDS$i" = "FEATURE$n"

         In the same manner, to analyze all CDS, write:  
   
         $i = 1;  
         while(defined(%{$gb->{CDS$i}})){  
   
                 $i ++;  
         }

 BASE COUNT  
         $gb->{BASE_COUNT}  

 SEQ  
         $gb->{SEQ}              -sequence data following "ORIGIN" 

=back

=head2 supported methods of G-language Genome Analysis Environment

=over 2

=item new()

         Creates a G instance.
         First option is the filename of the database. Default format is
         the GenBank database.
         Second option specifies detailed actions.

           'no msg'                  suprresses all STDOUT messages printed 
                                     when loading a database, including the
                                     copyright info and sequence statistics.

           'no cache'                suppresses the use of database caching.
                                     By default, databases are cached for
                                     optimized performance. (since v.1.6.4)

           'force cache'             rebuilds database cache.

           'without annotation'      this option skips the annotation.

           'multiple locus'          this option merges multiple loci in the 
                                     database and load the information
                                     as G-language instance.

           'long sequence'           this option uses a pointer of the filehandle 
                                     to read the genome sequence. See 
                                     next_seq() method below for details.

           'bioperl'                 this option creates a G instance from 
                                     a bioperl object. 
                                     eg. $bp = $bp->next_seq();       # bioperl
                                         $gb = new G($bp, "bioperl"); # G

           'longest ORF annotation'  this option predicts genes with longest ORF
                                     algorithm (longest frame from start codon
                                     to stop codon, with more than 17 amino 
                                     acids) and annotates the sequence.

           'glimmer annotation'      this option predicts genes using glimmer2,
                                     a gene prediction software for microbial
                                     genomes available from TIGR.
                                     http://www.tigr.org/softlab/
                                     Local installation of glimmer2 and setting
                                     of PATH environment value is required.

               - following options require bioperl installation -

           'Fasta'              this option loads a Fasta format database.
           'EMBL'               this option loads a EMBL  format database.
           'swiss'              this option loads a swiss format database.
           'SCF'                this option loads a SCF   format database.
           'PIR'                this option loads a PIR   format database.
           'GCG'                this option loads a GCG   format database.
           'raw'                this option loads a raw   format database.
           'ace'                this option loads a ace   format database.
           'net GenBank'        this option loads a GenBank format database from 
                                NCBI database. With this option, the first value to 
                                pass to new() function will be the accession 
                                number of the database.

=item output()

         Given a filename and an option, outputs the G-language data object 
         to the specified file in a flat-file database of a given format.
         The options are the same as those of new().  Default format is 'GenBank'.
         eg. $gb->output("my_genome.embl", "EMBL");
             $gb->output("my_genome.gbk"); # with GenBank you can ommit the option.

=item complement()

         Given a sequence, returns its complement.
         eg. complement('atgc');  returns 'gcat'

=item translate()

         Given a sequence, returns its translated sequence.
         Regular codon table is used.
         eg. translate('ctggtg'); returns 'LV'

=item $gb->seq_info()

         Prints the basic information of the genome to STDOUT.

=item $gb->DESTROY()

         Destroys the G instance

=item $gb->del_key()

         Given a object, deletes it from the G instance structure
         eg. $gb->del_key('FEATURE1'); deletes 'FEATURE1' hash

=item $gb->getseq()

         Given the start and end positions (starting from 0 as in Perl),
         returns the sequence specified.
         eg. $gb->getseq(1,3); returns the 2nd, 3rd, and 4th nucleotides.

=item $gb->get_gbkseq()

         Given the start and end positions (starting from 1 as in 
         Genbank), returns the sequence specified.
         eg. $gb->get_gbkseq(1,3); returns the 1st, 2nd, and 3rd 
             nucleotides.

=item $gb->get_cdsseq()

         Given a CDS ID, returns the CDS sequence. 
         'complement' is properly parsed.
         eg. $gb->get_cdsseq('CDS1'); returns the 'CDS1' sequence.

=item $gb->get_geneseq()

         Given a CDS ID, returns the CDS sequence, or the exon sequence
         If introns are present.
         'complement' is properly parsed, and introns are spliced out.
         eg. $gb->get_geneseq('CDS1'); returns the 'CDS1' sequence or 
             exon.

=item $gb->feature()

         Returns the array of all feature object name.
         foreach ($gb->feature()){
             $gb->get_cdsseq($_);
         }
         prints all feature sequences.

=item $gb->cds()

         Returns the array of all cds object name.

         !CAUTION! the object name is actually the FEATURE OBJECT NAME,
         to enable access to all feature values. However, most of the
         time you do not need to be aware of this difference.

         foreach ($gb->cds()){
             $gb->get_geneseq($_);
         }
         prints all gene sequences.

=item $gb->startcodon()

         Given a CDS ID, returns the start codon.
         eg. $gb->startcodon('CDS1'); returns 'atg'

=item $gb->stopcodon()

         Given a CDS ID, returns the stop codon.
         eg. $gb->stopcodon('CDS1'); returns 'tag'

=item $gb->before_startcodon()

         Given a CDS ID and length, returns the sequence upstream of 
         start codon.
         eg. $gb->before_startcodon('CDS1', 100); returns 100 bp  
             sequence upstream of the start codon of 'CDS1'.

=item $gb->after_startcodon()

         Given a CDS ID and length, returns the sequence downstream of 
         start codon.
         eg. $gb->after_startcodon('CDS1', 100); returns 100 bp  
             sequence downstream of the start codon of 'CDS1'.

=item $gb->before_stopcodon()

         Given a CDS ID and length, returns the sequence upstream of 
         stop codon.
         eg. $gb->before_stopcodon('CDS1', 100); returns 100 bp  
             sequence upstream of the stop codon of 'CDS1'.

=item $gb->after_stopcodon()

         Given a CDS ID and length, returns the sequence downstream of 
         stop codon.
         eg. $gb->after_stopcodon('CDS1', 100); returns 100 bp  
             sequence downstream of the stop codon of 'CDS1'.

=item $gb->get_intron()

         Given a CDS ID, returns the intron sequences as array of 
         sequences.
         eg. $gb->get_intron('CDS1'); 
             returns ($1st_intron, $2nd_intron,..)

=item $gb->pos2feature()

         Given a GenBank position (sequence starting from position 1) 
         returns the G-instance ID (ex. FEATURE123) of the feature at
         the given position. If multiple features exists for the given
         position, the first feature to appear is returned. Returns 
         NULL if no feature exists.

=item $gb->pos2gene()

         Given a GenBank position (sequence starting from position 1) 
         returns the G-instance ID (ex. FEATURE123) of the gene at
         the given position. If multiple genes exists for the given
         position, the first gene to appear is returned. Returns 
         NULL if no gene exists.

=item $gb->gene2id()

         Given a GenBank gene name, returns the G object feature ID
         (ex. FEATURE123). Returns NULL if no gene exists.

=item $gb->get_exon()

         Given a CDS ID, returns the exon sequence.
         'complement' is properly parsed, and introns are spliced out.
         eg. $gb->get_exon('CDS1'); returns the 'CDS1' exon.

=item $gb->next_locus()

         Reads the next locus.
         the G instance is then updated.

         do{

         }while($gb->next_locus());

         Enables multiple loci analysis.        

=item $gb->next_seq()

         If G instance is created with 'long sequence' option, 
         $gb->next_seq() method replace the next chunk of sequence 
         to $gb->{SEQ}.

         while($gb->next_seq(100000)){
             print $gb->{SEQ};
         }

         Enables continuous analysis.

=item $gb->rewind_genome()

         If G instance is created with 'long sequence' option, 
         $gb->rewind_genome() method puts the filehandle pointer back 
         to the ORIGIN position.

=back

=head1 AUTHOR

Kazuharu Gaou Arakawa, gaou@g-language.org

=head1 SEE ALSO

perl(1).

=cut



