"""Martel based parser to read GenBank formatted files.

This is a huge regular regular expression for GenBank, built using
the 'regular expressiona on steroids' capabilities of Martel.

Notes:
Just so I remember -- the new end of line syntax is:
  New regexp syntax - \R
     \R    means "\n|\r\n?"
     [\R]  means "[\n\r]"

This helps us have endlines be consistent across platforms.

Documentation for GenBank format that I found:

o GenBank/EMBL feature tables are described at:
http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.htm

o There are also descriptions of different GenBank lines at:
http://www.ibc.wustl.edu/standards/gbrel.txt
"""
# standard library
import string
     
# Martel
import Martel
from Martel import Expression

# useful constants for dealing with the blank space in GenBank documents
# this is useful since blank space can be significant in GenBank flat files.
blank_space = Martel.Rep1(Martel.Str(" "))
indent_space = Martel.RepN(Martel.Str(" "), 2)
big_indent_space = Martel.RepN(Martel.Str(" "), 5)
qualifier_space = Martel.RepN(Martel.Str(" "), 21)

# first line
# LOCUS       AC007323    86436 bp    DNA             PLN       19-JAN-2000
locus = Martel.Group("locus",
                     Martel.Re("[\w]+"))
size = Martel.Group("size",
                    Martel.Rep1(Martel.Integer()))
# deal with the different kinds of residues we can have
valid_residue_types = ["DNA", "RNA", "mRNA", "PROTEIN"]
residue_or = string.join(valid_residue_types, '|')
residue_type = Martel.Group("residue_type",
                            Martel.Re(residue_or))
date = Martel.Group("date",
                    Martel.Re("[-\w]+"))

# the PLN, etc stuff indicates data file divisions
valid_divisions = ["PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", "RNA",
                   "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", "GSS",
                   "HTG"]
divisions_or = string.join(valid_divisions, '|')
data_file_division = Martel.Group("data_file_division",
                                  Martel.Re(divisions_or))

locus_line = Martel.Group("locus_line",
                          Martel.Str("LOCUS") +
                          blank_space +
                          locus +
                          blank_space +
                          size +
                          blank_space +
                          Martel.Re("bp|aa") +
                          blank_space +
                          residue_type +
                          blank_space +
                          data_file_division +
                          blank_space +
                          date +
                          Martel.Re("\R")) 

# definition line
# DEFINITION  Genomic sequence for Arabidopsis thaliana BAC T25K16 from
#             chromosome I, complete sequence.

definition = Martel.Group("definition",
                          Martel.Rep1(blank_space +
                                      Martel.ToEol()))

definition_line = Martel.Group("definition_line",
                               Martel.Str("DEFINITION") +
                               definition)

# accession line
# ACCESSION   AC007323
accession = Martel.Group("accession",
                         Martel.Re("[\w\d]+"))

accession_line = Martel.Group("accession_line",
                              Martel.Str("ACCESSION") +
                              Martel.Rep1(blank_space +
                                          accession) +
                              Martel.Re("\R"))

# NID         g44010
nid = Martel.Group("nid",
                   Martel.Re("[\w\d]+"))
nid_line = Martel.Group("nid_line",
                        Martel.Str("NID") +
                        blank_space +
                        nid +
                        Martel.Re("\R"))

# version and GI line
# VERSION     AC007323.5  GI:6587720
version = Martel.Group("version",
                       Martel.Re("[\w\d\.]+"))

gi = Martel.Group("gi",
                  Martel.Re("[\d]+"))

version_line = Martel.Group("version_line",
                            Expression.AtBeginning() +
                            Martel.Str("VERSION") +
                            blank_space +
                            version +
                            blank_space +
                            Martel.Str("GI:") +
                            gi +
                            Martel.Re("\R"))

# keywords line
# KEYWORDS    antifreeze protein homology; cold-regulated gene; cor6.6 gene;
#             KIN1 homology.
keywords = Martel.Group("keywords",
                        Martel.Rep1(blank_space +
                                    Martel.ToEol()))

keywords_line = Martel.Group("keywords_line",
                             Martel.Str("KEYWORDS") +
                             keywords)

# SOURCE      thale cress.
source = Martel.Group("source",
                      Martel.Rep1(blank_space +
                                  Martel.ToEol() +
                                  indent_space))

source_line = Martel.Group("source_line",
                           Martel.Str("SOURCE") +
                           source)

# ORGANISM  Arabidopsis thaliana
#           Eukaryota; Viridiplantae; Embryophyta; Tracheophyta; Spermatophyta;
#           Magnoliophyta; eudicotyledons; core eudicots; Rosidae; eurosids II;
#            Brassicales; Brassicaceae; Arabidopsis.
organism = Martel.Group("organism",
                        Martel.Re("[\.\w ]+"))

taxonomy = Martel.Group("taxonomy",
                        Martel.Rep1(blank_space +
                                    Martel.ToEol()))

organism_line = Martel.Group("organism_line",
                             Martel.Str("ORGANISM") +
                             blank_space +
                             organism +
                             Martel.Re("\R") +
                             taxonomy)

# REFERENCE   1  (bases 1 to 86436)
#   AUTHORS   Thomashow,M.F.
#   TITLE     Direct Submission
#   JOURNAL   Submitted (01-FEB-1991) M.F. Thomashow, Dept. Crop and Soil
#             Sciences, Dept. Microbiology, Michigan State University, East
#             Lansing, Michigan 48824, USA
reference_num = Martel.Group("reference_num",
                             Martel.Re("[\d]+"))

# can have normal references, like that shown above, or references like:
# REFERENCE   1  (sites)
# with no base information or even:
# REFERENCE   2  (bases 1 to 105654; 110423 to 111122)
reference_bases = Martel.Group("reference_bases",
                               Martel.Str("(") +
                               Martel.Re("[;\w\d \R]+") +
                               Martel.Str(")"))
reference_line = Martel.Group("reference_line",
                              Martel.Str("REFERENCE") +
                              blank_space +
                              reference_num +
                              blank_space +
                              reference_bases +
                              Martel.Re("\R"))

authors = Martel.Group("authors",
                       Martel.Rep1(blank_space +
                                   Martel.ToEol() +
                                   indent_space))
author_line = Martel.Group("author_line",
                           indent_space +
                           Martel.Str("AUTHORS") +
                           authors)

title = Martel.Group("title",
                     Martel.Rep1(blank_space +
                                 Martel.ToEol() +
                                 indent_space))
title_line = Martel.Group("title_line",
                          Martel.Str("TITLE") +
                          title)

journal = Martel.Group("journal",
                       Martel.Rep1(blank_space +
                                   Martel.ToEol() +
                                   Martel.Opt(indent_space)))
journal_line = Martel.Group("journal_line",
                            Martel.Str("JOURNAL") +
                            journal)

#  MEDLINE   92119220
medline_id = Martel.Group("medline_id",
                          Martel.Re("[\d]+"))
medline_line = Martel.Group("medline_line",
                            Martel.Str("MEDLINE") +
                            blank_space +
                            medline_id +
                            Martel.Re("\R"))

# PUBMED   10617197
pubmed_id = Martel.Group("pubmed_id",
                         Martel.Re("[\d]+"))
pubmed_line = Martel.Group("pubmed_line",
                           Martel.Opt(blank_space) +
                           Martel.Str("PUBMED") +
                           blank_space +
                           pubmed_id +
                           Martel.Re("\R"))

# REMARK    This sequence is of BAC F10O3 from Arabidopsis thaliana chromosome
remark = Martel.Group("remark",
                      Martel.Rep1(blank_space +
                                  Martel.ToEol() +
                                  Martel.Opt(indent_space)))
remark_line = Martel.Group("remark_line",
                           Martel.Str("REMARK") +
                           remark)

# an entire reference for the sequence
reference = Martel.Group("reference",
                         reference_line +
                         author_line +
                         Martel.Opt(title_line) +
                         journal_line +
                         Martel.Opt(medline_line) +
                         Martel.Opt(pubmed_line) +
                         Martel.Opt(remark_line))

# COMMENT     On Dec 16, 1999 this sequence version replaced gi:5729683.
comment = Martel.Group("comment",
                       Martel.Rep1(blank_space +
                                   Martel.ToEol()))
comment_line = Martel.Group("comment_line",
                            Martel.Str("COMMENT") +
                            comment)

# start on the feature table. Eeek -- This is the part I was afraid of
# most!

# the header, so that we know we are heading into some features
# FEATURES             Location/Qualifiers
features_line = Martel.Group("feature_line",
                             Martel.Str("FEATURES") +
                             blank_space +
                             Martel.Str("Location/Qualifiers") +
                             Martel.Re("\R"))

# --- now we need to read in the features one at a time
# -- first, set up the feature keys and locations
# a listing of valid feature keys
valid_f_keys = ["allele", "attenuator", "C_region", "CAAT_signal",
                "CDS", "conflict", "D-loop", "D_segment", "enhancer",
                "exon", "GC_signal", "gene", "iDNA", "intron",
                "J_segment", "LTR", "mat_peptide", "misc_binding",
                "misc_difference", "misc_feature", "misc_recomb",
                "misc_RNA", "misc_signal", "misc_structure",
                "modified_base", "mRNA", "mutation", "N_region",
                "old_sequence", "polyA_signal", "polyA_site",
                "precursor_RNA", "prim_transcript", "primer_bind",
                "promoter", "protein_bind", "RBS", "repeat_region",
                "repeat_unit", "rep_origin", "rRNA", "S_region",
                "satellite", "scRNA", "sig_peptide", "snRNA",
                "source", "stem_loop", "STS", "TATA_signal",
                "terminator", "transit_peptide", "tRNA",
                "unsure", "V_region", "V_segment", "variation",
                "3'clip", "3'UTR", "5'clip", "5'UTR", "-10_signal",
                "-35_signal", "-"]
feature_or = string.join(valid_f_keys, "|")

feature_key = Martel.Group("feature_key",
                           Martel.Re(feature_or))

# handle lots of different kinds of locations
# complement(10..20)
# join(10..20,30..40)
# 10..20
# we can have an optional reference to another accession number, ie:
# J00194:(100..202)
location_ref = Martel.Group("location_ref",
                            Martel.Re("[_\d\w]+") +
                            Martel.Str(":"))
location_part  = Martel.Group("location_part",
                              Martel.Rep1(Martel.Re("[\<\>\(\)\^\.\,\d]") |
                                          Martel.Str("complement") |
                                          Martel.Str("join") |
                                          Martel.Str("order") |
                                          Martel.Str("replace") |
                                          (Martel.Str('"') +
                                           Martel.Opt(Martel.Re("\w")) +
                                           Martel.Str('"')) |
                                          location_ref))


location = Martel.Group("location",
                        Martel.Rep1(blank_space +
                                    Martel.Rep1(location_part +
                                                Martel.Re("\R"))))

feature_key_line = Martel.Group("feature_key_line",
                                big_indent_space +
                                feature_key +
                                location)

# -- now set up all of the info we can have for qualifiers
# a listing of valid qualifier keys
valid_q_keys = ["allele", "anticodon", "bound_moiety", "cell_line",
                "cell_type", "chromosome", "chloroplast", "chromoplast",
                "citation", "clone", "clone_lib", "codon_start", "codon",
                "cons_splice", "country", "cultivar", "cyanelle",
                "db_xref", "dev_stage", "direction", "EC_number",
                "evidence", "exception", "focus", "frequency",
                "function", "gene", "germline", "haplotype",
                "insertion_seq", "isolate", "kinetoplast", "label",
                "lab_host", "map", "macronuclear", "mitochrondrion",
                "mod_base", "note", "number", "organelle", "organism",
                "partial", "PCR_conditions",
                "pop_variant", "phenotype", "plasmid", "product",
                "protein_id", "proviral", "pseudo", "rearranged",
                "replace", "rpt_family", "rpt_type", "rpt_unit",
                "sequenced_mol", "serotype", "sex", "specific_host",
                "specimen_voucher", "standard_name", "strain", "sub_clone",
                "sub_species", "sub_strain", "tissue_lib", "tissue_type",
                "translation", "transl_table", "transposon", "usedin",
                "variety", "virion"]
qualifier_or = string.join(valid_q_keys, "|")

qualifier_key = Martel.Group("qualifier_key",
                             Martel.Opt(blank_space) +
                             Martel.Str("/") +
                             Martel.Re(qualifier_or) +
                             Martel.Opt(Martel.Str("=")))
qualifier_value = Martel.Group("qualifier_value",
                               Martel.ToEol() +
                               Martel.Rep(qualifier_space +
                                          Martel.AnyBut("/") +
                                          Martel.ToEol()))

qualifier = Martel.Group("qualifier",
                         qualifier_key +
                         qualifier_value)
feature = Martel.Group("feature",
                       feature_key_line +
                       Martel.Rep(qualifier))


# BASE COUNT    28300 a  15069 c  15360 g  27707 t
base_count = Martel.Group("base_count",
                          Martel.Re("[\w\d ]+"))
base_count_line = Martel.Group("base_count_line",
                               Martel.Str("BASE COUNT") +
                               blank_space +
                               base_count +
                               Martel.Re("\R"))

# ORIGIN      
#       1 ggacaaggcc aaggatgctg ctgctgcagc tggagcttcc gcgcaacaag taaacagata
origin_line = Martel.Group("origin_line",
                           Martel.Str("ORIGIN") +
                           Martel.Opt(blank_space) +
                           Martel.Re("\R"))
base_number = Martel.Group("base_number",
                           Martel.Re("[\d]+"))
sequence = Martel.Group("sequence",
                        Martel.Re("[\w ]+"))
sequence_line = Martel.Group("sequence_line",
                             blank_space +
                             base_number +
                             blank_space +
                             sequence +
                             Martel.Re("\R"))
sequence_entry = Martel.Group("sequence_entry",
                              origin_line +
                              Martel.Rep1(sequence_line))

# all done!
# //
record_end = Martel.Group("record_end",
                          Martel.Str("//") +
                          Martel.Rep1(Martel.Re("\R")))

record = locus_line + \
         definition_line + \
         accession_line + \
         Martel.Opt(nid_line) + \
         Martel.Opt(version_line) + \
         keywords_line + \
         source_line + \
         organism_line + \
         Martel.Rep1(reference) + \
         Martel.Opt(comment_line) + \
         features_line + \
         Martel.Rep1(feature) + \
         base_count_line + \
         sequence_entry + \
         record_end


                          



