[Biophp-dev] Spidey parser

Thu Nov 25 05:51:51 EST 2004

Dear all,

Here is stuff for people who wants to parse spidey file (sorry it is not yet Pearish):
This parser is useful if you to check the locations of several variants of a gene (from Refseq and/or Ensembl) in a genomic region.
In few weeks I will append a BioPHP module to draw the genomic region and the mapped associated variants. Then it will be also possible to append the target sequences of the Affymetrix qualifier in order to visualize which qualifier is specific to a variant.
Any Good or Bad critics are welcome.

FYI I used the BIOPHP files available at this url : http://bioinformatics.org/cgi-bin/cvsweb.cgi/biophp/genephp/

Part(1/5): Links to data in order to test the parser (just have to save the sequence as fasta files):
######################################################################################################
NM_002576: http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val=42794768&db=Nucleotide&dopt=GenBank
NT_033927: http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?cmd=&txt=&save=&cfm=&term=&_from=7255810&_to=7407682&_strand=2&list_uids=NT_033927.7&db=&extrafeat=24&view=def&dispmax=20&SendTo=on&__from=7250810&__to=7412682&__strand=

Part(2/5): Link to the spidey exec :
######################################################################################################
http://www.ncbi.nlm.nih.gov/spidey/spideyexec.html

Part(3/5): The code of the spidey parser :
######################################################################################################
<?php
/* 

This sourcecode is licensed under the GPL v2.0.
Finally a paser submitted by Fred

http://bioinformatics.org/biophp - a "BioPHP" project
*/

class parse_spidey
{
    var $source;                //might be a string, or file handle(resource)
    var $source_lines=Array();  //if source is a string, this holds its lines
    var $source_fh_opened;      //flag is set when a file handle was opened in parser
    var $eoRecords;             //flag is set when we reach the last record
    var $mrna_features;

//################class constructor###################

    function parse_spidey(&$source) 
    {
        if($source != "") {
            $this->setSource($source);
        }
    }

//###############Get/Set functions##################

    // return the next sequence record as an array,
    function fetchNext() 
    {
        if(!$this->eoRecords) {
            return $this->readRecord();
        }
        return false;
    }

    // declare or change the source data
    // can be an actual string containing data, a file handle, or
    // a filename.
    function setSource($source) {
        if(is_resource($source)) {
            $this->source=$source;
        }  elseif(is_array($source)) {
            //assume an already-split array of lines
            $this->source_lines=$source;
        } elseif(@file_exists($source)) {
            //if passed a filename, opens it
            $this->source=fopen($source,"r");
            if($this->source) {
                $this->source_fh_opened=true;
            }
        } else {
            // assume source is a string containing data
            $this->source_lines=preg_split("/[\r\n]/",$source);
        }
    }

//################"Internal" functions#############

    function readRecord() 
    {
        // a "wrapper" - calls the appropriate function for string or file
        if(is_resource($this->source)) {
            return $this->readfromFile();
        } else {
            return $this->readfromString($this->source_lines);
        }
    }

    // Test if this line contains the start of another result : --SPIDEY version 1.35--
    // Actually it is the first line of the next result in case of sevral results
    //so I did a little trck with the counter 
    // in the function readfromFile() (see below)
    function isEor($line, $line_cpt) 
    {
        // Genbank records end with a double slash
        if (ereg("--SPIDEY version",$line) && $line_cpt > 1) {         
            return true;
        }
        return false;
    }

    // read to next label or end of file, return an array with label and
    // sequence,  or false if no more records
    function readfromFile() 
    {
        // we keep on reading untill we hit an end of record mark
        // accumulate read lines in an array that we feed to readfromString
        $line_cpt = 0;
        while (!(feof($this->source) || $this->isEor($line, $line_cpt)) ) {
            $line_cpt++;
            $line=fgets($this->source,2048);
            $lines[]=$line;
        }
        if (feof($this->source)) {
            $this->eoRecords=true;
            if ($this->source_fh_opened) {
                fclose($this->source);
            }
        }
        return $this->readfromString($lines);
    }

    // parses a Spidey data file 
    function readfromString(&$sourcelines) 
    {

        $record = array();
        $genomic_description ;
        $mrna_description;
        $strand;
        $nbexons;
        $exons = array();
        $exons_location = array();
        $nb_exons = 0;

        while (list($no, $linestr) = each($sourcelines)) {

            if (left($linestr, 7) == "Genomic") {
                $genomic_description = substr($linestr, 9);
            }

            if (left($linestr, 4) == "mRNA") {
                $mrna_description = substr($linestr, 6);
            }

            if (left($linestr, 6) == "Strand") {
                $strand = substr($linestr, 8);
            }
            if (left($linestr, 15) == "Number of exons") {
                $nbexons = substr($linestr, 17);
            }

            if (ereg("^Exon ([[:digit:]]+): ([[:digit:]]+)-([[:digit:]]+) \(gen\) +([[:digit:]]+)-([[:digit:]]+) \(mRNA\) +id ([[:digit:]]+\.[[:digit:]]+%) mismatches ([[:digit:]]+) gaps ([[:digit:]]+) +splice site \(d  a\): +(.+)", $linestr, $infos)) {

                $exons[$infos[1]] = array (

                    'genomic_start'                     => $infos[2],
                    'genomic_end'                       => $infos[3],
                    'mrna_start'                        => $infos[4],
                    'mrna_end'                          => $infos[5],
                    'id'                                => $infos[6],
                    'mismatch'                          => $infos[7],
                    'gaps'                              => $infos[8],
                    'splice_site_donor_acceptor'        => $infos[9]
                );

               $exons_location[] =     $infos[2] . "-" .  $infos[3];

            }

            if (left($linestr, 22) == "Number of splice sites") {
                $nbsplicesites = substr($linestr, 24);
            }

            if (left($linestr, 13) == "mRNA coverage") {
                $mrnacoverage = substr($linestr, 15);
            }

            if (left($linestr, 24) == "overall percent identity") {
                $overallpercentid = substr($linestr, 26);
            }

            if (left($linestr, 17) == "Missing mRNA ends") {
                $missingmrnaends = substr($linestr, 19);
            }

            if (left($linestr, 33) == "Non-aligning poly(A)+ tail length") {
                $polyatailslength = substr($linestr, 35);
            }

        } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) 

        $record[genomic]          = $genomic_description  ; 
        $record[mrna]             = $mrna_description  ;
        $record[strand]           = $strand  ;
        $record[nbexons]          = $nbexons  ;
        $record[exons]            = $exons;
        $record[exons_location]   = $exons_location;
        $record[nbsplicesites]    = $nbsplicesites  ; 
        $record[mrnacoverage]     = $mrnacoverage  ;
        $record[overallpercentid] = $overallpercentid  ;
        $record[missingmrnaends]  = $missingmrnaends  ;
        $record[polyatailslength] = $polyatailslength  ;

        //The following method build an array for the rna_features of a genomic sequence
        $this->seqRnaFeatures($record);

        return $record;                        
    } // CLOSES parse_spidey()

    function seqRnaFeatures($record) {

        $location = $record[exons_location];

        //db_xref part
        $db_xref = array();

        if (ereg('gi\|([[:digit:]]+)\|', $record[mrna], $tab))                               $db_xref[] = "GI=$tab[1]";
        if (ereg('ref\|([[:alpha:]]{2}_[[:digit:]]+\.[[:digit:]]+)\|', $record[mrna], $tab)) $db_xref[] = "Refseq=$tab[1]";
        if (ereg('(NM_[[:digit:]]+) ', $record[mrna], $tab))                                 $db_xref[] = "Refseq=$tab[1]";
        if (ereg('(ENST[[:digit:]]+) ', $record[mrna], $tab))                                $db_xref[] = "Ensembl=$tab[1]";

        $this->mrna_features = array(
            'acc'      => $tab[1],
            'location' => $record[exons_location],
            'strand'   => $record[strand]
        );

    }

}
?>

Part(4/5): The code of the fasta parser that I modified a little.
######################################################################################################
<?php

/**
 *  fasta_parser_class.php (renamed to parse_fasta for GenePHP compatability)
 *
 * v0.0.01 - 15-March-2003
 *
 * v0.0.1 - 5-May-2003 - converted to class for GenePHP import module
 *
 * For iterating through a file(handle) or array of lines, returns
 * an array for each record
 *
 * Initially written and maintained by Sean M. Clark
 * Consider this licensed under the GPL v2.0.
 *
 * @package biophp
 * @author Sean Clark
 * @license http://opensource.org/licenses/gpl-license.php GNU Public License
 * @link http://bioinformatics.org/biophp 
 * - a "BioPHP" project
*/

/**
 * Support class for seqIOimport class.  Parses Fasta files
 * 
 * @package biophp
 * @author Sean Clark
 */ 
class parse_fasta
{
    var $source; //might be a string, or file handle(resource)
    var $next_label; //contents of just-read label
    var $source_lines=Array(); //if source is a string, this holds its lines
    var $sourcetype; //tracks whether we need to close filehandle ourselves

//################class constructor###################

    /**
     * Parse_Fasta constructor
     */
    function parse_fasta(&$source) {
        if($source != "") {
            $this->setSource($source);
        }
    }

//###############Get/Set functions##################

    /** 
     * Return the next sequence record as an array,
     * @return Array
     */
    function fetchNext()
    {
        $record=Array(); //for returning results
        if($this->next_label) {
            list($id,$sequence)=$this->findNextLabel();
            //treat the Defline used at NCBI
            //>gi|42794768|ref|NM_002576.3| Homo sapiens p21/Cdc42/Rac1-activated kinase 1 (STE20 homolog, yeast) (PAK1), mRNA
            if(ereg("^gi", $id)) {
                genbankDefline($id, $record);
            }
            elseif(ereg("^ref", $id)) {
                refseqDefline($id, $record);
            }
            else {
            $record['id']=$id;
            }
            $record['sequence']=$sequence;
            return $record;
        }
        else {
            //no more records in file
            return false;
        }
    }

    /**
     * <p>Determines whether the source is a file or a string</p>
     *  Mainly for checking whether or not you've accidentally
     * passed an invalid filename
     */
    function isFromFile() {
        if(is_resource($this->source)) {
            return true;
        }
        else {
            return false;
        }
    }

    /**
     * <p>Declare or change the source data</p>
     * Can be an actual string containing data, a file handle, or
     * a filename.
     */
    function setSource(& $source) {
        if(is_resource($source)) {
            $this->source=$source;
            $this->sourcetype='resource';
            $this->readToLabel(); //find first label

        }
        elseif(is_array($source)) {
            //assume an already-split array of lines
            $this->sourcetype='text';
            $this->source_lines=$source;
        }
        elseif(@file_exists($source)) {
            //if passed a filename, opens it
            echo("$source");
            $this->sourcetype='file';
            $this->source=fopen($source,"r");
            $this->readToLabel();
        }
        else {
        // assume source is a string containing data
            $this->sourcetype='text';
            $this->source_lines=preg_split("/[\r\n]/",$source);
        }
    }

//################"Internal" functions#############

    function findNextLabel()
    {
        // a "wrapper" - calls the appropriate function for string or file
        if($this->isFromFile()) {
            return $this->findNextInFile();
        }
        else {
            return $this->findNextInString();
        }
    }

    /**
     * <p>Read to next label or end of file</p>
     * Return an array with label and
     * sequence,  or false if no more records
     */
    function findNextInFile()
    {
        $sequencedata="";  //hold collected sequence lines
        $label=$this->next_label;
        while(!(preg_match("/^>/",$line=fgets($this->source,1024)) || feof($this->source))) {
            //keep adding remaining lines up until end of file or next label
            $sequencedata.=trim($line);
        }
        if(!(feof($this->source))) {
            $this->next_label=trim(substr($line,1)); //remove the leading > character
        }
        else {
            if ($this->sourcetype=='file') {
                fclose($this->source);
            }

            $this->next_label=false;
        }
        return Array($label,$sequencedata);
    }

    /** 
     * <p>Pulls the next sequence from the string array</p>
     * Returns an array with label,sequence
     */
    function findNextInString()
    {
        $label=$this->next_label;
        while ((!preg_match("/^>".$label."/",$line=array_shift($this->source_lines))) && (count($this->source_lines) > 0) ) {
            //keep adding remaining lines up until end of file or next FASTA label
            $sequencedata.=trim($line);
        }
        if(count($this->source_lines) > 0) {
            $this->next_label=trim(substr($line,1)); //remove the leading > character
        }
        else {
            $this->next_label=false;
        }
        return Array($label,$sequencedata);
    }

    /**
     * "wrapper" - calls appropriate function type
     */
    function readToLabel($label=".*")
    {
        if($this->isFromFile()) {
            return $this->readFileToLabel($label);
        }
        else {
            return $this->readStringToLabel($label);
        }
    }

    /**
     * scans to next label by default, or to the label name specified
     */
    function readFileToLabel($label=".*")
    {
        while(!(preg_match("/^>".$label."/",$line=fgets($this->source,1024)) || feof($this->source))) {
             //do nothing, really...just scan until that point
        }
        if(feof($this->source)) {
            $this->next_label=false;
            if($this->sourcetype=='file') {
                fclose($this->source);
            }
        }
        else {
            $this->next_label=trim(substr($line,1));
        }
    }

    /**
     * Advances through the source array to the specified label
     */
    function readStringToLabel($label=".*")
    {
        while(!(preg_match("/^>".$label."/",$line=array_shift($this->source_lines))) && (count($this->source_lines) > 0)) {
         //do nothing, really...just scan until that point
        }
        if(count($this->source_lines) < 1) {
            $this->next_label=false;
        }
        else {
            $this->next_label=trim(substr($line,1));
        }
    }

}

//################functions to treat get informations from the defline#############

    function genbankDefline($id, &$record)
    {
        $id = substr($id, 3); //take off gi|
        list($id, $db, $accession_version, $definition) = explode("|", $id);
        list($accession, $version) = explode(".", $accession_version);
        $record['ncbi_gi_id']   =$id;
        $record['id']           =$accession;
        $record['accession']    =$accession;
        $record['version']      =$accession_version;
        $record['definition']   =$definition;
        if (ereg("mRNA", $definition)) $record['moltype'] = "mRNA";
        if (ereg("Homo sapiens", $definition)) {
            $record['source']   = "Homo sapiens (human)";
            $record['organism'] = "Homo sapiens";
        }

    }

    function refseqDefline($id, &$record)
    {
        $id = substr($id, 4); //take off ref|
        list($accession_version, $definition) = explode("|", $id);
        list($accession, $version) = explode(".", $accession_version);
        $record['accession']    =$accession;
        $record['version']      =$accession_version;
        $record['definition']   =$definition;
        if (ereg("mRNA", $definition)) $record['moltype'] = "mRNA";
        if (ereg("Homo sapiens", $definition)) {
            $record['source']   = "Homo sapiens (human)";
            $record['organism'] = "Homo sapiens";
        }

    }

?>

Part(5/5): The code of the test file
######################################################################################################
<?php
define("GENEPHP_DIR","/home/ffleche/GENEPHP/");
define("MY_DIR","/home/ffleche/data_from_me/");

include(GENEPHP_DIR . 'parsers/spidey.inc.php');
include(GENEPHP_DIR . 'parsers/fasta.inc.php');
include(GENEPHP_DIR . 'genephp.inc.php');

###########partie ou on transforme genomique fasta en ensembl
$parser=new seqIOimport(MY_DIR . 'pak1_genomique.fa', 'fasta');

if ($parser) {
   while (!$parser->eof()) {
      $s=$parser->fetch();
      //Test the data concerning the Genomic sequence
      echo "Id: $s->id<br>\n";
      echo "Accession: $s->accession<br>\n";
      echo "Version: $s->version<br>\n";
      echo "ncbi_gi_id: $s->ncbi_gi_id<br>\n";
      echo "definition: $s->definition<br>\n";
      echo "moltype: $s->moltype<br>\n";
      echo "source: $s->source<br>\n";
      echo "organism: $s->organism<br>\n";
      //echo "Sequence: $s->sequence<br>\n";
      echo "Length: $s->seqlength<br>\n";
      echo "Date: $s->date<br>\n";
      $parser->Move_Next();
   }

}

###########Execute Spidey
exec('/home/ffleche/spidey.linux -i /home/ffleche/data_from_me/pak1_genomique.fa -m /home/ffleche/data_from_me/NM_002576.fa -p 0 -o /home/ffleche/data_from_me/resultatspidey -R /home/ffleche/data_from_me/repbase');

###########Parse Spidey
if(!$fh_spidey = fopen(MY_DIR . "resultatspidey", r)) die("problem to create resultatspidey");

$importer = new parse_spidey($fh_spidey);
$mrnafeatures = array();

while(!feof($importer->source)) {

$f = $importer->fetchNext();

}

###########Append the mRNA features to the Genomic sequence
$s->features[mRNA] =  $importer->mrna_features;
print_r($s->features);

?>