--Boundary-00=_Pucs+8O1mjqp0a3 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Content-Disposition: inline However, I nonetheless have gotten it all working, or so it appears to me. All, that is, except the other three parser objects, but those shouldn't cause too much trouble to turn into classes, now that I've got the rest worked out. I believe I've managed to keep the parse interface completely untouched - that is, the original test script's "fasta file" test ought to work completely unchanged, even though I ended up messing around a bit with the internal structure. Rather than "clobber" the existing code, I'm attaching my "replacement" files here for review. They are: seq_factory.inc.php - the new seq_factory object (feed it information, call the "createSeq" method, and you get back a seq object). parse2.inc.php - the rearranged version of Nico's parse object (I changed the name so as to keep the original there for me to refer back to as I worked - I replaced the reference to "parse.inc.php" with "parse2.inc.php" in my local copy of "genephp.inc.php" to test it). parse_fasta_class.inc.php - the "class" version of the original parser. test.php - Nico's test script, with the non-fasta stuff commented out (until the other parsers are class-ified). I modified the output slightly just so I could double-check that my changes were still behaving as they were expected to and make the output a little more readable on the command-line as well, but otherwise it's the same script. If these changes are acceptable to everyone to go in, the only requirements for the filetype parsers will be: 1)The parsers must be classes 2)Memory-based parsers MUST accept an array of lines as a data source. 3)Parsers SHOULD also accept raw text, filehandles, or filenames. (only relevant when autodetection is being bypassed). 4)Parsers MUST have a "fetchNext()" method, which returns the next parsed record (starting with the first one, obviously) as an array, made up of whatever key=>value pairs are available in the format. The keys MUST be named after the attributes in the seq object (e.g. "id"),and SHOULD begin "id" and "sequence". This method MUST return false if there are no more records. Anyway, let me know what you think, and if it looks good to go in, I'll get to work on making classes out of the other existing parsers (and then working on the EFetch module - with this design, I should be able to drop the individual parser modules for nucleotide and protein EFetch results right into this framework as well...) Sean --Boundary-00=_Pucs+8O1mjqp0a3 Content-Type: text/plain; charset="us-ascii"; name="parse2.inc.php" Content-Transfer-Encoding: 7bit Content-Description: Rearrangement of original parse.inc.php Content-Disposition: attachment; filename="parse2.inc.php" <?php function normalizeNewline($text){ /* fix DOS newlines */ str_replace("\r\n", "\n", $text); /* fix Mac newlines */ str_replace("\r", "\n", $text); return $text; } class parse { var $seqfiletype; // holds the current filetype var $seqfiletypes; // Array with seqfiletypes known in biophp var $func; // name of function needed to parse the seqfiletype var $filename; // holds path to the sequence file var $fp; // handle to open file var $flines=Array(); // array of strings holding file contents var $index=Array(); // Array holding pointers to parsed records var $parsedRecords=Array(); //stack of result arrays returned by filetype parser var $maxHistory=1000; //default history holds up to 1000 records var $eof; // true when at last record var $bof; // true when at first record var $parserObj; //parser for the particular filetype var $seqfactory; //instance of seq object factory // constructor, determines wether we read from file or a string, // fill array $flines, determined $seqfiletype and does a first call // to $this->fetch() to get the $this->index and $this->eof right function parse ($source, $seqfiletype=false) { // set the sequence file types we know about //$this->seqfiletypes=array('fasta','genbank','pdraw','swissprot'); $this->seqfiletypes=array('fasta'); //add rest back in as they get class-ified // check if a file or text was given, fill array $flines) if (@is_readable($source)) { $this->filename=$source; $this->flines=@file($this->filename); } else {//assume source is text containing data // I am not sure this works for all types of line breaks $source=normalizeNewline($source); $this->flines=explode(chr(10),$source); } if (!$this->flines) { $this=false; return false; } // initialize variable $this->index[]=0; $this->eof=false; $this->bof=true; $this->seqfiletype=$this->autodetect($seqfiletype); if ($this->seqfiletype) { require_once(GENPHP_DIR.'/parsers/parse_'.$this->seqfiletype.'_class.inc.php'); require_once(GENPHP_DIR.'seq_factory.inc.php'); $evalline="\$this->parserObj = new parse_".$this->seqfiletype."(\$this->flines);"; eval($evalline); $this->seqfactory = new seq_factory(); //the filetype parser has the flines data now, so we don't need it up here unset($this->flines); //grab the first record... if(!($this->parsedRecords[]=$this->parserObj->fetchNext())) { return false; //nothing could be read by filetype parser $this->eof=true; } } else {// if we don't know the seqfiletype, stop $this->eof=true; } } function fetch () {//returns the current parsed record as a seq object $record=current($this->parsedRecords); return($this->seqfactory->createSeq(current($this->parsedRecords))); } function fetchRawRecord() {//returns the "raw" array of parsed record data for current record return current($this->parsedRecords); } function autodetect ($seqfiletype) { // test whether we know $seqfiletype if (in_array($seqfiletype, $this->seqfiletypes)) { return $seqfiletype; } $seqfiletype=false; // now autodetect if (substr($this->flines[0],0,5) == "LOCUS") { $seqfiletype='genbank'; } elseif (substr($this->flines[0],0,1) == ">") { $seqfiletype='fasta'; } elseif (substr($this->flines[0],0,2) == "ID") { $seqfiletype='swissprot'; } elseif (strstr($this->flines[0],'pDRAW') ) { $seqfiletype='pdraw'; } return $seqfiletype; } function set_Array_Pointer ($target) { if (key($this->parsedRecords)!==$target) { reset($this->parsedRecords); for ($i=0;$i<$target;$i++) { next($this->parsedRecords); } } } function move_Next() { //check whether there are even any more records if ($this->eof) { return false; } //if we're at the end of the current stack, try to pull the next record if(key($this->parsedRecords) >= (count($this->parsedRecords)-1)) { if($this->parsedRecords[]=$this->parserObj->fetchNext()) { //make sure we haven't exceeded maxHistory if(count($this->parsedRecords) > $this->maxHistory) { //drop the oldest record to get back within history limit array_shift($this->parsedRecords); } } else { //if we couldn't get another record, that must be the end $this->eof=true; return false; } } //okay, at this point there should be a "next record" if (next($this->parsedRecords)) { //advanced to next parsed record $this->bof=false; //we've moved away from beginning. return true; } } function move_Previous () { if ($this->bof) { return false; } if(prev($this->parsedRecords)) { if (key($this->parsedRecords)==0) { $this->bof=true; } $this->eof=false; return true; } } function eof() { if ($this->eof) return true; else return false; } function bof() { if ($this->bof) return true; else return false; } function setMaxHistory($maximum) { $this->maxHistory=$maximum; } } // end of class parse ?> --Boundary-00=_Pucs+8O1mjqp0a3 Content-Type: text/plain; charset="us-ascii"; name="seq_factory.inc.php" Content-Transfer-Encoding: 7bit Content-Description: "factory" class for generating seq objects Content-Disposition: attachment; filename="seq_factory.inc.php" <?php /* seq_factory.inc.php - given an array containing sequence information, creates and returns a GenePHP sequence object from the data */ class seq_factory { var $seqdata=Array(); //array passed to the factory containing sequence info var $default_id="unspecified"; //what to 'name' sequence objects without id info function seq_factory() //creates a "blank" seq object if no data passed { require_once(GENPHP_DIR.'seq.inc.php'); //ensure seq object is available } function createSeq($seqdata="") {//returns a constructed Seq object with key=>value pairs //passed to it, or key=>value pairs already constructed //with the 'setSeqAttrib()' method by default if($seqdata == "") { $seqdata=$this->seqdata; } if(!isset($seqdata["id"])) { $seqdata["id"]=$this->default_id; } if(!isset($seqdata["seqlength"])) { $seqdata["seqlength"]=0; } $seqobj = new seq(); foreach(array_keys($seqdata) as $attrib) { //TODO - add validation of attributes //convert these calls later when interface methods are available, //but for now set the attributes directly $seqobj->$attrib=$seqdata[$attrib]; } return $seqobj; } function setSeqAttrib($attrib,$value) {//interface to attribute values for to-be-created seq object //TODO - add validation of $attrib name $this->seqdata[$attrib]=$value; } function resetSeqData() {//purges existing key=>value pairs in $this->seqdata the easy way unset($this->seqdata); $this->seqdata=Array(); } } ?> --Boundary-00=_Pucs+8O1mjqp0a3 Content-Type: text/plain; charset="us-ascii"; name="test.php" Content-Transfer-Encoding: 7bit Content-Description: The original test script, with non-fasta portions commented out and slightly modified output Content-Disposition: attachment; filename="test.php" <?php ob_implicit_flush(true); // This file tests class parse include('../genephp.inc.php'); /* put these back in as they're re-implemented // test parsing a swissprot file $parser=new parse('../testdata/lamin.sp'); $s=$parser->fetch(); if ($s) { print_r($s->id); echo "<br>"; print_r($s->sequence); echo "<br>"; echo "Just parsed a swissprot file.<br>"; } else echo "Failed parsing the swissprot file.<br>"; // test parsing from text $lines=file('../testdata/lamin.gb'); foreach ($lines as $line) { if (strlen($line)) $sequence.=$line; } $p=new parse($sequence); if ($p) { $s=$p->fetch(); echo "Id: {$s->id}<br>"; echo "Length: {$s->seqlength}<br>"; echo "Sequence: {$s->sequence}<br>"; echo "<br>Just parsed fom text source.<br>"; } // test parsing from file $parser=new parse('../testdata/lamin.gb'); if ($parser) { $s=$parser->fetch(); print_r($s); echo "<br>Just parsed genbank from file.<br>"; } */ // test moving through a bunch of entries in a fasta file format $parser = new parse('../testdata/lamin.fasta'); while ($parser && !$parser->eof()) { $sq=$parser->fetch(); echo "id: {$sq->id}<br>\n"; echo "Length: {$sq->seqlength}<br>\n"; echo "Sequence: {$sq->sequence}<br>\n"; $parser->move_Next(); } echo "At the end.<br>\n"; while ($parser && !$parser->bof()) { //print_r($parser->bof); $parser->move_Previous(); $sq=$parser->fetch(); print("moved back to:".$sq->id."\nwhich is:".$sq->seqlength."bp long<br>\n."); echo "<br>\n"; } echo "Back at the beginning.<br>\n"; ?> --Boundary-00=_Pucs+8O1mjqp0a3 Content-Type: text/plain; charset="us-ascii"; name="parse_fasta_class.inc.php" Content-Transfer-Encoding: 7bit Content-Description: memory-based fasta parser, converted to a class Content-Disposition: attachment; filename="parse_fasta_class.inc.php" <?php /* Parser designed and coded by Nicos converted to a class by Sean */ class parse_fasta { var $flines=Array(); //array of lines from the fasta text var $records=Array(); //array for individual fasta records var $currentRecord=""; //string containing current record function parse_fasta($source) { if (is_array($source)) { //assume pre-split flines $this->flines=$source; } else if (is_resource($source)) { //jump straight to reading $this->readLines($source); } else if(file_exists($source)) { $fileHandle=fopen($source,"r"); $this->readLines($fileHandle); } else { //assumed to be text containing data $this->flines=preg_split("/[\n\r]/",$source); } $this->splitRecords(); } function readLines($fhandle) { while($line=fgets($fhandle,2048)) { $this->flines[]=$line; } fclose($fhandle); } function splitRecords() {//collect and re-split at label lines $recordlist=Array(); $lines=implode("\n",$this->flines); $recordlist=explode(">",$lines); foreach($recordlist as $record) { if(trim($record)!=""){ //skip any blank records $recordlines=explode("\n",$record); $id=">".array_shift($recordlines); //first line is id //now re-collapse the rest as the sequence, losing all newlines $record=str_replace("\n","",implode("",$recordlines)); $this->records[]=$id."\n".$record; } } } function fetchNext() {//returns the next record as an array $matches=Array(); //for preg_match $record=Array(); if($this->currentRecord=="") { //parsing very first record $this->currentRecord=reset($this->records); } else if(!($this->currentRecord=next($this->records))) { //no more records return false; } //whatever - we should now have a current record if we're here if(preg_match("/>(.+)\n(.+)/",$this->currentRecord,$matches)) { $record["id"]=$matches[1]; $record["sequence"]=preg_replace("/\s/","",$matches[2]); $record["seqlength"]=strlen($record["sequence"]); return($record); } else { return false; } } /* function fetch() {//reads and returns next record in memory $id="";//will hold label from fasta definition line $currLine=key($this->flines); // find first/next fasta identifier: $line=current($this->flines); while ($line && !substr($line,0,1) =='>') { $line=next($this->flines); } if (!$line) { // no valid Fasta entry found $this->arrayPointer($currLine); $this->eof=true; return false; } // need to check for interesting stuff on the first line here $id=trim(substr($line,1)); //remove the leading > character // next line contains the sequence: $line=next($this->flines); // note: we can not simply use $line below, since there might be empty lines while (!($line===false) && (substr($line,0,1) <>'>') ) { if ($line) $sequence.=trim($line); $line=next($this->flines); } // check for next record or EOF if (substr($line,0,1) =='>') { // ahh, there is another record $key=key($this->flines); if (!in_array($key,$this->index)) $this->index[]=$key; } else $this->eof=true; // reset the Array line pointer to the first line of record we just read $this->arrayPointer($currLine); // construct the Seq record $seqobj["id"]=$id; $seqobj["sequence"]=$sequence; $seqobj["seqlength"]=strlen($sequence); return $seqobj; } */ } // end of class parse_fasta.inc.php ?> --Boundary-00=_Pucs+8O1mjqp0a3--