<?php

class UniGene {
	
	// UniGene DLST: Dynamic Local Storage Tool
	// R. Hart (c) 2005 Rutgers, The State University
	// rhart@rci.rutgers.edu
	
	// unigene.class.inc 
	
	// DLST is free software; you can redistribute it and/or
	// modify it under the terms of the GNU General Public License
	// as published by the Free Software Foundation; either version 2
	// of the License, or (at your option) any later version.
	//
	// DLST is distributed in the hope that it will be useful,
	// but WITHOUT ANY WARRANTY; without even the implied warranty of
	// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	// GNU General Public License for more details.
	//
	// http://www.gnu.org/copyleft/gpl.html
	//
	// You should have received a copy of the GNU General Public License
	// along with this program; if not, write to the Free Software
	// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
	
	// Database info:
	// table unigene - create by running "unigene.sql" in main folder
	// holds records of genomes available at ncbi and which are stored locally
	
	// table XXXdata: (XXX=genome code)
	// holds records of clusters, one cluster per row
	
	// table XXXsequence: (XXX=genome code)
	// holds records of genbank accession numbers clustered into a specific unigene cluster
	// many rows per cluster
	
	// table XXXexpress: (XXX=genome code)
	// holds tissue expression data for each cluster
	// many rows per cluster

	// Class Members
	
	// FTP NCBI data -- Set $ftp_pw to your email address
	var $ftp_server="ftp.ncbi.nih.gov";
	var $ftp_directory="repository/UniGene";
	var $ftp_uname="anonymous";
	var $ftp_pw="a@b.com";  //insert your email address
	var $ftp_info="*.info";
	var $genome_code;

	// Database info -- Change these to match your MySQL database server
	var $db_server = "localhost";  // "localhost" is default
	var $db_uname = "root"; // "root" is default but  you may wish to create a dedicated username for this purpose
	var $db_pw = ""; // defaults to a blank for no password.  
	var $db_name = "chips"; // defaults to "chips" but you can use any database name you like.
	var $db; // leave unassigned
	
	// Temporary local (web server) folder to hold files.  MUST have write access for the user running Apache.
	var $local_file_dir = "/Library/WebServer/Documents/dlst/files/";
	
	// Admin - variable to switch on administrative access
	// this will be replaced with a proper authentication system in future versions
	// but for now you can turn on or off admin access here
	var $admin = true;
	
	// Miscellaneous re-used variables
	var $graphic = "img/unigene_banner_dlst.gif";  // image used in header
	var $search_page = "http://www.mysite.com"; // link to a local search page using database
	var $crlf = "\r\n";

	
	// Constructor
	function UniGene(){
		// nothing happens here at this time
		return(true);
	}
	
	
	// Functions
	function page_head($the_heading="", $the_page="", $admin=false){
		// output page header including graphic
		
		print "<html>".$this->crlf;
		print "<head>".$this->crlf;
		print "<title>$the_heading</title>".$this->crlf;
		print "<link rel='stylesheet' href='include/style.css' type='text/css'>".$this->crlf;
		print "</head>".$this->crlf;
		print "<body>".$this->crlf;
		print "<table border='0'><tr>";
		print "<td><a href='http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=unigene'>";
		print "<img border='0' src='$this->graphic'></a></td></tr>".$this->crlf;
		print "<tr>";
		print "<td class='center'>| <a href='$this->search_page'>Search Page</a> ";
		print ($the_page != "status" ? "| <a href='index.php'>Status</a> " : "");
		print ($admin ? "| <a href='index.php?go=refresh'>Refresh Table From NCBI</a> " : "");
		print "|</td></tr></table>".$this->crlf;
		return(true);
	}
	
	
	function page_tail(){
		// output page tail including copyright notice
		print "<p class='small'>Copyright &copy; 2005, R. Hart, Rutgers, The State University</p>";
		print "</body>".$this->crlf;
		print "</html>".$this->crlf;
		return(true);
	}
	
	function genome_table($admin){
		// Obtains current genome table from database and outputs it to web page
		if($admin){
			print "<!--Admin permissions ON-->".$this->crlf;  //debug html comment
		}

		$this->db_connect();  //open connection to NCBI ftp site
		
		echo "<form name='getug' method='get' action='index.php'>".$this->crlf;
		echo "<input type='hidden' name='go' value='update'>".$this->crlf;
		
		echo "<h2>Current status of UniGene tables</h2>".$this->crlf;
		echo "<table border='1'>".$this->crlf;
		echo "<tr>"
			.($admin?"<th>Update</th>":"")
			."<th>Genome<br>Code</th><th>Genome</th><th>Build<br>Available</th>"
			."<th>Build<br>Stored</th><th>Date<br>Stored</th>"
			."</tr>"
			.$this->crlf;
		
		$sql = "select * from unigene order by genomecode asc";
		$result = mysql_query($sql);
		while($row=mysql_fetch_assoc($result)){
			
			echo "<tr>";
			if($admin) {
				echo "<td class='center'><input type='radio' name='genome' value='".$row['genomecode']."'></td>";
			}
			echo "<td class='center'>".$row['genomecode']."</td>";
			echo "<td><a target='_blank' href='ftp://ftp.ncbi.nih.gov/repository/UniGene/".ucfirst($row['genomecode']).".info' title='".substr(stripslashes($row['infofilencbi']),0,200)."'>".$row['title']."</a></td>";
			echo "<td class='center'>".$row['buildncbi']."</td>";
			echo "<td class='center'>".($row['buildlocal']?$row['buildlocal']:"&nbsp;")."</td>";
			echo "<td class='center'>".(($row['infodatelocal'] != "0000-00-00")?$row['infodatelocal']:"&nbsp;")."</td>";

			echo "</tr>".$this->crlf;
		}
		echo "</table>".$this->crlf;
		if($admin) {
			echo "<input type='submit' name='button' value='Go'>".$this->crlf;
			echo "</form>".$this->crlf;
		}
		return(true);
	}
	

	
	function ncbi_connect(){
		// establish connection to ncbi ftp server
		$conn_id = ftp_connect($this->ftp_server); 

		// login with uname and pw
		$login_result = ftp_login($conn_id, $this->ftp_uname, $this->ftp_pw); 

		// check connection
		if ((!$conn_id) || (!$login_result)) { //error condition
       		echo "<p>FTP connection has failed!<br>";
       		echo "Attempted to connect to $this->ftp_server for user $this->ftp_uname </p>"; 
       		return(false); 
   		} else { //success, hide in html comment
       		echo "<!--Connected to $this->ftp_server, for user $this->ftp_uname -->".$this->crlf;
   		}

		// change the directory 
		if (ftp_chdir($conn_id, $this->ftp_directory)) {  //success, hide in html comment
   			echo "<!--Current directory is now: " . ftp_pwd($conn_id) . "-->".$this->crlf;
		} else { //error
   			echo "<!--Couldn't change directory-->".$this->crlf;
   			return(false);
		}
		
		//set passive mode = true, this seems to help on some servers
		ftp_pasv($conn_id, true);

		return($conn_id);
	}
	
	function ncbi_get_titles($conn_id){
		// get list of all *.info files, one per genome
		// store values in member array named genome_code
		
		$info_tables = ftp_nlist($conn_id, $this->ftp_info);

		for ($i=0;$i<sizeof($info_tables);$i++) {
			$this->genome_code[$i] = substr($info_tables[$i],0,strpos($info_tables[$i],"."));
		}

		return(true);
	}


	
	function ncbi_close($conn_id){
		// close the FTP stream 
		if(ftp_close($conn_id)){
			return(true);
		}
			else {return(false);}
	}
	
	function db_connect(){
		// connect to your mysql server
		$this->db = mysql_connect($this->db_server, $this->db_uname, $this->db_pw);
		if(mysql_select_db($this->db_name,$this->db)){
			echo "<!--Connected to database-->".$this->crlf;
			return true;
		} else {
			echo "<!--Unable to connect to database".mysql_error()."-->".$this->crlf;
			return false;
		}
		
	}
	
	function refresh_genome_table($admin){
		// connect to ncbi, download directory list for *.info files
		// use file names to download and parse *.info file contents
		// store results in unigene table
		
		if($admin){
			print "<!--Admin permissions ON-->".$this->crlf;
		}
		
		echo "<h2>Updating Local UniGene Table</h2>".$this->crlf;
		$conn_id = $this->ncbi_connect();
		$this->db_connect();
		
		//get full directory list for info files to array
		$buff = ftp_rawlist($conn_id,$this->ftp_info);
		
		//create table output
		echo "<table border='1'>".$this->crlf;
		echo "<tr><th>Code</th><th>Genome</th><th>Build</th><th>Date</th></tr>".$this->crlf;
		
		//loop through each info file directory entry
		for ($i=0;$i<sizeof($buff);$i++) {
			
			$dir_line = $buff[$i];
			$genome_info_date = "";
			$genome_info_title =  "";
			$genome_build = "";
			$genome_title = "";
			$sql = "";
			
			//parse i-th buff entry
			$dir = $this->read_dir_line($dir_line);
   			$this->genome_code[$i] = substr($dir['name'],0,strpos($dir['name'],"."));
			$genome_info_date = date("Y-m-d",$dir['date']);
			
			//parse info file
			$info = $this->read_info($dir['name'],$conn_id);
			
			//recover data about previously loaded genomes
			$sql = "select buildlocal, infodatelocal from unigene where genomecode='".strtolower($this->genome_code[$i])."'";
			$return = mysql_query($sql);
			
			if($return) {
				$localdata = mysql_fetch_assoc($return);
			} else {
				$localdata['buildlocal']="";
				$localdata['infodatelocal']="";
			}
			
			//construct sql string containing new ncbi info and previously loaded genomes	
			$sql = "replace unigene (genomecode, title, buildlocal, buildncbi, infodatelocal, infodatencbi, infofilencbi) "
					."values ('".strtolower($this->genome_code[$i])."', "
						."'".$info['title']."', "
						."'".$localdata['buildlocal']."', "
						."'".$info['build']."', "
						."'".$localdata['infodatelocal']."', "
						."'".$genome_info_date."', "
						."'".$info['infofile']."')";

			//execute and check results		
			if(!(mysql_query($sql))){
				echo "<!--Failed to replace genome ".$this->genome_code[$i]."-->".$this->crlf;	
				return(false);
			} else { //success--output table row
				echo "<tr><td class='center'>".$this->genome_code[$i]."</td><td>"
					.$info['title']."</td><td class='center'>"
					.$info['build']."</td><td class='center'>"
					.$genome_info_date."</td></tr>".$this->crlf;
			}
			
		}
		
		echo "</table>".$this->crlf;
		$this->ncbi_close($conn_id);
		return(true);
	}
	
	function read_info($gc,$ftp_id) {
		//using passed file name and ftp connection, get an info file and parse contents
		
		// open a local temp file to hold data
		if(!($tfile = fopen($this->local_file_dir."tmp.txt","x+"))){
			echo "<!--Error creating temp file-->".$this->crlf;
		}
			
		// get the info file and grab first line
		if(!(ftp_fget($ftp_id,$tfile,$gc,FTP_ASCII,0))){
			echo "<!--Failed to get info file via ftp: ".$gc."-->".$this->crlf;
			return(false);
		} else {
			rewind($tfile);
			$line = fgets($tfile); //retrieve first line
			ereg("^(UniGene Build \#)([0-9]*)[[:space:]](.*)",$line,$reg);
		
			$tmp['build'] = $reg[2];
			$tmp['title'] = trim($reg[3]);
				
			rewind($tfile);
			while(!feof($tfile)){
				$tmp['infofile'] .= addslashes(fgets($tfile));
			}
			}
			fclose($tfile);
			unlink($this->local_file_dir."tmp.txt");
			return($tmp);
	}
	
	function read_dir_line($line){
		//parse a directory line from ncbi ftp server

		while (($k = strpos($line, "  ")) !== FALSE)
       		$line = substr($line,0,$k+1).trim(substr($line,$k));

		$parsed_array = split(" ",$line);
		$tmp_array['size']=$parsed_array[4];
		$tmp_array['mon']=$parsed_array[5];
		$tmp_array['day']=$parsed_array[6];
		$tmp_array['timeyear']=$parsed_array[7];
		$tmp_array['name']=$parsed_array[8];
		if(strpos($tmp_array['timeyear'],":")) {
			//timeyear contains time, use current year instead
			if(strtotime($tmp_array['mon']." ".$tmp_array['day'].", ".date("Y"))>time()){
				//date is from previous year since can't be in future
				$tmp_array['date']=strtotime($tmp_array['mon']." ".$tmp_array['day'].", ".(date("Y")-1));
			} else {
				//date is from earlier this year
				$tmp_array['date']=strtotime($tmp_array['mon']." ".$tmp_array['day'].", ".date("Y"));
			}
				
		} else {
			//timeyear contains year
			$tmp_array['date']=strtotime($tmp_array['mon']." ".$tmp_array['day'].", ".$tmp_array['timeyear']);
		}
		return $tmp_array;

	}	
	
	function gettoken($ls, &$li, &$eol) {
		// return tag from line of data file

		while (substr($ls,$li,1) == ' ' or substr($ls,$li,1) == '     ') {
			$li += 1;
		}

		$i1 = $li;

		while (substr($ls,$li,1) <> ' ' and substr($ls,$li,1) <> '    ' and
		substr($ls,$li,1) <> ';' and substr($ls,$li,1) <> '=' and $li <= strlen($ls) - 1) {
			$li += 1;
		}

		if ($li <= strlen($ls) - 1) { $eol = 'N'; } else { $eol = 'Y'; }

		return substr($ls,$i1,$li - $i1);

	}


function gettissue($ls, &$li, &$eol) {
	// parse tissues from data file

	while (substr($ls,$li,1) == ' ' or substr($ls,$li,1) == '     ') {
		$li += 1;
	}

	$i1 = $li;

	while (substr($ls,$li,1) <> ';' and $li <= strlen($ls) - 1) {
		$li += 1;
	}

	if ($li <= strlen($ls) - 1) { $eol = 'N'; } else { $eol = 'Y'; }

	return substr($ls,$i1,$li - $i1);

}

function get_data_file($gc) {
	//download a gzipped data file from ncbi via ftp connection, store to local temp file
	
	$local_file_name = $this->local_file_dir.$gc.".data.gz";
	
	if(file_exists($local_file_name)) {
		echo "<!--File exists: $local_file_name -->".$this->crlf;
		$fp=fopen($local_file_name,"r+");
	} else {
		echo "<!--File being created: $local_file_name -->".$this->crlf;
		$fp=fopen($local_file_name,"x+");
	}
	
	if($fp) {
		$ncbiFn = ucfirst($gc.".data.gz");
		$conn_id = $this->ncbi_connect();
		ftp_fget($conn_id, $fp, $ncbiFn, FTP_BINARY, 0);
		fclose($fp);
		$this->ncbi_close($conn_id);
		return(true);
	} else {
		echo "<!--Failed to open file: $local_file_name -->".$this->crlf;
		return(false);
	}
	
}

function parser($gc) {
	// open downloaded, gzipped data file and parse contents into databases
	
	// data file has been downloaded and stored, open file
	// data is gzipped
	$fp = gzopen($this->local_file_dir.$gc.".data.gz","r");

	// establish db connection
	$this->db_connect();

	// clear infodatelocal from unigene
	// delete contents of local genome-specific tables from database
	// create genome-specific tables if they don't exist
	
// data table - empty if exists
$sqlmaker=<<<EOL
DROP TABLE IF EXISTS `%sdata`
EOL;
$sql = sprintf($sqlmaker,$gc);
if(!(mysql_query($sql))) {
	echo "<!--Error running sql: $sql <br>".mysql_error()."<br>-->".$this->crlf;
	return(false);
}

// data table - create if not exists
$sqlmaker=<<<EOL
CREATE TABLE IF NOT EXISTS `%sdata` (
  `id` int(11) NOT NULL default '0',
  `title` varchar(255) default NULL,
  `gene` varchar(20) default NULL,
  `cytoband` varchar(20) default NULL,
  `mgi` varchar(20) default NULL,
  `locuslink` varchar(20) default NULL,
  `chromosome` varchar(20) default NULL,
  PRIMARY KEY  (`id`),
  KEY `%sDataLocuslink` (`locuslink`)
)
EOL;
$sql = sprintf($sqlmaker,$gc,$gc);
if(!(mysql_query($sql))) {
	echo "<!--Error running sql: $sql <br>".mysql_error()."<br>-->".$this->crlf;
	return(false);
}

// express table - empty if exists
$sqlmaker=<<<EOL
DROP TABLE IF EXISTS `%sexpress`
EOL;
$sql = sprintf($sqlmaker,$gc);
if(!(mysql_query($sql))) {
	echo "<!--Error running sql: $sql <br>".mysql_error()."<br>-->".$this->crlf;
	return(false);
}

// express table - create if not exists
$sqlmaker=<<<EOL
CREATE TABLE IF NOT EXISTS `%sexpress` (
  `id` int(11) NOT NULL default '0',
  `seq_no` int(11) NOT NULL default '0',
  `tissue` varchar(255) default NULL,
  PRIMARY KEY  (`id`,`seq_no`)
)
EOL;
$sql = sprintf($sqlmaker,$gc);
if(!(mysql_query($sql))) {
	echo "<!--Error running sql: $sql <br>".mysql_error()."<br>-->".$this->crlf;
	return(false);
}

// sequence table - empty if exists
$sqlmaker=<<<EOL
DROP TABLE IF EXISTS `%ssequence`
EOL;
$sql = sprintf($sqlmaker,$gc);
if(!(mysql_query($sql))) {
	echo "<!--Error running sql: $sql <br>".mysql_error()."<br>-->".$this->crlf;
	return(false);
}

// sequence table - create if not exists
$sqlmaker=<<<EOL
CREATE TABLE IF NOT EXISTS `%ssequence` (
  `id` int(11) NOT NULL default '0',
  `seq_no` int(11) NOT NULL default '0',
  `acc` varchar(20) default NULL,
  PRIMARY KEY  (`id`,`seq_no`),
  KEY `seq_acc` (`acc`(10)),
  KEY `acc_index` (`acc`(10))
)
EOL;
$sql = sprintf($sqlmaker,$gc);
if(!(mysql_query($sql))) {
	echo "<!--Error running sql: $sql <br>".mysql_error()."<br>-->".$this->crlf;
	return(false);
}

	$seq_seq_no=0;
	$ls = gzgets($fp);
	$ls = gzgets($fp);
	$id = 1;
	$iii = 1;
	$li = 0;
	$eol = 'N';
	$title1="";
	$gene="";
	$cytoband="";
	$mgi="";
	$chromo="";

	$token = $this->gettoken($ls, $li, $eol);

	echo "<p>Parsing downloaded data (one dot for each cluster stored) ";
	while (!feof ($fp) /* and $iii<=300*/) {
		//echo "<!--Read line: $ls -->".$this->crlf;
		$li = 0 ; $iii = $iii + 1;
		$token = $this->gettoken($ls, $li, $eol);
		
		switch ($token) {
			case "":
			case "STS":
			case "PROTSIM":
			case "SCOUNT":
			case "\/\/":
				break;
			
			case "ID":
				$seq_seq_no=0;
				$c = "insert into ".$gc."data values ($id, '$title1', '$gene', '$cytoband', '$mgi', '$locus', '$chromo')";
			
				if($get=mysql_query($c)) {
					echo ".<!-- $id $title1 -->".$this->crlf;
				} else {
					echo "x<!--Error $c ".$this->crlf.mysql_error()."-->".$this->crlf;
				}
				$ls = trim(substr($ls,$li));
				$id = (int)(substr($ls,strpos($ls,".")+1));
				break;
			
			case "TITLE":
				$title1 = addslashes(trim(substr($ls,$li)));
				break;
			
			case "GENE":
				$gene = trim(substr($ls,$li));
				break;
				
			case "CYTOBAND":
				$cytoband = trim(substr($ls,$li));
				break;
				
			case "MGI":
				$mgi = trim(substr($ls,$li));
				break;
				
			case "LOCUSLINK":
				$locus = trim(substr($ls,$li));
				break;
				
			case "CHROMOSOME":
				$chromo = trim(substr($ls,$li));
				break;
				
			case "SEQUENCE":
				$seq_seq_no = $seq_seq_no + 1;
				$token = $this->gettoken($ls, $li, $eol);
				$li = $li + 1;
				$acc = $this->gettoken($ls, $li, $eol);
				//
				//trim any trailing ".n" from acc before inserting
				if (!(strpos($acc,".") === false)) {
					$acc = substr($acc,0,strpos($acc,"."));
				}

				$c = "insert into ".$gc."sequence values ($id, $seq_seq_no, '$acc')";
				$get=mysql_query($c);
				break;
				
			case "EXPRESS":
				$exp_seq_no = 1;
				$tissue = $this->gettissue($ls, $li, $eol);
				while ($eol == 'N') {
					$c = "insert into ".$gc."express values ($id, $exp_seq_no, '$tissue')";
					$get=mysql_query($c);
					$exp_seq_no += 1;
					$li += 1;
					$tissue = $this->gettissue($ls, $li, $eol);
				}
		}

		$ls = gzgets($fp);
	}

	//writes last Data record if 2 slashes followed by eof
	$token = $this->gettoken($ls, $li, $eol);
	if($token=="\/\/"){
		$c = "insert into ".$gc."data values ($id, '$title1', '$gene', '$cytoband', '$mgi', '$locus', '$chromo')";
		$get=mysql_query($c);
	}

	$get=mysql_query('commit');
	echo "<br><br>Parse Complete</p>".$this->crlf;
	
	//close and erase temp file
	gzclose($fp);
	unlink($this->local_file_dir.$gc.".data.gz");
	
	//get and store build number
	$conn_id = $this->ncbi_connect();
	$info = $this->read_info(ucfirst($gc).".info",$conn_id);
	$this->ncbi_close($conn_id);

	//set new infodatelocal in unigene to mark download
	$sql = "update unigene set infodatelocal='".date("Y-m-d")."', buildlocal='".$info['build']."' where genomecode='".$gc."'";
	if(!(mysql_query($sql))) {
		echo "<!--Error updating unigene table: $sql".$this->crlf.mysql_error()."-->".$this->crlf;
		return(false);
	} else {
		return(true);
	}

}
	
}
?>