/*
 *  eqtl/genome.c 
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:47 $, $Version$
 *  
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */

#include <math.h>

#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_io.h>
#include <gdl/gdl_hash.h>
#include <gdl/gdl_list.h>
#include <gdl/gdl_snp_genome.h>
#include <gdl/gdl_genex_genome.h>
#include <gdl/gdl_eqtl_genome.h>

#include "gene_entry.c"

static gdl_string *
gdl_eqtl_genome_chromfile (const gdl_eqtl_genome * g, size_t i)
{
	return gdl_string_sprintf ("%s/%s.db", g->dbdir, g->chroms[i]);
}

gdl_eqtl_genome *
gdl_eqtl_genome_alloc (const gdl_string * config_file, 
							  const gdl_string * probe_dir,
							  const gdl_string * annot_dir,
							  const gdl_string * refseq_dir,
							  const gdl_string * gmap_dir,
							  const gdl_string * dbdir,
                       gdl_snp_data_format * format,
                       gdl_boolean cnv_data)
{
	size_t i, j, ngene=0;
	gdl_eqtl_chromosome * chrom;
	gdl_genex_genome * genex;
	gdl_snp_genome   * gsnp;
	gdl_eqtl_genome  * g;
	
	g = GDL_CALLOC (gdl_eqtl_genome, 1);
	
	g->dbdir = gdl_string_clone (dbdir);
	
	gsnp  = gdl_snp_genome_alloc (config_file, format, dbdir);
	genex = gdl_genex_genome_alloc (config_file, probe_dir, refseq_dir, dbdir);
	
	if (annot_dir)
	{
		gdl_snp_genome_annotation (gsnp, annot_dir);
	}
	if (cnv_data)
	{
		gdl_snp_genome_cnv (gsnp, config_file);
	}
	if (gmap_dir)
	{
		gdl_snp_genome_gmap (gsnp, gmap_dir);	
	}
	
	g->nchrom  = gsnp->nchrom;
	g->chroms  = GDL_MALLOC (gdl_string *, g->nchrom);
	g->dico    = gsnp->dico;
	gsnp->dico = 0;
	
	for (i = 0; i < gsnp->nchrom; i++)
	{
		gdl_snp_chromosome   * snp_data = gdl_snp_genome_get (gsnp, i);
		gdl_genex_chromosome * exp_data = gdl_genex_genome_get (genex, i);
		
		g->chroms[i] = gdl_string_clone (gsnp->chroms[i]);
		chrom = gdl_eqtl_chromosome_alloc (snp_data, exp_data);
		ngene += chrom->ngene;
		gdl_eqtl_genome_set (g, i, chrom);
		
		GDL_FREE (chrom->genes);
		GDL_FREE (chrom->probes);
		gdl_snp_chromosome_free (snp_data);
		gdl_genex_chromosome_free (exp_data);
	}
	
	gdl_genex_genome_rm (genex);
	gdl_snp_genome_rm (gsnp);
	
	gdl_genex_genome_free (genex);
	gdl_snp_genome_free (gsnp);
	
	g->genes = gdl_hashtable_alloc (gdl_eqtl_gene_entry_interface, ngene);
	for (i = 0; i < g->nchrom; i++)
	{
		chrom = gdl_eqtl_genome_get (g, i);
		for (j = 0; j < chrom->ngene; j++)
		{
			gdl_eqtl_gene * gene = chrom->genes[j];
			gdl_eqtl_gene_entry * entry = gdl_eqtl_gene_entry_alloc (i, j);
			gdl_hashtable_add (g->genes, gene->name, entry, 1);
		}
		gdl_eqtl_chromosome_free (chrom); 
	}
	
	return g;
}

void
gdl_eqtl_genome_free (gdl_eqtl_genome * v)
{
	if (v)
	{
		size_t i;
		for (i = 0; i < v->nchrom; i++)
		{
			gdl_string_free (v->chroms[i]);
		}
		GDL_FREE (v->chroms);
		gdl_string_free (v->dbdir);
		gdl_snp_annot_dico_free (v->dico);
		gdl_hashtable_free (v->genes);
		gdl_eqtl_sample_info_free (v->sample_info);
		GDL_FREE (v);
	}
}

size_t
gdl_eqtl_genome_size (const gdl_eqtl_genome * g)
{
	return g->nchrom;
}

gdl_eqtl_chromosome *
gdl_eqtl_genome_get (const gdl_eqtl_genome * g, size_t i)
{
	gdl_string * file;
	FILE * stream;
	gdl_eqtl_chromosome * c;
	
	file = gdl_eqtl_genome_chromfile (g, i);
	
	stream = gdl_fileopen (file, "r");
	
	c = gdl_eqtl_chromosome_fread (stream);
	
	c->name = g->chroms[i];
	
	c->dico = g->dico;
	
	gdl_fileclose (file, stream);
	
	gdl_string_free (file);
	
	return c;
}

gdl_eqtl_chromosome *
gdl_eqtl_genome_lookup (const gdl_eqtl_genome * g, const gdl_string * name)
{
	size_t i;
	
	for(i = 0; i < g->nchrom; i++)
	{
		if (!strcmp(g->chroms[i], name))
		{
			return gdl_eqtl_genome_get (g, i);
		}
	}
	
	return NULL;
}

int
gdl_eqtl_genome_set (const gdl_eqtl_genome * g, size_t i, gdl_eqtl_chromosome * c)
{
	int status;
	gdl_string * file;
	FILE * stream;
	
	file = gdl_eqtl_genome_chromfile (g, i);
	
	stream = gdl_fileopen (file, "w");
	
	status = gdl_eqtl_chromosome_fwrite (stream, c);
	
	gdl_fileclose (file, stream);
	
	gdl_string_free (file);
	
	return status;
}

size_t
gdl_eqtl_genome_select_most_variable_probe (gdl_eqtl_genome * v, const double prop, gdl_boolean extreme_outlier)
{
	size_t i, j, k, n, m, size, out, rm=0;
	size_t * idx;
	double * vpr;
	gdl_eqtl_chromosome * c;
	gdl_eqtl_probe ** p;
	gdl_eqtl_gene  ** g;
	
	if (prop > 1 || prop <= 0) return;
	
	for (n = i = 0; i < v->nchrom; i++)
	{
		c = gdl_eqtl_genome_get (v, i);
		if (prop == 1 || extreme_outlier)
		{
			m = c->nprobe;
			p = c->probes;
			for (k = 0; k < m; k++)
			{
				if (extreme_outlier
				    && (out=gdl_genex_probe_extreme_outlier (p[k], c->pop_sizes, c->npop))!=0)
				{
					if (v->logger)
					{
						fprintf (v->logger, "Discard probe %s: contains %d extreme outlier(s)\n", p[k]->name, out);
					}
					p[k]->ignore='y';
					rm++;
				}
				else
				{
					p[k]->ignore='n';
					n++;
				}
			}
			if (prop == 1)
			{
				gdl_eqtl_chromosome_select_gene (c);
			}
			gdl_eqtl_genome_set (v, i, c);
		}
		else
		{
			n += c->nprobe;
		}
		gdl_eqtl_chromosome_free (c);
	}
	
	if (prop == 1) return rm;
	
	size = floor (n*prop);
	rm   = n - size;
	
	if (v->logger)
	{
		fprintf (v->logger, "--\n");
		fprintf (v->logger, "Select the %d most variable probes (of %d)\n", size, n);
		fprintf (v->logger, "--\n");
		fprintf (v->logger, "Rank\tChrom\tName\tScore\n");
	}
	
	vpr = GDL_MALLOC (double, n);
	
	for (j = i = 0; i < v->nchrom; i++)
	{
		c = gdl_eqtl_genome_get (v, i);
		m = c->nprobe;
		p = c->probes;
		for (k = 0; k < m; k++)
		{
			if (extreme_outlier && p[k]->ignore == 'y') continue;
			vpr[j++] = gdl_genex_probe_var_score (p[k], c->pop_sizes, c->npop);
		}
		gdl_eqtl_chromosome_free (c);
	}
	idx = GDL_MALLOC (size_t, 2*n);
	gdl_sort_index (idx, vpr, 1, n);
	for (i = 0; i < n; i++) idx[n+idx[i]]=n-i;
	for (j = i = 0; i < v->nchrom; i++)
	{
		c = gdl_eqtl_genome_get (v, i);
		m = c->nprobe;
		p = c->probes;
		for (k = 0; k < m; k++)
		{
			if (extreme_outlier && p[k]->ignore == 'y') continue;
			if (idx[n+j] < size) 
			{
				p[k]->ignore='n';
				if (v->logger)
				{
					fprintf (v->logger, "%d\t%s\t%s\t%g\n", idx[n+j], c->name, p[k]->name, vpr[j]);
				}
			}
			else 
			{
				p[k]->ignore='y';
			}
			j++;
		}
		gdl_eqtl_chromosome_select_gene (c);
		gdl_eqtl_genome_set (v, i, c);
		gdl_eqtl_chromosome_free (c);
	}
	GDL_FREE (idx);
	GDL_FREE (vpr);
	
	return rm;
}

size_t
gdl_eqtl_genome_select_snp (gdl_eqtl_genome * v, const double fmin, const double gmin, const gdl_boolean poly_pop)
{
	size_t i, rm=0;
	gdl_eqtl_chromosome * c;
	
	for (i = 0; i < v->nchrom; i++)
	{
		c = gdl_eqtl_genome_get (v, i);
		c->snp_data->name = v->chroms[i];
		gdl_snp_chromosome_set_logger (c->snp_data, v->logger);
		rm += gdl_snp_chromosome_select_snp (c->snp_data, fmin, gmin, poly_pop);
		gdl_eqtl_genome_set (v, i, c);
		gdl_eqtl_chromosome_free (c);
	}
	
	return rm;
}

size_t
gdl_eqtl_genome_probe_size (const gdl_eqtl_genome * g)
{
	size_t i, n=0;
	gdl_eqtl_chromosome * c;
	for (i = 0; i < g->nchrom; i++)
	{
		c = gdl_eqtl_genome_get (g, i);
		n += c->nprobe;
		gdl_eqtl_chromosome_free (c);
	}
	return n;
}

size_t
gdl_eqtl_genome_snp_size (const gdl_eqtl_genome * g)
{
	size_t i, n=0;
	gdl_eqtl_chromosome * c;
	for (i = 0; i < g->nchrom; i++)
	{
		c = gdl_eqtl_genome_get (g, i);
		n += c->snp_data->chrom->size;
		gdl_eqtl_chromosome_free (c);
	}
	return n;	
}

void
gdl_eqtl_genome_collapse_gene (gdl_eqtl_genome * g)
{
	size_t i;
	gdl_eqtl_chromosome * c;
	
	for (i = 0; i < g->nchrom; i++)
	{
		c = gdl_eqtl_genome_get (g, i);
		gdl_eqtl_chromosome_collapse_gene (c);
		gdl_eqtl_genome_set (g, i, c);
		gdl_eqtl_chromosome_free (c);
	}	
}

void
gdl_eqtl_genome_meta_probeset (gdl_eqtl_genome * g, const size_t meta_type)
{
	size_t i;
	gdl_eqtl_chromosome * c;
	
	for (i = 0; i < g->nchrom; i++)
	{
		c = gdl_eqtl_genome_get (g, i);
		if (g->logger)
			c->logger = g->logger;	
		gdl_eqtl_chromosome_meta_probeset (c, meta_type);
		gdl_eqtl_genome_set (g, i, c);
		gdl_eqtl_chromosome_free (c);
	}		
}

const gdl_eqtl_gene_entry *
gdl_eqtl_genome_search_gene (const gdl_eqtl_genome * v, const gdl_string * name)
{
	return gdl_hashtable_lookup (v->genes, name);
}

gdl_eqtl_genome *
gdl_eqtl_genome_fread (FILE * stream)
{
	if (stream)
	{
		int status;
		size_t i, ngene;
		unsigned char has;
		gdl_eqtl_genome * v;
		
		v = GDL_MALLOC (gdl_eqtl_genome, 1);
		
		v->dbdir = gdl_string_fread (stream);
		GDL_FREAD_STATUS (v->dbdir!=0, 1);
		status = fread (&v->nchrom, sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		v->chroms = GDL_MALLOC (gdl_string *, v->nchrom);
		for (i = 0; i < v->nchrom; i++)
		{
			v->chroms[i] = gdl_string_fread (stream);
			GDL_FREAD_STATUS (v->chroms[i]!=0, 1);
		}
		status = fread (&has, sizeof(unsigned char), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		if (has == '1')
		{
			v->dico = gdl_snp_annot_dico_fread (stream);
			GDL_FREAD_STATUS (v->dico!=0, 1);
		}
		status = fread (&(v->ngene), sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		v->genes = gdl_hashtable_alloc (gdl_eqtl_gene_entry_interface, v->ngene);
		status = gdl_hashtable_fread (stream, v->genes);
		GDL_FREAD_STATUS (status, GDL_SUCCESS);
		
		if (gdl_eoff(stream))
		{
			return v;
		}
		
		v->sample_info = gdl_eqtl_sample_info_fread (stream);
		
		if (gdl_eoff(stream))
		{
			return v;
		}
		
		return v;
	}
	return 0;
}

int
gdl_eqtl_genome_fwrite (FILE * stream, const gdl_eqtl_genome * v)
{
	if (stream && v)
	{
		int status;
		size_t i, ngene;
		unsigned char has;
		
		status = gdl_string_fwrite (stream, v->dbdir);
		GDL_FWRITE_STATUS (status, GDL_SUCCESS);
		status = fwrite (&v->nchrom, sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		for (i = 0; i < v->nchrom; i++)
		{
			status = gdl_string_fwrite (stream, v->chroms[i]);
			GDL_FWRITE_STATUS (status, GDL_SUCCESS);
		}
		has = (v->dico) ? '1' : '0';
		status = fwrite (&has, sizeof(unsigned char), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		if (v->dico)
		{
			status = gdl_snp_annot_dico_fwrite (stream, v->dico);
			GDL_FWRITE_STATUS (status, GDL_SUCCESS);
		}
		ngene  = gdl_hashtable_size (v->genes);
		status = fwrite (&ngene, sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = gdl_hashtable_fwrite (stream, v->genes);
		GDL_FWRITE_STATUS (status, GDL_SUCCESS);
		if (v->sample_info)
		{
			status = gdl_eqtl_sample_info_fwrite (stream, v->sample_info);
			GDL_FWRITE_STATUS (status, GDL_SUCCESS);	
		}
		
		return GDL_SUCCESS;
	}
	return GDL_EINVAL;
}

FILE *
gdl_eqtl_genome_set_logger (gdl_eqtl_genome * g, FILE * stream)
{
	FILE * old = g->logger;
	g->logger = stream;
	return old;
}
