/*
 *  eqtl/chrom.c 
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:47 $, $Version$
 *  
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */
#include <math.h>

#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_math.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_io.h>
#include <gdl/gdl_hash.h>
#include <gdl/gdl_list.h>
#include <gdl/gdl_pca.h>
#include <gdl/gdl_matrix.h>
#include <gdl/gdl_vector.h>
#include <gdl/gdl_sort_double.h>
#include <gdl/gdl_statistics_double.h>
#include <gdl/gdl_snp_chromosome.h>
#include <gdl/gdl_snp_annotation.h>
#include <gdl/gdl_genex_chromosome.h>
#include <gdl/gdl_eqtl_chromosome.h>

void
gdl_eqtl_chromosome_anchor_gene (gdl_eqtl_chromosome * chrom)
{
	size_t i, j, k, l, nbloc=0, nsnp=chrom->snp_data->chrom->size;
	gdl_snp ** snps = chrom->snp_data->chrom->snps;
	gdl_genex_gene  ** genes = chrom->genes;
	gdl_genex_block ** blocks;
	
	for (i = 0; i < chrom->ngene; nbloc += genes[i]->size, i++);
	blocks = GDL_MALLOC (gdl_genex_block *, nbloc);
	for (k = i = 0; i < chrom->ngene; i++)
	{
		for (j = 0; j < genes[i]->size; j++, k++)
		{
			blocks[k] = genes[i]->blocks[j];
			blocks[k]->start = genes[i]->txStart;
			blocks[k]->end   = genes[i]->txEnd;
			blocks[k]->snp_start = blocks[k]->snp_end = 0;
		}	
	}
	for (j = 0; j < nsnp; j++)
	{ 
		for (k = 0; k < nbloc; k++)
		{
			if (snps[j]->position < blocks[k]->start)
			{
				continue;
			}
			else if (snps[j]->position > blocks[k]->end && !blocks[k]->snp_end)
			{
				blocks[k]->snp_end = j;
				if (!blocks[k]->snp_start) blocks[k]->snp_start = j;
			}
			else if (!blocks[k]->snp_start)
			{
				blocks[k]->snp_start = j+1;
			}
		}
	}
	for (k = 0; k < nbloc; k++)
	{
		if (blocks[k]->snp_start) 
		{
			blocks[k]->snp_start--;
			if (blocks[k]->snp_end) blocks[k]->snp_end--;
			else blocks[k]->snp_end = nsnp-1;
		}
		else if (blocks[k]->snp_end)
		{
			blocks[k]->snp_end--;
			blocks[k]->snp_start = blocks[k]->snp_end;
		}
	}
	GDL_FREE (blocks);
}

gdl_eqtl_chromosome *
gdl_eqtl_chromosome_alloc (gdl_snp_chromosome * snp_data, gdl_genex_chromosome * exp_data)
{
	size_t i;
	gdl_eqtl_chromosome * chrom;
	
	chrom = GDL_CALLOC (gdl_eqtl_chromosome, 1);
	
	chrom->ploidy    = snp_data->pops[0]->P;
	chrom->npop      = exp_data->npop;
	chrom->pop_sizes = exp_data->pop_sizes;
	for (i = 0; i < chrom->npop; chrom->nindiv+=chrom->pop_sizes[i], i++);
	chrom->ngene     = gdl_genex_chromosome_gene_size (exp_data);
	chrom->nprobe    = gdl_genex_chromosome_probe_size (exp_data);
	chrom->genes     = gdl_genex_chromosome_genes (exp_data);
	chrom->probes    = gdl_genex_chromosome_probes (exp_data);
	chrom->snp_data  = snp_data;
	
	gdl_eqtl_chromosome_anchor_gene (chrom);
	
	return chrom;	
}

void
gdl_eqtl_chromosome_free (gdl_eqtl_chromosome * chrom)
{
	if (chrom)
	{
		size_t i;
		GDL_FREE (chrom->pop_sizes);
		for (i = 0; i < chrom->ngene; i++)
		{
			gdl_genex_gene_free (chrom->genes[i]);
		}
		GDL_FREE (chrom->genes);
		for (i = 0; i < chrom->nprobe; i++)
		{
			gdl_genex_probe_free (chrom->probes[i], chrom->npop);
		}
		GDL_FREE (chrom->probes);
		gdl_snp_chromosome_free (chrom->snp_data);
		GDL_FREE (chrom);
	}
}

void
gdl_eqtl_chromosome_init_window (gdl_eqtl_chromosome * chrom, gdl_snp_annot_dico * dico, const long window_size)
{
	int k;
	size_t i, j, l, nbloc=0, nsnp=chrom->snp_data->chrom->size;
	gdl_snp ** snps = chrom->snp_data->chrom->snps;
	gdl_genex_gene  * gene;
	gdl_genex_block  * block;
	
	for (i = 0; i < chrom->ngene; i++)
	{
		gene = chrom->genes[i];
		//if (gene->ignore == 'y') continue;
		for (j = 0; j < gene->size; j++)
		{
			block = gene->blocks[j];
			//if (block->ignore == 'y') continue;
			//printf (">> %d %d\n", gene->txStart, snps[block->snp_start]->position);
			for (k = block->snp_start; k > 0 && snps[k-1]->position >= (gene->txStart-window_size);k--)
			{
				if (dico)
					gdl_genex_annot_dico_add_count (block, snps[k-1], dico); 	
			}
			//printf ("UP::%d\n", snps[k]->position-gene->txStart);
			block->snp_up = k;
			//if (block->snp_up == block->snp_start) block->snp_up++;
			//printf (">> %d %d (%d)\n", block->end, snps[block->snp_end]->position, block->snp_end);
			for (k = block->snp_end+1; k < nsnp && snps[k]->position <= gene->txEnd+window_size;k++)
			{
				if (dico)
					gdl_genex_annot_dico_add_count (block, snps[k-1], dico);
			}
			block->snp_down = k-1;
			//if (block->snp_down == block->snp_end) block->snp_down--;
			//printf ("DOWN::%d\n", snps[k-1]->position-gene->txEnd);
			if (block->snp_up==0 && block->snp_down==nsnp-1)
				block->snp_up=block->snp_down=nsnp-1;
		}
	}	
}

void
gdl_eqtl_chromosome_select_gene (gdl_eqtl_chromosome * chrom)
{
	size_t i,j,k,m;
	gdl_genex_gene  * gene;
	gdl_genex_block * block;
	gdl_genex_probe * probe;
	
	for (i = 0; i < chrom->ngene; i++)
	{
		gene = chrom->genes[i];
		for (m = j = 0; j < gene->size; j++)
		{
			block = gene->blocks[j];
			for (k = 0; k < block->size; k++)
			{
				probe = block->probes[k];
				if (probe->ignore == 'n')
					break;	
			}
			if (k == block->size)
			{
				block->ignore = 'y';	
				m++;
			}
			else
			{
				block->ignore = 'n';	
			}
		}
		if (m == gene->size)
		{
			gene->ignore = 'y';	
		}
		else
		{
			gene->ignore = 'n';	
		}
	}
}

void
gdl_eqtl_chromosome_collapse_gene (gdl_eqtl_chromosome * c)
{
	size_t i, j, k;
	
	for (i = 0; i < c->ngene; i++)
	{
		gdl_eqtl_gene * gene = c->genes[i];
		if (gene->size > 1)
		{
			for (j = 0; j < gene->size; j++)
			{
				gdl_eqtl_block * b1 = gene->blocks[j];
				for (k = 0; k < gene->size; k++)
				{
					if (k == j) continue; 
					gdl_eqtl_block * b2 = gene->blocks[k];
					if (b2->start >= b1->start
					    && b2->end <= b1->end)
					{
						gdl_genex_gene_remove_block (gene, k);
						if (k < j) j--;
					}
					else if (b2->start >= b1->start && b2->start < b1->end)
					{
						gdl_genex_gene_merge_block (gene, j, k);
						if (k < j) j--;
					}
					else if (b1->start >= b2->start && b1->start < b2->end)
					{
						gdl_genex_gene_merge_block (gene, j, k);
						if (k < j) j--;
					}
				}
			}
		}	
	}
}

void
gdl_eqtl_chromosome_meta_probeset (gdl_eqtl_chromosome * c, const size_t meta_type)
{
	 size_t g,b,p,pp,i,j,np;
	 gdl_list * meta_probes = gdl_list_alloc (gdl_list_default);
	 
	 for(g = 0; g < c->ngene; g++)
	 {
	 	  gdl_eqtl_gene * gene = c->genes[g];
	 	  
	 	  for (b = 0; b < gene->size; b++)
	 	  {
	 	  	   gdl_eqtl_block * block = gene->blocks[b];
	 	  	   // Get the number of probes
	 	  	   for(np = p = 0; p < block->size; p++)
	 	  	   {
	 	  	   	if (block->probes[p]->ignore=='y')
	 	  	   	{
	 	  	   		continue;
	 	  	   	}
	 	  	   	if (meta_type==3)
	 	  	   	{
	 	  	   		gdl_genex_probe_gaussian_quantile_normalize (block->probes[p], c->pop_sizes, c->npop);
	 	  	   	}
	 	  	   	np++;
	 	  	   }
	 	  	  	
	 	  	  	gdl_eqtl_probe * meta_probe = gdl_genex_meta_probe_alloc (block->probes, block->size, c->npop); 
		 	  	gdl_list_push_back (meta_probes, meta_probe, 0);
	 	  	   
	 	  	   if (!np)
	 	  	   {
	 	  	   	meta_probe->ignore = 'y';
	 	  	   	for(i = 0; i < c->npop; i++)
	 	  	   		meta_probe->data[i] = GDL_CALLOC (double, c->pop_sizes[i]);
	 	  	   }
	 	  	   else
	 	  	   {
		 	  	   for(i = 0; i < c->npop; i++)
		 	  	   {
		 	  	   	gdl_matrix * x = gdl_matrix_alloc (c->pop_sizes[i], np);
		 	  	   	for(pp = p = 0; p < block->size; p++)
		 	  	   	{
		 	  	   		if (block->probes[p]->ignore=='y')
		 	  	   			continue;
		 	  	   		gdl_eqtl_probe * probe = block->probes[p];
		 	  	   		for(j = 0; j < c->pop_sizes[i]; j++)
		 	  	   		{
		 	  	   			double xjp = probe->data[i][j];
		 	  	   			if (meta_type==4)
		 	  	   			{
		 	  	   				xjp -= probe->mean[i];
		 	  	   			}
		 	  	   			gdl_matrix_set (x, j, pp,  xjp);
		 	  	   		}
		 	  	   		pp++;
		 	  	   	}
		 	  	   	
		 	  	   	meta_probe->data[i] = GDL_MALLOC (double, c->pop_sizes[i]);
		 	  	   	
		 	  	   	if (meta_type != 4)
		 	  	   	{
			 	  	   	for(j = 0; j < c->pop_sizes[i]; j++)
			 	  	   	{
			 	  	   		gdl_vector_view  xj = gdl_matrix_row (x, j);
			 	  	   		switch(meta_type)
				 	  	   	{
				 	  	   		case 0:
				 	  	   		case 3:
				 	  	   			meta_probe->data[i][j] = gdl_stats_mean (xj.vector.data, xj.vector.stride, np);
				 	  	   			break;
				 	  	   		case 1:
				 	  	   			gdl_sort (xj.vector.data, xj.vector.stride, np);
				 	  	   			meta_probe->data[i][j] = gdl_stats_median_from_sorted_data (xj.vector.data, xj.vector.stride, np);
				 	  	   			break;
				 	  	   	}
			 	  	   	}
		 	  	   	}
		 	  	   	else
		 	  	   	{
		 	  	   		gdl_pca_workspace * pca = gdl_pca_workspace_alloc (gdl_pca_covariance);
		 	  	   		// check the dimensionality of the matrix
		 	  	   		if (c->pop_sizes[i] >= np)
		 	  	   		{
		 	  	   			gdl_pca_workspace_perform (pca, x);
		 	  	   		}
		 	  	   		else
		 	  	   		{
		 	  	   			gdl_matrix * tx = gdl_matrix_alloc (x->size2, x->size1);
								gdl_matrix_transpose_memcpy (tx, x);
		 	  	   			gdl_pca_workspace_perform_transpose (pca, tx);
		 	  	   			gdl_matrix_free (tx);
		 	  	   		}
		 	  	   		const gdl_matrix * U = gdl_pca_workspace_projection (pca);
		 	  	   		const gdl_vector * S = gdl_pca_workspace_weights (pca);
		 	  	   		if (c->logger)
		 	  	   		{
		 	  	   			double prop_first = gdl_vector_get (S, 0)/gdl_pca_workspace_tot_var (pca);
		 	  	   			fprintf (c->logger, "%s %s %s %d %g\n", c->name, gene->name, meta_probe->name, meta_probe->size, prop_first);	
		 	  	   		}
		 	  	   		// Get only the first axis
		 	  	   		for(j = 0; j < c->pop_sizes[i]; j++)
			 	  	   	{
			 	  	   		meta_probe->data[i][j] = gdl_matrix_get (U, j, 0);
			 	  	   	}
		 	  	   		gdl_pca_workspace_free (pca);
		 	  	   	}
		 	  	   	// compute the mean and the variances
		 	  	   	// compute the mean and the variance
						meta_probe->mean[i] = gdl_stats_mean (meta_probe->data[i], 1, c->pop_sizes[i]);
						meta_probe->var[i]  = gdl_stats_variance_with_fixed_mean (meta_probe->data[i], 1, c->pop_sizes[i], meta_probe->mean[i]);
						// clean buffer
						gdl_matrix_free (x);
		 	  	   }
	 	  	   }
	 	  	}
	 }
	 for(p = 0; p < c->nprobe; p++)
	 {
	 	 gdl_genex_probe_free (c->probes[p], c->npop);
	 }
	 GDL_FREE (c->probes);
	 c->nprobe = gdl_list_size (meta_probes);
	 c->probes = GDL_MALLOC (gdl_genex_probe *, c->nprobe);
	 gdl_list_itr * itr = gdl_list_iterator_front (meta_probes);
	 i=0;
	 for(g = 0; g < c->ngene; g++)
	 {
	 	  gdl_eqtl_gene * gene = c->genes[g];
	 	  for (b = 0; b < gene->size; b++)
	 	  {
	 	  	   gdl_eqtl_block * block = gene->blocks[b];
	 	  	   gdl_genex_block_rm_all_probe (block);
	 	  	  	// and add the meta_probe
	 	  	  	c->probes[i]=gdl_list_iterator_value (itr);
	         c->probes[i]->idx=i;
	 	  	  	gdl_genex_block_add_probe (block, c->probes[i]);
	 	  	  	i++;
	 	  	  	gdl_list_iterator_next (itr);
	 	  }
	 } 
	 gdl_list_iterator_free (itr);
	 gdl_list_free (meta_probes);
}

/**
 * 0  = snp has at least one individudual in each genotypic class
 * 1  = snp has no heterozygous
 * 2  = snp has one homozygous class empty
 * -1 = only one genotypic class found 
 */
int
gdl_eqtl_chromosome_get_snp_class (const gdl_eqtl_chromosome * chrom, const size_t snp_idx)
{
	size_t p,i,j,k,n0=0,n1=0,n2=0;
	
	for(p = 0; p < chrom->npop; p++)
	{
		if (!gdl_snp_chromosome_is_polymorphic (chrom->snp_data, p, snp_idx))
				continue;
		for(i = 0; i < chrom->pop_sizes[p]; i++)
		{
			switch(gdl_snp_chromosome_get_genotype (chrom->snp_data, p, i, snp_idx))
			{
				case 2:
					n2++;
					break;
				case 1:
					n1++;
					break;
				case 0:
					n0++;
					break;
				default:
					break;
			}
		}
	}
	
	if (n0 > 0 && n1 > 0 && n2 > 0)
	{
		return 0;
	}
	else if ((n0==0 && (n1!=0 && n2!=0)) || (n2==0 && (n1!=0 && n0!=0)))
	{
		return 2;
	}
	else if ((n1 == 0) && (n2 != 0 && n1 != 0))
	{
		return 1;
	}
	else
	{
		return -1;
	}
}

/**
 * Compute the fold change expression between the mean scores
 * of the two homozygous genotypes.
 * The 'imap' function is optional and allows user to transform
 * the expression level before computing the fold change (e.g if 
 * the expression levels have been previously log-transformed)
 * Finally, the 'rm_indiv' is an optional binary matrix indicating which
 * individuals have to be included to compute the fold change expression
 * 
 * The fold change is given in log_2 scale
 */
double
gdl_eqtl_chromosome_snp_fold_change (const gdl_eqtl_chromosome * c,
                                     const gdl_eqtl_probe * probe,
                                     const size_t snp,
                                     double (*imap)(double x),
                                     size_t * rm_indiv[],
                                     size_t * ng0,
                                     size_t * ng1,
                                     size_t * ng2)
{
	size_t p,i,n0=0,n1=0,n2=0;
	double x,s0=0,s1=0,s2=0;
	
	for(p = 0; p < c->npop; p++)
	{
		if (!gdl_snp_chromosome_is_polymorphic (c->snp_data, p, snp))
				continue;
		for(i = 0; i < c->pop_sizes[p]; i++)
		{
			if (rm_indiv && rm_indiv[p][i])
				continue;
			x = probe->data[p][i];
			if (imap) x = (*imap)(x);
			switch(gdl_snp_chromosome_get_genotype (c->snp_data, p, i, snp))
			{
				case 2:
					s2 += x;
					n2++;
					break;
				case 1:
					s1 += x;
					n1++;
					break;
				case 0:
					s0 += x;
					n0++;
					break;
				default:
					fprintf (stderr, "%d %d %d %d\n", p, i, snp, gdl_snp_chromosome_get_genotype (c->snp_data, p, i, snp));
					break;
			}
		}
	}
	
	*ng0 = n0;
	*ng1 = n1;
	*ng2 = n2;
	
	if (n2 != 0 && n0 != 0)
	{
		return (s2!=0 && s0!=0) ? log((n0*s2)/(n2*s0))/M_LN2 : 0;
	}
	else if (n2 != 0 && n1 != 0)
	{
		return (s2!=0 && s1!=0) ? log((n1*s2)/(n2*s1))/M_LN2 : 0;
	}
	else if (n0 != 0 && n1 != 0)
	{
		return (s0!=0 && s1!=0) ? log((n0*s1)/(n1*s0))/M_LN2 : 0;
	}
	return 0;
	
}  

gdl_eqtl_gene *
gdl_eqtl_chromosome_lookup_gene (const gdl_eqtl_chromosome * c, const gdl_string * name)
{
	size_t g;
	for(g = 0; g < c->ngene; g++)
	{
		if (!strcmp (c->genes[g]->name, name))
		{
			return c->genes[g];
		}	
	}
}

gdl_snp *
gdl_eqtl_chromosome_lookup_snp (const gdl_eqtl_chromosome * c, const gdl_string * name)
{
	size_t g;
	for(g = 0; g < c->snp_data->chrom->size; g++)
	{
		if (!strcmp (c->snp_data->chrom->snps[g]->rs, name))
		{
			c->snp_data->chrom->snps[g]->idx = g;
			return c->snp_data->chrom->snps[g];
		}	
	}
	return 0;
}

FILE *
gdl_eqtl_chromosome_set_logger (gdl_eqtl_chromosome * c, FILE * stream)
{
	FILE * old = c->logger;
	c->logger = stream;
	return old;	
}

gdl_eqtl_chromosome *
gdl_eqtl_chromosome_fread (FILE * stream)
{
	if (stream)
	{
		int status;
		size_t i;
		gdl_eqtl_chromosome * chrom;
		
		chrom = GDL_MALLOC (gdl_eqtl_chromosome, 1);
		
		status = fread (&chrom->ploidy, sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		status = fread (&chrom->npop, sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		status = fread (&chrom->ngene, sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		status = fread (&chrom->nprobe, sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		status = fread (&chrom->nindiv, sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		chrom->pop_sizes = GDL_MALLOC (size_t, chrom->npop);
		status = fread (chrom->pop_sizes, sizeof(size_t), chrom->npop, stream);
		GDL_FREAD_STATUS (status, chrom->npop);
		chrom->probes = GDL_MALLOC (gdl_genex_probe *, chrom->nprobe);
		for (i = 0; i < chrom->nprobe; i++)
		{
			chrom->probes[i] = gdl_genex_probe_fread (stream, chrom->npop, chrom->pop_sizes);
			GDL_FREAD_STATUS (chrom->probes[i]!=0, 1);
			chrom->probes[i]->idx = i; // dont't forget it... to read the genes
		}
		chrom->genes = GDL_MALLOC (gdl_genex_gene *, chrom->ngene);
		for (i = 0; i < chrom->ngene; i++)
		{
			chrom->genes[i] = gdl_genex_gene_fread (stream, chrom->probes);
			GDL_FREAD_STATUS (chrom->genes[i]!=0, 1);
		}
		chrom->snp_data = gdl_snp_chromosome_fread (stream);
		GDL_FREAD_STATUS (chrom->snp_data!=0, 1);
		
		return chrom;
	}
	return 0;	
}

int
gdl_eqtl_chromosome_fwrite (FILE * stream, const gdl_eqtl_chromosome * chrom)
{
	if (stream && chrom)
	{
		int status;
		size_t i;
		
		status = fwrite (&chrom->ploidy, sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = fwrite (&chrom->npop, sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = fwrite (&chrom->ngene, sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = fwrite (&chrom->nprobe, sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = fwrite (&chrom->nindiv, sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = fwrite (chrom->pop_sizes, sizeof(size_t), chrom->npop, stream);
		GDL_FWRITE_STATUS (status, chrom->npop);
		for (i = 0; i < chrom->nprobe; i++)
		{
			status = gdl_genex_probe_fwrite (stream, chrom->probes[i], chrom->npop, chrom->pop_sizes);
			GDL_FWRITE_STATUS (status, GDL_SUCCESS);
			chrom->probes[i]->idx=i;
		}
		for (i = 0; i < chrom->ngene; i++)
		{
			status = gdl_genex_gene_fwrite (stream, chrom->genes[i]);
			GDL_FWRITE_STATUS (status, GDL_SUCCESS);
		}
		status = gdl_snp_chromosome_fwrite (stream, chrom->snp_data);
		GDL_FWRITE_STATUS (status, GDL_SUCCESS);
		
		return GDL_SUCCESS;
	}
	return GDL_EINVAL;
}
