/*
 *  snp/chrom.c 
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:22:03 $, $Version$
 *  
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */

#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_io.h>
#include <gdl/gdl_list.h>
#include <gdl/gdl_snp_map.h>
#include <gdl/gdl_snp_data.h>
#include <gdl/gdl_snp_chromosome.h>

static void
gdl_snp_map_pop_merge (gdl_snp_chromosome * cv, const gdl_snp_map * c2, const size_t p)
{
	size_t i=0, j=0, n=0;
	gdl_snp_map * c1;
	gdl_snp ** tmp;
	size_t  ** idx;
	
	c1 = cv->chrom;
	
	for (; i < c1->size && j < c2->size;)
	{
		if (c1->snps[i]->position > c2->snps[j]->position)
		{
			for(;j < c2->size && c2->snps[j]->position < c1->snps[i]->position; j++, n++);
		}
		else if (c2->snps[j]->position > c1->snps[i]->position)
		{
			for(;i < c1->size && c1->snps[i]->position < c2->snps[j]->position; i++, n++);
		}
		else if (c1->snps[i]->position==c2->snps[j]->position)
		{
			i++;j++;n++;
		}
	}
	
	if (j < c2->size)
	{
		n += c2->size-j;
	}
	if (i < c1->size)
	{
		n += c1->size-i;
	}
	
	tmp = GDL_MALLOC (gdl_snp *, n);
	idx = GDL_MALLOC (size_t  *, n);
	
	//printf ("1 - CHROM SIZE %d\n", n);
	
	for (i = j = n = 0; i < c1->size && j < c2->size;)
	{
		if (c1->snps[i]->position > c2->snps[j]->position)
		{
			for(;j < c2->size && c2->snps[j]->position < c1->snps[i]->position; j++, n++)
			{
				tmp[n] = gdl_snp_clone (c2->snps[j]);
				idx[n] = GDL_CALLOC (size_t, cv->npop);
				idx[n][p] = j+1;
				tmp[n]->polypop = 'n';
			}
		}
		else if (c2->snps[j]->position > c1->snps[i]->position)
		{
			for(;i < c1->size && c1->snps[i]->position < c2->snps[j]->position; i++, n++)
			{
				tmp[n]  = c1->snps[i];
				tmp[n]->polypop = 'n';
				if (p > 1)
				{
					idx[n] = cv->idx[i];
				}
				else
				{
					idx[n] = GDL_CALLOC (size_t, cv->npop);
					idx[n][0] = i+1;	
				}
			}
		}
		else if (c1->snps[i]->position==c2->snps[j]->position)
		{
			tmp[n] = c1->snps[i];
			if (p > 1)
			{
				idx[n] = cv->idx[i];
			}
			else
			{
				idx[n] = GDL_CALLOC (size_t, cv->npop);
				idx[n][0] = i+1;
			}
			idx[n][p] = j+1;
			i++;
			j++;
			n++;	
		}
	}
	for (; i < c1->size; i++, n++)
	{
		tmp[n] = c1->snps[i];
		tmp[n]->polypop = 'n';
		if (p > 1)
		{
			idx[n] = cv->idx[i];
		}
		else
		{
			idx[n]    = GDL_CALLOC (size_t, cv->npop);
			idx[n][0] = i+1;
		}
	}
	for (; j < c2->size; j++, n++)
	{
		tmp[n] = gdl_snp_clone (c2->snps[j]);
		tmp[n]->polypop = 'n';
		idx[n] = GDL_CALLOC (size_t, cv->npop);
		idx[n][p] = j+1;
	}
	GDL_FREE (c1->snps);
	c1->size = n;
	c1->snps = tmp;
	GDL_FREE (cv->idx);
	cv->idx = idx;
	
	//printf ("2 - CHROM SIZE %d\n", n);
}

gdl_snp_chromosome *
gdl_snp_chromosome_alloc (const gdl_string * name, gdl_string ** popdir, const size_t npop, const gdl_snp_data_format * format)
{
	size_t i;
	FILE * stream;
	gdl_snp_chromosome * v;
	
	v = GDL_CALLOC (gdl_snp_chromosome, 1);
	
	v->npop = npop;
	v->pops = GDL_MALLOC (gdl_snp_data *, npop);
	
	for (i = 0; i < npop; i++)
	{
		gdl_string * file = gdl_string_sprintf ("%s/%s.snp", popdir[i], name);
		stream = gdl_fileopen (file, "r");
		fprintf(stdout, "READ %s %s\n", popdir[i], name);
		v->pops[i] = gdl_snp_data_fscanf (stream, format);
		gdl_fileclose (file, stream);
		stream = gdl_fileopen (file, "r");
		gdl_string_free (file);
		file = gdl_string_sprintf ("%s/%s.map", popdir[i], name);
		stream = gdl_fileopen (file, "r");
		if (i)
		{
			gdl_snp_map * tmp = gdl_snp_map_fscanf (stream);
			gdl_snp_map_pop_merge (v, tmp, i);
			gdl_snp_map_free (tmp);
		}
		else
		{
			v->chrom = gdl_snp_map_fscanf (stream);
		}
		gdl_fileclose (file, stream);
	}
	
	return v;
}

void
gdl_snp_chromosome_free (gdl_snp_chromosome * v)
{
	if (v)
	{
		size_t i;
		if (v->idx)
		{
			for (i = 0; i < v->chrom->size; i++)
			{
				GDL_FREE (v->idx[i]);
			}
			GDL_FREE (v->idx);
		}
		for (i = 0; i < v->npop; i++)
		{
			gdl_snp_data_free (v->pops[i]);
		}
		gdl_snp_map_free (v->chrom);
		GDL_FREE (v);
	}
}

size_t
gdl_snp_chromosome_nindiv (const gdl_snp_chromosome * v)
{
	size_t i,n=0;
	for(i = 0; i < v->npop; i++)
		n+=v->pops[i]->N;
	return n;
}

int * 
gdl_snp_chromosome_get_genotypes (const gdl_snp_chromosome * v, size_t snp, size_t * n)
{
	size_t i,j,k;
	int * G;
	
	*n = gdl_snp_chromosome_nindiv (v);
	G  = GDL_MALLOC (int, *n);
	
	for(k = i = 0; i < v->npop; i++)
	{
		if (gdl_snp_chromosome_is_polymorphic (v, i, snp))
		{
			for(j = 0; j < v->pops[j]->N; j++, k++)
				G[k]=gdl_snp_chromosome_get_genotype (v, i, j, snp);
		}
		else
		{
			for(j = 0; j < v->pops[j]->N; j++, k++)
				G[k]=-1;
		}
	}
	
	return G;
}

int
gdl_snp_chromosome_get_genotype (const gdl_snp_chromosome * v, size_t pop, size_t indiv, size_t snp)
{
	if (v->idx && v->idx[snp][pop])
	{
		return gdl_snp_data_get (v->pops[pop], indiv, v->idx[snp][pop]-1);
	}
	else if (!v->idx)
	{
		return gdl_snp_data_get (v->pops[pop], indiv, snp);
	}
	return -1;
}

int
gdl_snp_chromosome_get_haplotype (const gdl_snp_chromosome * v, size_t pop, size_t indiv, size_t snp, size_t k)
{
	if (v->idx && v->idx[snp][pop])
	{
		return gdl_snp_data_hget (v->pops[pop], indiv, v->idx[snp][pop]-1, k);
	}
	else if (!v->idx)
	{
		return gdl_snp_data_hget (v->pops[pop], indiv, snp, k);
	}
	return -1;
}

gdl_boolean
gdl_snp_chromosome_is_polymorphic (const gdl_snp_chromosome * v, size_t pop, size_t snp)
{
	if ((v->idx && v->idx[snp][pop]) || !v->idx)
		return gdl_true;
	else
		return gdl_false;
}

double
gdl_snp_chromosome_get_pop_frequency (const gdl_snp_chromosome * c, size_t pop, size_t snp)
{
	size_t s;
	double f;
	
	if (c->idx && c->idx[snp][pop])
	{
		s = c->idx[snp][pop]-1;
	}
	else if (c->idx)
	{
		s = snp;
	}
	else
	{
		return 0;
	}
	
	gdl_snp_stats * stats = gdl_snp_stats_alloc ();
	
	gdl_snp_data_snp_stats (c->pops[pop], s, stats);
	
	f = stats->afreq[1];
	
	gdl_snp_stats_free (stats);
	
	return f;
}

double
gdl_snp_chromosome_get_frequency (const gdl_snp_chromosome * c, size_t snp)
{
	size_t i, N = 0;
	double f = 0;
	
	for (i = 0; i< c->npop; i++)
	{	
		double x = gdl_snp_chromosome_get_pop_frequency (c, i, snp);
		if (x)
		{
			f += c->pops[i]->N*x;
			N += c->pops[i]->N;
		}
	}
	return f/N;
}

double
gdl_snp_chromosome_get_MAF (const gdl_snp_chromosome * c, size_t snp, size_t * which)
{
	size_t i, N = 0;
	double f = 0;

	*which=1;
	
	for (i = 0; i< c->npop; i++)
	{	
		double x = gdl_snp_chromosome_get_pop_frequency (c, i, snp);
		if (x)
		{
			f += c->pops[i]->N*x;
			N += c->pops[i]->N;
		}
	}
	f=f/N;
	if (f > 0.5)
	{
		*which=0;
		return 1.0-f;
	}
	else
	{
		return f;
	}
}

size_t
gdl_snp_chromosome_select_snp (gdl_snp_chromosome * c, const double fmin, const double gmin, const gdl_boolean poly_pop)
{
	size_t i, j, k, N, * idx, rm=0;
	double f, g0, g1;
	gdl_snp_stats * stats;
	gdl_snp ** snps = c->chrom->snps;
	
	stats = gdl_snp_stats_alloc ();
	
	for (i = 0; i < c->chrom->size; i++)
	{
		snps[i]->ignore = 'n';
		if (poly_pop && snps[i]->polypop == 'n')
		{
			snps[i]->ignore = 'y';
			if (c->logger)
			{
				fprintf (c->logger, "Discard SNP %s %d on chromosome %s: not polymorphic in all populations\n", snps[i]->rs, snps[i]->position, c->name);	
			}
			rm++;
		}
		else if ((fmin < 1 && fmin > 0) || (gmin < 1 && gmin > 0))
		{
			g0 = g1 = f = 0; // compute the snp frequency
			N = 0;
			for (j = 0; j < c->npop; j++)
			{
				if (c->idx && c->idx[i][j])
				{
					gdl_snp_data_snp_stats (c->pops[j], c->idx[i][j]-1, stats);
					f  += c->pops[j]->N*stats->afreq[0];
					g0 += c->pops[j]->N*stats->gfreq[0];
					g1 += c->pops[j]->N*stats->gfreq[1];
					N += c->pops[j]->N;
				}
				else if (!c->idx)
				{
					gdl_snp_data_snp_stats (c->pops[j], i, stats);
					f  += c->pops[j]->N*stats->afreq[0];
					g0 += c->pops[j]->N*stats->gfreq[0];
					g1 += c->pops[j]->N*stats->gfreq[1];
					N += c->pops[j]->N;
				}
			}
			f/=N;
			g0/=N;
			g1/=N;
			//printf (">SNP %d %1.3f %1.3f\n", i, f, fmin);
			if (   (f < fmin || 1-f < fmin)
			    || (g0 < gmin || g1 < gmin)
			    || (1-g0-g1 < gmin))
			{
			   snps[i]->ignore = 'y';
			   if (c->logger)
				{
					fprintf (c->logger, "Discard SNP %s %d on chromosome %s: frequency %1.5f under threshold %1.5f\n", snps[i]->rs, snps[i]->position, c->name, f, fmin);
				}
				rm++;
			}
		}
	}
	
	gdl_snp_stats_free (stats);
	
	return rm;
}

gdl_cnv_clone **
gdl_snp_chromosome_get_cnv_clones (const gdl_snp_chromosome * c, long from, long to, size_t * n)
{
	if (!c->cnvs)
		return 0;
	
	size_t i;  	
	gdl_cnv_data * data = c->cnvs[0];
	gdl_list * tmp = gdl_list_alloc (gdl_list_default);
	
	const size_t L = data->L;
	
	for (i = 0; i < L; i++)
	{
		if (data->clones[i]->start >= from && data->clones[i]->end <= to)
		{
			data->clones[i]->idx = i;
			gdl_list_push_back (tmp, data->clones[i], 0);
		}
	}
	
	if (!gdl_list_size (tmp))
	{
		*n=0;
		return 0;
	}
	
	*n = gdl_list_size (tmp);
	
	gdl_cnv_clone ** clones = GDL_MALLOC (gdl_cnv_clone *, *n);
	
	i=0;
	
	gdl_list_itr * itr = gdl_list_iterator_front (tmp);
	
	do
	{
		clones[i] = (gdl_cnv_clone *) gdl_list_iterator_value (itr);
		i++;
	}
	while (gdl_list_iterator_next (itr));
	
	gdl_list_iterator_free (itr);
	
	gdl_list_free (tmp);
	
	return clones;
}

FILE *
gdl_snp_chromosome_set_logger (gdl_snp_chromosome * c, FILE * logger)
{
	FILE * out = c->logger;
	c->logger = logger;
	return out;
}
