/*  
 * 	hview/resolve.c
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:43 $, $Version$
 *
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */
 
#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_permutation.h>
#include <gdl/gdl_hash.h>
#include <gdl/gdl_gentity.h>
#include <gdl/gdl_gview.h>
#include <gdl/gdl_hview.h>

static double _swap_haplotypes (_gdl_hview * wh, size_t i, size_t j, size_t jj);
static double _swap_genotypes (_gdl_hview * wh, size_t i, size_t j, size_t jj);

static size_t
_msd (size_t x)
{
	size_t y = x / 10;
	if (y >= 10)
		return 1 + _msd (y);
	return 1;
}

typedef struct
{
	size_t i;
	size_t * h;
} _gdl_haplo;

struct _gdl_hconfig
{
	double pr;
	size_t * idx;
};

gdl_hconfig *
gdl_hconfig_alloc (size_t size)
{
	gdl_hconfig * hc = GDL_MALLOC (gdl_hconfig, 1);
	hc->idx = GDL_MALLOC (size_t, size);
	hc->pr  = 1;
	return hc;
}

gdl_hconfig *
gdl_hconfig_clone (const gdl_hconfig * h, size_t size)
{
	if (h)
	{
		gdl_hconfig * n;
		
		n = GDL_MALLOC (gdl_hconfig, 1);
	    n->idx = GDL_MALLOC (size_t, size);
	    memcpy (n->idx, h->idx, sizeof(size_t)*size);
	    n->pr  = h->pr;
		
		return n;
	}
	
	return NULL;	
}

void
gdl_hconfig_free (gdl_hconfig * hc)
{
	if (hc)
	{
		GDL_FREE (hc->idx);
		GDL_FREE (hc);
	}	
}

const static gdl_data_interface _haplo_type =
{
	&free,
	NULL,
	NULL,
	NULL,
	NULL
};

static void
_extract_alleles (_gdl_hview * wh,
                  const gdl_gview * g,
                  const gdl_mask * m,
                  size_t i,
                  size_t j,
                  gdl_gvalues_get * gbuf,
                  int site)
{
	size_t k;
	
	if (site >= 0)
	{
		gdl_genotype * geno 
			= wh->genotypes[site][wh->gpath[site]];
		
		wh->pr *= wh->proba[site][wh->gpath[site]];
		
		//printf ("%s (", geno->name);fflush(stdout);
		
		for (k = 0; k < wh->np; k++)
		{
			
			size_t kk = gdl_permutation_get (wh->hpath[site], k);
			
			gdl_allele * allele 
			   = gdl_genotype_get_allele (geno, kk);
			 
			//printf (" [%s,%d] ", allele->name, allele->idx);
			
			//printf (" %d", kk);
			   
			wh->hbuf[k][j] = allele->idx;
			
			gdl_string_scat (&wh->hkeys[k][wh->hkeyx[j]], wh->hkeyf[j], allele->idx);
		}
		
		//printf (")");
	}
	else
	{
		const gdl_gvalues * x;
		
		for (k = 0; k < wh->np; k++)
		{
			GDL_GVIEW_GET_ALLELE (g, m, i, j, k, gbuf);
			
			x = gdl_gvalues_get_gvalues (gbuf);
			
			wh->hbuf[k][j] = x->values[0]->idx;
			
			gdl_string_scat (&wh->hkeys[k][wh->hkeyx[j]], wh->hkeyf[j], x->values[0]->idx);
		}
	}
}

static void
_init_extract_haplotypes (_gdl_hview * wh)
{
	size_t i, s;
	const gdl_gview * g = wh->h->data;
	const gdl_mask  * m = wh->h->mask;
	
	wh->hbuf  = GDL_MATRIX_ALLOC (size_t, wh->np, wh->nl);
	
	wh->hdict = gdl_hashtable_alloc (&_haplo_type, 0);
	
	wh->hkeyx = GDL_CALLOC (size_t, wh->nl);
	wh->hkeyf = GDL_CALLOC (gdl_string * , wh->nl);
	
	for (s = i = 0; i < wh->nl; i++)
	{
		gdl_locus * l = GDL_GVIEW_GET_LOCUS (g, m, i);
		size_t na     = gdl_locus_allele (l) - 1;
		wh->hkeyx[i]  = s;
		na = _msd (na);
		s += na;
		wh->hkeyf[i]  = gdl_string_sprintf ("%%%dd", na);
	}
	
	wh->hkeys = GDL_CALLOC (gdl_string * , wh->np);
	
	for (i = 0; i < wh->np; i++)
	{
		wh->hkeys[i] = gdl_string_alloc (s);
	}
	
	wh->locus 
	  = gdl_hashtable_alloc (gdl_entity_interface, wh->nl);
	  
	wh->hconfs = GDL_MALLOC (gdl_list *, wh->na);
	
	for (i = 0; i < wh->na; i++)
	{
		wh->hconfs[i] 
		   = gdl_list_alloc (gdl_list_default);
	}
}

static void
_clean_extract_haplotypes (_gdl_hview * wh)
{
	size_t i;
	GDL_MATRIX_FREE (wh->hbuf, wh->np);
	gdl_hashtable_free (wh->hdict);
	gdl_hashtable_free (wh->locus);
	GDL_FREE (wh->hkeyx);
	for (i = 0; i < wh->nl; i++)
	{
		gdl_string_free (wh->hkeyf[i]);
	}
	GDL_FREE (wh->hkeyf);
	for (i = 0; i < wh->np; i++)
	{
		gdl_string_free (wh->hkeys[i]);
	}
	GDL_FREE (wh->hkeys);
	for (i = 0; i < wh->na; i++)
	{
		gdl_list_free (wh->hconfs[i]);
	}
	GDL_FREE (wh->hconfs);
}

static void
_init_extract_genotypes (_gdl_hview * wh, size_t i)
{
	size_t j, jj, aidx;
	const gdl_gview * g = wh->h->data;
	const gdl_mask  * m = wh->h->mask;
	gdl_clustering * gclust = wh->h->guniq;
	gdl_gvalues_get   * gbuf = GDL_GVIEW_GET_NEW (g, m);
	
	wh->hpath = GDL_MALLOC (gdl_permutation *, wh->nas[i]);
	
	for (j = 0; j < wh->nas[i]; j++)
	{
		wh->hpath[j] = gdl_permutation_alloc (wh->np);
		gdl_permutation_init (wh->hpath[j]);
	}
	
	wh->gpath     = GDL_CALLOC (size_t, wh->nas[i]);
	wh->gskeep    = GDL_CALLOC (gdl_boolean, wh->nas[i]);
	wh->ng        = GDL_CALLOC (size_t, wh->nas[i]);
	wh->genotypes = GDL_CALLOC (gdl_genotype_ptr *, wh->nas[i]);
	wh->proba     = GDL_CALLOC (double *, wh->nas[i]);
	
	aidx = gdl_clustering_clust_idx (gclust, wh->aidx[i]);
	
	for (jj = j = 0; j < wh->nl; j++)
	{
		if (jj < wh->nas[i] && j == wh->lidx[i][jj])
		{
			jj++;
			continue;
		}
		else
		{
			_extract_alleles (wh, g, m, aidx, j, gbuf, -1);
		}
	}
	
	gdl_gvalues_get_free (gbuf);
}

static void
_clean_extract_genotypes (_gdl_hview * wh, size_t i)
{
	size_t j;
	
	for (j = 0; j < wh->nas[i]; j++)
	{
		gdl_permutation_free (wh->hpath[j]);
		GDL_FREE (wh->genotypes[j]);
		GDL_FREE (wh->proba[j]);
	}
	
	GDL_FREE (wh->hpath);
	GDL_FREE (wh->genotypes);
	GDL_FREE (wh->proba);
	GDL_FREE (wh->gpath);
	GDL_FREE (wh->gskeep);
	GDL_FREE (wh->ng);
}

static void
_extract_genotypes (_gdl_hview * wh, size_t i)
{
	size_t j, k, kk, aidx;
	const gdl_gview * g = wh->h->data;
	const gdl_mask  * m = wh->h->mask;
	gdl_clustering * gclust = wh->h->guniq;
	gdl_gvalues_get   * gbuf;
	const gdl_gvalues * x;
	
	aidx = gdl_clustering_clust_idx (gclust, wh->aidx[i]);
	
	for (j = 0; j < wh->nas[i]; j++)
	{
		gdl_locus * tmp
		   = GDL_GVIEW_GET_LOCUS (g, m, wh->lidx[i][j]);
		   
		gdl_locus * locus 
		   = gdl_hashtable_lookup (wh->locus, tmp->name);
		
		if (locus == NULL)
		{
			locus = gdl_locus_resolve_genotype (tmp);
			gdl_hashtable_add (wh->locus, locus->name, locus, 1);
		}
		
		gbuf = gdl_gvalues_get_alloc (gdl_locus_genotype (locus));
		
		GDL_GVIEW_GET_GENOTYPE_TEMPLATE (g, m, aidx, locus, gbuf);
		
		x = gdl_gvalues_get_gvalues (gbuf);
		
		wh->ng[j]        = x->size;
		wh->genotypes[j] = GDL_CALLOC (gdl_genotype *, wh->ng[j]);
		wh->proba[j]     = GDL_CALLOC (double, wh->ng[j]);
		
		for (k = 0; k < x->size; k++)
		{
			wh->genotypes[j][k] =	
				gdl_locus_get_genotype (locus, x->values[k]->idx);
			wh->proba[j][k] = x->values[k]->value;
		}
		
		gdl_gvalues_get_free (gbuf);
	}
}

static void
_add_haplotypes (_gdl_hview * wh, size_t i)
{
	size_t j, k, l;
	_gdl_haplo * h;
	gdl_hconfig * hconf;
	
	hconf = gdl_hconfig_alloc (wh->np);
	hconf->pr = wh->pr;
	
	for (k = 0; k < wh->np; k++)
	{
		h = (_gdl_haplo *) gdl_hashtable_lookup (wh->hdict, wh->hkeys[k]);
		if (h == 0)
		{
			h = GDL_MALLOC (_gdl_haplo, 1);
			h->i = gdl_hashtable_size (wh->hdict);
			h->h = GDL_MALLOC (size_t, wh->nl);
			memcpy (h->h, wh->hbuf[k], sizeof(size_t)*wh->nl);
			gdl_hashtable_add (wh->hdict, wh->hkeys[k], h, 0);
			//printf ("HDICT < %d %s\n", h->i, h->h);
			wh->nh = h->i+1;
		}
		hconf->idx[k]=h->i;
	}
	gdl_list_push_front (wh->hconfs[i], hconf, 0);
}

static double
_extract_haplotypes (_gdl_hview * wh, size_t i, size_t mode)
{
	size_t j, aidx;
	const gdl_gview * g = wh->h->data;
	const gdl_mask  * m = wh->h->mask;
	gdl_clustering * gclust = wh->h->guniq;
	gdl_gvalues_get * gbuf;
	
	gbuf = gdl_gview_get_new (g);
	
	wh->pr = 1;
	
	switch (mode)
	{
		case 0 :
		
			aidx = gdl_clustering_clust_idx (gclust, i);
			
			for (j = 0; j < wh->nl; j++)
			{
				_extract_alleles (wh, g, m, aidx, j, gbuf, -1);
			}
			
			_add_haplotypes (wh, i);
			
			break;
			
		case 1 :
		
			aidx = gdl_clustering_clust_idx (gclust, wh->aidx[i]);
		    
		    for (j = 0; j < wh->nas[i]; j++)
			{
				_extract_alleles (wh, g, m, aidx, wh->lidx[i][j], gbuf, j);
			}
			
			_add_haplotypes (wh, wh->aidx[i]);
			
			break;
	}
	
	gdl_gvalues_get_free (gbuf);
	
	return wh->pr;
}

static double
_swap_haplotypes (_gdl_hview * wh, size_t i, size_t j, size_t jj)
{
	double pr = 0;
	
	if (wh->gskeep[j])
	{
	    if (j < wh->nas[i] - 1)
			pr = _swap_haplotypes (wh, i, j+1, jj);
	}
	else
	{
		if (j == jj)
			gdl_permutation_next (wh->hpath[j]);
		do
		{
			if (j == wh->nas[i] - 1)
				pr += _extract_haplotypes (wh, i, 1);
			if (j < wh->nas[i] - 1)
				pr += _swap_haplotypes (wh, i, j+1, jj);
		}
		while (gdl_permutation_next (wh->hpath[j]) == GDL_SUCCESS);	
		
		gdl_permutation_init (wh->hpath[j]);
	}
	
	return pr;
}

static double
_path_in_haplotypes (_gdl_hview * wh, size_t i)
{
	double pr;
	size_t j, jj, n;
	
	for (n = j = 0; j < wh->nas[i]; j++)
	{
		if (gdl_genotype_is_homozygous (wh->genotypes[j][wh->gpath[j]]))
		{
			wh->gskeep[j] = gdl_true;
		}
		else
		{
			wh->gskeep[j] = gdl_false;
			n++;
		}
	}
	
	pr = _extract_haplotypes (wh, i, 1);
			
	for (jj = wh->nas[i]; jj > 1; jj--)
	{
		j = jj - 1;
		if (!wh->gskeep[j] && n > 1)
		{
			pr += _swap_haplotypes (wh, i, j, j);
			n--;
		}
	}
	
	return pr;
}

static double
_swap_genotypes (_gdl_hview * wh, size_t i, size_t j, size_t jj)
{
	double pr = 0;
	size_t k, l, ll, ns;
	
	k = (j == jj) ? 1 : 0;
	
	for (; k < wh->ng [j]; k++)
	{
		
		wh->gpath[j] = k;
		
		if (j == wh->nas[i] - 1)
		{
			pr += _path_in_haplotypes (wh, i);			
		}
		if (j < wh->nas[i] - 1)
		{
			pr += _swap_genotypes (wh, i, j+1, jj);
		}
	}	
	wh->gpath[j] = 0;
	
	return pr;
}

static void
_path_in_genotypes (_gdl_hview * wh, size_t i)
{
	double pr;
	size_t j, jj;
	gdl_list_itr * itr;
	
	pr = _path_in_haplotypes (wh, i);
	for (jj = wh->nas[i]; jj > 0; jj--)
	{
		j = jj - 1;
		pr += _swap_genotypes (wh, i, j, j);
	}
	
    itr = gdl_list_iterator_front (wh->hconfs[wh->aidx[i]]);
	
	do
	{
		gdl_hconfig * hc
			  = (gdl_hconfig *) gdl_list_iterator_value (itr);
		hc->pr /= pr;
	}
	while (gdl_list_iterator_next (itr));
	
	gdl_list_iterator_free (itr);
	
}

static int
_build_haplotype_table (_gdl_hview * wh)
{
	gdl_hview * h = wh->h;
	gdl_hashtable_itr * itr;
	
	itr = gdl_hashtable_iterator (wh->hdict);
	
	do
	{
		char * key = gdl_hashtable_iterator_key (itr);
		_gdl_haplo * haplo 
		   = (_gdl_haplo *) gdl_hashtable_iterator_value (itr);
		
		h->haplotypes[haplo->i]	= haplo->h;
		
		//printf ("H(%d) %s\n", haplo->i, key);
	}
	while (gdl_hashtable_iterator_next (itr));
	
	gdl_hashtable_iterator_free (itr);
	
	return GDL_SUCCESS;
}

static int
_build_hconfig_table (_gdl_hview * wh)
{
	size_t i, j, k, nc;
	double nn;
	gdl_hview * h = wh->h;
	gdl_clustering * gclust = h->guniq;
	
	nn = (double) gdl_clustering_size (gclust) * wh->np;
	
	for (i = 0; i < wh->na; i++)
	{
		nc = gdl_clustering_clust_size (gclust, i);
		
		gdl_list_itr * itr 
		   = gdl_list_iterator_front (wh->hconfs[i]);
		
		h->nhc[i] = gdl_list_size (wh->hconfs[i]);
		
		h->hconfigs[i] = GDL_MALLOC (gdl_hconfig *, h->nhc[i]);
		   
		j = 0;
		do
		{
			h->hconfigs[i][j]
			  = (gdl_hconfig *) gdl_list_iterator_value (itr);
			for (k = 0; k < wh->np; k++)
			{
				h->mult[h->hconfigs[i][j]->idx[k]] += (nc*h->hconfigs[i][j]->pr)/nn;
			}
			j++;		
		}
		while (gdl_list_iterator_next (itr));
		
		gdl_list_iterator_free (itr);
	}
}

static int
_build_hview (_gdl_hview * wh)
{
	gdl_hview * h = wh->h;
	
	gdl_hview_init (h, wh->na, wh->nl, wh->nh);
	
	_build_haplotype_table (wh);
	_build_hconfig_table (wh);
	
	return GDL_SUCCESS;
}

static int
_check_underflow (_gdl_hview * wh, size_t i)
{
	size_t j, aidx, nc; 
	gdl_clustering * gclust = wh->h->guniq;
	
	nc   = wh->ng[0];
	aidx = gdl_clustering_clust_idx (gclust, wh->aidx[i]);
	
	for (j = 1; j < wh->nas[i]; j++)
	{
		nc *= wh->ng[j]*wh->np;
		if (nc > GDL_HVIEW_MAX)
		{
			gdl_accession * a = GDL_GVIEW_GET_ACCESSION (wh->h->data, wh->h->mask, aidx);
			wh->error 
		      = gdl_string_sprintf ("Accession %s has more than %d compatible haplotype configurations. Remove it or reduce the number of locus", a->name, GDL_HVIEW_MAX);
		    return GDL_FAILURE;
		}
	}
	
	return GDL_SUCCESS;
}

static int
_check_missing (_gdl_hview * wh, size_t i, double pmt)
{
	size_t ii, nm;
	double pm;
	gdl_clustering * gclust = wh->h->guniq;
	
	ii = gdl_clustering_clust_idx (gclust, wh->aidx[i]);
	
	nm = GDL_GVIEW_ACCESSION_MISSING_SIZE (wh->h->data, wh->h->mask, ii);
	
	pm = nm/(double)(wh->nl*wh->np);
	
	if (pm > pmt)
	{
		printf ("PROP MISSING %g\n", pm);
		return GDL_SUCCESS;
	}
	else
	{
		return GDL_FAILURE;
	}
}

static int
_resolve_ambiguous (_gdl_hview * wh, double pmt)
{
	size_t i, j;
	
	_init_extract_haplotypes (wh);
	
	for (j = i = 0; i < wh->na; i++)
	{
		if (!wh->isa[i])
		{
			_extract_haplotypes (wh, i, 0);
		}
		else
		{
			_init_extract_genotypes (wh, j);
		
			_extract_genotypes (wh, j);
			
			if (_check_underflow (wh, j) != GDL_SUCCESS)
			{
				// test prop of missing data sites
				// if prop < thresh then exit
				// else keep in for the end...
				if (_check_missing (wh, j, pmt) != GDL_SUCCESS)
				{
					_clean_extract_genotypes (wh, j);
					_clean_extract_haplotypes (wh);
					return GDL_FAILURE;
				}
				else
				{
					continue;	
				}
			}
			
			_path_in_genotypes (wh, j);
				
			_clean_extract_genotypes (wh, j);
			
			wh->isa[i] = gdl_false;
			
			j++;
		}
	}
	
	_build_hview (wh);
	
	_clean_extract_haplotypes (wh);
	
	return GDL_SUCCESS;
}
