/*
 *  eqtl/info.c 
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:47 $, $Version$
 *  
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */

#include <stdio.h>

#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_matrix.h>
#include <gdl/gdl_vector.h>
#include <gdl/gdl_eqtl_genome.h>

gdl_eqtl_sample_info *
gdl_eqtl_sample_info_alloc (const size_t npop, const size_t pop_sizes[])
{
	size_t j;
	gdl_eqtl_sample_info * i;
	
	i = GDL_CALLOC (gdl_eqtl_sample_info, 1);
	
	i->npop = npop;
	i->pop_names = GDL_CALLOC (gdl_string *, npop);
	i->pop_sizes = GDL_MALLOC (size_t, npop);
	memcpy (i->pop_sizes, pop_sizes, sizeof(size_t)*npop);
	i->cumpop_sizes = GDL_CALLOC (size_t, npop);
	for(j = 0; j < npop; j++)
	{
		if (j > 0) 
			i->cumpop_sizes[j] = i->cumpop_sizes[j-1] + pop_sizes[j-1];
		i->nsample += pop_sizes[j];
	}
	i->sample_names = GDL_CALLOC (gdl_string *, i->nsample);
	i->popx = GDL_CALLOC (size_t, i->nsample);
	
	return i;
}

void
gdl_eqtl_sample_info_free (gdl_eqtl_sample_info * p)
{
	if (p)
	{
		size_t i,j;
		GDL_FREE (p->pop_sizes);
		GDL_FREE (p->cumpop_sizes);
		for(i = 0; i < p->npop; i++)
		{
			GDL_FREE (p->pop_names[i]);	
		}
		GDL_FREE (p->pop_names);
		for(i = 0; i < p->nsample; i++)
		{
			GDL_FREE (p->sample_names[i]);
		}
		GDL_FREE (p->sample_names);
		for(i = 0; i < p->nvar; i++)
		{
			GDL_FREE (p->var_names[i]);
		}
		GDL_FREE (p->var_names);
		GDL_FREE (p->var_types);
		GDL_FREE (p->popx);
		GDL_MATRIX_FREE (p->factors, p->nfactor);
		GDL_MATRIX_FREE (p->vectors, p->nvector);	
		for(i = 0; i < p->nfactor; i++)
		{
			for(j = 0; j < p->level_by_factor[i]; j++)
			{
				GDL_FREE (p->levels[i][j]);
			}
			GDL_FREE (p->levels[i]);
		}
		GDL_FREE (p->levels);
		GDL_FREE (p->level_by_factor);
		GDL_FREE (p);	
	}
}

static gdl_boolean
_is_float (const gdl_string * tok)
{
	size_t i,p=0,n=strlen(tok);
	for(i = 0; i < n; i++)
	{
		if (!isdigit(tok[i]))
		{
			if (tok[i]=='.' && p==0)
				p=1;
			else
				break;
		}		
	} 
	return (i < n || p==0) ? gdl_false : gdl_true;
}
/**
 * WARNING !!! The order of the samples within the population MUST BE
 * THE SAME than the order of the individuals in the .exp and .snp files
 * Furthermore, the order of the populations MUST BE THE SAME than the order
 * of the population in the configuration file.
 * 
 * population sample sexe V2 V3 ...
 * CEU NA0001 male 0.3 yes ...
 * YRI NA... 
 */
int
gdl_eqtl_sample_info_fscanf (FILE * stream, gdl_eqtl_sample_info * p)
{
	size_t i,ii,j,jj,n,k,l=0,m,popx=0,lx,fx,vx,*lxx;
	gdl_string * line = 0, * tok;
	gdl_hashtable * buffer = gdl_hashtable_alloc (gdl_hash_default, 0);
	gdl_hashtable * _levels;
	
	while (gdl_getline (&line, &n, stream) != -1)
	{
		//printf ("LINE %s\n", line);
		i = j = 0;
		if (l == 0) // header
		{
			// skip 2 first column names (pop + sample)
			for(k = 0; k < 2; k++)
			{
				tok = gdl_string_next_token (line, n, &i, &j);
			   gdl_string_free (tok);
			}
			ii=i;jj=j;
			while (j != n)
			{
				tok = gdl_string_next_token (line, n, &i, &j);
				if (gdl_hashtable_lookup (buffer, tok))
				{
					GDL_ERROR_VAL (gdl_string_sprintf ("Two columns share the same name %s\n", tok), GDL_EINVAL, GDL_EINVAL);
				}
				gdl_hashtable_add (buffer, tok, tok, 0);  
			}
			if ((p->nvar=gdl_hashtable_size (buffer)) > 0)
			{
				p->var_names = GDL_MALLOC (gdl_string *, p->nvar);
//				printf ("NVAR = %d\n", p->nvar);
				// Now, loop and store the variable names
				i=ii;j=jj;
				for(k = 0; k < p->nvar; k++)
				{
					tok = gdl_string_next_token (line, n, &i, &j);
					p->var_names[k] = gdl_string_clone (tok);
					//printf ("VAR_NAMES %d = %s\n", k, p->var_names[k]);
					gdl_string_free (tok);		   
				}
			}
			gdl_hashtable_free (buffer);
		}
		else if (l > p->nsample) 
		{
			GDL_ERROR_VAL ("There are more lines than samples", GDL_EINVAL, GDL_EINVAL);
		}
		else
		{
			tok = gdl_string_next_token (line, n, &i, &j);
			// pop name
			if (p->pop_names[popx]==0 || (p->pop_names[popx] && strcmp(p->pop_names[popx], tok)))
			{
				if (p->pop_names[popx]) popx++;
				p->pop_names[popx] = gdl_string_clone (tok);
			}
			gdl_string_free (tok);
			// population index
			//printf ("POPX %d => %d (%s)\n", l-1, popx, p->pop_names[popx]);
			p->popx[l-1] = popx;
			// sample name
			tok = gdl_string_next_token (line, n, &i, &j);
			p->sample_names[l-1] = tok;
			//printf ("SAMPLE %d => %s\n", l-1, p->sample_names[l-1]);
			ii = i;jj = j;
			if (p->nvar)
			{
				if (l == 1) // determine the variable type
				{
					p->var_types = GDL_CALLOC (unsigned char, p->nvar);
					p->nvector   = 0;
					p->nfactor   = 0;
					for(k = 0; k < p->nvar; k++)
					{
						tok = gdl_string_next_token (line, n, &i, &j);
						if (_is_float (tok))
						{
							p->var_types[k]='v';
							(p->nvector)++;
						}
						else
						{
						   p->var_types[k]='f';
						   (p->nfactor)++;
						}
						gdl_string_free (tok);
					}
					if (p->nfactor)
					{
						_levels    = gdl_hashtable_alloc (gdl_hash_default, p->nfactor);
						p->factors = GDL_MATRIX_ALLOC (size_t, p->nfactor, p->nsample);
					}
					if (p->nvector)
						p->vectors =  GDL_MATRIX_ALLOC (double, p->nvector, p->nsample);
					i=ii;j=jj;
				}
				for(fx = vx = k = 0; k < p->nvar; k++)
				{
					tok = gdl_string_next_token (line, n, &i, &j);
					//printf ("TOK %s\n", tok);
					switch(p->var_types[k])
					{
						case 'f':
							//printf ("VARIABLE %d %s\n", k, p->var_names[k]);
							//fflush(stdout);
							buffer = gdl_hashtable_lookup(_levels, p->var_names[k]);
							//printf ("%p <== %s\n", buffer, p->var_names[k]);
							if (!buffer)
							{
								buffer = gdl_hashtable_alloc (gdl_interface_uint, 0);
								gdl_hashtable_add (_levels, p->var_names[k], buffer, 0);
							}
							if ((lxx=gdl_hashtable_lookup (buffer, tok))==0)
							{
								lxx = GDL_MALLOC (size_t, 1);
								*lxx = lx = gdl_hashtable_size (buffer);
								//printf ("%s %s ==> %d\n", p->var_names[k], tok, *lxx);
								gdl_hashtable_add (buffer, tok, lxx, 1);
							}
							p->factors[fx][l-1] = *lxx;
							//printf ("F %d %d %d\n", fx, l-1, *lxx);
							(fx)++;
							break;
						case 'v':
							p->vectors[vx][l-1] = (double)atof(tok);
							//printf ("V %d %d %g (%s)\n", vx, l-1, p->vectors[vx][l-1], tok);
							(vx)++;
							break;
					}
					gdl_string_free (tok);
				}	
			}
		}
		// end of line
		gdl_string_free (line);
		line=0;
		l++;
	}
	// Create the level matrix for factors
	if (p->nfactor)
	{
		p->level_by_factor = GDL_MALLOC (size_t, p->nfactor);
		p->levels = GDL_MALLOC (gdl_string **, p->nfactor);
		for(fx = k = 0; k < p->nvar; k++)
		{
			if (p->var_types[k] != 'f')
				continue;
			buffer = gdl_hashtable_lookup (_levels, p->var_names[k]);
			p->level_by_factor[fx] = gdl_hashtable_size (buffer);
			p->levels[fx]          = GDL_MALLOC (gdl_string *, p->level_by_factor[fx]); 
			gdl_hashtable_itr * itr = gdl_hashtable_iterator (buffer);
			do
			{
				size_t * itmp = (size_t *) gdl_hashtable_iterator_value (itr);
				p->levels[fx][*itmp] = gdl_string_clone (gdl_hashtable_iterator_key (itr));
				//printf ("FACTOR %d LEVEL %d %s\n", fx, *itmp, p->levels[fx][*itmp]);
			}
			while(gdl_hashtable_iterator_next (itr));
			gdl_hashtable_iterator_free (itr);
			gdl_hashtable_free (buffer);
		}
		gdl_hashtable_free (_levels);
	}
}
 
int
gdl_eqtl_sample_info_fwrite (FILE * stream, const gdl_eqtl_sample_info * p)
{
	if (stream && p)
	{
		size_t i,j;
		int status;
		
		status = fwrite (&(p->npop), sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = fwrite (&(p->nsample), sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = fwrite (&(p->nvar), sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = fwrite (&(p->nfactor), sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		status = fwrite (&(p->nvector), sizeof(size_t), 1, stream);
		GDL_FWRITE_STATUS (status, 1);
		
		status = fwrite (p->pop_sizes, sizeof(size_t), p->npop, stream);
		GDL_FWRITE_STATUS (status, p->npop);
		status = fwrite (p->cumpop_sizes, sizeof(size_t), p->npop, stream);
		GDL_FWRITE_STATUS (status, p->npop);
		status = fwrite (p->popx, sizeof(size_t), p->nsample, stream);
		GDL_FWRITE_STATUS (status, p->nsample);
		
		for(i = 0; i < p->npop; i++)
		{
			status = gdl_string_fwrite (stream, p->pop_names[i]);
			GDL_FWRITE_STATUS (status, GDL_SUCCESS);
		}
		for(i = 0; i < p->nsample; i++)
		{
			status = gdl_string_fwrite (stream, p->sample_names[i]);
			GDL_FWRITE_STATUS (status, GDL_SUCCESS);
		}
		if (p->nvar)
		{
			status = fwrite (p->var_types, sizeof(unsigned char), p->nvar, stream);
			GDL_FWRITE_STATUS (status, p->nvar);
			for(i = 0; i < p->nvar; i++)
			{
				status = gdl_string_fwrite (stream, p->var_names[i]);
				GDL_FWRITE_STATUS (status, GDL_SUCCESS);
			}
			if (p->nfactor)
			{
				status = fwrite (p->level_by_factor, sizeof(size_t), p->nfactor, stream);
				GDL_FWRITE_STATUS (status, p->nfactor);
				for(i = 0; i < p->nfactor; i++)
				{
					status = fwrite (p->factors[i], sizeof(size_t), p->nsample, stream);
					GDL_FWRITE_STATUS (status, p->nsample);
				}
				for(i = 0; i < p->nfactor; i++)
				{
					for(j = 0; j < p->level_by_factor[i]; j++)
					{
						status = gdl_string_fwrite (stream, p->levels[i][j]);
						GDL_FWRITE_STATUS (status, GDL_SUCCESS);
					}
				}				
			}
			for(i = 0; i < p->nvector; i++)
			{
				status = fwrite (p->vectors[i], sizeof(double), p->nsample, stream);
				GDL_FWRITE_STATUS (status, p->nsample);
			}
		}		
		return GDL_SUCCESS;
	}
	return GDL_EINVAL;
}

gdl_eqtl_sample_info *
gdl_eqtl_sample_info_fread (FILE * stream)
{
	if (stream)
	{
		size_t i,j;
		int status;
		gdl_eqtl_sample_info * p;
		
		p = GDL_CALLOC (gdl_eqtl_sample_info, 1);
		
		status = fread (&(p->npop), sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		status = fread (&(p->nsample), sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		status = fread (&(p->nvar), sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		status = fread (&(p->nfactor), sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		status = fread (&(p->nvector), sizeof(size_t), 1, stream);
		GDL_FREAD_STATUS (status, 1);
		
		p->pop_sizes    = GDL_MALLOC (size_t, p->npop);
		p->cumpop_sizes = GDL_MALLOC (size_t, p->npop);
		p->popx         = GDL_MALLOC (size_t, p->nsample);
		p->pop_names    = GDL_MALLOC (gdl_string *, p->npop);
		p->sample_names = GDL_MALLOC (gdl_string *, p->nsample);
		
		
		status = fread (p->pop_sizes, sizeof(size_t), p->npop, stream);
		GDL_FREAD_STATUS (status, p->npop);
		status = fread (p->cumpop_sizes, sizeof(size_t), p->npop, stream);
		GDL_FREAD_STATUS (status, p->npop);
		status = fread (p->popx, sizeof(size_t), p->nsample, stream);
		GDL_FREAD_STATUS (status, p->nsample);
		
		for(i = 0; i < p->npop; i++)
		{
			p->pop_names[i] = gdl_string_fread (stream);
			GDL_FREAD_STATUS (p->pop_names[i]!=0, 1);
		}
		for(i = 0; i < p->nsample; i++)
		{
			p->sample_names[i] = gdl_string_fread (stream);
			GDL_FREAD_STATUS (p->sample_names[i]!=0, 1);
		}
		if (p->nvar)
		{
			p->var_types = GDL_MALLOC (unsigned char, p->nvar);
			p->var_names = GDL_MALLOC (gdl_string *, p->nvar);
			
			status = fread (p->var_types, sizeof(unsigned char), p->nvar, stream);
			GDL_FREAD_STATUS (status, p->nvar);
			for(i = 0; i < p->nvar; i++)
			{
				p->var_names[i] = gdl_string_fread (stream);
				GDL_FREAD_STATUS (p->var_names[i]!=0, 1);
			}
			if (p->nfactor)
			{
				p->level_by_factor = GDL_MALLOC (size_t, p->nfactor);
				p->factors = GDL_MATRIX_ALLOC (size_t, p->nfactor, p->nsample);
				p->levels  = GDL_MALLOC (gdl_string **, p->nfactor);
				
				status = fread (p->level_by_factor, sizeof(size_t), p->nfactor, stream);
				GDL_FREAD_STATUS (status, p->nfactor);
				
				for(i = 0; i < p->nfactor; i++)
				{
					status = fread (p->factors[i], sizeof(size_t), p->nsample, stream);
					GDL_FREAD_STATUS (status, p->nsample);
				}
				
				for(i = 0; i < p->nfactor; i++)
				{
					p->levels[i] = GDL_MALLOC (gdl_string *, p->level_by_factor[i]);
					for(j = 0; j < p->level_by_factor[i]; j++)
					{
						p->levels[i][j] = gdl_string_fread (stream);
						GDL_FREAD_STATUS (p->levels[i][j]!=0, 1);
					}
				}				
			}
			if (p->nvector)
			{
				p->vectors = GDL_MATRIX_ALLOC (double, p->nvector, p->nsample);
				for(i = 0; i < p->nvector; i++)
				{
					status = fread (p->vectors[i], sizeof(double), p->nsample, stream);
					GDL_FREAD_STATUS (status, p->nsample);
				}
			}
		}		
		
		return p;
	}
	return 0;
}

double * 
gdl_eqtl_sample_info_get_var (const gdl_eqtl_sample_info * p,
                              const gdl_string * name)
{
	size_t i,f=0,v=0;
	double * x = 0;
	
	for(i = 0; i < p->nvar; i++)
	{
		if (!strcmp(p->var_names[i], name))
		{
			break;	
		}
		switch(p->var_types[i])
		{
			case 'f':
				f++;
				break;
			case 'v':
				v++;
				break;
		}
	}
	if (i == p->nvar)
	{
		return x;	
	}
	x = GDL_MALLOC (double, p->nsample);
	switch(p->var_types[i])
	{
		case 'f':
			for(i = 0; i < p->nsample; i++)
			{
				x[i] = (double) p->factors[f][i];
			}
			break; 
		case 'v':
			memcpy(x, p->vectors[v], sizeof(double)*p->nsample);
			break;	
	}
	return x;
}

size_t * 
gdl_eqtl_sample_info_get_factor (const gdl_eqtl_sample_info * p, const gdl_string * name, size_t *nz)
{
	size_t i,f=0;
	size_t * x = 0;
	
	(*nz) = 0;
	
	for(i = 0; i < p->nvar; i++)
	{
		if (!strcmp(p->var_names[i], name))
		{
			break;	
		}
		switch(p->var_types[i])
		{
			case 'f':
				f++;
				break;
			default:
				break;
		}
	}
	if (i == p->nvar)
	{
		return x;	
	}
	switch(p->var_types[i])
	{
		case 'f':
			x = GDL_MALLOC (size_t, p->nsample);
			for(i = 0; i < p->nsample; i++)
			{
				x[i] = p->factors[f][i];
			}
			(*nz) = p->level_by_factor[f];
			return x;
		case 'v':
			return x;
	}
}
