/*
 *  genex/gene.c
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:52 $, $Version$
 *  
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */
#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_io.h>
#include <gdl/gdl_hash.h>
#include <gdl/gdl_list.h>
#include <gdl/gdl_statistics_double.h>
#include <gdl/gdl_genex_chromosome.h>

static void
gdl_genex_gene_parse (const gdl_string * line,
                      const size_t n,
                      size_t * i,
                      size_t * j,
                      gdl_string ** name,
                      long * id,
                      long * start,
                      long * end,
                      unsigned char * strand)
{
 	gdl_string * tok;
 	
 	// first four tokens are the probe
	*name = gdl_string_next_token (line, n, i, j);
	tok  = gdl_string_next_token (line, n, i, j);
//	*id = atol(tok);
//	gdl_string_free (tok);
//	tok  = gdl_string_next_token (line, n, i, j);
//	*start = atol(tok);
//	gdl_string_free (tok);
//	tok  = gdl_string_next_token (line, n, i, j);
//	*end = atol(tok);
//	gdl_string_free (tok);
//	tok  = gdl_string_next_token (line, n, i, j);
//	*strand = tok[0];
//	gdl_string_free (tok);
}

static long *
_get_probe_positions (const gdl_string * tok, size_t * npos)
{
	size_t i,j,np,n=strlen(tok);
	gdl_string * tmp;
	long * pos;	
	
	for(np=1,i=0;i<n;i++) {if (tok[i]==',') np++;}
	*npos=np;
	pos=GDL_MALLOC(long,np);
	for(np=0,i=j=0;i<n;i++)
	{
		if (tok[i]==',')
		{
			tmp = gdl_string_alloc (i-j);
			memcpy (tmp, tok+j, sizeof(char)*(i-j));
			pos[np++]=atol(tmp);
			gdl_string_free (tmp);
			j=i+1;
		}	
	}
	// last one
	tmp = gdl_string_alloc (i-j);
	memcpy (tmp, tok+j, sizeof(char)*(i-j));
	pos[np++]=atol(tmp);
	gdl_string_free (tmp);
	
	return pos;
}

static gdl_genex_probe *
gdl_genex_probe_parse (const gdl_string * line, const size_t n, size_t * i, size_t * j)
{
	size_t size;
	gdl_string * name, * tok;
	long * starts, * ends;
	unsigned char strand;
	gdl_genex_probe * prb;
	
	// first four tokens are the probe
	name = gdl_string_next_token (line, n, i, j);
	// starts
	tok    = gdl_string_next_token (line, n, i, j);
	starts = _get_probe_positions (tok, &size);
	gdl_string_free (tok);
	// ends
	tok  = gdl_string_next_token (line, n, i, j);
	ends = _get_probe_positions (tok, &size);
	gdl_string_free (tok);
	//tok  = gdl_string_next_token (line, n, i, j);
	//strand = tok[0];
	//gdl_string_free (tok);
	
	prb = gdl_genex_probe_alloc (name, '?', starts, ends, size);
	
	return prb;
}

int
gdl_genex_chromosome_fscanf (FILE * stream, gdl_genex_chromosome * chrom)
{
	if (stream && chrom)
	{
		size_t i, j, k, n;
		gdl_string * name, * tok, * line=0;
		long id=0, start, end;
		unsigned char strand;
		gdl_genex_probe * probe;
		gdl_genex_gene  * gene;
		gdl_genex_block * block;
		
		while (gdl_getline (&line, &n, stream) != -1)
		{
			i=j=0;
			probe = gdl_genex_probe_parse (line, n, &i, &j);
			gdl_genex_chromosome_add_probe (chrom, probe);
			while (j < n)
			{
				gdl_genex_gene_parse (line, n, &i, &j, &name, &id, &start, &end, &strand);
				gene  = gdl_genex_chromosome_search_gene (chrom, name);
				block = 0;
				if (!gene)
				{
					gene = gdl_genex_gene_alloc (name, id);
					gdl_genex_chromosome_add_gene (chrom, gene);
				}
				else
				{
					block = gdl_genex_gene_search_block (gene, start, end, probe->strand);
					gdl_string_free (name);
				}
				if (!block)
				{
					block = gdl_genex_block_alloc (start, end, probe->strand);
					gdl_genex_gene_add_block (gene, block);
				}
				gdl_genex_block_add_probe (block, probe);
			}
			
			GDL_FREE (line);
			line=0;	
		}
		
		return GDL_SUCCESS;
	}
	return GDL_EINVAL;
}

int
gdl_genex_chromosome_fscanf_expr (FILE * stream, const size_t pop, gdl_genex_chromosome * chrom)
{
	if (stream && chrom)
	{
		size_t i, j, k, n, m;
		gdl_string * name, * tok, * line=0;
		gdl_genex_probe * probe;
		long offset;
		
		offset=ftell (stream);
		// count the number of indiv using the first line
		gdl_getline (&line, &n, stream);
		for(i=j=m=0;j<n;m++)
		{
			gdl_string_free (gdl_string_next_token (line, n, &i, &j));
		}
		fseek (stream, offset, SEEK_SET);
		m--;
		gdl_genex_chromosome_set_popsize (chrom, pop, m);
		
		gdl_string_free (line);line=0;
		
		while (gdl_getline (&line, &n, stream) != -1)
		{
			for(i=j=k=0;j<n;k++)
			{
				tok = gdl_string_next_token (line, n, &i, &j);
				if (!k)
				{
					probe = gdl_genex_chromosome_search_probe (chrom, tok);
					if (!probe)
					{
						gdl_string_free (tok);
						break;
					}
				}
				else
				{
					probe->data[pop][k-1] = (double)atof(tok);
				}
				gdl_string_free (tok);
			}
			if (probe)
			{
				// compute the mean and the variance
				probe->mean[pop] = gdl_stats_mean (probe->data[pop], 1, m);
				probe->var[pop]  = gdl_stats_variance_with_fixed_mean (probe->data[pop], 1, m, probe->mean[pop]);
			}
			GDL_FREE (line);
			line=0;
		}
		
		return GDL_SUCCESS;
	}
	return GDL_EINVAL;
}

static long *
_exon_split (const gdl_string * token, const size_t exonCount)
{
	size_t i,j,n,l;
	long * pos;
	gdl_string * tmp;
	
	l=strlen(token);
	pos=GDL_MALLOC(long,exonCount);
	
	for(n=j=i=0;n<exonCount && i<l;i++)
	{
		if (token[i]==',')
		{
			tmp=gdl_string_alloc (i-j);
			strncpy (tmp,&token[j],i-j);
			pos[n]=atol(tmp);
			gdl_string_free (tmp);
			n++;
			j=i+1;
		}
	}
	
	return pos;
}

int
gdl_genex_chromosome_fscanf_refseq (FILE * stream,
                                    gdl_genex_chromosome * chrom,
                                    gdl_boolean force)
{
	if (stream && chrom)
	{
		size_t i, j, k, n, m;
		gdl_string * tok, * line=0;
		gdl_genex_gene * gene;
		
		while (gdl_getline (&line, &n, stream) != -1)
		{
			i=j=0;
			tok  = gdl_string_next_token (line, n, &i, &j);
			gene = gdl_genex_chromosome_search_gene (chrom, tok);
			if (force && !gene)
			{
				gene  = gdl_genex_gene_alloc (tok, 0);
				gdl_genex_chromosome_add_gene (chrom, gene);
			}
			if (gene)
			{
				// strand
				tok = gdl_string_next_token (line, n, &i, &j);
				gene->strand = (unsigned char) tok[0];
				gdl_string_free (tok);
				// txStart
				tok = gdl_string_next_token (line, n, &i, &j);
				gene->txStart = atol(tok);
				gdl_string_free (tok);
				// txEnd
				tok = gdl_string_next_token (line, n, &i, &j);
				gene->txEnd = atol(tok);
				gdl_string_free (tok);
				// cdsStart
				tok = gdl_string_next_token (line, n, &i, &j);
				gene->cdsStart = atol(tok);
				gdl_string_free (tok);
				// cdsEnd
				tok = gdl_string_next_token (line, n, &i, &j);
				gene->cdsEnd = atol(tok);
				gdl_string_free (tok);
				// exonCount
				tok = gdl_string_next_token (line, n, &i, &j);
				gene->exonCount = atoi(tok);
				gdl_string_free (tok);
				// exonStarts
				tok = gdl_string_next_token (line, n, &i, &j);
				gene->exonStarts = _exon_split (tok, gene->exonCount);
				gdl_string_free (tok);
				// exonEnds
				tok = gdl_string_next_token (line, n, &i, &j);
				gene->exonEnds = _exon_split (tok, gene->exonCount);
				if (force)
				{
					gdl_genex_block * block = gdl_genex_block_alloc (gene->txStart, gene->txEnd, gene->strand);
					gdl_genex_gene_add_block (gene, block);
				}
			}
			gdl_string_free (tok);
			GDL_FREE (line);
			line=0;
		}
		
		return GDL_SUCCESS;
	}
	return GDL_EINVAL;
}
