/*  
 *  dna/fasta.c
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:52 $, $Version$
 *
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */

#include <stdio.h>
#include <ctype.h>
 
#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_io.h>
#include <gdl/gdl_util.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_hash.h>
#include <gdl/gdl_gpoint.h>
#include <gdl/gdl_gview.h>
#include <gdl/gdl_gmap.h>
#include <gdl/gdl_dna_reader.h>

#define CLEAN(line) gdl_string_free (line);line=NULL;

static gdl_string *
gdl_dna_reader_fasta_get_name (const gdl_string * line, size_t n)
{
	size_t i;
	gdl_string * name;
	
	for (i = 1; i < n && !isspace(line[i]); i++);
	name = gdl_string_alloc (i-1);
	strncpy (name, &line[1], i-1);
	
	return name;
}

static gdl_hashtable *
gdl_dna_reader_fasta_check_header (gdl_dna_reader * reader)
{
	int ch, nseq=0;
	long fpos, * tmp;
	size_t i, n;
	gdl_string    * name;
	gdl_string    * line   = NULL;
	gdl_hashtable * header;
		
	header = gdl_hashtable_alloc (gdl_interface_long, 0);
	
	fpos = ftell (reader->stream);
	ch   = gdl_getline (&line, &n, reader->stream);
	while (ch != EOF)
	{
		if (line[0] == '>')
		{
			name = gdl_dna_reader_fasta_get_name (line, n);
			if (!gdl_hashtable_lookup (header, name))
			{
				tmp  = GDL_MALLOC (long, 1);
				*tmp = fpos;
				gdl_hashtable_add (header, name, tmp, 1);
				gdl_string_free (name);
			}
			else
			{
				gdl_dna_reader_error (reader, "Sequence [ %s ] is not unique\n", name);
				gdl_string_free (name);
				gdl_hashtable_free (header);
				CLEAN(line)
				return NULL;
			}
		}
		CLEAN(line)
		fpos = ftell (reader->stream);
		ch   = gdl_getline (&line, &n, reader->stream);
	}
	
	return header;
}

static size_t
gdl_dna_reader_fasta_get_sequence_size (gdl_dna_reader * reader)
{
	int ch;
	long fpos;
	size_t n, ns;
	gdl_string * line = NULL;
	
	ns   = 0;
	fpos = ftell (reader->stream);
	ch   = gdl_getline (&line, &n, reader->stream);
	
	while (ch != EOF)
	{
		if (line[0] == '>')
		{
			if (ns)
			{
				CLEAN(line)
				break;
			}
		}
		else
		{
			ns += n;
		}
		CLEAN(line)
		ch   = gdl_getline (&line, &n, reader->stream);
	}
	fseek (reader->stream, fpos, SEEK_SET);
	
	return ns;
}

static int
gdl_dna_reader_fasta_check_sequence (gdl_string * refseq, size_t * poly, const gdl_string * line, const size_t n)
{
	size_t i;
	char c;
	
	for (i = 0; i < n; i++)
	{
		c = toupper (line[i]);
		if (!gdl_dna_reader_is_valid (c))
		{
			return GDL_EINVAL;
		}
		if (!gdl_dna_reader_is_ambiguous (c))
		{
			if (refseq[i] != '\0' && c != refseq[i])
			{
				(poly[i])++;
			}
			else if (refseq[i] == '\0')
			{
				refseq[i]=c;
			}
			if (gdl_dna_reader_is_gap (c))
			{
				refseq[i] = c;
			}
		}		
	}
	
	return GDL_SUCCESS;
}

static gdl_dna_locus **
gdl_dna_reader_fasta_check_sites (gdl_dna_reader * reader, size_t * nl, size_t * length)
{
	int ch;
	long fpos;
	size_t i, j, n, ns, ons;
	gdl_string    * name = NULL;
	gdl_string    * line = NULL;
	gdl_string    * refseq;
	size_t        * poly;
	gdl_dna_locus ** sites;
	
	ns  = 0;
	*length = ons = gdl_dna_reader_fasta_get_sequence_size (reader);
	
	refseq = gdl_string_alloc (ons);
	poly   = GDL_CALLOC (size_t, ons);
	
	ch   = gdl_getline (&line, &n, reader->stream);
	while (ch != EOF)
	{
		if (line[0] == '>')
		{
			if (name && ns != ons)
			{
				gdl_dna_reader_error (reader, "Sequence [ %s ] has less sites [ %d ] than expected [ %d ]", name, ns, ons);
				gdl_string_free (name);
				gdl_string_free (refseq);
				GDL_FREE (poly);
				CLEAN(line)
				return NULL;
			}
			else if (name && ns == ons)
			{
				ons = ns;
				ns  = 0;
				gdl_string_free (name);
			}
			name = gdl_dna_reader_fasta_get_name (line, n);
		}
		else
		{
			ns += n;
			if (ns > ons)
			{
				gdl_dna_reader_error (reader, "Sequence [ %s ] has only [ %d] sites while expecting [ %d ]", name, ns, ons);
				gdl_string_free (name);
				gdl_string_free (refseq);
				GDL_FREE (poly);
				CLEAN(line)
				return NULL;
			}
			else if (gdl_dna_reader_fasta_check_sequence (&refseq[ns-n], &poly[ns-n], line, n) != GDL_SUCCESS)
			{
				gdl_dna_reader_error (reader, "Sequence [ %s ] contains not IUPAC characters in fragment:\n[ %s ]", name, line);
				gdl_string_free (name);
				gdl_string_free (refseq);
				GDL_FREE (poly);
				CLEAN(line)
				return NULL;
			}
		}
		CLEAN(line)
		ch = gdl_getline (&line, &n, reader->stream);
	}
			
	*nl = 0;
	for (i = 0; i < ns; i++)
	{
		if (poly[i])
		{
			(*nl)++;
		}
	}
	if (*nl)
	{
		sites = GDL_MALLOC (gdl_dna_locus *, *nl);
		for (j = i = 0; i < ns; i++)
		{
			if (poly[i])
			{
				sites[j] = gdl_dna_locus_alloc (i, refseq[i]);
				j++;
			}
		}
		if (reader->remove_indel)
		{
			gdl_dna_reader_remove_indel (refseq, sites, nl);
		}
		else if (reader->collapse_indel)
		{
			gdl_dna_reader_collapse_indel (refseq, poly, sites, nl);
		}
	}
	else
	{
		gdl_dna_reader_error (reader, "There is no polymorphic sites", name, line);
		sites = NULL;
	}
	
	gdl_string_free (refseq);
	GDL_FREE (poly);
	
	return sites;
}

static void
gdl_dna_reader_fasta_add_locus (gdl_dna_reader * reader, gdl_dna_locus ** sites, size_t nl)
{
	size_t i;
	gdl_string * name;
	
	for (i = 0; i < nl; i++)
	{
		if (reader->base)
		{
			name = gdl_string_sprintf ("%s%d", reader->base, i+1);
		}
		else
		{
			name = gdl_string_sprintf ("%d", i+1);
		}
		sites[i]->locus = gdl_gview_add_locus (reader->gview, name);
		gdl_string_free (name);
	}
}

static void
gdl_dna_reader_fasta_add_accession (gdl_dna_reader * reader, gdl_hashtable * header)
{
	gdl_hashtable_itr * itr;
	const gdl_string * name;
	
	itr = gdl_hashtable_iterator (header);
	do
	{
		name = gdl_hashtable_iterator_key (itr);
		gdl_gview_add_accession (reader->gview, name);
	}
	while (gdl_hashtable_iterator_next (itr));
	
	gdl_hashtable_iterator_free (itr);
}

static gdl_gdatapoint *
gdl_dna_reader_fasta_genotype_point (const gdl_genotype * allele)
{
	gdl_gdatapoint * point      = gdl_gdatapoint_alloc (gdl_gpoint_haplo, 1);
	point->values[0]            = gdl_gvalues_alloc (1);
	point->values[0]->values[0] = gdl_gvalue_alloc ();
	point->values[0]->values[0]->idx   = allele->idx;
	point->values[0]->values[0]->value = 1.0;
	return point;
}

static int
gdl_dna_reader_fasta_fill_accession (gdl_dna_reader * reader, gdl_accession * accession, gdl_dna_locus ** sites, size_t nl)
{
	size_t i;
	gdl_allele   * allele;
	gdl_genotype * genotype;
	gdl_gdatapoint * point;
	
	for (i = 0; i < nl; i++)
	{
		if (!gdl_dna_reader_has_ambiguous (sites[i]->buffer))
		{
			allele   = gdl_allele_new (sites[i]->buffer);
			genotype = gdl_genotype_alloc ();
			gdl_genotype_add (genotype, allele, 1);
			gdl_locus_add_genotype (sites[i]->locus, &genotype, 1);
			point = gdl_dna_reader_fasta_genotype_point (genotype);
			gdl_gview_set_gdatapoint (reader->gview, accession, sites[i]->locus, point);
			gdl_gdatapoint_free (point);
		}
		else
		{
			// TODO resolve - if possible - ambiguous...
		}
	}
}

static int
gdl_dna_reader_fasta_fill (gdl_dna_reader * reader, gdl_hashtable * header, gdl_dna_locus ** sites, size_t nl)
{
	int ch;
	long fpos;
	size_t i, j, jj, n, ns;
	gdl_string    * name = NULL;
	gdl_string    * line = NULL;
	gdl_accession * accession = NULL;
	gdl_locus     * locus;
	
	gdl_dna_reader_fasta_add_accession (reader, header);
	gdl_dna_reader_fasta_add_locus (reader, sites, nl);
	
	for (i = 0; i < nl; i++)
	{
		//printf ("NPOLY %d (%d, %d)\n", sites[i]->npoly, sites[i]->from, sites[i]->to);
		sites[i]->buffer = gdl_string_alloc (sites[i]->npoly);
	}
	
	ch = gdl_getline (&line, &n, reader->stream);
	while (ch != EOF)
	{
		if (line[0] == '>')
		{
			if (accession)
			{
				gdl_dna_reader_fasta_fill_accession (reader, accession, sites, nl);
			}
			name      = gdl_dna_reader_fasta_get_name (line, n);
			accession = gdl_gview_search_accession (reader->gview, name);
			i=j=ns=0;
			jj=0;
		}
		else
		{
			ns += n;
			for (; i < nl;)
			{
				for (j = jj; j < sites[i]->npoly && sites[i]->poly[j] < ns; j++)
				{
					//printf ("SITE %d BUFFER %d %c\n", i, j, toupper (line[sites[i]->poly[j]-(ns-n)]));
					sites[i]->buffer[j] = toupper (line[sites[i]->poly[j]-(ns-n)]);
				}
				if (j < sites[i]->npoly)
				{
					jj=j;
					break;
				}
				else
				{
					i++;
					jj=0;
				}
			}
		}
		CLEAN(line)
		ch = gdl_getline (&line, &n, reader->stream);
	}
	
	gdl_dna_reader_fasta_fill_accession (reader, accession, sites, nl);
	
	return GDL_SUCCESS;	
}

static int
gdl_dna_reader_fasta_parse (gdl_dna_reader * reader)
{
	size_t i, na, nl, length;
	gdl_hashtable * header;
	gdl_dna_locus ** sites;
	long offset;
	
	offset = ftell (reader->stream);
	
	header = gdl_dna_reader_fasta_check_header (reader);
	
	if (!header)
	{
		return GDL_EINVAL;	
	}
	
	na = gdl_hashtable_size (header);
	
	fseek (reader->stream, offset, SEEK_SET);
	
	sites = gdl_dna_reader_fasta_check_sites (reader, &nl, &length);
	
	if (!sites)
	{
		return GDL_EINVAL;	
	}
	
	fseek (reader->stream, offset, SEEK_SET);
	
	reader->gview = gdl_gview_alloc (gdl_gview_standard);
	
	gdl_gview_init (reader->gview, na, nl, 1);
	
	gdl_dna_reader_fasta_fill (reader, header, sites, nl);
	
	reader->gmap  = gdl_dna_reader_create_map (sites, nl, length, reader->reverse);
	
	reader->ltype = gdl_dna_reader_create_locus_type (sites, nl);
	
	gdl_hashtable_free (header);
	for (i = 0; i < nl; i++)
	{
		gdl_dna_locus_free (sites[i]);
	}
	GDL_FREE (sites);
	
	return GDL_SUCCESS;
}

#undef CLEAN

static const gdl_dna_reader_type _gdl_dna_reader_fasta =
{
	"gdl_dna_reader_fasta",
	"fasta",
	&gdl_dna_reader_fasta_parse
};

const gdl_dna_reader_type * gdl_dna_reader_fasta = &_gdl_dna_reader_fasta;
