/*  
 *  dna/reader.c
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:52 $, $Version$
 *
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */

#include <ctype.h>
 
#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_io.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_gentity.h>
#include <gdl/gdl_locus_type.h>
#include <gdl/gdl_gview.h>
#include <gdl/gdl_gmap.h>
#include <gdl/gdl_dna_reader.h>

gdl_dna_reader *
gdl_dna_reader_alloc (const gdl_dna_reader_type * type, const gdl_string * base, gdl_boolean remove_indel, gdl_boolean collapse_indel, gdl_boolean reverse)
{
	gdl_dna_reader * r;
	
	r = GDL_CALLOC (gdl_dna_reader, 1);
	
	r->type           = type;
	r->base           = base;
	r->remove_indel   = remove_indel;
	r->collapse_indel = collapse_indel;
	r->reverse        = reverse;
	
	return r;
}

void
gdl_dna_reader_free (gdl_dna_reader * reader)
{
	if (reader)
	{
		gdl_string_free (reader->filename);
		gdl_string_free (reader->error);
		GDL_FREE (reader);
	}
}

int
gdl_dna_reader_open (gdl_dna_reader * reader, 
                          const gdl_string * filename)
{
	if (reader && filename)
	{
		reader->stream = gdl_fileopen (filename, "r");
		if (reader->stream)
		{
			gdl_string_free (reader->filename);
			reader->filename = gdl_string_clone (filename);
			gdl_string_free (reader->error);
			reader->error = NULL;
			return GDL_SUCCESS;
		}
		else
			return GDL_EINVAL;
	}
	return GDL_EINVAL;
}

int
gdl_dna_reader_parse (gdl_dna_reader * reader)
{
	if (!reader)
	{
		return GDL_EINVAL;
	}
	else
	{
		return (reader->type->read)(reader);
	}
}

gdl_gview *
gdl_dna_reader_get_data (const gdl_dna_reader * reader)
{
	return reader->gview;	
}

gdl_gmap *
gdl_dna_reader_get_map (const gdl_dna_reader * reader)
{
	return reader->gmap;
}

gdl_locus_type_registry *
gdl_dna_reader_get_locus_type (const gdl_dna_reader * reader)
{
	return reader->ltype;	
}

int
gdl_dna_reader_close (gdl_dna_reader * reader)
{
	if (reader && reader->stream)
	{
		gdl_fileclose (reader->filename, reader->stream);
		return GDL_SUCCESS;
	}
	return GDL_EINVAL;
}

int
gdl_dna_reader_error (gdl_dna_reader * reader, char * format, ...)
{
	if (reader == 0)
		return GDL_EINVAL;
	else
	{
		va_list ap; 
		
		va_start (ap, format);
		
		gdl_string_free (reader->error);
		
		reader->error = gdl_string_vsprintf (format, ap);
		
		va_end (ap);
	 	
	 	return GDL_SUCCESS;
	}		
}

//	 IUPCA DNA Code
//  
//   A = adenine           
//   C = cytosine            
//   G = guanine             
//   T = thymine           
//   U = uracil
//   R = G A (purine)        
//   Y = T C (pyrimidine)    
//   K = G T (keto)    
//   M = A C (amino)
//   S = G C 
//   W = A T 
//   B = G T C
//   D = G A T
//   H = A C T
//   V = G C A
//   N = A G C T (any)

gdl_boolean
gdl_dna_reader_is_valid (const int base)
{
	switch(toupper(base))
	{
		case 'A':
		case 'C':
		case 'G':
		case 'T':
		case 'U':
		case 'R':
		case 'Y':
		case 'K':
		case 'M':
		case 'S':
		case 'W':
		case 'B':
		case 'D':
		case 'H':
		case 'V':
		case 'N':
		case '-':
			return gdl_true;
		default:
			return gdl_false;
	}
}

gdl_boolean
gdl_dna_reader_is_ambiguous (const int base)
{
	switch(toupper(base))
	{
		case 'A':
		case 'C':
		case 'G':
		case 'T':
		case '-':
			return gdl_false;
		default:
			return gdl_true;
	}
}

gdl_boolean
gdl_dna_reader_has_ambiguous (const gdl_string * seq)
{
	size_t i, n;
	
	n = strlen (seq);
	
	for (i = 0; i < n; i++)
	{
		if (gdl_dna_reader_is_ambiguous (seq[i]))
		{
			return gdl_true;	
		}
	}
	
	return gdl_false;	
}

gdl_boolean
gdl_dna_reader_is_gap (const int base)
{
	switch(toupper(base))
	{
		case '-':
			return gdl_true;
		default:
			return gdl_false;
	}
}

void
gdl_dna_reader_remove_indel (const gdl_string * refseq, gdl_dna_locus ** sites, size_t * nl)
{
	size_t i, j, s, e, n;
	
	n = *nl;
	
	for (i = 0; i < n; i++)
	{
		s = sites[i]->from;
		e = sites[i]->to;
		for (j = s; j <= e; j++)
		{
			if (gdl_dna_reader_is_gap(refseq[j]))
			{
				break;	
			}
		}
		if (j <= e)
		{
			gdl_dna_locus_free (sites[i]);
			for (j = i; j < n-1; j++)
			{
				sites[j] = sites[j+1];
			}
			sites[j]=NULL;
			(n)--;
			i--;
		}
	}
	
	*nl = n;
}

void
gdl_dna_reader_collapse_indel (const gdl_string * refseq, const size_t * poly, gdl_dna_locus ** sites, size_t * nl)
{
	size_t i, j, s, e, n;
	
	n = *nl;
	
	for (i = 0; i < n-1; i++)
	{
		if (sites[i]->indel)
		{
			s = sites[i+1]->from;
		}
		else
		{
			s = sites[i]->to;
		}
		e = sites[i+1]->to;
		for (j = s; j <= e; j++)
		{
			if (!gdl_dna_reader_is_gap(refseq[j]))
			{
				break;	
			}
		}
		if (j > e)
		{
			sites[i+1]->from  = sites[i]->from;
			
			sites[i+1]->indel = gdl_true;
			
			gdl_dna_locus_poly (sites[i+1], poly);
			
			gdl_dna_locus_free (sites[i]);
			
			for (j = i; j < n-1; j++)
			{
				sites[j] = sites[j+1];
			}
			sites[j]=NULL;
			(n)--;
			i--;
		}
	}
	
	*nl = n;
}

gdl_dna_locus *
gdl_dna_locus_alloc (size_t position, const char base)
{
	gdl_dna_locus * d;
	
	d = GDL_CALLOC (gdl_dna_locus, 1);
	
	d->from    = position;
	d->to      = position;
	d->poly    = GDL_MALLOC (size_t, 1);
	d->npoly   = 1;
	d->poly[0] = position;
	d->indel   = gdl_dna_reader_is_gap (base);
	
	return d;
}

void
gdl_dna_locus_free (gdl_dna_locus * d)
{
	if (d)
	{
		gdl_string_free (d->buffer);
		GDL_FREE (d->poly);
		GDL_FREE (d);	
	}	
}

void
gdl_dna_locus_poly (gdl_dna_locus * d, const size_t * poly)
{
	size_t i, j;
	
	if (d->from == d->to)
	{
		return;
	}
	
	if (d->poly)
	{
		GDL_FREE (d->poly);
	}
	
	d->npoly=0;
	
	for (i = d->from; i <= d->to; i++)
	{
		if ((d->indel && poly[i] > 1)
		    || (!d->indel && poly[i]))
		{
			(d->npoly)++;
		}
	}
	
	if (!d->npoly)
	{
		d->npoly   = 1;
		d->poly    = GDL_MALLOC (size_t, 1);
		d->poly[0] = d->from;
		return;
	}
	
	d->poly = GDL_MALLOC (size_t, d->npoly);
	
	for (j = 0, i = d->from; i <= d->to; i++)
	{
		if ((d->indel && poly[i] > 1)
		    || (!d->indel && poly[i]))
		{
			d->poly[j++] = i;
		}
	}
}

gdl_gmap *
gdl_dna_reader_create_map (gdl_dna_locus ** sites, size_t nl, size_t length, gdl_boolean reverse)
{
	size_t i, ii;
	gdl_gdistance  * position;
	gdl_genome     * genome;
	gdl_chromosome * chrom;
	gdl_locus      * locus;
	gdl_gmap       * map;
	
	map    = gdl_gmap_alloc ();
	genome = gdl_genome_new ("1");
	chrom  = gdl_chromosome_new ("1");
	
	position = gdl_gdistance_alloc (gdl_gdistance_base);
	locus    = gdl_locus_new ("-");
	gdl_chromosome_push (chrom, locus, 1);
	
	if (reverse)
	{
		int j;
		
		ii = nl - 1;
		for (j = nl; j > 1; j--)
		{
			i = j-1;
			if (i < nl-1)
			{
				if (sites[i]->indel && sites[i+1]->indel)
				{
					position->value = 0;
				}
				else
				{
					position->value=sites[ii]->from-sites[i]->from;
					ii=i;
				}
			}
			else
			{
				position->value=(double)(length-sites[i]->from);
			}
			locus  = gdl_entity_clone (sites[i]->locus);
			gdl_chromosome_add (chrom, locus, position, 1);
		}
	}
	else
	{
		for (ii = i = 0; i < nl; i++)
		{
			if (i)
			{
				if (sites[i]->indel && sites[i-1]->indel)
				{
					position->value = 0;
				}
				else 
				{
					position->value=(double)(sites[i]->from-sites[ii]->from);
					ii=i;
				}
			}
			else
			{
				position->value=(double)sites[i]->from;
			}
			locus  = gdl_entity_clone (sites[i]->locus);
			gdl_chromosome_add (chrom, locus, position, 1);
		}
	}
	
	gdl_genome_add (genome, chrom);
	
	gdl_gmap_add (map, genome);
	
	gdl_gdistance_free (position);
	
	return map;
}

gdl_locus_type_registry *
gdl_dna_reader_create_locus_type (gdl_dna_locus ** sites, size_t nl)
{
	size_t i;
	const gdl_locus_type * dna   = gdl_locus_dna;
	const gdl_locus_type * indel = gdl_locus_indel;
	gdl_locus_type_registry * r;
	
	r = gdl_locus_type_registry_alloc ();
	
	for (i = 0; i < nl; i++)
	{
		if (sites[i]->indel)
		{
			gdl_locus_type_registry_add (r, indel, sites[i]->locus);	
		}
		else
		{
			gdl_locus_type_registry_add (r, dna, sites[i]->locus);		
		}
	}
	
	return r;
}
