/*  
 *  seq/reader.c
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:41 $, $Version$
 *
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */

#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_io.h>
#include <gdl/gdl_hash.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_seq.h>

/***********************************************************************
 * 
 * 
 *                      SEQ
 * 
 * 
 * *********************************************************************/

gdl_seq *
gdl_seq_alloc (const gdl_string * id, const size_t idx)
{
	gdl_seq * s;
	
	s = GDL_CALLOC (gdl_seq, 1);
	
	s->idx  = idx;
	s->id   = gdl_string_clone (id);
	
	return s;
}

void
gdl_seq_free (gdl_seq * s)
{
	if (s)
	{
		gdl_string_free (s->id);
		gdl_string_free (s->seq);
		GDL_FREE (s);	
	}	
}

gdl_string *
gdl_seq_extract_string (const gdl_seq * seq, const size_t from, const size_t to)
{
	const size_t size = strlen(seq->seq);
	if (from < size && to < size)
	{
		gdl_string * sub = gdl_string_alloc (to-from+1);
		memcpy (sub, &(seq->seq[from]), sizeof(char)*(to-from+1));
		return sub;
	}
	return 0;
}

void
gdl_seq_reverse (gdl_seq * seq)
{
	const size_t N = strlen(seq->seq);
	int i;
	gdl_string * rev = gdl_string_alloc (N);
	
	for(i = N-1; i>=0; i--)
		rev[(N-1)-i] = GDL_DNA_COMPLEMENT (seq->seq[i]);
	
	gdl_string_free (seq->seq);
	seq->seq = rev;
}

gdl_string * 
gdl_seq_reverse_str (gdl_string * str)
{
	const size_t N = strlen(str);
	int i;
	gdl_string * rev = gdl_string_alloc (N);
	
	for(i = N-1; i>=0; i--)
		rev[(N-1)-i] = GDL_DNA_COMPLEMENT (str[i]);
	
	gdl_string_free (str);
	
	str = rev;
	
	return str;
}

gdl_string *
gdl_seq_dna2aa_str (gdl_string * str, const unsigned char strand, const size_t phase)
{
	size_t i,j,N,A;
	gdl_string * seq = str;
	gdl_string * aa;
	
	N  = strlen(seq);
	A  = (N-phase)/3;
	if (A*3 != N-phase)
	{
		GDL_WARNING ("[gdl_seq_dna2aa_str] Sequence length is not a multiple of 3 (return null pointer)", GDL_EINVAL);
		
		return 0;
	}
	
	gdl_string * codon = gdl_string_alloc (3);
	
	if (strand == '-')
	{
		seq = gdl_string_clone (str);
		seq = gdl_seq_reverse_str (seq);
	}
	aa = gdl_string_alloc (A);
	for(j=0,i=phase;i<N;i+=3,j++)
	{
		memcpy(codon, seq + i, sizeof(char)*3);
		aa[j] = GDL_DNA_TO_AA (codon);
	}
	
	gdl_string_free (codon);
	if (seq != str)
		gdl_string_free (seq);
	
	return aa;
}

/***********************************************************************
 * 
 * 
 *                      SEQDB
 * 
 * 
 * *********************************************************************/
 
gdl_seqdb *
gdl_seqdb_alloc (const size_t size)
{
	gdl_seqdb * p;
	
	p = GDL_CALLOC (gdl_seqdb, 1);
	
	p->size = size;
	p->seqs = GDL_CALLOC (gdl_seq *, size);
	
	return p;
}

void
gdl_seqdb_free (gdl_seqdb * db)
{
	if (db)
	{
		size_t i;
		for(i = 0; i < db->size; i++)
			gdl_seq_free (db->seqs[i]);	
		GDL_FREE (db->seqs);
		GDL_FREE (db);
	}
}

int
gdl_seqdb_refactor_seqids (gdl_seqdb * db, gdl_hashtable * refactor)
{
	size_t i,nr=0;
	
	for(i = 0; i < db->size; i++)
	{
		const gdl_string * newid = gdl_hashtable_lookup (refactor, db->seqs[i]->id);
		if (newid)
		{
			nr++;
			gdl_string_free (db->seqs[i]->id);
			db->seqs[i]->id = gdl_string_clone (newid);
		}
	}
	
	return nr;
}

int
gdl_seqdb_id2seq_create (gdl_seqdb * db)
{
	if (db->_id2seq)
	   gdl_hashtable_free (db->_id2seq);
	
	db->_id2seq = gdl_hashtable_alloc (gdl_hash_default, 0);
	size_t i,*idx;
	for(i = 0; i < db->size; i++)
	{
	   idx=GDL_MALLOC(size_t, 1);
	   *idx=i;
	   gdl_hashtable_add (db->_id2seq, db->seqs[i]->id, idx, 0);
	}
	
	return GDL_SUCCESS;
}

gdl_seq *
gdl_seqdb_id2seq_lookup (gdl_seqdb * db, const gdl_string * id)
{
	if (db->_id2seq)
	{
		size_t * idx = gdl_hashtable_lookup (db->_id2seq, id);
		if (idx)
		{
			return db->seqs[*idx];
		}
	}
	return 0;
}

int
gdl_seqdb_id2seq_clean (gdl_seqdb * db)
{
	if (db->_id2seq)
	{
		gdl_hashtable_free (db->_id2seq);
		db->_id2seq=0;
	}
}

//	 IUPAC DNA Code
//  
//   A = adenine           
//   C = cytosine            
//   G = guanine             
//   T = thymine           
//   U = uracil
//   R = G A (purine)        
//   Y = T C (pyrimidine)    
//   K = G T (keto)    
//   M = A C (amino)
//   S = G C 
//   W = A T 
//   B = G T C
//   D = G A T
//   H = A C T
//   V = G C A
//   N = A G C T (any)

gdl_boolean
GDL_IUPAC_DNA_CODE (const char base)
{
	switch(toupper(base))
	{
		case 'A':
		case 'C':
		case 'G':
		case 'T':
		case 'U':
		case 'R':
		case 'Y':
		case 'K':
		case 'M':
		case 'S':
		case 'W':
		case 'B':
		case 'D':
		case 'H':
		case 'V':
		case 'N':
		case '-':
			return gdl_true;
		default:
			return gdl_false;
	}
}

gdl_boolean
GDL_DNA_CODE_AMBIGUOUS (const char base)
{
	switch(toupper(base))
	{
		case 'A':
		case 'C':
		case 'G':
		case 'T':
		case '-':
			return gdl_false;
		default:
			return gdl_true;
	}
}

gdl_boolean
GDL_DNA_CODE_GAP (const char base)
{
	switch(toupper(base))
	{
		case '-':
			return gdl_true;
		default:
			return gdl_false;
	}
}

char
GDL_DNA_COMPLEMENT (const char base)
{
	switch(toupper(base))
	{
		case 'A':
			return 'T';
		case 'T':
			return 'A';
		case 'G':
			return 'C';
		case 'C':
			return 'G';
		case 'N':
			return 'N';
	}
}

char
GDL_DNA_TO_AA (gdl_string * codon)
{
	if (strlen(codon)!=3)
		return '?';
	if (!strcmp(codon,"GCC") || !strcmp(codon,"GCC"))
   	return 'A';
	if (!strcmp(codon,"AGT") || !strcmp(codon,"AGU"))
	   return 'S';
	if (!strcmp(codon,"TGA") || !strcmp(codon,"UGA"))
	   return '*';
	if (!strcmp(codon,"TGT") || !strcmp(codon,"UGU"))
	   return 'C';
	if (!strcmp(codon,"CGA") || !strcmp(codon,"CGA"))
	   return 'R';
	if (!strcmp(codon,"ATC") || !strcmp(codon,"AUC"))
	   return 'I';
	if (!strcmp(codon,"AAC") || !strcmp(codon,"AAC"))
	   return 'N';
	if (!strcmp(codon,"AGC") || !strcmp(codon,"AGC"))
	   return 'S';
	if (!strcmp(codon,"TAC") || !strcmp(codon,"UAC"))
	   return 'Y';
	if (!strcmp(codon,"ACA") || !strcmp(codon,"ACA"))
	   return 'T';
	if (!strcmp(codon,"TCG") || !strcmp(codon,"UCG"))
	   return 'S';
	if (!strcmp(codon,"CCG") || !strcmp(codon,"CCG"))
	   return 'P';
	if (!strcmp(codon,"CTG") || !strcmp(codon,"CUG"))
	   return 'L';
	if (!strcmp(codon,"GCA") || !strcmp(codon,"GCA"))
	   return 'A';
	if (!strcmp(codon,"GTG") || !strcmp(codon,"GUG"))
	   return 'V';
	if (!strcmp(codon,"AAG") || !strcmp(codon,"AAG"))
	   return 'K';
	if (!strcmp(codon,"GTT") || !strcmp(codon,"GUU"))
	   return 'V';
	if (!strcmp(codon,"CAC") || !strcmp(codon,"CAC"))
	   return 'H';
	if (!strcmp(codon,"AGA") || !strcmp(codon,"AGA"))
	   return 'R';
	if (!strcmp(codon,"ACC") || !strcmp(codon,"ACC"))
	   return 'T';
	if (!strcmp(codon,"CCA") || !strcmp(codon,"CCA"))
	   return 'P';
	if (!strcmp(codon,"TGG") || !strcmp(codon,"UGG"))
	   return 'W';
	if (!strcmp(codon,"CGC") || !strcmp(codon,"CGC"))
	   return 'R';
	if (!strcmp(codon,"CTC") || !strcmp(codon,"CUC"))
	   return 'L';
	if (!strcmp(codon,"TTG") || !strcmp(codon,"UUG"))
	   return 'L';
	if (!strcmp(codon,"TAA") || !strcmp(codon,"UAA"))
	   return '*';
	if (!strcmp(codon,"CAG") || !strcmp(codon,"CAG"))
	   return 'Q';
	if (!strcmp(codon,"ACG") || !strcmp(codon,"ACG"))
	   return 'T';
	if (!strcmp(codon,"AAA") || !strcmp(codon,"AAA"))
	   return 'K';
	if (!strcmp(codon,"ATG") || !strcmp(codon,"AUG"))
	   return 'M';
	if (!strcmp(codon,"GTA") || !strcmp(codon,"GUA"))
	   return 'V';
	if (!strcmp(codon,"CTT") || !strcmp(codon,"CUU"))
	   return 'L';
	if (!strcmp(codon,"TAG") || !strcmp(codon,"UAG"))
	   return '*';
	if (!strcmp(codon,"GGA") || !strcmp(codon,"GGA"))
	   return 'G';
	if (!strcmp(codon,"GTC") || !strcmp(codon,"GUC"))
	   return 'V';
	if (!strcmp(codon,"TGC") || !strcmp(codon,"UGC"))
	   return 'C';
	if (!strcmp(codon,"TCA") || !strcmp(codon,"UCA"))
	   return 'S';
	if (!strcmp(codon,"ATT") || !strcmp(codon,"AUU"))
	   return 'I';
	if (!strcmp(codon,"TAT") || !strcmp(codon,"UAU"))
	   return 'Y';
	if (!strcmp(codon,"AAT") || !strcmp(codon,"AAU"))
	   return 'N';
	if (!strcmp(codon,"ACT") || !strcmp(codon,"ACU"))
	   return 'T';
	if (!strcmp(codon,"GAC") || !strcmp(codon,"GAC"))
	   return 'D';
	if (!strcmp(codon,"CAA") || !strcmp(codon,"CAA"))
	   return 'Q';
	if (!strcmp(codon,"GGT") || !strcmp(codon,"GGU"))
	   return 'G';
	if (!strcmp(codon,"TCC") || !strcmp(codon,"UCC"))
	   return 'S';
	if (!strcmp(codon,"TTT") || !strcmp(codon,"UUU"))
	   return 'F';
	if (!strcmp(codon,"AGG") || !strcmp(codon,"AGG"))
	   return 'R';
	if (!strcmp(codon,"CGT") || !strcmp(codon,"CGU"))
	   return 'R';
	if (!strcmp(codon,"ATA") || !strcmp(codon,"AUA"))
	   return 'I';
	if (!strcmp(codon,"CAT") || !strcmp(codon,"CAU"))
	   return 'H';
	if (!strcmp(codon,"CGG") || !strcmp(codon,"CGG"))
	   return 'R';
	if (!strcmp(codon,"GGG") || !strcmp(codon,"GGG"))
	   return 'G';
	if (!strcmp(codon,"CCC") || !strcmp(codon,"CCC"))
	   return 'P';
	if (!strcmp(codon,"TTA") || !strcmp(codon,"UUA"))
	   return 'L';
	if (!strcmp(codon,"GAG") || !strcmp(codon,"GAG"))
	   return 'E';
	if (!strcmp(codon,"CTA") || !strcmp(codon,"CUA"))
	   return 'L';
	if (!strcmp(codon,"GAT") || !strcmp(codon,"GAU"))
	   return 'D';
	if (!strcmp(codon,"TCT") || !strcmp(codon,"UCU"))
	   return 'S';
	if (!strcmp(codon,"TTC") || !strcmp(codon,"UUC"))
	   return 'F';
	if (!strcmp(codon,"GCG") || !strcmp(codon,"GCG"))
	   return 'A';
	if (!strcmp(codon,"GGC") || !strcmp(codon,"GGC"))
	   return 'G';
	if (!strcmp(codon,"GAA") || !strcmp(codon,"GAA"))
	   return 'E';
	if (!strcmp(codon,"GCT") || !strcmp(codon,"GCU"))
	   return 'A';
	if (!strcmp(codon,"CCT") || !strcmp(codon,"CCU"))
	   return 'P';
	return '?';
}
