/*  
 *  seq/fasta.c
 * 
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:41 $, $Version$
 *
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */

#include <stdio.h>
#include <ctype.h>
 
#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_io.h>
#include <gdl/gdl_util.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_list.h>
#include <gdl/gdl_hash.h>
#include <gdl/gdl_seq_io.h>
#include <gdl/gdl_seq.h>

static gdl_string *
gdl_seq_io_fasta_get_name (const gdl_string * line, size_t n)
{
	size_t i;
	gdl_string * name;
	
	for (i = 1; i < n && !isspace(line[i]); i++);
	name = gdl_string_alloc (i-1);
	memcpy (name, line+1, (i-1)*sizeof(char));
	
	return name;
}
//
//static gdl_hashtable *
//gdl_seq_io_fasta_check_header (gdl_seq_io * reader)
//{
//	int ch, nseq=0;
//	long fpos, * tmp;
//	size_t i, n;
//	gdl_string    * name;
//	gdl_string    * line   = NULL;
//	gdl_hashtable * header;
//		
//	header = gdl_hashtable_alloc (gdl_interface_long, 0);
//	
//	fpos = ftell (reader->stream);
//	ch   = gdl_getline (&line, &n, reader->stream);
//	while (ch != -1)
//	{
//		if (line[0] == '>')
//		{
//			name = gdl_seq_io_fasta_get_name (line, n);
//			if (!gdl_hashtable_lookup (header, name))
//			{
//				tmp  = GDL_MALLOC (long, 1);
//				*tmp = fpos;
//				gdl_hashtable_add (header, name, tmp, 1);
//				gdl_string_free (name);
//			}
//			else
//			{
//				gdl_seq_io_error (reader, "Sequence [ %s ] is not unique\n", name);
//				gdl_string_free (name);
//				gdl_hashtable_free (header);
//				CLEAN(line)
//				return NULL;
//			}
//		}
//		gdl_string_free (line);line=NULL;
//		fpos = ftell (reader->stream);
//		ch   = gdl_getline (&line, &n, reader->stream);
//	}
//	
//	return header;
//}
//
//static size_t
//gdl_seq_io_fasta_get_sequence_size (gdl_seq_io * reader)
//{
//	int ch;
//	long fpos;
//	size_t n, ns;
//	gdl_string * line = NULL;
//	
//	ns   = 0;
//	fpos = ftell (reader->stream);
//	ch   = gdl_getline (&line, &n, reader->stream);
//	while (ch != -1)
//	{
//		if (line[0] == '>')
//		{
//		   if (ns)
//		   {
//			CLEAN(line)
//			break;
//		   }
//		}
//		else
//		{
//			ns += n;
//		}
//		gdl_string_free (line);
//                line=0;
//		ch = gdl_getline (&line, &n, reader->stream);
//	}
//	fseek (reader->stream, fpos, SEEK_SET);
//	
//	return ns;
//}
//
//static int
//gdl_seq_io_fasta_fill (gdl_seq_io * reader, gdl_hashtable * header)
//{
//	int ch;
//	long fpos;
//	size_t seqidx=0,len,cur,n;
//	gdl_seq * seq;
//	gdl_string    * seqid = NULL;
//	gdl_string    * line = NULL;
//	
//	ch = gdl_getline (&line, &n, reader->stream);
//	while (ch != -1)
//	{
//		if (line[0] == '>')
//		{
//			seqid = gdl_seq_io_fasta_get_name (line, n);
//			len   = gdl_seq_io_fasta_get_sequence_size (reader);
//			seq   = gdl_seq_alloc (seqid, seqidx, 10);
//			// add to the database
//			reader->db->seqs[seqidx]=seq;
//			seqidx++;
//			cur=0;
//		}
//		else
//		{
//	  	   memcpy (&seq->seq[cur], line, sizeof(char)*strlen(line));
//		   cur += strlen(line);
//		}
//		gdl_string_free (line);line=0;
//		ch = gdl_getline (&line, &n, reader->stream);
//	}
//	
//	return GDL_SUCCESS;	
//}
//
//static int
//gdl_seq_io_fasta_parse (gdl_seq_io * reader)
//{
//	size_t i, na, nl, length;
//	gdl_hashtable * header;
//	long offset;
//	
//	offset = ftell (reader->stream);
//	
//	header = gdl_seq_io_fasta_check_header (reader);
//	
//	if (!header)
//	{
//		return GDL_EINVAL;	
//	}
//	
//	na = gdl_hashtable_size (header);
//	
//	fseek (reader->stream, offset, SEEK_SET);
//	
//	reader->db = gdl_seqdb_alloc (na);
//	
//	gdl_seq_io_fasta_fill (reader, header);
//	
//	return GDL_SUCCESS;
//}
//
//#undef CLEAN

static int
gdl_seq_io_fasta_parse (gdl_seq_io * reader)
{
	size_t i, na, nl, length;
	int ch;
	long offset;
	size_t seqidx=0,seqlen,cur,n;
	gdl_seq * seq;
	gdl_string    * seqid  = NULL;
	gdl_string    * seqdna = NULL;
	gdl_string    * line   = NULL;
	gdl_list * sequences  = gdl_list_alloc(gdl_list_default);
	
	while (gdl_getline (&line, &n, reader->stream) != -1)
	{
		if (line[0] == '>')
		{
			if (seqid)
			{
				seq = gdl_seq_alloc (seqid, seqidx++);
				seq->seq = seqdna;
				gdl_list_push_back (sequences, seq, 0);
				gdl_string_free (seqid);
			}
			seqid  = gdl_seq_io_fasta_get_name (line, n);
			seqdna = 0;
			cur=0;
			gdl_string_free (line);line=0;
			offset = ftell (reader->stream);
		}
		else
		{
			seqlen = 0;
			do
			{
				if (line[0] == '>')
				{
					gdl_string_free (line);line=0;
					break;
				}	 
				seqlen += n;
				gdl_string_free (line);line=0;
			}
			while (gdl_getline (&line, &n, reader->stream) != -1);
			fseek (reader->stream, offset, SEEK_SET);
			seqdna=gdl_string_alloc (seqlen);
			line=0;
			cur=0;
			while (cur < seqlen)
			{
				gdl_getline (&line, &n, reader->stream);
				//printf("DNA %s\n", line);
			   memcpy (seqdna+cur,line,sizeof(char)*n);
			   gdl_string_free (line);line=0;
			   cur += n;
			}
		}
	}
	if (seqid)
	{
		seq = gdl_seq_alloc (seqid, seqidx++);
		seq->seq = seqdna;
		gdl_list_push_back (sequences, seq, 0);
	}
	
	if (gdl_list_size (sequences))
	{
		gdl_list_itr * itr = gdl_list_iterator_front (sequences);
		reader->db = gdl_seqdb_alloc (gdl_list_size (sequences));
		seqidx     = 0;
		do
		{
			reader->db->seqs[seqidx++] = gdl_list_iterator_value (itr);
		}
		while(gdl_list_iterator_next (itr));
		gdl_list_iterator_free (itr);
	}
	gdl_list_free (sequences);
	
	return GDL_SUCCESS;
}

static int
gdl_seq_io_fasta_dump_str (FILE * stream, const gdl_seq_io * io, const gdl_string * str)
{
	size_t j,k,len;
	
	j=0;
	len=strlen(str);
	while(j < len)
	{
		for(k = 0; k < 70 && j < len; j++, k++)
			fprintf (stream, "%c", str[j]);
		fprintf (stream, "\n");
	};
}

static int
gdl_seq_io_fasta_dump (FILE * stream, const gdl_seq_io * io, const gdl_seqdb * db)
{
	size_t i;
	
	for(i = 0; i < db->size; i++)
	{
		fprintf (stream, ">%s\n", db->seqs[i]->id);
		gdl_seq_io_fasta_dump_str (stream, io, db->seqs[i]->seq);
	}
}

static const gdl_seq_io_type _gdl_seq_io_fasta =
{
	"gdl_seq_io_fasta",
	"fasta",
	&gdl_seq_io_fasta_parse,
	&gdl_seq_io_fasta_dump,
	&gdl_seq_io_fasta_dump_str
};

const gdl_seq_io_type * gdl_seq_io_fasta = &_gdl_seq_io_fasta;
