/*  
 *  gpca/tag.c
 *  
 *  $Author: baptiste $, $Date: 2008-05-13 15:33:42 $, $Version$
 *
 *  Libgdl : a C library for statistical genetics
 * 
 *  Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * 
 */

#include <math.h>

#include <gdl/gdl_common.h>
#include <gdl/gdl_errno.h>
#include <gdl/gdl_string.h>
#include <gdl/gdl_gentity.h>
#include <gdl/gdl_hash.h>
#include <gdl/gdl_vector.h>
#include <gdl/gdl_matrix.h>
#include <gdl/gdl_blas.h>
#include <gdl/gdl_cblas.h>
#include <gdl/gdl_linalg.h>
#include <gdl/gdl_pca.h>
#include <gdl/gdl_gmatrix.h>
#include <gdl/gdl_gview_wrapper.h>
#include <gdl/gdl_hview.h>
#include <gdl/gdl_gpca.h>
#include <gdl/gdl_gpca_tag.h>

typedef struct
{
	const gdl_entity_type * type;
	gdl_mask * gmask;
	gdl_gpca_workspace * gpca;
	gdl_gview_wrapper  * gwrap;
	gdl_hview          * hview;
} gdl_gpca_tag_group;

struct _gdl_gpca_tag_workspace
{
	gdl_gpca_workspace * gpca;
	gdl_hashtable      * buffer;
	gdl_gpca_tag_group ** groups;
	gdl_matrix          * Z;
	size_t             naxe;
	const gdl_gmatrix  * data;
	const gdl_gview    * gview;
	const gdl_hview    * hview;
	const gdl_gpca_workspace_type * type;
	const gdl_entity_type         * mode;
	const gdl_mask                * mask;
	gdl_mask * tag;
	FILE * logger;
};

static gdl_gpca_tag_group *
gdl_gpca_tag_group_alloc (const gdl_entity_type * T, gdl_entity_mask * mask, size_t owner)
{
   gdl_gpca_tag_group * g;
   
   g = GDL_CALLOC (gdl_gpca_tag_group, 1);
   
   g->type  = T;
   
   g->gmask = gdl_mask_alloc ();
   
   if (T == GDL_LOCUS)
   {
   	gdl_mask_set (g->gmask, GDL_ACCESSION, mask, owner);
   }
   else if (T == GDL_ACCESSION)
   {
   	gdl_mask_set (g->gmask, GDL_LOCUS, mask, owner);
   }
   
   return g;
}

size_t
gdl_gpca_tag_group_size (const gdl_gpca_tag_group * g)
{
	return gdl_mask_get_size (g->gmask, g->type);
}

static void
gdl_gpca_tag_group_add (gdl_gpca_tag_group * g, const gdl_mask * gmask, size_t i)
{
	if (g->type == GDL_LOCUS)
	{
		// This assumes that groups have reasonable size...
		// since it takes O(n) to add an index.
		size_t j, idx, n;
		
		n   = gdl_mask_get_size (g->gmask, GDL_LOCUS);
		idx = gdl_mask_get_idx (gmask, GDL_LOCUS, i);
		
		for (j = 0; j < n; j++)
		{
			if (gdl_mask_get_idx (g->gmask, GDL_LOCUS, j) == idx)
			{
				break;
			}
		}
		if (j == n)
		{
			gdl_mask_add_idx (g->gmask, GDL_LOCUS, idx);
		}
	}
	else if (g->type == GDL_ACCESSION)
	{
		gdl_mask_add_idx (g->gmask, GDL_ACCESSION, gdl_mask_get_idx (gmask, GDL_ACCESSION, i));
	}	
}

static gdl_gpca_workspace *
gdl_gpca_tag_group_workspace_alloc (gdl_gpca_tag_group * g, const gdl_gpca_tag_workspace * w)
{
	if (w->gview)
	{
		g->gwrap = gdl_gview_wrapper_alloc (gdl_gview_wrapper_global, w->gview, g->gmask);
		g->gpca  = gdl_gpca_workspace_alloc (w->type, g->gwrap);
	}
	else if (w->hview)
	{
		g->hview = gdl_hview_extract (w->hview, g->gmask);
		//g->gpca  = gdl_gpca_workspace_halloc (w->type, g->hview);
	}
	else
	{
		GDL_ERROR_VAL ("Unexpected null pointer : Internal Error", GDL_FAILURE, 0);	
	}
	
	return g->gpca;
}

static void
gdl_gpca_tag_group_workspace_free (gdl_gpca_tag_group * g)
{
	if (g)
	{
		gdl_gpca_workspace_free (g->gpca);g->gpca=NULL;
		gdl_gview_wrapper_free (g->gwrap);g->gwrap=NULL;
		gdl_hview_free (g->hview);g->hview=NULL;
	}	
}

static void
gdl_gpca_tag_group_free (gdl_gpca_tag_group * g)
{
	if (g)
	{
		gdl_gpca_tag_group_workspace_free (g);
		gdl_mask_free (g->gmask);
		GDL_FREE (g);
	}	
}

const gdl_entity_type *
gdl_gpca_tag_workspace_mode (const gdl_gpca_workspace_type * T)
{
	if (   T == gdl_gpca_locus_allele_covariance
	    || T == gdl_gpca_locus_allele_correlation)
	{
		return GDL_LOCUS;
	}
	else if (   T == gdl_gpca_accession_allele_covariance
	         || T == gdl_gpca_accession_allele_correlation
	         || T == gdl_gpca_accession_allele_euclidean_distance)
	{
		return GDL_ACCESSION;
	}
	
	return NULL;
}

gdl_gpca_tag_workspace *
gdl_gpca_tag_workspace_alloc (const gdl_gpca_workspace_type * T, const gdl_gview_wrapper * gwrap)
{
	gdl_gpca_tag_workspace * t;
	
	t = GDL_CALLOC (gdl_gpca_tag_workspace, 1);
	
	t->type  = T;
	
	t->mode = gdl_gpca_tag_workspace_mode (T);
	
	if (t->mode == 0)
	{
   	GDL_FREE (t);
   	GDL_ERROR_VAL ("Invalid gdl_gpca_workspace_type in gdl_gpca_tag_workspace_galloc()",
   	               GDL_EINVAL,
   	               0);
   }
	
	t->gpca  = gdl_gpca_workspace_alloc (T, gwrap);
	
	t->data  = gdl_gpca_workspace_data (t->gpca);
	
	t->gview = gdl_gview_wrapper_gview (gwrap);
	
	t->mask  = gdl_gview_wrapper_gmask (gwrap);
	
	return t;
}

static 
gdl_gpca_workspace_clean (gdl_gpca_tag_workspace * m)
{
	if (m)
	{
		gdl_mask_free (m->tag);
		gdl_matrix_free (m->Z);
		gdl_hashtable_free (m->buffer);
		if (m->groups)
		{
			size_t i;
			for (i = 0; i < m->naxe; i++)
			{
				gdl_gpca_tag_group_free (m->groups[i]);	
			}
			GDL_FREE (m->groups);
		}
		m->naxe   = 0;
		m->Z      = NULL;
		m->buffer = NULL;
		m->groups = NULL;
	}
}

void
gdl_gpca_tag_workspace_free (gdl_gpca_tag_workspace * m)
{
	if (m)
	{
		gdl_gpca_workspace_clean (m);
		gdl_gpca_workspace_free (m->gpca);
		GDL_FREE (m);	
	}	
}

static int
gdl_gpca_tag_groups_alloc (gdl_gpca_tag_workspace * w, const gdl_entity_type * type)
{
	size_t i;
	gdl_entity_mask   * mask;
	const gdl_mask   * gmask;
	
	gmask = gdl_gmatrix_get_mask (w->data);
	
	if (w->mode == GDL_LOCUS)
	{
		mask = gdl_mask_get_clone (gmask, GDL_ACCESSION);
	}
	else
	{
		mask = gdl_mask_get_clone (gmask, GDL_LOCUS);
	}
	
	w->groups = GDL_MALLOC (gdl_gpca_tag_group *, w->naxe);
	
	for (i = 0; i < w->naxe; i++)
	{
		w->groups[i] = gdl_gpca_tag_group_alloc (type, mask, (!i) ? 1: 0);
	}
	
	return GDL_SUCCESS;
}

static void
gdl_gpca_tag_locus_groups_init (gdl_gpca_tag_workspace * w, const double loading_threshold)
{
	size_t i, j, k, l, nl, nc;
	double x;
	const gdl_mask   * gmask;
	
	gmask = gdl_gmatrix_get_mask (w->data);
	
	nl = gdl_gmatrix_locus_size (w->data);
	for (k = i = 0; i < nl; i++)
	{
		nc = gdl_gmatrix_locus_column_size (w->data, i);
		for (j = 0; j < nc; j++, k++)
		{
			for (l = 0; l < w->naxe; l++)
			{
				 x = fabs (gdl_matrix_get (w->Z, k, l));
				 if (x >= loading_threshold)
				 {
				 	 gdl_gpca_tag_group_add (w->groups[l], gmask, i);
				 }
			}
		}
	}
}

static void
gdl_gpca_tag_accession_groups_init (gdl_gpca_tag_workspace * w, const double loading_threshold)
{
	size_t i, j, na;
	double x;
	const gdl_mask * gmask;
	
	gmask = gdl_gmatrix_get_mask (w->data);
	
	na = gdl_gmatrix_row_size (w->data);
	
	for (i = 0; i < na; i++)
	{
		for (j = 0; j < w->naxe; j++)
		{
			 x = fabs (gdl_matrix_get (w->Z, i, j));
			 if (x >= loading_threshold)
			 {
			 	 gdl_gpca_tag_group_add (w->groups[j], gmask, i);
			 }
		}
	}
}

static void
gdl_gpca_tag_groups_init (gdl_gpca_tag_workspace * w, const double loading_threshold)
{
	if (w->mode == GDL_LOCUS)
	{
		gdl_gpca_tag_groups_alloc (w, GDL_LOCUS);
		gdl_gpca_tag_locus_groups_init (w, loading_threshold);
		if (w->logger)
		{
			size_t i;
			for (i = 0; i < w->naxe; i++)
			{
				fprintf (w->logger, "Group [ %d ] contains %d locus\n", i+1, gdl_gpca_tag_group_size (w->groups[i]));
			}
		}
	}
	else if (w->mode == GDL_ACCESSION)
	{
		gdl_gpca_tag_groups_alloc (w, GDL_ACCESSION);
		gdl_gpca_tag_accession_groups_init (w, loading_threshold);
		if (w->logger)
		{
			size_t i;
			for (i = 0; i < w->naxe; i++)
			{
				fprintf (w->logger, "Group [ %d ] contains %d accessions\n", i+1, gdl_gpca_tag_group_size (w->groups[i]));
			}
		}
	}
}

static void
gdl_gpca_tag_select (gdl_gpca_tag_workspace * w, gdl_gpca_tag_group * group, size_t i)
{
	size_t idx;
	gdl_entity * entity;
	
	idx = gdl_mask_get_idx (group->gmask, group->type, i);
	
	if (group->type == GDL_LOCUS)
	{
		entity = gdl_gview_get_locus (w->gview, idx);
	}
	else if (group->type == GDL_ACCESSION)
	{
		entity = gdl_gview_get_accession (w->gview, idx);
	}
	
	if (w->logger)
	{
		fprintf (w->logger, "\tSelect %s [ %d ]\n", entity->name, i);
	}
	
	gdl_hashtable_add (w->buffer, gdl_entity_get_name (entity), entity, 0);
}

static void
gdl_gpca_tag_locus_group_select (gdl_gpca_tag_workspace * w,
                                    gdl_gpca_tag_group * group,
                                    const gdl_gpca_workspace * gpca,
                                    size_t naxe)
{
	size_t i, j, k, l, nl, nc, maxl;
	double x, max;
	const gdl_matrix  * E;
	const gdl_gmatrix * data;
	
	E       = gdl_gpca_workspace_rotation (gpca);
	data    = gdl_gpca_workspace_data (gpca);
	
	nl = gdl_gmatrix_locus_size (data);
	
	for (j = 0; j < naxe; j++)
	{
		for (max = 0, maxl = i = l = 0; l < nl; l++)
		{
			nc = gdl_gmatrix_locus_column_size (data, l);
			
			for (k = 0; k < nc; k++, i++)
			{
				x = fabs (gdl_matrix_get (E, i, j));
				if (x > max)
				{
					max  = x;
					maxl = l;
				}
			}
		}
		gdl_gpca_tag_select (w, group, maxl);
	}
}

static void
gdl_gpca_tag_accession_group_select (gdl_gpca_tag_workspace * w,
                                     gdl_gpca_tag_group * group,
                                     const gdl_gpca_workspace * gpca,
                                     size_t naxe)
{
	size_t i, j, na, maxi;
	double x, max;
	const gdl_matrix  * E;
	const gdl_gmatrix * data;
	
	E       = gdl_gpca_workspace_rotation (gpca);
	data    = gdl_gpca_workspace_data (gpca);
	
	na = gdl_gmatrix_row_size (data);
	
	for (j = 0; j < naxe; j++)
	{
		for (max = 0, maxi = i = 0; i < na; i++)
		{
			x = fabs (gdl_matrix_get (E, i, j));
			if (x > max)
			{
				max  = x;
				maxi = i;
			}
		}
		gdl_gpca_tag_select (w, group, maxi);
	}
}

static int
gdl_gpca_tag_groups_select (gdl_gpca_tag_workspace * w, const double scree_threshold)
{
	size_t i;
	
	for (i = 0; i < w->naxe; i++)
	{
		if (gdl_gpca_tag_group_size (w->groups[i]) > 1)
		{
			size_t naxe;
			gdl_gpca_workspace * gpca;
			
			gpca = gdl_gpca_tag_group_workspace_alloc (w->groups[i], w);
			
			if (w->logger)
			{
				fprintf (w->logger, "Perform PCA on group [ %d ] : wait...", i);
			}
			
			gdl_gpca_workspace_perform (gpca, 0);
			
			if (w->logger)
			{
				fprintf (w->logger, "\b\b\b\b\b\b\b[ OK ]\n");
			}
			
			naxe = gdl_gpca_workspace_scree_threshold (gpca, scree_threshold);
			
			if (w->groups[i]->type == GDL_LOCUS)
			{
				gdl_gpca_tag_locus_group_select (w, w->groups[i], gpca, naxe);
			}
			else if (w->groups[i]->type == GDL_ACCESSION)
			{
				gdl_gpca_tag_accession_group_select (w, w->groups[i], gpca, naxe);
			}
			
			gdl_gpca_tag_group_workspace_free (w->groups[i]);
		}
		else
		{
			if (w->logger)
			{
				fprintf (w->logger, "Group [ %d ] is a singleton\n", i);
			}
			gdl_gpca_tag_select (w, w->groups[i], 0);
		}
	}
}

static int
gdl_gpca_tag_varimax (gdl_gpca_tag_workspace * w)
{
	size_t i, j;
	const gdl_matrix * CE;
	gdl_matrix * E, * T;
		
	CE = gdl_gpca_workspace_rotation (w->gpca);
	E  = gdl_matrix_alloc (CE->size1, CE->size2);
	gdl_matrix_memcpy (E, CE);
	
	T    = gdl_matrix_alloc (E->size2, E->size2);
	w->Z = gdl_matrix_alloc (E->size1, E->size2);
	
	if (w->logger)
	{
		fprintf (w->logger, "Start Varimax : wait...");
	}
   gdl_linalg_varimax (E, T, w->Z, 1.e-5, 1000);
   if (w->logger)
	{
		fprintf (w->logger, "\b\b\b\b\b\b\b[ OK ]\n");
	}
//	
//	for (i = 0; i < w->Z->size1; i++)
//	{
//		printf ("Z[%d]", i);
//		for (j = 0; j < w->naxe; j++)
//		{
//			printf (" %1.2f", gdl_matrix_get (w->Z, i, j));
//		}
//		printf ("\n");	
//	}
	
	gdl_matrix_free (E);
	gdl_matrix_free (T);
}

static gdl_entity_mask *
gdl_gpca_tag_build_entity_mask (gdl_gpca_tag_workspace * w)
{
	gdl_entity_mask * mask = NULL;
	
	if (gdl_hashtable_size (w->buffer))
	{
		gdl_hashtable_itr * itr;
		
		//printf ("BUFFER SIZE %d\n", gdl_hashtable_size (w->buffer));
		
		mask = gdl_entity_mask_new (gdl_hashtable_size (w->buffer));
		
		itr  = gdl_hashtable_iterator (w->buffer);
		
		do
		{
			gdl_entity * e = (gdl_entity *) gdl_hashtable_iterator_value (itr);
			//printf ("TAG %d %s\n", e->idx, e->name);
			gdl_entity_mask_add (mask, e);
		}
		while (gdl_hashtable_iterator_next (itr));
		
		gdl_hashtable_iterator_free (itr);
	}
	
	return mask;	
}

static gdl_mask *
gdl_gpca_tag_build_gmask (gdl_gpca_tag_workspace * w)
{
	gdl_entity_mask * mask;
	gdl_mask       * tag = NULL;
	
	mask  = gdl_gpca_tag_build_entity_mask (w);
	
	if (mask)
	{
		tag = gdl_mask_alloc ();
		
		if (w->mode == GDL_LOCUS)
		{
			gdl_mask_set (tag, GDL_ACCESSION, gdl_mask_get_clone (w->mask, GDL_ACCESSION), 1);
			gdl_mask_set (tag, GDL_LOCUS, mask, 1);
		}
		else if (w->mode == GDL_ACCESSION)
		{
			gdl_mask_set (tag, GDL_LOCUS, gdl_mask_get_clone (w->mask, GDL_LOCUS), 1);
			gdl_mask_set (tag, GDL_ACCESSION, mask, 1);
		}
	}
	
	return tag;
}

int
gdl_gpca_tag_workspace_perform (gdl_gpca_tag_workspace * w, const double scree_threshold, const double loading_threshold)
{
	gdl_mask * tag;
	
	w->buffer = gdl_hashtable_alloc (gdl_hash_default, 0);
	
	if (w->logger)
	{
		fprintf (w->logger, "--\n");
		fprintf (w->logger, "GPCA Tag parameters\n");
		fprintf (w->logger, "\tscree threshold   = %1.2f\n", scree_threshold);
		fprintf (w->logger, "\tloading threshold = %1.2f\n", loading_threshold);
		fprintf (w->logger, "--\n");
	}
	
	gdl_gpca_workspace_perform (w->gpca, 0);
	
	w->naxe = gdl_gpca_workspace_scree_threshold (w->gpca, scree_threshold);
	
	if (w->logger)
	{
		fprintf (w->logger, "Retains %d PC axes\n", w->naxe);
	}
	
	gdl_gpca_tag_varimax (w);
	
	gdl_gpca_tag_groups_init (w, loading_threshold);
	
	gdl_gpca_tag_groups_select (w, scree_threshold);
	
	tag = gdl_gpca_tag_build_gmask (w);
	
	gdl_gpca_workspace_clean (w);
	
	w->tag = tag;
	
	return GDL_SUCCESS;
}

const gdl_mask *
gdl_gpca_tag_workspace_get (const gdl_gpca_tag_workspace * w)
{
	return w->tag;	
}

const gdl_gview *
gdl_gpca_tag_workspace_data (const gdl_gpca_tag_workspace * m)
{
	return m->gview;	
}

const gdl_gpca_workspace_type *
gdl_gpca_tag_workspace_type (const gdl_gpca_tag_workspace * m)
{
	return m->type;	
}

FILE *
gdl_gpca_tag_workspace_set_logger (gdl_gpca_tag_workspace * w, FILE * logger)
{
	FILE * old = w->logger;
	
	w->logger = logger;
	
	return old; 	
}
