/* 
 * multireg/multireg.c
 * 
 * $Author: baptiste $, $Date: 2008-05-13 15:33:53 $, $Version$
 *
 * Libgdl : a C library for statistical genetics
 * 
 * Copyright (C) 2003-2006  Jean-Baptiste Veyrieras, INRA, France.
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or (at
 * your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 */
#include <stdlib.h>

#include <gdl/gdl_common.h>
#include <gdl/gdl_math.h>
#include <gdl/gdl_vector.h>
#include <gdl/gdl_matrix.h>
#include <gdl/gdl_statistics.h>
#include <gdl/gdl_specfunc.h>
#include <gdl/gdl_rng.h>
#include <gdl/gdl_randist.h>
#include <gdl/gdl_multifit.h>
#include <gdl/gdl_multireg.h>

struct _gdl_multireg_linear_workspace
{
  size_t n; /* max number of observations */
  size_t p; /* max number of parameters */
  size_t df1;
  size_t df2;
  double rsquare;
  double fstat;
  double pval;
  double tss;
  double rss0;
  double rss1;
  double rss;
  gdl_multifit_linear_workspace * fit;
  gdl_vector * coef;
  gdl_matrix * cov;
  gdl_vector_view coefv;
  gdl_matrix_view covv;
};

gdl_multireg_linear_workspace *
gdl_multireg_linear_workspace_alloc (size_t n, size_t p)
{
	gdl_multireg_linear_workspace * w;
	
	w = GDL_CALLOC (gdl_multireg_linear_workspace, 1);
	
	w->n = n;
	w->p = p;
	w->coef  = gdl_vector_alloc (p);
	w->cov   = gdl_matrix_alloc (p, p);
	
	return w;
}

void
gdl_multireg_linear_workspace_free (gdl_multireg_linear_workspace * w)
{
	if (w)
	{
		gdl_vector_free (w->coef);
		gdl_matrix_free (w->cov);
		GDL_FREE (w);	
	}	
}

size_t
gdl_multireg_linear_workspace_size1 (const gdl_multireg_linear_workspace * w)
{
	return w->n;
}

size_t
gdl_multireg_linear_workspace_size2 (const gdl_multireg_linear_workspace * w)
{
	return w->p;
}


static void
gdl_multireg_prepare (gdl_multireg_linear_workspace * w, const gdl_matrix * X, const gdl_vector * y)
{
	w->coefv = gdl_vector_subvector (w->coef, 0, X->size2);
	w->covv  = gdl_matrix_submatrix (w->cov, 0, 0, X->size2, X->size2);
	
	w->fit = gdl_multifit_linear_alloc (X->size1, X->size2);
	
	if (y)
	{
		w->tss = gdl_stats_variance (y->data, y->stride, y->size) * (y->size - 1);
	}
}

int
gdl_multireg_linear_perform (gdl_multireg_linear_workspace * w, const gdl_matrix * X, const gdl_vector * y)
{
	if (y->size > w->n)
	{
		GDL_ERROR_VAL ("Vector size exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	if (X->size1 > w->n)
	{
		GDL_ERROR_VAL ("Matrix size1 exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	if (X->size2 > w->p)
	{
		GDL_ERROR_VAL ("Matrix size2 exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	
	gdl_multireg_prepare (w, X, y);
	
	gdl_multifit_linear (X, y, &((w->coefv).vector), &((w->covv).matrix), &(w->rss), w->fit);
	
	w->rss0 = w->tss;
	w->rss1 = w->rss;
   w->df1 = X->size1 - X->size2;
	w->df2 = X->size2 - 1;
	w->rsquare = (w->tss-w->rss)/w->tss;
	w->fstat   = (w->tss-w->rss)*(w->df1)/(w->rss*(w->df2));
	
	if (w->fstat > 0.0)
	{
		w->pval = gdl_sf_beta_inc (0.5*w->df1, 0.5*w->df2,  w->df1/(w->df1 + w->df2*w->fstat));
	}
	else
	{
		w->rsquare = w->fstat = 0.0;
		w->pval    = 1.0;
	}
	
	gdl_multifit_linear_free (w->fit);
	
	return GDL_SUCCESS;
}

int
gdl_multireg_linear_eval_permut (gdl_multireg_linear_workspace * w, const gdl_matrix * X0, const gdl_matrix * X1, const gdl_vector * y, const size_t boot, const gdl_rng * rng)
{
	if (y->size > w->n)
	{
		GDL_ERROR_VAL ("Vector size exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	if (X0->size1 > w->n)
	{
		GDL_ERROR_VAL ("Matrix X0 size1 exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	if (X1->size1 > w->n)
	{
		GDL_ERROR_VAL ("Matrix X1 size1 exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	if (X0->size2 + X1->size2 > w->p)
	{
		GDL_ERROR_VAL ("Matrix X0 + X1 size2 exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	size_t i, j;
	double yi, y_est, fboot;
	gdl_vector * epsilon;
	gdl_matrix * X;
	gdl_matrix_view V;
	
	gdl_multireg_prepare (w, X0, y);
	
	gdl_multifit_linear (X0, y, &((w->coefv).vector), &((w->covv).matrix), &(w->rss0), w->fit);
	
	// store the residuals
	epsilon = gdl_vector_alloc (X0->size1);
	for (i = 0; i < epsilon->size; i++)
   {
      yi = gdl_vector_get (y, i);
      gdl_vector_const_view row = gdl_matrix_const_row (X0, i);
      gdl_blas_ddot (&row.vector, &((w->coefv).vector), &y_est);
      gdl_vector_set (epsilon, i, yi - y_est);
   }
	gdl_multifit_linear_free (w->fit);
	// Merge X0 + X1
	X = gdl_matrix_alloc (y->size, X0->size2 + X1->size2);
	V = gdl_matrix_submatrix (X, 0, 0, X0->size1, X0->size2);
	gdl_matrix_memcpy (&(V.matrix), X0);
	V = gdl_matrix_submatrix (X, 0, X0->size2, X1->size1, X1->size2);
	gdl_matrix_memcpy (&(V.matrix), X1);
	
	gdl_multireg_prepare (w, X, 0);
	
	gdl_multifit_linear (X, y, &((w->coefv).vector), &((w->covv).matrix), &(w->rss1), w->fit);
  
   gdl_multifit_linear_free (w->fit);
   
   w->rss = w->rss1;
   w->df1 = X->size1 - X->size2;
	w->df2 = X->size2 - X0->size2;
	w->rsquare = (w->rss0-w->rss1)/w->tss;
	w->fstat   = (w->rss0-w->rss1)*(w->df1)/(w->rss1*(w->df2));
	
	w->pval = 0;
	for(i = 0; i < boot; i++)
	{
		gdl_ran_shuffle (rng, epsilon->data, epsilon->size, sizeof(double));
		gdl_multireg_prepare (w, X1, epsilon);
		gdl_multifit_linear (X1, epsilon, &((w->coefv).vector), &((w->covv).matrix), &(w->rss1), w->fit);
  		gdl_multifit_linear_free (w->fit);
  		fboot = (w->rss0-w->rss1)*(w->df1)/(w->rss1*(w->df2));
  		//printf ("%d F(boot) = %g > %g\n", i, fboot, w->fstat);
  		fflush (stdout);
  		if (fboot > w->fstat) w->pval+=1.;
  	}
  	w->pval /= boot;
	
	gdl_matrix_free (X);
	gdl_vector_free (epsilon);
	
	return GDL_SUCCESS;
}

int
gdl_multireg_linear_eval (gdl_multireg_linear_workspace * w, const gdl_matrix * X0, const gdl_matrix * X1, const gdl_vector * y)
{
	if (y->size > w->n)
	{
		GDL_ERROR_VAL ("Vector size exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	if (X0->size1 > w->n)
	{
		GDL_ERROR_VAL ("Matrix X0 size1 exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	if (X1->size1 > w->n)
	{
		GDL_ERROR_VAL ("Matrix X1 size1 exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	if (X0->size2 + X1->size2 > w->p)
	{
		GDL_ERROR_VAL ("Matrix X0 + X1 size2 exceeds workspace dimension", GDL_FAILURE, GDL_FAILURE);	
	}
	size_t i, j;
	gdl_matrix * X;
	gdl_matrix_view V;
	
	gdl_multireg_prepare (w, X0, y);
	
	gdl_multifit_linear (X0, y, &((w->coefv).vector), &((w->covv).matrix), &(w->rss0), w->fit);
	
	gdl_multifit_linear_free (w->fit);
	
	// Merge X0 + X1
	X = gdl_matrix_alloc (y->size, X0->size2 + X1->size2);
	V = gdl_matrix_submatrix (X, 0, 0, X0->size1, X0->size2);
	gdl_matrix_memcpy (&(V.matrix), X0);
	V = gdl_matrix_submatrix (X, 0, X0->size2, X1->size1, X1->size2);
	gdl_matrix_memcpy (&(V.matrix), X1);
	
	gdl_multireg_prepare (w, X, 0);
	
	gdl_multifit_linear (X, y, &((w->coefv).vector), &((w->covv).matrix), &(w->rss1), w->fit);
  
   gdl_multifit_linear_free (w->fit);
   
   w->rss = w->rss1;
   w->df1 = X->size1 - X->size2;
	w->df2 = X->size2 - X0->size2;
	w->rsquare = (w->rss0-w->rss1)/w->tss;
	w->fstat   = (w->rss0-w->rss1)*(w->df1)/(w->rss1*(w->df2));
	
	if (w->fstat > 0.0)
	{
		w->pval = gdl_sf_beta_inc (0.5*w->df1, 0.5*w->df2,  w->df1/(w->df1 + w->df2*w->fstat));
	}
	else
	{
		w->rsquare = w->fstat = 0.0;
		w->pval    = 1.0;
	}
	
	gdl_matrix_free (X);
	
	return GDL_SUCCESS;
}

const gdl_vector *
gdl_multireg_linear_coefficients (const gdl_multireg_linear_workspace * w)
{
	return &((w->coefv).vector);
}

const gdl_matrix *
gdl_multireg_linear_covariance (const gdl_multireg_linear_workspace * w)
{
	return &((w->covv).matrix);
}

double
gdl_multireg_linear_rss (const gdl_multireg_linear_workspace * w)
{
	return w->rss;
}

double
gdl_multireg_linear_tss (const gdl_multireg_linear_workspace * w)
{
	return w->tss;	
}

double
gdl_multireg_linear_rsquare (const gdl_multireg_linear_workspace * w)
{
	return (w->tss-w->rss)/w->tss;
}

size_t
gdl_multireg_linear_df (const gdl_multireg_linear_workspace * w)
{
	return w->df1;	
}

double
gdl_multireg_linear_eval_rsquare (const gdl_multireg_linear_workspace * w)
{
	return w->rsquare;	
}

double
gdl_multireg_linear_eval_rss0 (const gdl_multireg_linear_workspace * w)
{
	return w->rss0;	
}

double
gdl_multireg_linear_eval_rss1 (const gdl_multireg_linear_workspace * w)
{
	return w->rss1;	
}

size_t
gdl_multireg_linear_eval_df (const gdl_multireg_linear_workspace * w)
{
	return w->df2;	
}

double
gdl_multireg_linear_eval_fstat (const gdl_multireg_linear_workspace * w)
{
	return w->fstat;	
}

double
gdl_multireg_linear_eval_pvalue (const gdl_multireg_linear_workspace * w)
{
	return w->pval;
}
