#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <mpi.h>

#include "external_functions.h"



typedef struct  {
        double  L;
        double  DL_theta,DL_cov1,DL_s;
        double  D2L_theta,D2L_cov1,D2L_s;
        double  D2L_theta_cov1,D2L_theta_s;
        double  D2L_cov1_s;
} derivative_return;



derivative_return derivative_binomial_glmm(double theta,double cov1_size,double s,
                                 	   int nobs,double *Y,double *cov1,
                                 	   int ncube,double *u_vec,double *pu_vec,int *u_index_vec)  
{
double          	u,pu;
int			u_index;
double          	z,p;
double          	L;
double          	Dlz_theta,Dlz_cov1,Dlz_s;
double          	D2lz_theta,D2lz_cov1,D2lz_s;
double          	D2lz_theta_cov1,D2lz_theta_s;
double          	D2lz_cov1_s;
double          	prod_L;
double          	sum_Dlz_theta,sum_Dlz_cov1,sum_Dlz_s;
double          	sum_D2lz_theta,sum_D2lz_cov1,sum_D2lz_s;
double          	sum_D2lz_theta_cov1,sum_D2lz_theta_s;
double          	sum_D2lz_cov1_s;
double          	L_acc;
double          	DL_theta_acc,DL_cov1_acc,DL_s_acc;
double          	D2L_theta_acc,D2L_cov1_acc,D2L_s_acc;
double          	D2L_theta_cov1_acc,D2L_theta_s_acc;
double          	D2L_cov1_s_acc;
double			pL_pu;
derivative_return	ret_struct;
int             	i,c;


L_acc = 0;

DL_theta_acc = 0;
DL_cov1_acc = 0;
DL_s_acc = 0;

D2L_theta_acc = 0;
D2L_cov1_acc = 0;
D2L_s_acc = 0;

D2L_theta_cov1_acc = 0;
D2L_theta_s_acc = 0;

D2L_cov1_s_acc = 0;

for (c = 0; c < ncube; c++)  {
        pu = pu_vec[c];

        prod_L = 1;

        sum_Dlz_theta = 0;
        sum_Dlz_cov1 = 0;
        sum_Dlz_s = 0;

        sum_D2lz_theta = 0;
        sum_D2lz_cov1 = 0;
        sum_D2lz_s = 0;

        sum_D2lz_theta_cov1 = 0;
        sum_D2lz_theta_s = 0;

        sum_D2lz_cov1_s = 0;

	u_index = u_index_vec[c];

        for (i = 0; i < nobs; i++)  {
		u = u_vec[(u_index*nobs)+i];

                z = theta + cov1_size*cov1[i] + s*u;
                p = exp(z) / (1 + exp(z));

                L = pow(p,Y[i]) * pow(1-p,1-Y[i]);

                prod_L = prod_L * L;

                Dlz_theta = Y[i] - p;
                Dlz_cov1 = cov1[i]*Dlz_theta;
                Dlz_s = u*Dlz_theta;

                D2lz_theta = p*p - p;

                D2lz_theta_cov1 = cov1[i]*D2lz_theta;
                D2lz_theta_s = u*D2lz_theta;

                D2lz_cov1 = cov1[i]*D2lz_theta_cov1;
                D2lz_s = u*D2lz_theta_s;
                D2lz_cov1_s = cov1[i]*D2lz_theta_s;

                sum_Dlz_theta = sum_Dlz_theta + Dlz_theta;
                sum_Dlz_cov1 = sum_Dlz_cov1 + Dlz_cov1;
                sum_Dlz_s = sum_Dlz_s + Dlz_s;

                sum_D2lz_theta = sum_D2lz_theta + D2lz_theta;
                sum_D2lz_cov1 = sum_D2lz_cov1 + D2lz_cov1;
                sum_D2lz_s = sum_D2lz_s + D2lz_s;

                sum_D2lz_theta_cov1 = sum_D2lz_theta_cov1 + D2lz_theta_cov1;
                sum_D2lz_theta_s = sum_D2lz_theta_s + D2lz_theta_s;

                sum_D2lz_cov1_s = sum_D2lz_cov1_s + D2lz_cov1_s;
                }

	pL_pu = prod_L*pu;

        L_acc = L_acc + pL_pu;                              

        DL_theta_acc = DL_theta_acc + sum_Dlz_theta*pL_pu;
        DL_cov1_acc = DL_cov1_acc + sum_Dlz_cov1*pL_pu;
        DL_s_acc = DL_s_acc + sum_Dlz_s*pL_pu;

        D2L_theta_acc = D2L_theta_acc + (sum_D2lz_theta + sum_Dlz_theta*sum_Dlz_theta)*pL_pu;
        D2L_cov1_acc = D2L_cov1_acc + (sum_D2lz_cov1 + sum_Dlz_cov1*sum_Dlz_cov1)*pL_pu;
        D2L_s_acc = D2L_s_acc + (sum_D2lz_s + sum_Dlz_s*sum_Dlz_s)*pL_pu;

        D2L_theta_cov1_acc = D2L_theta_cov1_acc + (sum_D2lz_theta_cov1 + sum_Dlz_theta*sum_Dlz_cov1)*pL_pu;
        D2L_theta_s_acc = D2L_theta_s_acc + (sum_D2lz_theta_s + sum_Dlz_theta*sum_Dlz_s)*pL_pu;

        D2L_cov1_s_acc = D2L_cov1_s_acc + (sum_D2lz_cov1_s + sum_Dlz_cov1*sum_Dlz_s)*pL_pu;
        }


ret_struct.L = L_acc;

ret_struct.DL_theta = DL_theta_acc;
ret_struct.DL_cov1 = DL_cov1_acc;
ret_struct.DL_s = DL_s_acc;

ret_struct.D2L_theta = D2L_theta_acc;
ret_struct.D2L_cov1 = D2L_cov1_acc;
ret_struct.D2L_s = D2L_s_acc;

ret_struct.D2L_theta_cov1 = D2L_theta_cov1_acc;
ret_struct.D2L_theta_s = D2L_theta_s_acc;

ret_struct.D2L_cov1_s = D2L_cov1_s_acc;

return(ret_struct);
}



derivative_return gather_nodes(derivative_return d_ret,int totnodes)  
{
MPI_Status	stat;
double		dtool;
int		r;

for (r = 1; r < totnodes; r++)  {
       	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
        d_ret.L = d_ret.L + dtool;

	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
	d_ret.DL_theta = d_ret.DL_theta + dtool;
	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
	d_ret.DL_cov1 = d_ret.DL_cov1 + dtool;
	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
	d_ret.DL_s = d_ret.DL_s + dtool;

	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
	d_ret.D2L_theta = d_ret.D2L_theta + dtool;
	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
	d_ret.D2L_cov1 = d_ret.D2L_cov1 + dtool;
	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
	d_ret.D2L_s = d_ret.D2L_s + dtool;

	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
	d_ret.D2L_theta_cov1 = d_ret.D2L_theta_cov1 + dtool;
	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
	d_ret.D2L_theta_s = d_ret.D2L_theta_s + dtool;

	MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
	d_ret.D2L_cov1_s = d_ret.D2L_cov1_s + dtool;
        }

return(d_ret);
}



void calculate_derivatives(derivative_return d_ret,double *L,double *DlnL,double *D2lnL)
{
double          	DL_theta,DL_cov1,DL_s;
double          	D2L_theta,D2L_cov1,D2L_s;
double          	D2L_theta_cov1,D2L_theta_s;
double          	D2L_cov1_s;
double          	DlnL_theta,DlnL_cov1,DlnL_s;
double          	D2lnL_theta,D2lnL_cov1,D2lnL_s;
double          	D2lnL_theta_cov1,D2lnL_theta_s;
double          	D2lnL_cov1_s;

*L = d_ret.L;

DL_theta = d_ret.DL_theta;
DL_cov1 = d_ret.DL_cov1;
DL_s = d_ret.DL_s;

D2L_theta = d_ret.D2L_theta;
D2L_cov1 = d_ret.D2L_cov1;
D2L_s = d_ret.D2L_s;
D2L_theta_cov1 = d_ret.D2L_theta_cov1;
D2L_theta_s = d_ret.D2L_theta_s;
D2L_cov1_s = d_ret.D2L_cov1_s;

DlnL_theta = DL_theta / *L;
DlnL_cov1 = DL_cov1 / *L;
DlnL_s = DL_s / *L;

DlnL[0] = DlnL_theta;
DlnL[1] = DlnL_cov1;
DlnL[2] = DlnL_s;

D2lnL_theta = (D2L_theta / *L) - DlnL_theta*DlnL_theta;
D2lnL_cov1 = (D2L_cov1 / *L) - DlnL_cov1*DlnL_cov1;
D2lnL_s = (D2L_s / *L) - DlnL_s*DlnL_s;
D2lnL_theta_cov1 = (D2L_theta_cov1 / *L) - DlnL_theta*DlnL_cov1;
D2lnL_theta_s = (D2L_theta_s / *L) - DlnL_theta*DlnL_s;
D2lnL_cov1_s = (D2L_cov1_s / *L) - DlnL_cov1*DlnL_s;

D2lnL[0] = -D2lnL_theta;
D2lnL[4] = -D2lnL_cov1;
D2lnL[8] = -D2lnL_s;
D2lnL[1] = D2lnL[3] = -D2lnL_theta_cov1;
D2lnL[2] = D2lnL[6] = -D2lnL_theta_s;
D2lnL[5] = D2lnL[7] = -D2lnL_cov1_s;
}


void calculate_parameters(double theta,double cov1_size,double s,
			  double *DlnL,double *D2lnL,
			  double *theta_star,double *cov1_size_star,double *s_star)
{
double	x11,x12,x13,x21,x22,x23,x31,x32,x33;
double	z1,z2,z3;
double	y1,y2,y3;
double	t_11_23,t_12_13,t_12_23,t_12_33,t_13_22,t_13_23,t_22_33,t_23_23;
double	denom;
double	d;
int	i;

x11=D2lnL[0]; x12=D2lnL[1]; x13=D2lnL[2];
	      x22=D2lnL[4]; x23=D2lnL[5];
	      		    x33=D2lnL[8];

z1 = DlnL[0];
z2 = DlnL[1];
z3 = DlnL[2];

t_11_23 = x11*x23;
t_12_13 = x12*x13;
t_12_23 = x12*x23;
t_12_33 = x12*x33;
t_13_22 = x13*x22;
t_13_23 = x13*x23;
t_22_33 = x22*x33;
t_23_23 = x23*x23;

denom = x11*(t_22_33-t_23_23) + x12*(t_13_23-t_12_33) + x13*(t_12_23-t_13_22);

if (denom != 0)  {
	y1 = (z1*(t_22_33-t_23_23) + z2*(t_13_23-t_12_33) + z3*(t_12_23-t_13_22)) / denom;
	y2 = (z1*(t_13_23-t_12_33) + z2*(x11*x33-x13*x13) + z3*(t_12_13-t_11_23)) / denom;
	y3 = (z1*(t_12_23-t_13_22) + z2*(t_12_13-t_11_23) + z3*(x11*x22-x12*x12)) / denom;

	*theta_star = theta + y1;
	*cov1_size_star = cov1_size + y2;
	*s_star = s + y3;
	}
else	{
	*theta_star = theta;
	*cov1_size_star = cov1_size;
	*s_star = s;
	}
}



int update_pu(double theta,double cov1_size,double s,
              int nobs,double *Y,double *cov1,
              int ncube,double *u_vec,double *pu_vec,int *u_index_vec,
              double L,
	      int node,int totnodes,double thresh)  
{
double          u,pu;
int		u_index;
double          z,p;
double          L_u,L_prod,pu_0;
int		nonzero_cube;
double		pu_acc;
int             i,c,r,inttool;
double		dtool;
int		zero_index,nonzero_index;
MPI_Status	stat;


nonzero_cube = 0;
for (c = 0; c < ncube; c++)  {
        pu = pu_vec[c];

	u_index = u_index_vec[c];

        L_prod = 1;

        for (i = 0; i < nobs; i++)  {
                u = u_vec[(u_index*nobs)+i];

                z = theta + cov1_size*cov1[i] + s*u;
                p = exp(z) / (1 + exp(z));

                L_u = pow(p,Y[i]) * pow(1-p,1-Y[i]);

                L_prod = L_prod * L_u;
                }

        pu_0 = (L_prod*pu) / L;                         

	if ((pu_0 < 0) || (pu_0 > 1))  {
		printf("Warning: pu_0= %le\n",pu_0);
		printf("Diagnostic: L_prod= %le pu= %le L_prod*pu= %le L= %le\n",L_prod,pu,L_prod*pu,L);
		}
		
	pu_vec[c] = pu_0;
	if (pu_0 > 0)  {
		nonzero_cube++;
		}
        }

if (node == 0)  {
	for (r = 1; r < totnodes; r++)  {
	      	MPI_Recv(&inttool,1,MPI_INT,r,r,MPI_COMM_WORLD,&stat);
		nonzero_cube = nonzero_cube + inttool;
		}
	thresh = thresh / (double)(nonzero_cube);
	for (r = 1; r < totnodes; r++)  {
		MPI_Send(&thresh,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
		}
	}
else	{
	MPI_Send(&nonzero_cube,1,MPI_INT,0,node,MPI_COMM_WORLD);
        MPI_Recv(&thresh,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
	}

pu_acc = 0;
nonzero_cube = 0;
for (c = 0; c < ncube; c++)  {
	if (pu_vec[c] < thresh)  {
		pu_vec[c] = 0;
		}
	else	{
		pu_acc = pu_acc + pu_vec[c];
		nonzero_cube++;
		}
	}

zero_index = 0;
nonzero_index = 0;
do	{
	while ((zero_index < ncube) && (pu_vec[zero_index] != 0))  {
		zero_index++;
		}
	if (zero_index < ncube)  {
		if (nonzero_index < zero_index)  nonzero_index = zero_index;
		while ((nonzero_index < ncube) && (pu_vec[nonzero_index] == 0))  {
			nonzero_index++;
			}
		if (nonzero_index < ncube)  {
			pu_vec[zero_index] = pu_vec[nonzero_index];
			pu_vec[nonzero_index] = 0;

			inttool = u_index_vec[zero_index];
			u_index_vec[zero_index] = u_index_vec[nonzero_index];
			u_index_vec[nonzero_index] = inttool;
			}
		else	{
			zero_index = nonzero_index;
			}
		}
	} while (zero_index < ncube);	

if (node == 0)  {
	for (r = 1; r < totnodes; r++)  {
		MPI_Recv(&dtool,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
		pu_acc = pu_acc + dtool;
		}
	for (r = 1; r < totnodes; r++)  {
		MPI_Send(&pu_acc,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
		}
	}
else	{
	MPI_Send(&pu_acc,1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
	MPI_Recv(&pu_acc,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
	}

for (c = 0; c < nonzero_cube; c++)  {
	pu_vec[c] = pu_vec[c] / pu_acc;
	}

return(nonzero_cube);
}


void write_pu(int ncube,int nobs,
	      double *pu_vec,double *u_vec,int *u_index_vec,
	      char *u_outputfile,char *pu_outputfile,char *write_append)
{
int	c,r,u_index;
FILE    *fptr_u,*fptr_pu;

fptr_u = fopen(u_outputfile,write_append);
fptr_pu = fopen(pu_outputfile,write_append);

for (c = 0; c < ncube; c++)  {
	u_index = u_index_vec[c];

	fprintf(fptr_pu,"%le\n",pu_vec[c]);

	for (r = 0; r < nobs; r++)  {
		fprintf(fptr_u,"%le ",u_vec[(u_index*nobs)+r]);
		}
	fprintf(fptr_u,"\n");
	}

fclose(fptr_u);
fclose(fptr_pu);
}



void main(int argc,char *argv[])
{
int			node,totnodes;
int			nobs,ngene,ncube_node_base;
double			*phenotypes,*cov1;
double			*u,pu_base,*pu;
int			*u_index_vec;
double			theta_base,cov1_size_base,s_lo,s_hi,s_step,s_curr;
double			cull_thresh;
double			difference_limit;
int                     *id,*u_id,haltcrit;
int			ncube_node;
double			theta,cov1_size,s;
derivative_return	d_ret;
double			L,DlnL_distance,DlnL[3],D2lnL[9];
double			theta_0,cov1_size_0,s_0;
derivative_return	d_ret_0;
double			L_0,DlnL_distance_0,DlnL_0[3],D2lnL_0[9];
double			difference;
double			theta_store,cov1_size_store,s_store;
double			L_store,DlnL_store[3],D2lnL_store[9];
int			ncube_node_store;
double			*pu_store;
int			*u_index_vec_store;
double			s_eval_code,stop_code,update_pu_code,pu_store_code,pu_write_code;
int			retcode;
double			D2lnLinv[9];
int			i,j,r;
char			c,fname[256],fname2[256];
FILE			*fptr;
double			dtool;
MPI_Status		stat;



MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&totnodes);
MPI_Comm_rank(MPI_COMM_WORLD,&node);

if (argc != 11)  {
        if (node == 0)  {
                printf("Required arguments: <pheno/cov file> <nobs> <ncube_node> <theta effect> <cov1 effect> <s effect lo> <s effect hi> <s effect step> <stopping crit> <point culling threshhold>\n");
                }
        MPI_Finalize();
        exit(0);
        }

sscanf(argv[2],"%i",&nobs);
id = (int *)malloc(nobs*sizeof(int));
phenotypes = (double *)malloc(nobs*sizeof(double));
cov1 = (double *)malloc(nobs*sizeof(double));
fptr = fopen(argv[1],"r");
for (i = 0; i < nobs; i++)  {
        fscanf(fptr,"%i %lf %lf",&(id[i]),&(phenotypes[i]),&(cov1[i]));
        }
fclose(fptr);

sscanf(argv[3],"%i",&ncube_node_base);
u_id = (int *)malloc(nobs*sizeof(int));
u = (double *)malloc(ncube_node_base*nobs*sizeof(double));
sprintf(fname,"u_matrix_%i.dat",node);
fptr = fopen(fname,"r");
for (i = 0; i < nobs; i++)  {
        if (fscanf(fptr,"%i",&(u_id[i])) == EOF)  {
                if (node == 0)  {
                        printf("Too small cubature file. Quitting.\n");
                        }
                MPI_Finalize();
                exit(0);
                }
        }
for (i = 0; i < ncube_node_base*nobs; i++)  {
        if (fscanf(fptr,"%lf",&(u[i])) == EOF)  {
                if (node == 0)  {
                        printf("Too small cubature file. Quitting.\n");
                        }
                MPI_Finalize();
                exit(0);
                }
        }
fclose(fptr);

u_index_vec = (int *)malloc(ncube_node_base*sizeof(int));
for (i = 0; i < ncube_node_base; i++)  {
	u_index_vec[i] = i;
	}
u_index_vec_store = (int *)malloc(ncube_node_base*sizeof(int));
pu = (double *)malloc(ncube_node_base*sizeof(double));
pu_store = (double *)malloc(ncube_node_base*sizeof(double));
pu_base = 1.0 / (double)(ncube_node_base*totnodes);

haltcrit = 0;
for (i = 0; i < nobs; i++)  {
        if (id[i] != u_id[i])  haltcrit = 1;
        }
if (haltcrit == 1)  {
        if (node == 0)  {
                printf("Nonmatching phenotype/covariate and cubature person IDs. Quitting.\n");
                }
        MPI_Finalize();
        exit(0);
        }

sscanf(argv[4],"%le",&theta_base);
sscanf(argv[5],"%le",&cov1_size_base);
	
sscanf(argv[6],"%le",&s_lo);
sscanf(argv[7],"%le",&s_hi);
sscanf(argv[8],"%le",&s_step);

sscanf(argv[9],"%le",&difference_limit);

sscanf(argv[10],"%le",&cull_thresh);


if (node == 0)  {
	s_eval_code = 1;
	s_curr = s_lo;
	while (s_eval_code == 1)  {
		for (r = 1; r < totnodes; r++)  {
			MPI_Send(&s_eval_code,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
			}

		printf("s= %le evaluation.\n",s_curr);

		theta = theta_base;
		cov1_size = cov1_size_base;
		s = s_curr;
		for (i = 0; i < ncube_node_base; i++)  {
			pu[i] = pu_base;
			}
		ncube_node = ncube_node_base;

		stop_code = 0;
		do	{
			for (r = 1; r < totnodes; r++)  {
				MPI_Send(&stop_code,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
				}

			for (r = 1; r < totnodes; r++)  {
				MPI_Send(&theta,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
			        MPI_Send(&cov1_size,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
			        MPI_Send(&s,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
			        }
			d_ret = derivative_binomial_glmm(theta,cov1_size,s,
							 nobs,phenotypes,cov1,ncube_node,u,pu,u_index_vec);
			d_ret = gather_nodes(d_ret,totnodes);
			calculate_derivatives(d_ret,&L,DlnL,D2lnL);
			DlnL_distance = sqrt(pow(DlnL[0],2) + pow(DlnL[1],2) + pow(DlnL[2],2));
			printf("\t%le %le %le : %le %le\n",theta,cov1_size,s,log(L),DlnL_distance);

			calculate_parameters(theta,cov1_size,s,DlnL,D2lnL,&theta_0,&cov1_size_0,&s_0);
			for (r = 1; r < totnodes; r++)  {
				MPI_Send(&theta_0,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
			        MPI_Send(&cov1_size_0,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
			        MPI_Send(&s_0,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
			        }
			d_ret_0 = derivative_binomial_glmm(theta_0,cov1_size_0,s_0,
							   nobs,phenotypes,cov1,ncube_node,u,pu,u_index_vec);
			d_ret_0 = gather_nodes(d_ret_0,totnodes);
			calculate_derivatives(d_ret_0,&L_0,DlnL_0,D2lnL_0);
			DlnL_distance_0 = sqrt(pow(DlnL_0[0],2) + pow(DlnL_0[1],2) + pow(DlnL_0[2],2));
			difference = DlnL_distance - DlnL_distance_0;
			printf("\t%le %le %le : %le %le %le\n",theta_0,cov1_size_0,s_0,log(L_0),DlnL_distance_0,difference);
	
			if (difference > difference_limit)  {
				theta = theta_0;
				cov1_size = cov1_size_0;
				s = s_0;

				L = L_0;

				DlnL_distance = DlnL_distance_0;

				DlnL[0]=DlnL_0[0]; DlnL[1]=DlnL_0[1]; DlnL[2]=DlnL_0[2];

				D2lnL[0]=D2lnL_0[0]; D2lnL[1]=D2lnL_0[1]; D2lnL[2]=D2lnL_0[2];
				D2lnL[3]=D2lnL_0[3]; D2lnL[4]=D2lnL_0[4]; D2lnL[5]=D2lnL_0[5];
				D2lnL[6]=D2lnL_0[6]; D2lnL[7]=D2lnL_0[7]; D2lnL[8]=D2lnL_0[8];

				update_pu_code = 1;
				for (r = 1; r < totnodes; r++)  {
				        MPI_Send(&update_pu_code,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);

				        MPI_Send(&theta_0,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
				        MPI_Send(&cov1_size_0,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
				        MPI_Send(&s_0,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
				        MPI_Send(&L_0,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
					}
				ncube_node = update_pu(theta_0,cov1_size_0,s_0,
				          	       nobs,phenotypes,cov1,
				          	       ncube_node,u,pu,u_index_vec,L_0,
		 			  	       node,totnodes,cull_thresh);

				stop_code = 0;
				}
			else	{
				update_pu_code = 0;
				for (r = 1; r < totnodes; r++)  {
				        MPI_Send(&update_pu_code,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
					}

				stop_code = 1;
				}			
			} while (stop_code == 0);
		for (r = 1; r < totnodes; r++)  {
			MPI_Send(&stop_code,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
			}

		if ((s_curr == s_lo) || (L > L_store))  {
			pu_store_code = 1;
			for (r = 1; r < totnodes; r++)  {
				MPI_Send(&pu_store_code,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
				}

			theta_store = theta; cov1_size_store = cov1_size; s_store = s;			

			L_store = L;

			DlnL_store[0] = DlnL[0]; DlnL_store[1] = DlnL[1]; DlnL_store[2] = DlnL[2];

			D2lnL_store[0] = D2lnL[0]; D2lnL_store[1] = D2lnL[1]; D2lnL_store[2] = D2lnL[2];
			D2lnL_store[3] = D2lnL[3]; D2lnL_store[4] = D2lnL[4]; D2lnL_store[5] = D2lnL[5];
			D2lnL_store[6] = D2lnL[6]; D2lnL_store[7] = D2lnL[7]; D2lnL_store[8] = D2lnL[8];

			ncube_node_store = ncube_node;
			for (i = 0; i < ncube_node; i++)  {
				pu_store[i] = pu[i];
				u_index_vec_store[i] = u_index_vec[i];
				}
			}
		else	{
			pu_store_code = 0;
			for (r = 1; r < totnodes; r++)  {
				MPI_Send(&pu_store_code,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
				}
			}

		printf("Curr Parameters: ");
		printf("theta= %le cov1_size= %le s= %le ",theta,cov1_size,s);
		printf("lnL= %le\n",log(L));
		printf("Best Parameters: ");
		printf("theta= %le cov1_size= %le s= %le ",theta_store,cov1_size_store,s_store);
		printf("lnL= %le\n",log(L_store));

		s_curr = s_curr + s_step;
		if (s_curr > s_hi) s_eval_code = 0;
		}
	for (r = 1; r < totnodes; r++)  {
		MPI_Send(&s_eval_code,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
		}

	printf("Final Parameters: ");
	printf("theta= %le cov1_size= %le s= %le\n",theta_store,cov1_size_store,s_store);
	printf("lnL= %le\n",log(L_store));
	printf("DlnL= %le %le %le\n",DlnL_store[0],DlnL_store[1],DlnL_store[2]);
	printf("D2lnL=\n");
	printf("%le %le %le\n",D2lnL_store[0],D2lnL_store[1],D2lnL_store[2]);
	printf("%le %le %le\n",D2lnL_store[3],D2lnL_store[4],D2lnL_store[5]);
	printf("%le %le %le\n",D2lnL_store[6],D2lnL_store[7],D2lnL_store[8]);
        retcode = invert_matrix_3x3(D2lnL_store,D2lnLinv);
	printf("D2lnLinv=\n");
	printf("%le %le %le\n",D2lnLinv[0],D2lnLinv[1],D2lnLinv[2]);
	printf("%le %le %le\n",D2lnLinv[3],D2lnLinv[4],D2lnLinv[5]);
	printf("%le %le %le\n",D2lnLinv[6],D2lnLinv[7],D2lnLinv[8]);

	write_pu(ncube_node_store,nobs,pu_store,u,u_index_vec_store,"u_matrix_star.dat","pu_matrix_star.dat","w");
	for (r = 1; r < totnodes; r++)  {
		MPI_Send(&pu_write_code,1,MPI_DOUBLE,r,0,MPI_COMM_WORLD);
		MPI_Recv(&pu_write_code,1,MPI_DOUBLE,r,r,MPI_COMM_WORLD,&stat);
		}
	}
else	{
	MPI_Recv(&s_eval_code,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);

	while (s_eval_code == 1)  {
		for (i = 0; i < ncube_node_base; i++)  {
			pu[i] = pu_base;
			}
		ncube_node = ncube_node_base;

	        MPI_Recv(&stop_code,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
		while (stop_code == 0)  {
		        MPI_Recv(&theta,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
		        MPI_Recv(&cov1_size,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
		        MPI_Recv(&s,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
			d_ret = derivative_binomial_glmm(theta,cov1_size,s,
			 			         nobs,phenotypes,cov1,ncube_node,u,pu,u_index_vec);
			MPI_Send(&(d_ret.L),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.DL_theta),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.DL_cov1),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.DL_s),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_theta),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_cov1),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_s),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_theta_cov1),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_theta_s),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_cov1_s),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);

		        MPI_Recv(&theta,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
		        MPI_Recv(&cov1_size,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
		        MPI_Recv(&s,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
			d_ret = derivative_binomial_glmm(theta,cov1_size,s,
			 			         nobs,phenotypes,cov1,ncube_node,u,pu,u_index_vec);
			MPI_Send(&(d_ret.L),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.DL_theta),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.DL_cov1),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.DL_s),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_theta),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_cov1),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_s),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_theta_cov1),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_theta_s),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
		        MPI_Send(&(d_ret.D2L_cov1_s),1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);

		        MPI_Recv(&update_pu_code,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
			if (update_pu_code == 1)  {
			        MPI_Recv(&theta,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
			        MPI_Recv(&cov1_size,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
			        MPI_Recv(&s,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
			        MPI_Recv(&L,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);

				ncube_node = update_pu(theta,cov1_size,s,
				          	       nobs,phenotypes,cov1,
				          	       ncube_node,u,pu,u_index_vec,L,
		 			               node,totnodes,cull_thresh);
				}

		        MPI_Recv(&stop_code,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
			}

	        MPI_Recv(&pu_store_code,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
		if (pu_store_code == 1)  {
			ncube_node_store = ncube_node;
			for (i = 0; i < ncube_node; i++)  {
				pu_store[i] = pu[i];
				u_index_vec_store[i] = u_index_vec[i];
				}
			}

		MPI_Recv(&s_eval_code,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
		}

	MPI_Recv(&pu_write_code,1,MPI_DOUBLE,0,0,MPI_COMM_WORLD,&stat);
	write_pu(ncube_node_store,nobs,pu_store,u,u_index_vec_store,"u_matrix_star.dat","pu_matrix_star.dat","a");
	MPI_Send(&pu_write_code,1,MPI_DOUBLE,0,node,MPI_COMM_WORLD);
	}

MPI_Finalize();
}
