
/*

Copyright 2010 Brian Caffrey, Tom Williams, Mario Fares.


this file is part of Clusterfunc.

    Clusterfunc is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Clusterfunc is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Clusterfunc.  If not, see <http://www.gnu.org/licenses/>.


*/


#include"stats.h"
#include"file_manip.h"
#include<math.h>
#include<gsl/gsl_randist.h>
#include<gsl/gsl_rng.h>
#include<gsl/gsl_cdf.h>
#include<fstream>

void Species_tag_enrichment(vector< vector<int> >& fd, vector< vector<int> >& total, vector< vector<int> >& species_tag_enrichment){


	float species_tag_expected = 0.0, species_tag_not_expected=0.0;

	for(int i=0;i<fd.size();++i){
		for(int j=0;j<fd[0].size();++j){
			if(total[i][j]!=0)
				species_tag_expected+=(float)fd[i][j]/(float)total[i][j];
		}
	}
	species_tag_expected/=(float)(fd.size()*fd[0].size());
	species_tag_not_expected=1-species_tag_expected;

	//cerr << "species_tag_exp=" << species_tag_expected << endl;


	for(int i=0;i<fd.size();++i){
		//cerr << "i=" << i << "\ttotal=" << fd.size() << endl;
		vector<int> tempvec;
		for(int j=0;j<fd[0].size();++j){
			float chi_sq=0.0;
			if(total[i][j]!=0){
				chi_sq+=  (pow((fd[i][j]-(species_tag_expected*total[i][j])),2)/(species_tag_expected*total[i][j]));
				chi_sq+=  (pow(((total[i][j]-fd[i][j])-(species_tag_not_expected*total[i][j])),2)/(species_tag_not_expected*total[i][j]));
			}
		//cerr << "fd[" << i << "][" << j << "]=" << fd[i][j] << endl;
	//	cerr << "total[" << i << "][" << j << "]=" << total[i][j] << endl;
			if(chi_sq >= gsl_cdf_chisq_Pinv(0.95, 1) && (float)fd[i][j]>(species_tag_expected*(float)total[i][j]) ){
				tempvec.push_back(1);
			}else if(chi_sq >= gsl_cdf_chisq_Pinv(0.95, 1) && (float)fd[i][j]<(species_tag_expected*(float)total[i][j]) ){
				tempvec.push_back(-1);
			}else{
				tempvec.push_back(0);
			}

		}
		species_tag_enrichment.push_back(tempvec);
	}




}


int Correlation(vector<int>& spec_totala, vector<int>& spec_totalb){

	int diff=0;
	for(unsigned int i=0;i<spec_totala.size();++i){
		if(spec_totala[i]!=spec_totalb[i]){
			++diff;
		}
	}
	return diff;
}



void Tag_enrichment(vector< vector<int> >& fd, vector< vector<int> >& total, vector<int>& tag_enrichment){

	float tag_expected=0.0, tag_not_expected=0.0;
	int totals=0, fd_tot=0;

	for(int i=0;i<fd[0].size();++i){
		totals=0;
		fd_tot=0;
		for(int j=0;j<fd.size();++j){
			fd_tot+=fd[j][i];
			totals+=total[j][i];
		}
		if(totals!=0){
			tag_expected+=(float)fd_tot/(float)totals;
		}
	}
	tag_expected/=fd[0].size();
	tag_not_expected=1-tag_expected;



	for(int i=0;i<fd[0].size();++i){

		totals=0;
		fd_tot=0;
		for(int j=0;j<fd.size();++j){
			fd_tot+=fd[j][i];
			totals+=total[j][i];
		}
		float chi_sq_tag=0.0;
		if(totals!=0){
			chi_sq_tag+=  (pow((fd_tot-(tag_expected*totals)),2)/(tag_expected*totals));
			chi_sq_tag+=  (pow(((totals-fd_tot)-(tag_not_expected*totals)),2)/(tag_not_expected*totals));
		}else{
			chi_sq_tag = 0.0;
		}

		if(chi_sq_tag >= gsl_cdf_chisq_Pinv(0.95, (fd[0].size()-1)) && (float)fd_tot>(tag_expected*(float)totals) ){
			tag_enrichment.push_back(1);
		}else if( chi_sq_tag >= gsl_cdf_chisq_Pinv(0.95, (fd[0].size()-1)) && (float)fd_tot<=(tag_expected*(float)totals) ){
			tag_enrichment.push_back(-1);
		}else{
			tag_enrichment.push_back(0);
		}   
	}

}




void Tag_Enrichment(vector<int>& tag_fd, vector<int>& tag_total, map<string, int> tag_map, vector<string> tag_vector){
		float average_fd=0.0, average_not=0.0;

		for(int i=0;i<tag_fd.size();++i){
			average_fd+=(float)tag_fd[i]/(float)tag_total[i];
			average_not+=(float)(tag_total[i] - tag_fd[i])/(float)tag_total[i];
		}
		average_fd/=tag_fd.size();
		average_not/=tag_fd.size();

fstream file;
file.open("enrichment.txt", fstream::in | fstream::out | fstream::app);

//		cerr << "tag_fd_size=" << tag_fd.size() << endl;
		for(int i=0;i<tag_fd.size();++i){

			string this_tag;

/*			map<string, int>::iterator myit;

			for(myit=tag_map.begin();myit!=tag_map.end(); ++myit){
				if(myit->second==i){
					this_tag = myit->first;
				}
			}
*/

			

			//totality++;

			float chi_test_sq=0.0;

			//		cerr << "fd=" << tag_fd[i] << "\ttotal=" << tag_total[i] << "\t" << "expected=" << average_fd*(float)tag_total[i] << "\t";
			chi_test_sq += pow(((float)tag_fd[i]-average_fd*tag_total[i]),2)/(average_fd*tag_total[i]);
			chi_test_sq += pow(((float)(tag_total[i] - tag_fd[i])-average_not*tag_total[i]),2)/(average_not*tag_total[i]);

			if(chi_test_sq >= gsl_cdf_chisq_Pinv(0.95, (1)) && (float)tag_fd[i]>(average_fd*(float)tag_total[i]) ){
//			tag_enrichment.push_back(1);
				file << tag_vector[i] << "\tOver Enriched" << endl;
			}else if( chi_test_sq >= gsl_cdf_chisq_Pinv(0.95, (1)) && (float)tag_fd[i]<(average_fd*(float)tag_total[i]) ){
		//	tag_enrichment.push_back(-1);
				file << tag_vector[i] << "\tUnder Enriched" << endl;
			}else{
				file << tag_vector[i] << "\tNot Enriched" << endl;
			//tag_enrichment.push_back(0);
			//	totalnot++;
			}

		}
file.close();
}



void Species_enrichment(vector< vector<int> >& fd, vector< vector<int> >& total, vector<int>& species_enrichment){


	float species_expected=0.0, species_not_expected=0.0;
	int totals=0, fd_tot=0;


	for(int i=0;i<fd.size();++i){
		totals=0;
		fd_tot=0;
		for(int j=0;j<fd[0].size();++j){
			fd_tot+=fd[i][j];
			totals+=total[i][j];
		}
		if(totals!=0){
			species_expected+=(float)fd_tot/(float)totals;
		}
	}
	species_expected/=fd.size();
	species_not_expected=1-species_expected;

	for(int i=0;i<fd.size();++i){

		totals=0;
		fd_tot=0;
		for(int j=0;j<fd[0].size();++j){
			fd_tot+=fd[i][j];
			totals+=total[i][j];
		}
		float chi_sq_species=0.0;
		if(totals!=0){
			chi_sq_species+=  (pow((fd_tot-(species_expected*totals)),2)/(species_expected*totals));
			chi_sq_species+=  (pow(((totals-fd_tot)-(species_not_expected*totals)),2)/(species_not_expected*totals));
		}else{
			chi_sq_species = 0.0;
		}

		if(chi_sq_species >= gsl_cdf_chisq_Pinv(0.95, (fd[0].size()-1)) && (float)fd_tot>(species_expected*(float)totals) ){
			species_enrichment.push_back(1);
		}else if( chi_sq_species >= gsl_cdf_chisq_Pinv(0.95, (fd[0].size()-1)) && (float)fd_tot<=(species_expected*(float)totals) ){
			species_enrichment.push_back(-1);
		}else{
			species_enrichment.push_back(0);
		}   
	}
}




