/*

Copyright 2010 Brian Caffrey, Tom Williams, Mario Fares.


this file is part of Clusterfunc.

    Clusterfunc is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Clusterfunc is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Clusterfunc.  If not, see <http://www.gnu.org/licenses/>.


*/



#include "fd_tree.h"
#include "stats.h"



void TagMatrix::increment(std::string Tag, std::string Species){
	int row, col;

	if(fdtotal.size()==0){
		AddTagSpec(Tag, Species);
		row=0;
		col=0;
	}else{

		col = getCol(Tag);
		row = getRow(Species);
	}


	++fdtotal[row][col];

}

void TagMatrix::incrementtotal(std::string Tag, std::string Species){

	int col, row;
	if(fdtotal.size()==0){
		AddTagSpec(Tag, Species);
		row = 0;
		col = 0;
	}else{
		col = getCol(Tag);
		row = getRow(Species);
	}
	++speciestotal[row][col];

}

void TagMatrix::AddTagSpec(std::string tag_name, std::string Spec_name){

	SpeciesNames.push_back(Spec_name);
	species_map[Spec_name] = SpeciesNames.size()-1;
	TagNames.push_back(tag_name);
	tag_map[tag_name] = TagNames.size()-1;

	vector<int> temp(1, 0);
	fdtotal.push_back(temp);
	speciestotal.push_back(temp);

}

int TagMatrix::getTagSize(){

	return TagNames.size();
}

int TagMatrix::getCol(std::string Tag){
	int i=0;
	map<string, int>::iterator it;

	it = tag_map.begin();

	while(it!=tag_map.end()){
		if(it->first == Tag){
			return tag_map[Tag];
		}
		++i;
		++it;
	}
	AddTag(Tag);
	return TagNames.size()-1;
}

int TagMatrix::getRow(std::string Species){
	int i=0;
	map<string, int>::iterator it;

	it = species_map.begin();
	while(it!=species_map.end()){
		if(it->first == Species){
			return species_map[Species];;
		}
		++i;
		++it;
	}
	AddSpecies(Species);
	return SpeciesNames.size()-1;
}

void TagMatrix::AddSpecies(std::string species_name){

	SpeciesNames.push_back(species_name);
	species_map[species_name] = SpeciesNames.size()-1;
	vector<int> temp(fdtotal[0].size(), 0);
	fdtotal.push_back(temp);
	speciestotal.push_back(temp);
}

void TagMatrix::AddTag(std::string tag_name){

	TagNames.push_back(tag_name);
	tag_map[tag_name] = TagNames.size()-1;

	for(unsigned int i=0;i<speciestotal.size();++i){
		fdtotal[i].push_back(0);
		speciestotal[i].push_back(0);
	}

}

void TagMatrix::enrichment(){

	ofstream matrix;
	matrix.open("matrix.txt");


	/*speciestotal matrix is a matrix of the total number of tests for a species*/
	/*fdtotal is a matrix of the total number of tests showing functional divergence*/

	int len = speciestotal[0].size();
/*	vector<int> temp(len, 0);
	for(unsigned int i =0;i<speciestotal.size();++i){
		species_tag_solution.push_back(temp);
	}
*/
//	species_enrichment.resize(speciestotal.size());
//	tag_enrichment.resize(speciestotal[0].size());






	/*-------------------Species with Tag Enrichment------------------*/

Species_tag_enrichment(fdtotal, speciestotal, species_tag_solution);
	/*------------------Tag Enrichment-------------------------------*/
//Tag_enrichment(fdtotal, speciestotal, tag_enrichment);
	/*-----------------Species Enrichment--------------------------*/

Species_enrichment(fdtotal, speciestotal, species_enrichment);
	/*-----------------Print Enrichments--------------------------*/


ofstream enrich;
        enrich.open("enrichment.txt");
        enrich << "Species Enrichment" << endl;

        for(int i=0;i<speciestotal.size();++i){
                enrich << SpeciesNames[i] << "\t";
                if(species_enrichment[i] == 0){
                        enrich << "Not enriched" << endl;
                }else if(species_enrichment[i] == 1){
                        enrich << "Over Enriched" << endl;
                }else{
                        enrich << "Under Enriched" << endl;
                }
        }

        enrich << "Tag Enrichment" << endl;

        /*for(unsigned int i=0;i<TagNames.size();++i){
                enrich << TagNames[i] << "\t";
                if(tag_enrichment[i] == 0){
                        enrich << "Not enriched" << endl;
                }else if (tag_enrichment[i]==1){
                        enrich << "Over Enriched" << endl;
                }else{
                        enrich << "Under Enriched" << endl;
                }

        }*/
        enrich.close();

 matrix << "*\t";
        for(unsigned int j=0;j<speciestotal[0].size();++j){
                matrix << TagNames[j] << "\t";
        }
        matrix << endl;
        for(unsigned int i=0;i<speciestotal.size();++i){
                //int l=getCol(SpeciesNames[i]);        
                matrix << SpeciesNames[i] << "\t";
                for(unsigned int j=0;j<speciestotal[i].size();++j){
                        matrix << fdtotal[i][j] << "/" << speciestotal[i][j] << "\t";
                }
                matrix << endl;
        }

        matrix.close();


}

/*=============================================================================*/
/*=============================================================================*/
/*=============================================================================*/

void TagMatrix::MakeTree(){
	DistanceMatrix *Dist;

	Dist = new DistanceMatrix(SpeciesNames);

	for(unsigned int i=0;i<speciestotal.size();++i){
		for(unsigned int j=i;j<speciestotal.size();++j){
			if(i==j){
				(*Dist)(i, j) = 0.0;		
			}else{
				(*Dist)(i, j)=(*Dist)(i,j) = ((double)Correlation(species_tag_solution[i], species_tag_solution[j]));
			}

		}
	}

	TreeTemplate<Node> * tree2 = NULL;
	AgglomerativeDistanceMethod * distMethod = NULL;


	BioNJ * bionj = new BioNJ();
	bionj->outputPositiveLengths(true);
	distMethod = bionj;


	distMethod->setDistanceMatrix(*Dist);
	distMethod->computeTree(true);
	delete Dist;
	tree2 = dynamic_cast<TreeTemplate<Node> *>(distMethod->getTree());

	ofstream tes;
	tes.open("func_div");
	Newick * newickReader = new Newick(false);
	newickReader->write(*tree2, tes);

	delete tree2;
	delete newickReader;
	delete bionj;
}

/*=============================================================================*/
/*=============================================================================*/
/*=============================================================================*/

/*
int Correlation(vector<int>& spec_totala, vector<int>& spec_totalb){

	int diff=0;
	for(unsigned int i=0;i<spec_totala.size();++i){
		if(spec_totala[i]!=spec_totalb[i]){
			++diff;
		}
	}
	return diff;

}
*/
/*=============================================================================*/
/*=============================================================================*/
/*=============================================================================*/

int seq_number;

int main(int argc, char *argv[]){ 
	char pwd[1000], temp_folder[1000], mat_file[100], aminos[23], folder[1000], forced[1000], tree_file[1000], file[1000];
	int length, supress=0, opt, force =0, one =0;
	float threshold=0.00001;

	TagMatrix summary_matrix;


cerr << "   *=====================================================================*\n";
cerr << "   *                                                                     *\n";
cerr << "   * ClusterFunC: Clusters of Functional Categories(version 1.0)         *\n";
cerr << "   * Author: Brian E. Caffrey                                            *\n";
cerr << "   * Mathematical Model: Caffrey, Williams, Fares                        *\n";
cerr << "   * Evolutionary Genetics and Bioinformatics Laboratory                 *\n";
cerr << "   * Smurfit Institute of Genetics                                       *\n";
cerr << "   * University of Dublin, Trinity College                               *\n"; 
cerr << "   * Publication:Caffrey, Williams, Jiang, Toft, Hokamp, Fares (2010)    *\n";
cerr << "   *                                                                     *\n";
cerr << "   *=====================================================================*\n\n";




	if(argc<2){
		fprintf(stderr, "usage: -F folder or -f file (options: [-m matrix file][-a amino acid info][-t tree file]\n"); 
		exit(-1);
	}

	strcpy(mat_file, "blosum62.tab");

	while ((opt = getopt(argc, argv, "F:f:m:at:c:")) != -1) {

		switch(opt){

			case 'F':
				strcpy(folder, optarg);
				length = strlen(folder);
				if(folder[length-1]!='/'){
					strcat(folder, "/");
				}
				break;
			case 'f':
				strcpy(file, optarg);
				one = 1;
		//		supress = 1;
				break;
			case 'm':
				strcpy(mat_file, optarg);
				break;
			case 'a':
				supress = 1;
				break;
			case 't':
				force = 1;
				strcpy(forced, optarg);
				break;
			case 'c':
				threshold=atof(optarg);
				break;
			case ':':
				fprintf(stderr, "Error: Unknown option passed in: %c\n", optopt); // optarg defined in getopt.h 
				exit(1);
				break;
			case '?':
				fprintf(stderr, "Error: Unknown option passed in: %c\n", optopt);
				exit(1);
				break;
		}

	}






	Pwd(pwd);
		TreeTemplate<Node> *tree_for = NULL;	
	if(force == 1){
		strcat(tree_file, pwd);
		strcat(tree_file, "/");
		strcat(tree_file, forced);	
		Newick * NewickReader = new Newick(false); //No comment allowed!
		tree_for = NewickReader->read(tree_file);  
		delete NewickReader;
	}

	

	char infile[1000];
	vector<char> blosum_index;
	vector<int> blosum_matrix;

	FILE *matrix_file;
	matrix_file=fopen(mat_file, "r");
	if(matrix_file == NULL){
		fprintf(stderr, "Error: couldn't find file:%s\n", mat_file);
		exit(-1);
	}
	read_matrix(blosum_matrix, blosum_index, matrix_file);

	strcpy(aminos, "-ARNDCEQGHILKMFPSTWYV ");
	fclose(matrix_file);

	int first=0;
	fprintf(stderr, "Files processed:\n0%%\r"); 



		vector<int> tag_fd;
		vector<int> tag_total;
		map<string, int> tag_nums;
map<string, int> tag_map;
		vector< string> treestring;
		vector<string> tag_vector;
if(one == 1){

	char file1[1000];
	
		vector<string> Tag1;
		vector<FDSet> fd_clades;
		strcpy(infile, pwd);
//		strcat(infile, folder);
		strcat(infile, file);
		vector<string> seq_names;
		vector< string > sequences;
			TreeTemplate<Node> *tree = NULL;


		if(force==0){
			tree = create_input_tree(infile, seq_names, sequences, Tag1, first);
		}else{
	Read_fasta(seq_names, sequences, infile, Tag1);	

}
		first=1;
	vector< TreeTemplate<Node> > tree_vector;
		if(tree!=NULL||tree_for!=NULL){
			if(force !=1)
				fd_tree(tree, seq_names, sequences, fd_clades, tree_vector, treestring);
			else
				fd_tree(tree_for, seq_names, sequences, fd_clades, tree_vector, treestring);
		}

		if(tree!=NULL)
			delete tree;
//		if(tree_for!=NULL)
//			delete tree_for;


string mytemp(infile);	
		if(tree_vector.size()>0){
			fd_call(blosum_matrix, blosum_index, seq_names, sequences, fd_clades, tree_vector, summary_matrix, Tag1, file1, supress, mat_file, mytemp.c_str(), treestring, tag_fd, tag_total, tag_map, threshold, tag_vector);
		}
}else{
	vector<string> files=printDirectoryContent(folder, pwd);
	strcpy(temp_folder, pwd);
	int num_tags = 0;
	for(unsigned int i=0;i<files.size();++i){
		vector<string> Tag1;
		vector<FDSet> fd_clades;
		strcpy(infile, pwd);
		strcat(infile, folder);
		strcat(infile, files[i].c_str());

		vector<string> seq_names;
		vector< string > sequences;
			TreeTemplate<Node> *tree = NULL;
		if(force==0){
			tree = create_input_tree(infile, seq_names, sequences, Tag1, first);
		}else{
	Read_fasta(seq_names, sequences, infile, Tag1);	
}
		first=1;

		num_tags=Tag1.size();

		vector< TreeTemplate<Node> > tree_vector;
		if(tree!=NULL||tree_for!=NULL){
			if(force !=1)
				fd_tree(tree, seq_names, sequences, fd_clades, tree_vector, treestring);
			else
				fd_tree(tree_for, seq_names, sequences, fd_clades, tree_vector, treestring);
		}

		if(tree!=NULL)
			delete tree;
		
	//	if(tree_for!=NULL)
	//		delete tree_for;

		if(tree_vector.size()>0){
			fd_call(blosum_matrix, blosum_index, seq_names, sequences, fd_clades, tree_vector, summary_matrix, Tag1, files[i], supress, mat_file, files[i].c_str(), treestring, tag_fd, tag_total, tag_map, threshold, tag_vector);
		}
		float per = (float)(i+1)/(float)files.size();
		fprintf(stderr, "%.0f%%\r", per*100);
	}
	if(files.size()>1 && num_tags>0 && summary_matrix.fdtotal.size()>=1 && summary_matrix.fdtotal[0].size()>1){
		summary_matrix.enrichment();	//do enrichment test
		summary_matrix.MakeTree();	//make a tree of functional divergence
		Tag_Enrichment(tag_fd, tag_total, tag_map, tag_vector);
	}

}
	fprintf(stderr, "100%%\n\n"); 

}

/*=======================================================================================*/
/*=======================================================================================*/
/* creates files which correspond to the subtrees of the original input alignment	 */
/*=======================================================================================*/
/*=======================================================================================*/

int fd_tree(TreeTemplate<Node> *tree, vector<string>& seq_names, vector< string >& sequences, vector<FDSet>& fd_clades, vector< TreeTemplate<Node> >& tree_vector, vector<string>& treestring){ 

	//	Newick * NewickReader = new Newick(false); //No comment allowed!
	try {
		vector<Node *> nodes = tree->getNodes();

		for(unsigned int j = 0; j < nodes.size(); ++j)
		{
			Node *subtreeRoot = TreeTemplateTools::cloneSubtree<Node>(*nodes[j]);
			TreeTemplate<Node> subtree(*subtreeRoot); //Create a new (independent) tree from the subtree
			vector<int> sons;
			vector<int> leaf;
			vector<int> nodesid;

			vector<string> leaves;
			vector<int> leavesid;
			leavesid = subtree.getLeavesId();


			sons = subtree.getSonsId(subtree.getRootId());	//gets the ids of the son clades
			leaves = subtree.getLeavesNames();

			vector<int> treeleaves = subtree.getLeavesId();
			nodesid = subtree.getNodesId();	//read the nodes into a vector(these are the indices of the leaves)
			if(sons.size()>1){

				vector<string> clade1_names;
				vector<string> clade2_names;

				Node *subtreeRoot1 = TreeTemplateTools::cloneSubtree<Node>(subtree, sons[0]);
				TreeTemplate<Node> trees1(*subtreeRoot1);
				Node *subtreeRoot2 = TreeTemplateTools::cloneSubtree<Node>(subtree, sons[1]);
				TreeTemplate<Node> trees2(*subtreeRoot2);
				clade1_names = trees1.getLeavesNames();
				clade2_names = trees2.getLeavesNames();
	
					if(clade1_names.size()==0){
						clade1_names.push_back(trees1.getNodeName(trees1.getRootId()));
					}else if (clade2_names.size()==0){
						
						clade2_names.push_back(trees2.getNodeName(trees2.getRootId()));
					}

				if(clade2_names.size()>=8){		// if the clade isnt big enough to have 2 clades of size 4 then dont do it
				/*for(int p=0;p<clade2_names.size();++p){
					cerr << clade2_names[p] << endl;
				}
				for(int p=0;p<clade1_names.size();++p){
					cerr << clade1_names[p] << endl;
				}*/

					make_setss(trees2, clade1_names, tree_vector, seq_names, sequences, fd_clades, subtree, treestring);
				}
				if(clade1_names.size()>=8){	// if the clade isnt big enough to have 2 clades of size 4 then dont do it
					make_setss(trees1, clade2_names, tree_vector, seq_names, sequences, fd_clades, subtree, treestring);
				}
			}

		}


	} catch (Exception e) {
		cout << "Error when reading tree." << endl;
		exit(-1);
	}

	return 0;
}

/*======================================================================*/
/*======================================================================*/
/*======================================================================*/

int make_setss(TreeTemplate<Node>& tree, vector<string>& outgroup, vector< TreeTemplate<Node> >& tree_vector, vector<string>& seq_names, vector< string >& sequences, vector<FDSet>& fd_clades, TreeTemplate<Node>& original, vector<string>& treestring){

	vector<int> sons;
	sons = tree.getSonsId(tree.getRootId());
	vector<string> clade1_names, clade2_names;


	Node *subtreeRoot1 = TreeTemplateTools::cloneSubtree<Node>(tree, sons[0]);
	TreeTemplate<Node> trees1(*subtreeRoot1);
	Node *subtreeRoot2 = TreeTemplateTools::cloneSubtree<Node>(tree, sons[1]);
	TreeTemplate<Node> trees2(*subtreeRoot2);
	clade1_names = trees1.getLeavesNames();
	clade2_names = trees2.getLeavesNames();


	if(clade1_names.size()>=4 && clade2_names.size()>=4 && outgroup.size()>=1){

		FDSet temp_set;
		for(unsigned int i=0;i<clade1_names.size();++i){
			/*			if(i==0){
						clade1_names[i]+="C1_start";
						}*/
			temp_set.clade1_names.push_back(clade1_names[i]);

			int t= compare_to_alignment(seq_names, clade1_names[i]);
			if(t!=-1){
				string temp_char(sequences[t-1].begin(), sequences[t-1].end());
				temp_set.clade1.push_back(temp_char);
			}
		}

		for(unsigned int i=0;i<clade2_names.size();++i){
			/*			if(i==0){
						clade2_names[i]+="C2_start";
						}*/
			temp_set.clade2_names.push_back(clade2_names[i]);
			int t= compare_to_alignment(seq_names, clade2_names[i]);
			if(t!=-1){
				string temp_char(sequences[t-1].begin(), sequences[t-1].end());
				temp_set.clade2.push_back(temp_char);
			}
		}




		for(unsigned int i=0;i<outgroup.size();++i){
			/*			if(i==0){
						outgroup[i]+="OG_start";
						}*/
			temp_set.outgroup_names.push_back(outgroup[i]);

			int t = compare_to_alignment(seq_names, outgroup[i]);
			if(t!=-1){
				string temp_char(sequences[t-1].begin(), sequences[t-1].end());
				temp_set.outgroup.push_back(temp_char);
			}
		}
		//temp_set.outgroup_names[0].push_back("OG_start");
		//temp_set.clade1_names[0].push_back("C1_start");	
		//temp_set.clade2_names[0].push_back("C2_start");	
		fd_clades.push_back(temp_set);
	string temp = TreeTemplateTools::treeToParenthesis(original, false);
	treestring.push_back(temp);

		tree_vector.push_back(original);

	}
	return 0;
}


/*=======================================================================================*/
/*=======================================================================================*/
/*=======================================================================================*/

TreeTemplate<Node> *create_input_tree(char *input_file, vector<string>& seq_names, vector< string >& sequences, vector<string>& Tag1, int first){
	FILE *stream;
	if(first==0)
		stream = freopen("/dev/null", "w", stdout);
	Read_fasta(seq_names, sequences, input_file, Tag1);	

	if(seq_names.size()>=9){
		DistanceMatrix *DS;
		vector<string> names;



		int tot=0;
		names.resize(seq_number);
		for(int i=0;i<seq_number;++i){
			names[i] = seq_names[i];
			++tot;
		}

		DS=ScoreDist(names, sequences);
		if(DS==NULL){
			return NULL;
		}

		AgglomerativeDistanceMethod * distMethod = NULL;
		BioNJ * bionj = new BioNJ();
		bionj->outputPositiveLengths(true);
		distMethod = bionj;

		bionj->setDistanceMatrix(*DS);
		bionj->computeTree(true);
		if(DS!=NULL)
			delete DS;
		TreeTemplate<Node> * tree2 = dynamic_cast<TreeTemplate<Node> *>(bionj->getTree());

	//string temp = TreeTemplateTools::treeToParenthesis(*tree2, false);
//	treestring.push_back(temp);


		delete bionj;
		return tree2;
	}else{
		return NULL;
	}
}

/*=======================================================================================*/
/*=======================================================================================*/
/*=======================================================================================*/

int compare_to_alignment(vector<string>& names, string tree_name){
	std::string str1(""),  str2("");

	str2 = tree_name;
	for(unsigned int i=0;i<names.size();++i){
		size_t found;

		str1 = names[i];
		found = str1.find(str2);
		if(found!=string::npos) 
			return i+1;

	}
	return -1;
}


/*======================================================================*/
/*======================================================================*/
/* Reads in a fasta file into a vector of strings and a vector of
   vectors of chars							*/
/*======================================================================*/
/*======================================================================*/
int Read_fasta(vector<string>& names, vector< string >& sequences, char *inname, vector<string>& Tag1){
	int first=0, i=0;
	FILE *input;
	char temp[1000];


	input=fopen(inname, "r");

	if(input == NULL){
		fprintf(stderr, "Error: couldn't find file named %s\n", inname);
		exit(-1);
	}

	while(1){
		fgets(temp, 1000, input);

		if(temp[strlen(temp)-1]=='\n'){
			temp[strlen(temp)-1]='\0';		//remove carriage returns
		}
		if(temp[0]=='>'){
			break;
		}
		Tag1.push_back(temp);
	}

	string mytemp(temp);
	mytemp.erase(0,1);
	names.push_back(mytemp);
	first=0;++i;
	while(fgets(temp, 1000, input)!=NULL){

		if(temp[strlen(temp)-1]=='\n'){
			temp[strlen(temp)-1]='\0';		//remove carriage returns
		}


		if(temp[0]=='>'){			//check if it is a sequence name
			string tempy(temp);
			tempy.erase(0, 1); // removes the '>' from the start of the name
			names.push_back(tempy);		// put the name into the vector 'names'


			first=0;			//specify that we have a new sequence
			++i;				//count the sequences
			if(first == 1){
				sequences[i-1].push_back('\0');
			}

		}else if(first == 0){			//need to start new sequence and copy string
			string tempor(temp);

			sequences.push_back(tempor);
			tempor.clear();
			first=1;
		}else{					//need to concatonate existing string with the next part of sequence
			sequences[i-1] += temp;
			temp[0]='\0';
		}	
	}
	seq_number=sequences.size();			//set the global variable seq_number
	fclose(input);
	return 0;
}


/*=============================================================================*/
/*=============================================================================*/
/*=============================================================================*/
vector<string> printDirectoryContent(char *dir, char *pwd){
	DIR *dp;
	struct dirent *entry;
	struct stat statbuf;

	vector<string> files;

	if ((dp = opendir(dir)) == NULL) {
		fprintf(stderr, "Cannot open directory: %s\n", dir);
		exit(-1);
	}
	chdir(dir);
	while ((entry = readdir(dp)) != NULL) {
		lstat(entry->d_name, &statbuf);

		if (S_ISREG(statbuf.st_mode)) {
			if(*entry->d_name!='.'){
				files.push_back(entry->d_name);
			}
		} else if (S_ISDIR(statbuf.st_mode)) {
			/* Maybe traverse inside see below on how to do. */
		}

	}
	chdir(pwd);
	closedir(dp);
	return files;
}
