#include "BEDParser.h"

BEDParser::BEDParser(string fileName):Parser(fileName)
{
	//Read first header line
	string line;
	vector<string> vec;	
	getline(m_File, line,m_LineDelimiter);
	Tokenize(line,vec,"\r\t\n ");
	if(vec.size() < 3)
	{
		string err = "Invalid BED file format";
		throw err;
	}
	else
	{
		try
		{
			int val1 = AtoI(vec[1]);
			int val2 = AtoI(vec[2]);
			m_File.seekg(0,ios::beg);
		}
		catch(string& err)
		{
			//this is probably a header line
		}
	}
	
}

BEDParser::~BEDParser(void)
{	
}

Position* BEDParser::ParseLine()
{
	string line;
	vector<string> vec;
	vector<GeneElement*> geneElementsVec;

	getline(m_File, line,m_LineDelimiter);
	if(line.length() == 0)
		return NULL;	
	Tokenize(line,vec,"\r\t\n ");	

	int vectorSize = vec.size();
	string chrom = ToUpper(vec[0]);
	int start = atoi(vec[1].c_str());
	int end = atoi(vec[2].c_str());
	string name = "";
	if(vectorSize > 3)
	{
		name = vec[3];
	}	
	Strand strand = Strand_Positive;
	if(vectorSize > 5)
	{
		//we expect the strand to be + or - any other symbol is ignored
		if(vec[5]=="+")
		{
			strand = Strand_Positive;
		}		
		else if(vec[5] == "-")
		{
			strand = Strand_Negative;
		}		
	}
	//if we don't have the information about the gene's elements
	//we stop here by building simple Position object.
	if(vectorSize <= 6)
	{
		Position* g = new Position(start,end,name,"",chrom,strand,line);
		return g;
	}

	if(vectorSize < 12)
	{
		string err = "Invalid BED file format - missing columns: 12 columns expected";
		throw err;
	}

	start = start + 1;
	int cdsStart = atoi(vec[6].c_str()) + 1;
	UTR* beginUTR = NULL;
	UTR* endUTR = NULL;

	if(cdsStart != start && cdsStart > 1)
	{
		cdsStart--;
		if(strand == Strand_Positive)
		{
			//UTR5
			beginUTR = new UTR(start,cdsStart,5);
		}
		else
		{
			//UTR3
			beginUTR = new UTR(start,cdsStart,3);
		}
	}

	int cdsEnd = atoi(vec[7].c_str());
	if(cdsEnd != end && cdsEnd > 0)
	{
		cdsEnd++;
		if(strand == Strand_Positive)
		{
			//UTR3
			endUTR = new UTR(cdsEnd,end,3);
		}
		else
		{
			//UTR5
			endUTR = new UTR(cdsEnd,end,5);
		}
	}
	
	int numOfExons = atoi(vec[9].c_str());	

	//split the exons length (vec[10])
	vector<string> vecExonsLength;
	Tokenize(vec[10],vecExonsLength,",");	


	//split the exons offset (vec[11])
	vector<string> vecExonsOffset;
	Tokenize(vec[11],vecExonsOffset,",");			

	//ignore rest columns
	int lastElementEnd = start;	
	for(int i=0;i<numOfExons;i++)
	{
		int exonStart = atoi(vecExonsOffset[i].c_str()) + start;
		int exonEnd = exonStart + atoi(vecExonsLength[i].c_str()) -1;
		int index = (strand == Strand_Positive)? i+1 : numOfExons-i;
		int intronIndex = (strand == Strand_Positive)? i : numOfExons-i;		

		if(index == numOfExons)
		{
			//Last exon is marked with -1
			index = -1;
		}
		if(intronIndex == (numOfExons - 1))
		{
			//Last intron is marked with -1
			intronIndex = -1;
		}
		if((exonStart - lastElementEnd) > 0)
		{
			//add intron between exons
			GeneElement* elem = new Intron(lastElementEnd + 1, exonStart - 1,intronIndex);
			geneElementsVec.push_back(elem);
		}
		lastElementEnd = exonEnd;

		GeneElement* elem = new Exon(exonStart ,exonEnd, index);
		geneElementsVec.push_back(elem);
	}
	Position* g = new Position(start,end,name,"",geneElementsVec,chrom,strand,beginUTR,endUTR);
	return g;
}
