/* ----------------------------------------------------------------------
 * Some common functions for the Genpak package
 * Copyright (C) 2000 January Weiner III
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 ---------------------------------------------------------------------- */

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <ctype.h>

#define MAXSTR 500
#define MAXNAZWA 100
#define MAXLEN 9 
#define QUIET 0
#define DEBUG 0
#define FALSE 0
#define TRUE 1

/* char to separate directories */
#ifndef DSEP
	#define DSEP ':'
#endif

/* default path variable name, can be defined at compile time */
#ifndef DVAR
	#define DVAR "GPPATH"
#endif

/* default search directory, can be defined at compile time */
#ifndef DDIR
	#define DDIR "/usr/lib/genpak"
#endif

/*
 * we need a structure to store all the warnings issued by the program
 * in case we need to print them all one at the time, that is when
 * html==TRUE.
 */

struct warnings_s {
	char warn[200] ;
	struct warnings_s *next ;
};

/* getopt code: types */
typedef enum GETOPT_ORDERING_T {
    PERMUTE,
    RETURN_IN_ORDER,
    REQUIRE_ORDER
} GETOPT_ORDERING_T;


extern struct warnings_s *allwarnings ;
extern struct warnings_s *currentwarning ;

/* three global variables needed by all programs */
extern int debug ;
extern int quiet ;
extern int html ;


/* These are used for designating sequence types */
typedef enum { UNKNOWN=0, DNA, RNA, PROTEIN, WRONG } sequence_type;
char stypes[5][10]={"unknown","dna","protein","wrong"} ;      

/* conversion from one letter aminoacid code to three letter code */
char one2three[128][4] ;

/* table for storing the genetic code in one letter notation */
typedef struct {
	char tbl[4][4][4] ;
} codont ;

/* structure type for sequences */
struct sekw_ss {
	char name[200] ;
	char *sequ ;
	long leng ;
	struct sekw_ss *next ;
	sequence_type type ; } ;

typedef struct sekw_ss sekw ;

/* structure for storing positions as linked lists */
struct position_ss {
	long start ;
	long end ; 
	struct position_ss *next ; } ;

typedef struct position_ss position_s ;

/* Function prototypes */

sekw* ReadWholeSequence(FILE *in) ;
char Complement(char c) ;
sekw* CopySeqFrag(sekw * s, long start, long end) ;
sekw* Dna2Protein(sekw *inseq, codont* intable, long start, int pedantic) ;
void Error(char *message, ...) ;
position_s* FindSeq(sekw *query, sekw *subject) ;
sequence_type GetSeqType(sekw *s) ;
void InitializeOne2Three() ;
int IsStartCodon(sekw *s, long p) ;
codont* LoadStandardTable() ;
void PrintAllWarnings(FILE *out) ;
void Programming_Error() ;
sekw* ReadSeqFrag(FILE *in, long start, long end, int numseq) ;
sekw* ReverseSequence(sekw* s) ;
sekw* ReverseSequence(sekw* s) ;
FILE* TryOpen(const char *filename, const char *type) ;
double VarianceE(double sum, double sumofsquares, int number) ;
void Warn(char *message, ...) ;

/* sekw* ReadSeqWhole(FILE *in) ; */

/* 
 * Checking whether sequence s has a start codon at position p 
 * Alternative start codons GTG and TTG are allowed
 */

int IsStartCodon(sekw *s, long p) {

	/* allow start codons: ATG, GTG, TTG */
	if(strchr("UATG",s->sequ[p]) != NULL &&
	   strchr("UT",s->sequ[p+1]) != NULL &&
		 s->sequ[p+2]=='G') return(TRUE) ;
	else {
		return(FALSE) ;
	}
		

}


/* 
 * This is a substitute for simple opening of a file.
 * The difference is, it also tries to open the file in the default
 * data directory and path.
 * TryOpen searches for the given filename *file_n in 1) current directory
 * 2) path found in environment variable DVAR 3) default directory DDIR. On
 * success, a file pointer to the opened file is returned; otherwise NULL.
 */

FILE *TryOpen(const char *file_n, const char *mode) {

	FILE *res ;
	char *p, *q, fn[FILENAME_MAX + 1], path[FILENAME_MAX + 1] ;

	/* check whether file opened for writing already exists */
	if( strchr(mode,'w') && (res=fopen(file_n,"r")) ) {
		fclose(res) ;
		Error("Cowardly refusing to overwrite existing file") ;
	}

	/* first, the current directory */ 
	if( (res = fopen(file_n,mode) ) != NULL ) return res ;
	if(debug) Warn("File %s not in working directory",file_n) ;

	/* path env. variable found and not empty */
	if((p = getenv(DVAR)) != NULL && p[0] != '\0') {
		if(debug) Warn("Searching path %s",p) ;

		strcpy(path,p) ;

		p = strchr(path,DSEP) ;
		q = path ;

		/* searching through the path */
		while(p != NULL) {
			p[0] = '\0' ;

			sprintf(fn,"%s/%s",q,file_n) ;

			/* file found */
			if( (res = fopen(fn,mode)) != NULL) return res ;

			q = p + 1 ;
			p = strchr(q,DSEP) ;
		}

		/* 
		 * there are n-1 ':'s in the path, so we have to search through the
		 * last part of the path 
		 */
		sprintf(fn,"%s/%s",q,file_n) ;
		if( (res = fopen(fn,mode)) != NULL) return res ;

	} 
	if(debug) Warn("File %s not in GPPATH pathway",file_n) ;

	/* last chance: the built-in default directory */

	sprintf(fn,"%s/%s",DDIR,file_n) ;
	if( (res = fopen(fn,mode)) != NULL) return res ;

	Error("Could not open file %s",fn) ;

	return NULL ;
}


			

/* 
 * Here we copy a fragment of a sequence to another one 
 * Note that start is the no of the first base to be taken,
 * and end is the no of the last base to be taken. So the sequence length
 * is end-start+1
 */
sekw* CopySeqFrag(sekw * s, long start, long end) {

	sekw* out ;
	long i=0,dlug=0;
	int complement=FALSE ;
	char c;


	if(start==0) start=1 ;
	if(end==0) end=strlen(s->sequ) ;


	if(start > end) {
		complement=TRUE ;
		i=start ;
		start=end ;
		end=i ;
	}


	if(debug) Warn("CopySeqFrag from %i to %i, complement=%i",
		start,end,complement) ;

	out=(sekw*) calloc(1,sizeof(sekw)) ;
	dlug=end-start+1 ;


	if(end > strlen(s->sequ)) {
		Warn("Wrong paramenters for function CopySeqFrag") ;
		end=strlen(s->sequ) ;
	}

	out->sequ=(char*) calloc(dlug+5,sizeof(char)) ;

	strcpy(out->name,s->name) ;


	/* copying char by char */
	for(i=start-1;i<end;i++) {
		if(complement) {
			c=Complement(s->sequ[(end-i-2+start)]) ;
		} else {
			c=s->sequ[i] ;
		}
		out->sequ[(i-start+1)]=c ;
	}

	out->sequ[dlug]='\0' ;
	out->leng=dlug ;
	return(out) ;

}



/* return the complementary Watson-Crick base pair */

char Complement(char c) {

	char d;

	d=toupper(c) ;

	if(d=='A') return('T') ;
	else if(d=='C') return('G') ;
	else if(d=='G') return('C') ;
	else if(d=='T') return('A') ;
	else if(d=='U') return('A') ;

	else if(d=='M') return('K') ;
	else if(d=='K') return('M') ;
	else if(d=='Y') return('R') ;
	else if(d=='R') return('Y') ;
	else if(d=='W') return('W') ;
	else if(d=='S') return('S') ;

	else if(d=='V') return('B') ;
	else if(d=='H') return('D') ;
	else if(d=='D') return('H') ;
	else if(d=='B') return('V') ;
	else if(d=='N') return('N') ;

	return(0) ;

}


/* Standard error message */

void Error(char *message, ...) {

	va_list vl ;
	FILE* output=stderr ;
	
	if(html) output=stdout ;

	PrintAllWarnings(output) ;
	va_start(vl,message) ;

	if(!quiet || debug) {
		fprintf(output,"Ops :-(  ") ;
		vfprintf(output,message,vl) ;
		fprintf(output,". Exiting\n") ;
	}
	exit(1) ;
}


/* Print all accumulated warnings of a program */

void PrintAllWarnings(FILE *out) {

	struct warnings_s *nextwarning=allwarnings ;
	struct warnings_s *oldwarning=allwarnings ;

	if(allwarnings!=NULL) {

		if(html) fprintf(out,"<PRE>") ;
		fprintf(out,"\n# Warnings issued during the execution of the program:\n") ;
		while(nextwarning != NULL) {
			fprintf(out,"# %s\n",nextwarning->warn) ;
			oldwarning=nextwarning ;
			nextwarning=nextwarning->next ;
			free(oldwarning) ;
		}
	}
	if(html) fprintf(out,"</PRE>") ;

}


/* Standard warning message */

void Warn(char *message, ...) {
	va_list vl ;
	FILE *output=stderr ;

	va_start(vl,message) ;

	if( (!quiet || debug) && !html ) {
		fprintf(output,"8^) ") ;
		vfprintf(output,message,vl) ;
		fprintf(output,"\n") ;
	}

	if(html) {

		if(allwarnings==NULL) {
			allwarnings=calloc(1,sizeof(*allwarnings) ) ;

			strcpy(allwarnings->warn,"-------- :\n") ;
			allwarnings->next=calloc(1,sizeof(*allwarnings)) ;
			currentwarning=allwarnings->next ;
			currentwarning->next=NULL ;
		}

		vsprintf(currentwarning->warn,message,vl) ;
		currentwarning->next=calloc(1,sizeof(*currentwarning) ) ;
		currentwarning=currentwarning->next ;
		currentwarning->next=NULL ;

	}
}


/* Seldom used: used when it's obvious I've screwed up smth badly */

void Programming_Error() {
	fprintf(stderr,"You have found a bug in the Genpak package\n") ;
	fprintf(stderr,"Please write down what you have been doing\n") ;
	fprintf(stderr,"(which program, what parameters etc.) and mail the\n") ;
	fprintf(stderr,"bug report to jw3@gyral.com. Thanks!\n") ;
	exit(1) ;
}




/* =======================================================================
 * Here the actual work of reading a sequence file is done.
 * *in - input file, start - start position, end - position, 
 + end==start==0 if the whole sequence should be read.
 * numseq - number of sequences to read. 0 to read all sequences.
 * (multiple sequence loading not implemented yet)
 * =======================================================================   */

sekw* ReadSeqFrag(FILE *in, long start, long end, int numseq)
{
	sekw *sekwencja ; /* The struct into which sequence is read */
	/* Number of characters read from the sequence file is dlug
	   size of buffer read, minus '\0', is bufsiz */
	long dlug=0; 
	long allocated=10000, licznik=1 ; /* this is a counter for reallocating memory for sequence */
	int whole=FALSE ; /* maybe we want the whole sequence */

	/* some temporary variables */
	int i=0, temp=0, check=TRUE ;

	if(start>end) { Programming_Error() ; }
	if((start==end) && (start==0)) { whole=TRUE ; start=1 ; }
	if(debug && whole) Warn("Reading the whole sequence") ;
	if(debug) Warn("Readseq: Reading sequence from %li to %li",start,end) ;

	if((sekwencja=(sekw*) malloc(sizeof(sekw)))==NULL)
		Programming_Error() ;
	

	strcpy(sekwencja->name,"Unknown\0") ;
	
/* checking whether fasta format or not */
	while( (temp=fgetc(in)) != '>') {
		if( (temp=fgetc(in)) == EOF) {
			if(debug) Warn("End of input") ;
			free(sekwencja) ;
			return(NULL) ;
		}
	}


/* reading the name of the sequence */
	while (i<MAXNAZWA) {
		if((temp=fgetc(in)) == EOF) {
			Warn("No sequence found") ;
			free(sekwencja->sequ) ;
			free(sekwencja) ;
			return(NULL) ;
		} else {
			if(temp=='\n') {
				sekwencja->name[i]='\0' ;
				i=MAXNAZWA ;
			} else {
			sekwencja->name[i]=(char)temp ;
			}
		}
		i++ ;
	}

	/* reading the actual sequence */
	/* First, allocating space for the beginning of the sequence */

	if(debug) Warn("Reading the sequence %s...\n",sekwencja->name) ;

	sekwencja->sequ=(char*) malloc((allocated + 1)*sizeof(char)) ;
	sekwencja->sequ[0]='\0' ;

	check=TRUE ; dlug=0 ; licznik=0 ;

	/* As long as we didn't got the whole sequence */
	while(check && (licznik<(end-start+1) || whole==TRUE)) {

		/*if end of file or a next sequence is reached */
		if((temp=fgetc(in))==EOF || temp=='>') {
			if (debug) Warn("Got to end of the file") ;
			

			/* EOF, but maybe the sequence is incomplete */
			if(whole != TRUE) 
				Error("Unexpected end of sequence %s",sekwencja->name) ;
			else check=FALSE ;

			/* ignoring blanks */
		} else if(temp!='\n' && temp!='\t' && temp!=' ') {
				dlug++ ;

				/* Have we started the reading? */
				if(dlug>=start) {
					sekwencja->sequ[licznik]=temp ;
					licznik++ ;
					if(licznik>allocated) {
						allocated += 10000 ;
						sekwencja->sequ=realloc(sekwencja->sequ,
									(sizeof(char)*(allocated+2))) ;
					}
				}
		}
	}

	if(temp=='>') ungetc(temp,in) ;

	sekwencja->sequ[licznik]='\0' ;
	sekwencja->leng=licznik ;

	if(debug) Warn("Sequence length %i",sekwencja->leng) ;

	if( (sekwencja->type=GetSeqType(sekwencja))==WRONG)
		Warn("Sequence %s is not DNA/RNA/Protein",sekwencja->name) ;
	
	if(debug) Warn("Sequence type %s",stypes[sekwencja->type]) ;

	if(sekwencja->leng == 0) {
		Warn("No sequence read") ;
		free(sekwencja->sequ) ;
		free(sekwencja) ;
		return(NULL) ;
	}


	if(debug) Warn("Sequence %s read succesfully",sekwencja->name) ;
	return(sekwencja) ;
}



sekw* ReadWholeSequence(FILE *in) {

	sekw *sekwencja ; /* The struct into which sequence is read */
	/* Number of characters read from the sequence file is dlug
	   size of buffer read, minus '\0', is bufsiz */
	long dlug=0; 
	long allocated=1000000, licznik=1 ; /* this is a counter for reallocating memory for sequence */

	/* some temporary variables */
	int i=0, temp=0, check=TRUE ;

	if(debug) Warn("Reading the whole sequence") ;

	if((sekwencja=(sekw*) malloc(sizeof(sekw)))==NULL)
		Error("Could not allocate memory for my sequence. Grr!") ;
	
	strcpy(sekwencja->name,"Unknown\0") ;
	
/* checking whether fasta format or not */
	while( (temp=fgetc(in)) != '>') {
		if( (temp=fgetc(in)) == EOF) {
			if(debug) Warn("End of input") ;
			free(sekwencja) ;
			return(NULL) ;
		}
	}


/* reading the name of the sequence */
	while (i<MAXNAZWA) {
		if((temp=fgetc(in)) == EOF) {
			Warn("No sequence found") ;
			free(sekwencja->sequ) ;
			free(sekwencja) ;
			return(NULL) ;
		} else {
			if(temp=='\n') {
				sekwencja->name[i]='\0' ;
				i=MAXNAZWA ;
			} else {
			sekwencja->name[i]=(char)temp ;
			}
		}
		i++ ;
	}

	/* reading the actual sequence */
	/* First, allocating space for the beginning of the sequence */

	if(debug) Warn("Reading the sequence %s...\n",sekwencja->name) ;

	sekwencja->sequ=(char*) malloc((allocated + 1)*sizeof(char)) ;
	sekwencja->sequ[0]='\0' ;

	check=TRUE ; dlug=0 ; licznik=0 ;

	/* As long as we didn't got the whole sequence */
	while(check && (temp=fgetc(in)) != EOF && temp!='>') {

			/* ignoring blanks */
		if(temp!='\n' && temp!='\t' && temp!=' ') {
			dlug++ ;
			sekwencja->sequ[licznik]=temp ;
			licznik++ ;

			if(licznik>allocated) {
				allocated += 100000 ;
				sekwencja->sequ=realloc(sekwencja->sequ,
							(sizeof(char)*(allocated+2))) ;
			}

		}
	}

	if(temp=='>') ungetc(temp,in) ;

	sekwencja->sequ[licznik]='\0' ;
	sekwencja->leng=licznik ;

	if( (sekwencja->type=GetSeqType(sekwencja))==WRONG)
		Warn("Sequence %s is not DNA/RNA/Protein",sekwencja->name) ;
	

	if(sekwencja->leng == 0) {
		Warn("No sequence read") ;
		free(sekwencja->sequ) ;
		free(sekwencja) ;
		return(NULL) ;
	}

	sekwencja->next = NULL ;
	if(debug) Warn("Sequence %s read succesfully %i nt type %s",
		sekwencja->name, sekwencja->leng, stypes[sekwencja->type]) ;
	return(sekwencja) ;

	




}




/* Reverse the sequence provided. Pretty obvious, heh? */

sekw* ReverseSequence(sekw* s) {

	sekw* sout;
	long d, i;
	char *t ;
	
	sout=(sekw*) malloc(sizeof(sekw)) ;
	strcpy(sout->name,s->name) ;
	if(!(strlen(s->name) < MAXNAZWA)) {
		sout->name[MAXNAZWA-6]='\0' ;
	}

	strcat(sout->name,"(rev)") ;
	d=strlen(s->sequ) ;
	sout->sequ=(char*) malloc((d+1)*sizeof(char)) ;
	
	for(i=0;i<d;i++) {
		t=strchr("ATAUACGCatauacgc",s->sequ[i]) + 1 ;
		sout->sequ[(d-1-i)]=Complement(s->sequ[i]) ;
	}

	sout->sequ[d]='\0' ;
	sout->leng=d ;

	return(sout) ;
}




/* This function prints out a sequence nicely formatted in the fasta style */

int FormatSeq(FILE* out, sekw* s, int width) {
	long d,i=0; 
	int j=0;
	char *p ;
	char typname[20] ;
	

	if(strlen(s->name) >= (MAXNAZWA-21))
		s->name[MAXNAZWA-21]='\0' ;

	if((p=strchr(s->name,'('))!=NULL) 
		p[0]='\0' ;

	if(s->type == RNA || s->type==DNA) strcpy(typname,"bases") ;
	else if(s->type == PROTEIN) strcpy(typname,"residues") ;
	else strcpy(typname,"letters") ;
	
	d=strlen(s->sequ) ;
	fprintf(out, ">%s (%li %s)\n", s->name, d, typname) ;

	if(width==0) {
		fprintf(out,"%s\n",s->sequ) ;
	} else {
		while(i<d) {
			j=0 ;
			while(j<width && i<d) {
				fprintf(out, "%c",s->sequ[i]) ;
				j++ ; i++ ;
			}
			fprintf(out, "\n") ;
		}
	}

	return(EXIT_SUCCESS) ;

}



/* This function tries to determine the sequence type */

sequence_type GetSeqType(sekw *s) {
	int dlug, i=0 ;
	sequence_type type=UNKNOWN ;
	char c ;

	dlug=strlen(s->sequ) ;

	for(i=0;i<dlug;i++) {
		c=toupper(s->sequ[i]) ;

		/* it has to be either RNA, or DNA, or protein */
		if((char*) strchr("0ACDEFGHIKLMNPQRSTVWU",c)==NULL) {
			if(debug) Warn("Sequence %s has strange characters",s->name) ;
			type=WRONG ;
		}

		/* if it's unknown, let's try to find out what it is */

		if(type==UNKNOWN) {
			if((char*) strchr("0DEFHIKLMNPQRSVW",c)!=NULL) {
				type=PROTEIN ;
		} else if(c=='U') type=RNA ;
		}

		/* if it's RNA, it can't be protein or T */
		if((type==RNA) && 
			 ((char*) strchr("0DEFHIKLMNPQRSVWT",c)!=NULL)) {
			if(debug) Warn("I thought %s is RNA?!",s->name) ;
			type=WRONG ;
		}
		
		/* if it's a protein, it can't be U */
		if((type==PROTEIN) &&
			 (c=='U')) {
			if(debug) Warn("I thought %s is protein?!",s->name) ;
			type=WRONG ;
		}

	}
	if(type==UNKNOWN) type=DNA ;
	return(type) ;
}



/* 
 * Comparing two nucleotides isn't easy, because some sequences contain
 * "wildcards", like R signifying A or G etc.
 */

int Compare(char c1, char c2) {

	/* doing it nearly "manually" for the sake of speed */

	c1=toupper(c1) ;
	c2=toupper(c2) ;


	switch(c1) {
		case 'A': 
			if(strchr("AMRWVHDN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'C': 
			if(strchr("CMSYVHBN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'G': 
			if(strchr("GRSKVDBN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'U':
		case 'T': 
			if(strchr("UTWYKHDBN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;

		case 'M': 
			if(strchr("AMRWVHDNCMSYVHBN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'R': 
			if(strchr("AMRWVHDNGRSKVDBN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'W': 
			if(strchr("UAMRWVHDNTWYKHDBN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'S': 
			if(strchr("CMSYVHBNGRSKVDBN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'Y': 
			if(strchr("UCMSYVHBNTWYKHDBN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'K': 
			if(strchr("UGRSKVDBNTWYKHDBN",c2)==NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;

			
		case 'V': 
			if(c2=='T' || c2=='U') return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'H': 
			if(c2=='G') return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'D': 
			if(c2=='C') return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'B': 
			if(c2=='A') return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'N':
		case 'X':
			return(TRUE) ;
			break ;
		default:
			Warn("Wrong sequence characters :%c: and :%c: in comparison",c1,c2) ;
			break;
	}

	return(0) ;
}


/* 
 * converts DNA/RNA sequence to protein. Start is the number of the first
 * nucleotide to start translation with. If pedantic==TRUE Dna2Protein will
 * return a NULL pointer instead of a sequence if the sequence is erronous,
 * i.e. it does not start with a start codon or does not stop with a stop
 * codon.
 */

sekw* Dna2Protein(sekw *inseq, codont* intable, long start, int pedantic) {

	/* i is a counter for triplet acquirement, j denotes the current 
	 * position in the sequence. Aaread denotes the number of aa residues
	 * written in the sequence outseq */

	long dlugosc,i=0,j=0,aaread=0 ;
	char name[MAXNAZWA] ;
	char t, *tmp;
	sekw *outseq ;

	/* 
	 * The Conv matrix converts the nucleic acid letter to 
	 * apriopriate coordinate for the codont matrix 
	 */
	int Conv[128], coord[3], check=TRUE ;
	for(i=0;i<128;i++) Conv[i]=99 ;
	Conv['A']=0 ; Conv['C']=1; Conv['G']=2 ; Conv['T']=3 ; Conv['U']=3 ;

	if(debug) Warn("Dna2Protein: from %i, pedantic=%i",start,pedantic) ;

	if((tmp=strchr(inseq->name,'(')) != NULL) tmp[0]='\0' ;
	if(strlen(inseq->name)>=MAXNAZWA)
		inseq->name[MAXNAZWA-12]='\0' ; 

	strcpy(name,inseq->name) ;
	strcat(name, " (translated)") ;
	/* Allocating enough memory for the outseq */

	dlugosc=inseq->leng ;
	outseq=(sekw*) calloc(1,sizeof(sekw)) ;
	outseq->sequ=(char*) calloc((dlugosc/3 + 2),sizeof(char)) ;
	outseq->type=PROTEIN ;
	strcpy(outseq->name,name) ;

	/* first triplet: is it a valid start codon? */
	for(i=0,j=start-1;i<3;i++,j++) coord[i]=Conv[inseq->sequ[j]] ;


	if(pedantic && (IsStartCodon(inseq,start-1) == FALSE)) {
		if(debug)
			Warn("D2P: %s does not start with a start codon!", inseq->name) ;
		free(outseq->sequ) ;
		free(outseq) ;
		return(NULL) ;
	}

	t=intable->tbl[coord[0]][coord[1]][coord[2]] ;
	
	while(t != '0' && check==TRUE) {
		outseq->sequ[aaread]=t ;
		aaread++ ;

		/* getting the next triplet. Watch out for the end of the sequence */
		for(i=0;i<3;i++,j++) {
			if(j==dlugosc) check=FALSE ;
			else coord[i]=Conv[inseq->sequ[j]] ;
		}

		if(check==TRUE) t=intable->tbl[coord[0]][coord[1]][coord[2]] ;
	}
	
	if(pedantic && check==FALSE) {
		if(debug)
			Warn("D2P: No stop codon found in %s",inseq->name) ;
		free(outseq->sequ) ;
		free(outseq) ;
		return(NULL) ;
	}

	outseq->sequ[aaread]='\0' ;
	outseq->leng=aaread ;

	if(debug) Warn("Dna2Prot: %s translated succesfully, %i residues",
		inseq->name,strlen(outseq->sequ)) ;
	return(outseq) ;

}


/* 
 * The standard codon table is loaded. In Genpak programs, A=0, C=1, G=2
 * and T/U=3
 */

codont * LoadStandardTable() {

	codont* outtable ;

	outtable=(codont*) calloc(1,sizeof(codont)) ;

	outtable->tbl[0][0][0]='K' ;
	outtable->tbl[0][0][1]='N' ;
	outtable->tbl[0][0][2]='K' ;
	outtable->tbl[0][0][3]='N' ;
	outtable->tbl[0][1][0]='T' ;
	outtable->tbl[0][1][1]='T' ;
	outtable->tbl[0][1][2]='T' ;
	outtable->tbl[0][1][3]='T' ;
	outtable->tbl[0][2][0]='R' ;
	outtable->tbl[0][2][1]='S' ;
	outtable->tbl[0][2][2]='R' ;
	outtable->tbl[0][2][3]='S' ;
	outtable->tbl[0][3][0]='I' ;
	outtable->tbl[0][3][1]='I' ;
	outtable->tbl[0][3][2]='M' ;
	outtable->tbl[0][3][3]='I' ;
	outtable->tbl[1][0][0]='Q' ;
	outtable->tbl[1][0][1]='H' ;
	outtable->tbl[1][0][2]='Q' ;
	outtable->tbl[1][0][3]='H' ;
	outtable->tbl[1][1][0]='P' ;
	outtable->tbl[1][1][1]='P' ;
	outtable->tbl[1][1][2]='P' ;
	outtable->tbl[1][1][3]='P' ;
	outtable->tbl[1][2][0]='R' ;
	outtable->tbl[1][2][1]='R' ;
	outtable->tbl[1][2][2]='R' ;
	outtable->tbl[1][2][3]='R' ;
	outtable->tbl[1][3][0]='L' ;
	outtable->tbl[1][3][1]='L' ;
	outtable->tbl[1][3][2]='L' ;
	outtable->tbl[1][3][3]='L' ;
	outtable->tbl[2][0][0]='E' ;
	outtable->tbl[2][0][1]='D' ;
	outtable->tbl[2][0][2]='E' ;
	outtable->tbl[2][0][3]='D' ;
	outtable->tbl[2][1][0]='A' ;
	outtable->tbl[2][1][1]='A' ;
	outtable->tbl[2][1][2]='A' ;
	outtable->tbl[2][1][3]='A' ;
	outtable->tbl[2][2][0]='G' ;
	outtable->tbl[2][2][1]='G' ;
	outtable->tbl[2][2][2]='G' ;
	outtable->tbl[2][2][3]='G' ;
	outtable->tbl[2][3][0]='V' ;
	outtable->tbl[2][3][1]='V' ;
	outtable->tbl[2][3][2]='V' ;
	outtable->tbl[2][3][3]='V' ;
	outtable->tbl[3][0][0]='0' ;
	outtable->tbl[3][0][1]='T' ;
	outtable->tbl[3][0][2]='0' ;
	outtable->tbl[3][0][3]='T' ;
	outtable->tbl[3][1][0]='S' ;
	outtable->tbl[3][1][1]='S' ;
	outtable->tbl[3][1][2]='S' ;
	outtable->tbl[3][1][3]='S' ;
	outtable->tbl[3][2][0]='0' ;
	outtable->tbl[3][2][1]='C' ;
	outtable->tbl[3][2][2]='W' ;
	outtable->tbl[3][2][3]='C' ;
	outtable->tbl[3][3][0]='L' ;
	outtable->tbl[3][3][1]='F' ;
	outtable->tbl[3][3][2]='L' ;
	outtable->tbl[3][3][3]='F' ;
	return(outtable) ;

}


/* 
 * Initializing the table one2three, which stores conversions from one
 * letter to three letter amino acid codes
 */

void InitializeOne2Three() {


	strcpy(one2three['0'],"STP") ;
	strcpy(one2three['G'],"Gly") ;
	strcpy(one2three['A'],"Ala") ;
	strcpy(one2three['V'],"Val") ;
	strcpy(one2three['L'],"Leu") ;
	strcpy(one2three['I'],"Ile") ;
	strcpy(one2three['P'],"Pro") ;
	strcpy(one2three['C'],"Cys") ;
	strcpy(one2three['M'],"Met") ;
	strcpy(one2three['F'],"Phe") ;
	strcpy(one2three['W'],"Trp") ;
	strcpy(one2three['S'],"Ser") ;
	strcpy(one2three['T'],"Thr") ;
	strcpy(one2three['Y'],"Tyr") ;
	strcpy(one2three['N'],"Asn") ;
	strcpy(one2three['Q'],"Gln") ;
	strcpy(one2three['K'],"Lys") ;
	strcpy(one2three['R'],"Arg") ;
	strcpy(one2three['H'],"His") ;
	strcpy(one2three['D'],"Asp") ;
	strcpy(one2three['E'],"Glu") ;

}

/* Finds all occurencies of sequence "query" in sequence "subject" */

position_s *FindSeq(sekw *q, sekw *s) {

	position_s *result;
	position_s *temp; 
	long pos1, i,j ;
	sekw *q_rev ;

	result=NULL ;
	temp=result ;

	s->leng=strlen(s->sequ) ;
	q->leng=strlen(q->sequ) ;

	if(q->leng > s->leng) return FALSE ;

	if(debug) Warn("FindSeq: Starting search for %s",q->sequ) ;
	/* preparing the reverse needle */
	q_rev=ReverseSequence(q) ;


	/* The main search loop */
	for(pos1=0;pos1<(s->leng-q->leng+1);pos1++) {
		
		/* searching forwards */
		i=0 ;
		while(i<q->leng && Compare(q->sequ[i],s->sequ[pos1+i])) i++ ;
		if(i==q->leng) {
			if(result==NULL) {
				result=calloc(1,sizeof(*result)) ;
				temp=result ;
			} else {
				temp->next=calloc(1,sizeof(*temp)) ;
				temp=temp->next ;
			}
			temp->start=pos1+1 ; temp->end=(pos1+q->leng) ;
			temp->next=NULL ;
		}
			

		/* searching backwards */
		i=0 ;
		while(i<q->leng && Compare(q_rev->sequ[i],s->sequ[pos1+i])) i++ ;
		if(i==q->leng) {
			if(result==NULL) {
				result=calloc(1,sizeof(*result)) ;
				temp=result ;
			} else {
				temp->next=calloc(1,sizeof(*temp)) ;
				temp=temp->next ;
			}
			temp->end=pos1+1 ; temp->start=(pos1+q->leng) ;
			temp->next=NULL ;
		}
	
	}

	free(q_rev) ;
	return(result) ;

}



double VarianceE(double sum, double sumofsquares, int number) {

	double res=0.0 ;

	if(number < 2) { 
		Warn("Wrong parameter for function VarianceE()!") ;
		return -1 ;
	}

	res=((sumofsquares - (sum*sum)/number)/(number-1)) ;

	return res ;

}

