/* ----------------------------------------------------------------------
 * Some common functions for the Genpak package
 * Copyright (C) 2000 January Weiner III
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 ---------------------------------------------------------------------- */

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <ctype.h>
#include "gp_getopt.h"

#define MAXSTR 500
#define MAXNAZWA 100
#define MAXLEN 9 
#define QUIET 0
#define DEBUG 0
#define FALSE 0
#define TRUE 1

/* char to separate directories */
#ifndef DSEP
	#define DSEP ':'
#endif

/* default path variable name, can be defined at compile time */
#ifndef DVAR
	#define DVAR "GPPATH"
#endif

/* default search directory, can be defined at compile time */
#ifndef DDIR
	#define DDIR "/usr/lib/genpak"
#endif


/*
 * we need a structure to store all the warnings issued by the program
 * in case we need to print them all one at the time, that is when
 * html == TRUE.
 */

struct warnings_s {
	char warn[200] ;
	struct warnings_s *next ;
};


/* getopt code: types */
typedef enum GETOPT_ORDERING_T {
    PERMUTE,
    RETURN_IN_ORDER,
    REQUIRE_ORDER
} GETOPT_ORDERING_T;


extern struct warnings_s *allwarnings ;
extern struct warnings_s *currentwarning ;

/* getopt code: globally-defined variables */
char *optarg = NULL;
int optind = 0;
int opterr = 1;
int optopt = '?';

/* three global variables needed by all programs */
extern int debug ;
extern int quiet ;
extern int html ;
extern char* progname ;

/* These are used for designating sequence types */
typedef enum { UNKNOWN= 0, DNA, RNA, PROTEIN, WRONG } sequence_type;
char stypes[5][10] = {"unknown","dna","protein","wrong"} ;      

/* conversion from one letter aminoacid code to three letter code */
char one2three[128][4] ;

/* table for storing the genetic code in one letter notation */
typedef struct {
	char tbl[4][4][4] ;
} codont ;

/* structure type for sequences */
struct _sekw {
	char *name ;
	char *sequ ;
	long leng ;
	struct _sekw *next ;
	struct _sekw *prev ;
	sequence_type type ; } ;

typedef struct _sekw sekw ;


/* structure for storing positions as linked lists */
struct _position_s {
	long start ;
	long end ; 
	struct _position_s *next ; } ;

typedef struct _position_s position_s ;


/* Function prototypes */

char              gp_nucl_complement(char c) ;

void              gp_warn(char *message, ...) ;
void              gp_warn_print_all(FILE *out) ;
void              gp_error(char *message, ...) ;
void              gp_error_fatal() ;

void              gp_codon_init_conversion() ;
int               gp_codon_isstart(sekw *s, long p) ;
codont*           gp_codon_table_load_std() ;

void              gp_seq_free(sekw *s) ;
int               gp_seq_print_fasta(FILE* out, sekw* s, int width) ;
sekw*             gp_seq_read_fragment(FILE *in, long start, long end, int numseq) ;
sekw*             gp_seq_read(FILE *in) ;
sekw*             gp_seq_reverse(sekw* s) ;
sekw*             gp_seq_copy_frag(sekw * s, long start, long end) ;
position_s*       gp_seq_find(sekw *query, sekw *subject) ;
sequence_type     gp_seq_get_type(sekw *s) ;
sekw*             gp_seq_dna_to_protein(sekw *inseq, 
                         codont* intable, 
												 long start, 
												 int pedantic) ;

FILE*             gp_file_open(const char *filename, const char *type) ;
double            gp_variance_e(double sum, double sumofsquares, int number) ;
char*             gp_strdup(char *s) ;


/* free a sekw* structure */
void gp_seq_free(sekw *s) {
	if(!s) return ;
	if(s->name) free(s->name) ;
	if(s->sequ) free(s->sequ) ;
}


/* equivalent to non-ANSI strdup() function */
char *gp_strdup(char *s) {
	char *res ;
	res = malloc(strlen(s) + 1) ;
	strcpy(res, s) ;
	return res ;
}


/* 
 * Checking whether sequence s has a start codon at position p 
 * Alternative start codons GTG and TTG are allowed
 */

int gp_codon_isstart(sekw *s, long p) {

	/* allow start codons: ATG, GTG, TTG */
	if(strchr("UATG",s->sequ[p]) != NULL &&
	   strchr("UT",s->sequ[p+1]) != NULL &&
		 s->sequ[p+2] == 'G') return(TRUE) ;
	else {
		return(FALSE) ;
	}
		

}


/* 
 * This is a substitute for simple opening of a file.
 * The difference is, it also tries to open the file in the default
 * data directory and path.
 * TryOpen searches for the given filename *file_n in 1) current directory
 * 2) path found in environment variable DVAR 3) default directory DDIR. On
 * success, a file pointer to the opened file is returned; otherwise NULL.
 */

FILE *gp_file_open(const char *file_n, const char *mode) {

	FILE *res ;
	char *p, *q, fn[FILENAME_MAX + 1], path[FILENAME_MAX + 1] ;

	/* check whether file opened for writing already exists */
	if( strchr(mode,'w') && (res = fopen(file_n,"r")) ) {
		fclose(res) ;
		gp_error("Cowardly refusing to overwrite existing file") ;
	}

	/* first, the current directory */ 
	if( (res = fopen(file_n,mode) ) != NULL ) return res ;
	if(debug) gp_warn("File %s not in working directory",file_n) ;

	/* path env. variable found and not empty */
	if((p = getenv(DVAR)) != NULL && p[0] != '\0') {
		if(debug) gp_warn("Searching path %s",p) ;

		strcpy(path,p) ;

		p = strchr(path,DSEP) ;
		q = path ;

		/* searching through the path */
		while(p != NULL) {
			p[0] = '\0' ;

			sprintf(fn,"%s/%s",q,file_n) ;

			/* file found */
			if( (res = fopen(fn,mode)) != NULL) return res ;

			q = p + 1 ;
			p = strchr(q,DSEP) ;
		}

		/* 
		 * there are n-1 ':'s in the path, so we have to search through the
		 * last part of the path 
		 */
		sprintf(fn,"%s/%s",q,file_n) ;
		if( (res = fopen(fn,mode)) != NULL) return res ;

	} 
	if(debug) gp_warn("File %s not in GPPATH pathway",file_n) ;

	/* last chance: the built-in default directory */

	sprintf(fn,"%s/%s",DDIR,file_n) ;
	if( (res = fopen(fn,mode)) != NULL) return res ;

	gp_error("Could not open file %s",fn) ;

	return NULL ;
}
			

/* 
 * Here we copy a fragment of a sequence to another one 
 * Note that start is the no of the first base to be taken,
 * and end is the no of the last base to be taken. So the sequence length
 * is end-start+1
 */
sekw* gp_seq_copy_frag(sekw * s, long start, long end) {

	sekw* out ;
	long i = 0,dlug = 0;
	int complement = FALSE ;
	char c;

	if(start == 0) start = 1 ;
	if(end == 0) end = strlen(s->sequ) ;

	if(start > end) {
		complement = TRUE ;
		i = start ;
		start = end ;
		end = i ;
	}

	if(debug) gp_warn("gp_seq_copy_frag from %i to %i, complement = %i",
		start,end,complement) ;

	out = malloc(sizeof(*out)) ;
	dlug = end - start + 1 ;

	if(end > strlen(s->sequ)) {
		gp_warn("Wrong paramenters for function gp_seq_copy_frag") ;
		end = strlen(s->sequ) ;
	}

	out->sequ = calloc(dlug + 5, sizeof(*out->sequ)) ;
	out->name = gp_strdup(s->name) ;

	/* copying char by char */
	for(i = start - 1 ; i < end ; i++) {
		if(complement) {
			c = gp_nucl_complement(s->sequ[(end - i - 2 + start)]) ;
		} else {
			c = s->sequ[i] ;
		}
		out->sequ[(i - start + 1)] = c ;
	}

	out->sequ[dlug] = '\0' ;
	out->leng = dlug ;
	return(out) ;
}



/* return the complementary Watson-Crick base pair */

char gp_nucl_complement(char c) {

	char d;

	d = toupper(c) ;

	if(d == 'A') return('T') ;
	else if(d == 'C') return('G') ;
	else if(d == 'G') return('C') ;
	else if(d == 'T') return('A') ;
	else if(d == 'U') return('A') ;

	else if(d == 'M') return('K') ;
	else if(d == 'K') return('M') ;
	else if(d == 'Y') return('R') ;
	else if(d == 'R') return('Y') ;
	else if(d == 'W') return('W') ;
	else if(d == 'S') return('S') ;

	else if(d == 'V') return('B') ;
	else if(d == 'H') return('D') ;
	else if(d == 'D') return('H') ;
	else if(d == 'B') return('V') ;
	else if(d == 'N') return('N') ;

	return(0) ;
}


/* Standard error message */

void gp_error(char *message, ...) {

	va_list vl ;
	FILE* output = stderr ;
	
	if(html) output = stdout ;

	gp_warn_print_all(output) ;
	va_start(vl,message) ;

	if(!quiet || debug) {
		fprintf(output,"%s: fatal: ", progname) ;
		vfprintf(output,message,vl) ;
		fprintf(output,". Exiting\n") ;
	}
	exit(1) ;
}


/* Print all accumulated warnings of a program */

void gp_warn_print_all(FILE *out) {

	struct warnings_s *nextwarning = allwarnings ;
	struct warnings_s *oldwarning = allwarnings ;

	if(allwarnings != NULL) {

		if(html) fprintf(out,"<PRE>") ;
		fprintf(out,"\n# Warnings issued during the execution of the program:\n") ;
		while(nextwarning != NULL) {
			fprintf(out,"# %s\n",nextwarning->warn) ;
			oldwarning = nextwarning ;
			nextwarning = nextwarning->next ;
			free(oldwarning) ;
		}
	}
	if(html) fprintf(out,"</PRE>") ;

}


/* Standard warning message */

void gp_warn(char *message, ...) {
	va_list vl ;
	FILE *output = stderr ;

	va_start(vl,message) ;

	if( (!quiet || debug) && !html ) {
		fprintf(output,"%s: ", progname) ;
		vfprintf(output,message,vl) ;
		fprintf(output,"\n") ;
	}

	if(html) {

		if(allwarnings == NULL) {
			allwarnings = calloc(1,sizeof(*allwarnings) ) ;

			strcpy(allwarnings->warn,"-------- :\n") ;
			allwarnings->next = calloc(1,sizeof(*allwarnings)) ;
			currentwarning = allwarnings->next ;
			currentwarning->next = NULL ;
		}

		vsprintf(currentwarning->warn,message,vl) ;
		currentwarning->next = calloc(1,sizeof(*currentwarning) ) ;
		currentwarning = currentwarning->next ;
		currentwarning->next = NULL ;

	}
}


/* Seldom used: used when it's obvious I've screwed up smth badly */

void gp_error_fatal() {
	fprintf(stderr,"You have found a bug in the Genpak package\n") ;
	fprintf(stderr,"Please write down what you have been doing\n") ;
	fprintf(stderr,"(which program, what parameters etc.) and mail the\n") ;
	fprintf(stderr,"bug report to january@bioinformatics.org. Thanks!\n") ;
	exit(1) ;
}




/* =======================================================================
 * Here the actual work of reading a sequence file is done.
 * *in - input file, start - start position, end - position, 
 + end == start == 0 if the whole sequence should be read.
 * numseq - number of sequences to read. 0 to read all sequences.
 * (multiple sequence loading not implemented yet)
 * =======================================================================   */

sekw* gp_seq_read_fragment(FILE *in, long s_start, long s_end, int numseq)
{
	sekw *res, *tmp ; /* The struct into which sequence is read */
	/* Number of characters read from the sequence file is dlug
	   size of buffer read, minus '\0', is bufsiz */
	long start, end, dlug = 0; 
	/* this is for keeping track of how much memory has been allocated */
	long allocated = 10000, licznik = 1 ; 
	int whole = FALSE, rev ; 

	/* some temporary variables */
	int temp = 0, check = TRUE ;
	char bufor[BUFSIZ], *tc ;

	if(s_start > s_end) {
		rev = TRUE ;
		start = s_end ;
		end = s_start ;
	} else {
		rev = FALSE ;
		start = s_start ;
		end = s_end ;
	}

	if(start > end) gp_error_fatal() ; 

	if((start == end) && (start == 0)) { 
		whole = TRUE ; 
		start = 1 ; 
	}

	if(debug && whole) gp_warn("Reading the whole sequence") ;
	if(debug) gp_warn("Readseq: Reading sequence from %li to %li",start,end) ;

	/* checking whether fasta format or not */
	while( (temp = fgetc(in)) != '>') 
		if( (temp = fgetc(in)) == EOF) return(NULL) ;
	
	/* reading sequence name */
	if( fgets(bufor, BUFSIZ, in) == NULL ) return(NULL) ;

	/* allocating memory for the sequence */
	if( (res = malloc(sizeof(*res))) == NULL)
		gp_error("gp_sequence_read: could not allocate enough memory") ;

	/* discarding end of line */
	if( (tc = strchr(bufor, '\n')) ) *tc = '\0' ;
	/* reading the name of the sequence */
	if( strlen(bufor) > 0) res->name = gp_strdup(bufor) ;
	else res->name = gp_strdup("Unknown\0") ;

	/* reading the actual sequence */
	if(debug) gp_warn("gp_sequence_read: reading the sequence %s\n",res->name) ;

	res->sequ = malloc(allocated + 1) ;
	*res->sequ = '\0' ;
	tc = res->sequ ;

	/* reading the actual sequence */
	/* First, allocating space for the beginning of the sequence */

	if(debug) gp_warn("Reading the sequence %s...\n",res->name) ;

	check = TRUE ; dlug = 0 ; licznik = 0 ;

	/* As long as we didn't got the whole sequence */
	while(check && (licznik < (end - start + 1) || whole == TRUE)) {

		/*if end of file or a next sequence is reached */
		if( (temp = fgetc(in)) == EOF || temp == '>') {
			if (debug) gp_warn("Got to end of the file") ;
			
			/* EOF, but maybe the sequence is incomplete */
			if(whole != TRUE) gp_error("Unexpected end of sequence %s", res->name) ;
			else check = FALSE ;

			/* ignoring blanks */
		} else if( !isspace(temp) ) {
				dlug++ ;

				/* Have we started the reading? */
				if(dlug >= start) {
					*tc = temp ;
					licznik++ ;
					tc++ ;

					if(licznik >= allocated) {
						allocated += 10000 ;
						res->sequ = realloc(res->sequ,
									(sizeof(char)*(allocated+2))) ;
						tc = res->sequ + licznik ;
					}
				}
		}
	}

	if(temp == '>') ungetc(temp, in) ;

	*tc = '\0' ;
	res->leng = licznik ;

	if(debug) gp_warn("Sequence length %i %i",res->leng, strlen(res->sequ)) ;

	if( (res->type = gp_seq_get_type(res)) == WRONG)
		gp_warn("Sequence %s is not DNA/RNA/Protein", res->name) ;
	
	if(debug) gp_warn("Sequence type %s",stypes[res->type]) ;

	if(res->leng == 0) {
		gp_warn("No sequence read") ;
		gp_seq_free(res) ;
		return(NULL) ;
	}

	if(rev) {
		if(debug) gp_warn("reversing the sequence") ;
		tmp = res ;
		res = gp_seq_reverse(tmp) ;
		gp_seq_free(tmp) ;
	}

	if(debug) gp_warn("Sequence %s read succesfully",res->name) ;
	return(res) ;
}



/* reads whole sequence */
sekw* gp_seq_read(FILE *in) {
	sekw *res ; /* The struct into which sequence is read */
	long allocated = 1000000, licznik = 1 ; 
	char bufor[BUFSIZ], *tc ;
	int temp = 0, check = TRUE ;	/* some temporary variables */

	if(debug) gp_warn("gp_seq_read: Reading the whole sequence") ;

	/* checking whether fasta format or not */
	while( (temp = fgetc(in)) != '>') 
		if( (temp = fgetc(in)) == EOF) return(NULL) ;
	
	/* reading sequence name */
	if( fgets(bufor, BUFSIZ, in) == NULL ) return(NULL) ;

	/* allocating memory for the sequence */
	if( (res = malloc(sizeof(*res))) == NULL)
		gp_error("gp_sequence_read: could not allocate enough memory") ;

	res->next = NULL ;
	res->prev = NULL ;
	res->leng = 0 ;

	/* discarding end of line */
	if( (tc = strchr(bufor, '\n')) ) *tc = '\0' ;
	/* reading the name of the sequence */
	if( strlen(bufor) > 0) res->name = gp_strdup(bufor) ;
	else res->name = gp_strdup("Unknown\0") ;

	/* reading the actual sequence */
	if(debug) gp_warn("gp_sequence_read: reading the sequence %s\n",res->name) ;

	res->sequ = malloc(allocated + 1) ;
	*res->sequ = '\0' ;
	tc = res->sequ ;

	check = TRUE ; licznik = 0 ;

	/* As long as we didn't got the whole sequence */
	while(check && (temp = fgetc(in)) != EOF && temp != '>') {

		/* ignoring blanks */
		if( !isspace(temp) ) {
			*tc = temp ;
			tc++ ;
			licznik++ ;

			if(licznik >= allocated) {
				allocated += 100000 ;
				res->sequ = realloc(res->sequ, (allocated + 1)) ;
				tc = res->sequ + licznik ;
			}

		}
	}

	if(temp == '>') ungetc(temp,in) ;

	*tc = '\0' ;
	if(debug) gp_warn("tc-sequ = %i, licznik = %i", tc - res->sequ, licznik) ;
	res->leng = licznik ;

	if(debug) gp_warn("Sequence length %i", res->leng) ;

	if( (res->type = gp_seq_get_type(res)) == WRONG)
		gp_warn("Sequence %s is not DNA/RNA/Protein",res->name) ;
	
	if(debug) gp_warn("Sequence type %s",stypes[res->type]) ;

	if(res->leng == 0) {
		gp_warn("No sequence read") ;
		gp_seq_free(res) ;
		return(NULL) ;
	}

	if(debug) gp_warn("Sequence %s read succesfully",res->name) ;
	return(res) ;

}


/* read all sequences found on input stream into a linked file of sequences */
sekw* gp_seq_read_all(FILE *in) {
	sekw *res, *cur, *tmp ;

	res = NULL ;
	tmp = NULL ;
	cur = NULL ;

	while( (tmp = gp_seq_read(in)) ) {
		if(!res) {
			res = tmp ;
			res->prev = NULL ;
			cur = res ;
		} else {
			cur->next = tmp ;
			cur->next->prev = cur ;
			cur = cur->next ;
		}
	}

	return res ;
}


/* Reverse the sequence provided. Pretty obvious, heh? */

sekw* gp_seq_reverse(sekw* s) {
	sekw* res;
	long d, i;
	
	res = malloc(sizeof(*res)) ;
	res->name = gp_strdup(s->name) ;

	d = s->leng ;
	res->sequ = malloc(d + 1) ;
	
	for(i = 0 ; i < d ; i++) {
		res->sequ[(d - 1 - i)] = gp_nucl_complement(s->sequ[i]) ;
	}

	res->sequ[d] = '\0' ;
	res->leng = d ;

	return(res) ;
}




/* This function prints out a sequence nicely formatted in the fasta style */

int gp_seq_print_fasta(FILE* out, sekw* s, int width) {
	long d,i = 0; 
	int j = 0;
	char *p ;
	char typname[20] ;
	

	if(strlen(s->name) >= (MAXNAZWA-21))
		s->name[MAXNAZWA-21] = '\0' ;

	if((p = strchr(s->name,'('))!= NULL) 
		p[0] = '\0' ;

	if(s->type == RNA || s->type == DNA) strcpy(typname,"bases") ;
	else if(s->type == PROTEIN) strcpy(typname,"residues") ;
	else strcpy(typname,"letters") ;
	
	d = strlen(s->sequ) ;
	fprintf(out, ">%s (%li %s)\n", s->name, d, typname) ;

	if(width == 0) {
		fprintf(out,"%s\n",s->sequ) ;
	} else {
		while(i<d) {
			j = 0 ;
			while(j<width && i<d) {
				fprintf(out, "%c",s->sequ[i]) ;
				j++ ; i++ ;
			}
			fprintf(out, "\n") ;
		}
	}

	return(EXIT_SUCCESS) ;

}



/* This function tries to determine the sequence type */

sequence_type gp_seq_get_type(sekw *s) {
	int dlug, i = 0 ;
	sequence_type type = UNKNOWN ;
	char c ;

	dlug = strlen(s->sequ) ;

	for(i = 0;i<dlug;i++) {
		c = toupper(s->sequ[i]) ;

		/* it has to be either RNA, or DNA, or protein */
		if((char*) strchr("0ABCDEFGHIKLMNPQRSTVYWU",c) == NULL) {
			if(debug) gp_warn("Sequence %s has strange characters",s->name) ;
			type = WRONG ;
		}

		/* if it's unknown, let's try to find out what it is */
		if(type == UNKNOWN) {
			if((char*) strchr("0DEFHIKLMNPQRSVW",c)!= NULL) {
				type = PROTEIN ;
		} else if(c == 'U') type = RNA ;
		}

		/* if it's RNA, it can't be protein or T */
		if((type == RNA) && 
			 ((char*) strchr("0DEFHIKLMNPQRSVWT",c)!= NULL)) {
			if(debug) gp_warn("I thought %s is RNA?!",s->name) ;
			type = WRONG ;
		}
		
		/* if it's a protein, it can't be U */
		if((type == PROTEIN) &&
			 (c == 'U')) {
			if(debug) gp_warn("I thought %s is protein?!",s->name) ;
			type = WRONG ;
		}

	}
	if(type == UNKNOWN) type = DNA ;
	return(type) ;
}



/* 
 * Comparing two nucleotides isn't easy, because some sequences contain
 * "wildcards", like R signifying A or G etc.
 */

int Compare(char c1, char c2) {

	/* doing it nearly "manually" for the sake of speed */

	c1= toupper(c1) ;
	c2= toupper(c2) ;


	switch(c1) {
		case 'A': 
			if(strchr("AMRWVHDN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'C': 
			if(strchr("CMSYVHBN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'G': 
			if(strchr("GRSKVDBN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'U':
		case 'T': 
			if(strchr("UTWYKHDBN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;

		case 'M': 
			if(strchr("AMRWVHDNCMSYVHBN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'R': 
			if(strchr("AMRWVHDNGRSKVDBN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'W': 
			if(strchr("UAMRWVHDNTWYKHDBN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'S': 
			if(strchr("CMSYVHBNGRSKVDBN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'Y': 
			if(strchr("UCMSYVHBNTWYKHDBN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'K': 
			if(strchr("UGRSKVDBNTWYKHDBN",c2) == NULL) return(FALSE) ;
			else return(TRUE) ;
			break ;

			
		case 'V': 
			if(c2== 'T' || c2== 'U') return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'H': 
			if(c2== 'G') return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'D': 
			if(c2== 'C') return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'B': 
			if(c2== 'A') return(FALSE) ;
			else return(TRUE) ;
			break ;
		case 'N':
		case 'X':
			return(TRUE) ;
			break ;
		default:
			gp_warn("Wrong sequence characters :%c: and :%c: in comparison",c1,c2) ;
			break;
	}

	return(0) ;
}


/* 
 * converts DNA/RNA sequence to protein. Start is the number of the first
 * nucleotide to start translation with. If pedantic == TRUE Dna2Protein will
 * return a NULL pointer instead of a sequence if the sequence is erronous,
 * i.e. it does not start with a start codon or does not stop with a stop
 * codon.
 */

sekw* gp_seq_dna_to_protein(sekw *inseq, codont* intable, long start, int pedantic) {

	/* i is a counter for triplet acquirement, j denotes the current 
	 * position in the sequence. Aaread denotes the number of aa residues
	 * written in the sequence outseq */

	long dlugosc,i = 0,j = 0,aaread = 0 ;
	char name[MAXNAZWA] ;
	char t, *tmp;
	sekw *outseq ;

	/* 
	 * The Conv matrix converts the nucleic acid letter to 
	 * apriopriate coordinate for the codont matrix 
	 */
	int Conv[128], coord[3], check = TRUE ;
	for(i = 0;i<128;i++) Conv[i] = 99 ;
	Conv['A'] = 0 ; Conv['C'] = 1; Conv['G'] = 2 ; Conv['T'] = 3 ; Conv['U'] = 3 ;

	if(debug) gp_warn("Dna2Protein: from %i, pedantic = %i",start,pedantic) ;

	if((tmp = strchr(inseq->name,'(')) != NULL) tmp[0] = '\0' ;
	if(strlen(inseq->name)>= MAXNAZWA)
		inseq->name[MAXNAZWA-12] = '\0' ; 

	strcpy(name,inseq->name) ;
	strcat(name, " (translated)") ;
	/* Allocating enough memory for the outseq */

	dlugosc = inseq->leng ;
	outseq = (sekw*) calloc(1,sizeof(sekw)) ;
	outseq->sequ = (char*) calloc((dlugosc/3 + 2),sizeof(char)) ;
	outseq->type = PROTEIN ;
	strcpy(outseq->name,name) ;

	/* first triplet: is it a valid start codon? */
	for(i = 0, j = start - 1 ; i < 3 ; i++, j++) 
		coord[i] = Conv[inseq->sequ[j]] ;

	if(pedantic && (gp_codon_isstart(inseq,start-1) == FALSE)) {
		if(debug)
			gp_warn("D2P: %s does not start with a start codon!", inseq->name) ;
		free(outseq->sequ) ;
		free(outseq) ;
		return(NULL) ;
	}

	t = intable->tbl[coord[0]][coord[1]][coord[2]] ;
	
	while(t != '0' && check == TRUE) {
		outseq->sequ[aaread] = t ;
		aaread++ ;

		/* getting the next triplet. Watch out for the end of the sequence */
		for(i = 0;i<3;i++,j++) {
			if(j == dlugosc) check = FALSE ;
			else coord[i] = Conv[inseq->sequ[j]] ;
		}

		if(check == TRUE) t = intable->tbl[coord[0]][coord[1]][coord[2]] ;
	}
	
	if(pedantic && check == FALSE) {
		if(debug)
			gp_warn("D2P: No stop codon found in %s",inseq->name) ;
		free(outseq->sequ) ;
		free(outseq) ;
		return(NULL) ;
	}

	outseq->sequ[aaread] = '\0' ;
	outseq->leng = aaread ;

	if(debug) gp_warn("Dna2Prot: %s translated succesfully, %i residues",
		inseq->name,strlen(outseq->sequ)) ;
	return(outseq) ;

}


/* 
 * The standard codon table is loaded. In Genpak programs, A= 0, C= 1, G= 2
 * and T/U= 3
 */

codont * gp_codon_table_load_std() {

	codont* outtable ;

	outtable = (codont*) calloc(1,sizeof(codont)) ;

	outtable->tbl[0][0][0] = 'K' ;
	outtable->tbl[0][0][1] = 'N' ;
	outtable->tbl[0][0][2] = 'K' ;
	outtable->tbl[0][0][3] = 'N' ;
	outtable->tbl[0][1][0] = 'T' ;
	outtable->tbl[0][1][1] = 'T' ;
	outtable->tbl[0][1][2] = 'T' ;
	outtable->tbl[0][1][3] = 'T' ;
	outtable->tbl[0][2][0] = 'R' ;
	outtable->tbl[0][2][1] = 'S' ;
	outtable->tbl[0][2][2] = 'R' ;
	outtable->tbl[0][2][3] = 'S' ;
	outtable->tbl[0][3][0] = 'I' ;
	outtable->tbl[0][3][1] = 'I' ;
	outtable->tbl[0][3][2] = 'M' ;
	outtable->tbl[0][3][3] = 'I' ;
	outtable->tbl[1][0][0] = 'Q' ;
	outtable->tbl[1][0][1] = 'H' ;
	outtable->tbl[1][0][2] = 'Q' ;
	outtable->tbl[1][0][3] = 'H' ;
	outtable->tbl[1][1][0] = 'P' ;
	outtable->tbl[1][1][1] = 'P' ;
	outtable->tbl[1][1][2] = 'P' ;
	outtable->tbl[1][1][3] = 'P' ;
	outtable->tbl[1][2][0] = 'R' ;
	outtable->tbl[1][2][1] = 'R' ;
	outtable->tbl[1][2][2] = 'R' ;
	outtable->tbl[1][2][3] = 'R' ;
	outtable->tbl[1][3][0] = 'L' ;
	outtable->tbl[1][3][1] = 'L' ;
	outtable->tbl[1][3][2] = 'L' ;
	outtable->tbl[1][3][3] = 'L' ;
	outtable->tbl[2][0][0] = 'E' ;
	outtable->tbl[2][0][1] = 'D' ;
	outtable->tbl[2][0][2] = 'E' ;
	outtable->tbl[2][0][3] = 'D' ;
	outtable->tbl[2][1][0] = 'A' ;
	outtable->tbl[2][1][1] = 'A' ;
	outtable->tbl[2][1][2] = 'A' ;
	outtable->tbl[2][1][3] = 'A' ;
	outtable->tbl[2][2][0] = 'G' ;
	outtable->tbl[2][2][1] = 'G' ;
	outtable->tbl[2][2][2] = 'G' ;
	outtable->tbl[2][2][3] = 'G' ;
	outtable->tbl[2][3][0] = 'V' ;
	outtable->tbl[2][3][1] = 'V' ;
	outtable->tbl[2][3][2] = 'V' ;
	outtable->tbl[2][3][3] = 'V' ;
	outtable->tbl[3][0][0] = '0' ;
	outtable->tbl[3][0][1] = 'T' ;
	outtable->tbl[3][0][2] = '0' ;
	outtable->tbl[3][0][3] = 'T' ;
	outtable->tbl[3][1][0] = 'S' ;
	outtable->tbl[3][1][1] = 'S' ;
	outtable->tbl[3][1][2] = 'S' ;
	outtable->tbl[3][1][3] = 'S' ;
	outtable->tbl[3][2][0] = '0' ;
	outtable->tbl[3][2][1] = 'C' ;
	outtable->tbl[3][2][2] = 'W' ;
	outtable->tbl[3][2][3] = 'C' ;
	outtable->tbl[3][3][0] = 'L' ;
	outtable->tbl[3][3][1] = 'F' ;
	outtable->tbl[3][3][2] = 'L' ;
	outtable->tbl[3][3][3] = 'F' ;
	return(outtable) ;

}


/* 
 * Initializing the table one2three, which stores conversions from one
 * letter to three letter amino acid codes
 */

void gp_codon_init_conversion() {


	strcpy(one2three['0'],"STP") ;
	strcpy(one2three['G'],"Gly") ;
	strcpy(one2three['A'],"Ala") ;
	strcpy(one2three['V'],"Val") ;
	strcpy(one2three['L'],"Leu") ;
	strcpy(one2three['I'],"Ile") ;
	strcpy(one2three['P'],"Pro") ;
	strcpy(one2three['C'],"Cys") ;
	strcpy(one2three['M'],"Met") ;
	strcpy(one2three['F'],"Phe") ;
	strcpy(one2three['W'],"Trp") ;
	strcpy(one2three['S'],"Ser") ;
	strcpy(one2three['T'],"Thr") ;
	strcpy(one2three['Y'],"Tyr") ;
	strcpy(one2three['N'],"Asn") ;
	strcpy(one2three['Q'],"Gln") ;
	strcpy(one2three['K'],"Lys") ;
	strcpy(one2three['R'],"Arg") ;
	strcpy(one2three['H'],"His") ;
	strcpy(one2three['D'],"Asp") ;
	strcpy(one2three['E'],"Glu") ;

}

/* Finds all occurencies of sequence "query" in sequence "subject" */

position_s *gp_seq_find(sekw *q, sekw *s) {

	position_s *result;
	position_s *temp; 
	long pos1, i ;
	sekw *q_rev ;

	result = NULL ;
	temp = result ;

	s->leng = strlen(s->sequ) ;
	q->leng = strlen(q->sequ) ;

	if(q->leng > s->leng) return FALSE ;

	if(debug) gp_warn("FindSeq: Starting search for %s",q->sequ) ;
	/* preparing the reverse needle */
	q_rev = gp_seq_reverse(q) ;


	/* The main search loop */
	for(pos1= 0;pos1<(s->leng-q->leng+1);pos1++) {
		
		/* searching forwards */
		i = 0 ;
		while(i<q->leng && Compare(q->sequ[i],s->sequ[pos1+i])) i++ ;
		if(i == q->leng) {
			if(result == NULL) {
				result = calloc(1,sizeof(*result)) ;
				temp = result ;
			} else {
				temp->next = calloc(1,sizeof(*temp)) ;
				temp = temp->next ;
			}
			temp->start = pos1+1 ; temp->end = (pos1+q->leng) ;
			temp->next = NULL ;
		}
			

		/* searching backwards */
		i = 0 ;
		while(i<q->leng && Compare(q_rev->sequ[i],s->sequ[pos1+i])) i++ ;
		if(i == q->leng) {
			if(result == NULL) {
				result = calloc(1,sizeof(*result)) ;
				temp = result ;
			} else {
				temp->next = calloc(1,sizeof(*temp)) ;
				temp = temp->next ;
			}
			temp->end = pos1+1 ; temp->start = (pos1+q->leng) ;
			temp->next = NULL ;
		}
	
	}

	free(q_rev) ;
	return(result) ;

}



double gp_variance_e(double sum, double sumofsquares, int number) {

	double res = 0.0 ;

	if(number < 2) { 
		gp_warn("Wrong parameter for function gp_variance_e()!") ;
		return -1 ;
	}

	res = ((sumofsquares - (sum*sum)/number)/(number-1)) ;

	return res ;

}


/* 
 * The following code is taken from getopt.c written by Gregory Pietch,
 * modified to suit the needs of my package. I have removed the
 * "getopt_long" functions and changed the name of the getopt() to
 * gp_getopt() [January Weiner]
 */

/****************************************************************************

getopt.c - Read command line options

AUTHOR: Gregory Pietsch
CREATED Fri Jan 10 21:13:05 1997

DESCRIPTION:

The getopt() function parses the command line arguments.  Its arguments argc
and argv are the argument count and array as passed to the main() function
on program invocation.  The argument optstring is a list of available option
characters.  If such a character is followed by a colon (`:'), the option
takes an argument, which is placed in optarg.  If such a character is
followed by two colons, the option takes an optional argument, which is
placed in optarg.  If the option does not take an argument, optarg is NULL.

The external variable optind is the index of the next array element of argv
to be processed; it communicates from one call to the next which element to
process.

The getopt_long() function works like getopt() except that it also accepts
long options started by two dashes `--'.  If these take values, it is either
in the form

--arg = value

 or

--arg value

It takes the additional arguments longopts which is a pointer to the first
element of an array of type GETOPT_LONG_OPTION_T.  The last element of the
array has to be filled with NULL for the name field.

The longind pointer points to the index of the current long option relative
to longopts if it is non-NULL.

The getopt() function returns the option character if the option was found
successfully, `:' if there was a missing parameter for one of the options,
`?' for an unknown option character, and EOF for the end of the option list.

The getopt_long() function's return value is described in the header file.

The function getopt_long_only() is identical to getopt_long(), except that a
plus sign `+' can introduce long options as well as `--'.

The following describes how to deal with options that follow non-option
argv-elements.

If the caller did not specify anything, the default is REQUIRE_ORDER if the
environment variable POSIXLY_CORRECT is defined, PERMUTE otherwise.

REQUIRE_ORDER means don't recognize them as options; stop option processing
when the first non-option is seen.  This is what Unix does.  This mode of
operation is selected by either setting the environment variable
POSIXLY_CORRECT, or using `+' as the first character of the optstring
parameter.

PERMUTE is the default.  We permute the contents of ARGV as we scan, so that
eventually all the non-options are at the end.  This allows options to be
given in any order, even with programs that were not written to expect this.

RETURN_IN_ORDER is an option available to programs that were written to
expect options and other argv-elements in any order and that care about the
ordering of the two.  We describe each non-option argv-element as if it were
the argument of an option with character code 1.  Using `-' as the first
character of the optstring parameter selects this mode of operation.

The special argument `--' forces an end of option-scanning regardless of the
value of ordering.  In the case of RETURN_IN_ORDER, only `--' can cause
getopt() and friends to return EOF with optind != argc.

COPYRIGHT NOTICE AND DISCLAIMER:

Copyright (C) 1997 Gregory Pietsch

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
more details.

You should have received a copy of the GNU General Public License along with
this program; if not, write:

The Free Software Foundation, Inc.
675 Massachusetts Avenue
Cambridge, Massachusetts  02139  USA

Here's my address, should one need to contact me by paper mail:

Gregory Pietsch
211 Pingree Avenue
Ewing, New Jersey  08618  USA

****************************************************************************/

#ifndef GETOPT_H
#include "getopt.h"
#endif

/* macros */

/* functions */

/* reverse_argv_elements:  reverses num elements starting at argv */
static void reverse_argv_elements(char **argv, size_t num)
{
    size_t i;
    char *tmp;

    for (i = 0; i < (num >> 1); i++) {
	tmp = argv[i];
	argv[i] = argv[num - i - 1];
	argv[num - i - 1] = tmp;
    }
}

/* permute: swap two blocks of argv-elements given their lengths */
static void permute(char **argv, size_t len1, size_t len2)
{
    reverse_argv_elements(argv, len1);
    reverse_argv_elements(argv, len1 + len2);
    reverse_argv_elements(argv, len2);
}

/* is_option: is this argv-element an option or the end of the option list? */
static int is_option(char *argv_element, int only)
{
    return ((argv_element == NULL)
	    || (argv_element[0] == '-')
	    || (only && argv_element[0] == '+'));
}

/* getopt_internal:  the function that does all the dirty work */
static int getopt_internal(int argc, char **argv, char *shortopts,
		 GETOPT_LONG_OPTION_T * longopts, int *longind, int only)
{
    GETOPT_ORDERING_T ordering = PERMUTE;
    static size_t optwhere = 0;
    size_t permute_from = 0;
    int num_nonopts = 0;
    int optindex = 0;
    size_t match_chars = 0;
    char *possible_arg = NULL;
    int longopt_match = -1;
    int has_arg = -1;
    char *cp = NULL;
    int arg_next = 0;

    /* first, deal with silly parameters and easy stuff */
    if (argc == 0 || argv == NULL || (shortopts == NULL && longopts == NULL))
	return (optopt = '?');
    if (optind >= argc || argv[optind] == NULL)
	return EOF;
    if (strcmp(argv[optind], "--") == 0) {
	/* no more options */
	optind++;
	return EOF;
    }
    /* if this is our first time through */
    if (optind == 0)
	optind = optwhere = 1;

    /* define ordering */
    if (shortopts != NULL && (*shortopts == '-' || *shortopts == '+')) {
	ordering = (*shortopts == '-') ? RETURN_IN_ORDER : REQUIRE_ORDER;
	shortopts++;
    }
    else
	ordering = (getenv("POSIXLY_CORRECT") != NULL) ? REQUIRE_ORDER :
	    PERMUTE;

    /*
     * based on ordering, find our next option, if we're at the beginning of
     * one
     */
    if (optwhere == 1) {
	switch (ordering) {
	case PERMUTE:
	    permute_from = (size_t) optind;
	    num_nonopts = 0;
	    while (!is_option(argv[optind], only)) {
		optind++;
		num_nonopts++;
	    }
	    if (argv[optind] == NULL) {
		/* no more options */
		optind = (int) permute_from;
		return EOF;
	    }
	    else if (strcmp(argv[optind], "--") == 0) {
		/* no more options, but have to get `--' out of the way */
		permute(argv + permute_from, num_nonopts, 1);
		optind = (int)(permute_from + 1);
		return EOF;
	    }
	    break;
	case RETURN_IN_ORDER:
	    if (!is_option(argv[optind], only)) {
		/* non-option */
		optarg = argv[optind++];
		return (optopt = 1);
	    }
	    break;
	case REQUIRE_ORDER:
	    if (!is_option(argv[optind], only))
		return EOF;
	    break;
	}
    }
    /* we've got an option, so parse it */

    /* first, is it a long option? */
    if (longopts != NULL
	&& (memcmp(argv[optind], "--", 2) == 0
	    || (only && argv[optind][0] == '+'))
	&& optwhere == 1) {
	/* handle long options */
	if (memcmp(argv[optind], "--", 2) == 0)
	    optwhere = 2;
	longopt_match = -1;
	possible_arg = strchr(argv[optind] + optwhere, '=');
	if (possible_arg == NULL) {
	    /* no = , so next argv might be arg */
	    match_chars = strlen(argv[optind]);
	    possible_arg = argv[optind] + match_chars;
	    match_chars = match_chars - optwhere;
	}
	else
	    match_chars = (possible_arg - argv[optind]) - optwhere;
	for (optindex = 0; longopts[optindex].name != NULL; optindex++) {
	    if (memcmp(argv[optind] + optwhere,
		       longopts[optindex].name,
		       match_chars) == 0) {
		/* do we have an exact match? */
		if (match_chars == (strlen(longopts[optindex].name))) {
		    longopt_match = optindex;
		    break;
		}
		/* do any characters match? */
		else {
		    if (longopt_match < 0)
			longopt_match = optindex;
		    else {
			/* we have ambiguous options */
			if (opterr)
			    fprintf(stderr, "%s: option `%s' is ambiguous "
				    "(could be `--%s' or `--%s')\n",
				    argv[0],
				    argv[optind],
				    longopts[longopt_match].name,
				    longopts[optindex].name);
			return (optopt = '?');
		    }
		}
	    }
	}
	if (longopt_match >= 0)
	    has_arg = longopts[longopt_match].has_arg;
    }
    /* if we didn't find a long option, is it a short option? */
    if (longopt_match < 0 && shortopts != NULL) {
	cp = strchr(shortopts, argv[optind][optwhere]);
	if (cp == NULL) {
	    /* couldn't find option in shortopts --
             * if we're using RETURN_IN_ORDER, treat it like a
             * non-option
             */
	    if (optwhere == 1 && ordering == RETURN_IN_ORDER) {
		optarg = argv[optind++];
		return (optopt = 1);
	    }
	    if (opterr)
		fprintf(stderr,
			"%s: invalid option -- `-%c'\n",
			argv[0],
			argv[optind][optwhere]);
	    optwhere++;
	    if (argv[optind][optwhere] == '\0') {
		optind++;
		optwhere = 1;
	    }
	    return (optopt = '?');
	}
	has_arg = ((cp[1] == ':')
		   ? ((cp[2] == ':') ? OPTIONAL_ARG : REQUIRED_ARG)
		   : NO_ARG);
	possible_arg = argv[optind] + optwhere + 1;
	optopt = *cp;
    }
    /* get argument and reset optwhere */
    arg_next = 0;
    switch (has_arg) {
    case OPTIONAL_ARG:
	if (*possible_arg == '=')
	    possible_arg++;
	if (*possible_arg != '\0') {
	    optarg = possible_arg;
	    optwhere = 1;
	}
	else
	    optarg = NULL;
	break;
    case REQUIRED_ARG:
	if (*possible_arg == '=')
	    possible_arg++;
	if (*possible_arg != '\0') {
	    optarg = possible_arg;
	    optwhere = 1;
	}
	else if (optind + 1 >= argc) {
	    if (opterr) {
		fprintf(stderr, "%s: argument required for option `",
			argv[0]);
		if (longopt_match >= 0)
		    fprintf(stderr, "--%s'\n", longopts[longopt_match].name);
		else
		    fprintf(stderr, "-%c'\n", *cp);
	    }
	    optind++;
	    return (optopt = ':');
	}
	else {
	    optarg = argv[optind + 1];
	    arg_next = 1;
	    optwhere = 1;
	}
	break;
    case NO_ARG:
	if (longopt_match < 0) {
	    optwhere++;
	    if (argv[optind][optwhere] == '\0')
		optwhere = 1;
	}
	else
	    optwhere = 1;
	optarg = NULL;
	break;
    }

    /* do we have to permute or otherwise modify optind? */
    if (ordering == PERMUTE && optwhere == 1 && num_nonopts != 0) {
	permute(argv + permute_from, num_nonopts, 1 + arg_next);
	optind = permute_from + 1 + arg_next;
    }
    else if (optwhere == 1)
	optind = optind + 1 + arg_next;

    /* finally return */
    if (longopt_match >= 0) {
	if (longind != NULL)
	    *longind = longopt_match;
	if (longopts[longopt_match].flag != NULL) {
	    *(longopts[longopt_match].flag) = longopts[longopt_match].val;
	    return 0;
	}
	else
	    return longopts[longopt_match].val;
    }
    else
	return optopt;
}

int gp_getopt(int argc, char **argv, char *optstring)
{
    return getopt_internal(argc, argv, optstring, NULL, NULL, 0);
}
/* end of file GETOPT.C */
