/* ----------------------------------------------------------------------
 * digest - restriction site analysis
 * Copyright (C) 2000 January Weiner III
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 ---------------------------------------------------------------------- */

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

#include "genpak.h"
#include "gp_getopt.h"

#define VERSION "0.1"
#define PROGNAME "gp_digest"
#ifndef ENZYMEFILE
#define ENZYMEFILE "enzyme.enz"
#endif

typedef struct {
	char name[11] ;
	char sequ[101] ;
	int leftend;
	int rightend;
	int temp ;
	char heat ;
	char buf[10] ;
} enzym ;

typedef enum { 
	POSITIONS= 0,LENGTHS,FRAGMENTS,SEQUENCES,ASCII,NUMONLY 
} output_type ;

char *progname ;

/*
 *
 */

int FindRestrictionSites(FILE*out,sekw* s, enzym *e, output_type o) ;
enzym* LoadEnzyme(FILE* in, const char se[10]) ;
int PrintSite(FILE* out, sekw* s, long pos1, long pos2, enzym* e, output_type o) ;
int PrintEnzyme(FILE* out, enzym* e) ;

int main(int argc, char *argv[])
{
	extern int optind ;
	extern char *optarg ;
	int width = 70 ; /* width with which the sequence gets formatted */
	FILE *in, *out, *enzfile ;
	sekw *inseq ;
	enzym *inenz ;
	char searche[11] ;



	int c,show_info = FALSE,altenzfile = FALSE; 
	output_type outtype = ASCII ;
	char message[100] ;
	int errflg = 0 ;

	


	progname = argv[0] ;

	while ((c = gp_getopt(argc, argv, "o:ie:qvdh")) != EOF)
		switch(c) {
		case 'e':
			gp_warn("Loading enzymes from file %s",optarg) ;
			enzfile = gp_file_open(optarg,"r") ;
			altenzfile = TRUE ;
			break ;
		case 'o':
			if(strchr("asflnp",optarg[0]) == NULL)
				gp_error("Wrong type for option '-o'") ;
			if(optarg[0] == 'a') outtype = ASCII ;
			if(optarg[0] == 'p') outtype = POSITIONS ;
			if(optarg[0] == 's') outtype = SEQUENCES ;
			if(optarg[0] == 'f') outtype = FRAGMENTS ;
			if(optarg[0] == 'l') outtype = LENGTHS ;
			if(optarg[0] == 'n') outtype = NUMONLY ;
			if(debug) gp_warn("Output type %i",outtype) ;
			break ;
		case 'i':
			show_info = TRUE ;
			break ;
		case 'q':
			quiet = TRUE ;
			break ;
		case 'v':
			fprintf(stderr,"%s version %s\n",progname,VERSION) ;
			exit(0) ;
			break ;
		case 'd':
			debug = TRUE ;
			gp_warn("Running in debug mode") ;
			break ;
		case 'h':
			Help() ;
			break ;
		default:
			errflg++ ;
			break;
		}


	if(errflg) {
			sprintf(message,"Type '%s -h' for help",progname) ;
			gp_error(message) ;
	}

	/* Read the enzyme name to digest the sequences with */

	if(optind >= argc) {
		gp_error("You must give me at least the enzyme name") ;
	} else {
		if(strlen(argv[optind]) > 10) gp_error("Enzyme name too long") ;
		strcpy(searche,argv[optind]) ;
	}

	optind++;


/* open the file pointer to read the sequences 
 * from: standard input or a file provided? */
	if(optind >= argc) {
		in = stdin ;
	} else {
		if( (in = ((FILE *) fopen(argv[optind],"r"))) == NULL) {
			sprintf(message, "Failed to open file %s for reading", argv[optind]) ;
			gp_error(message) ;
		}
	}


/* opening the file pointer to write the output: 
 * standard output or file provided? */
	optind++ ;

	if(optind >= argc) {
		out = stdout ;
	} else {
		out = gp_file_open(argv[optind],"wb") ;
	}


	/* If no alternate file provided, let's get the default file */

	enzfile = gp_file_open(ENZYMEFILE,"r") ;


	
	if(debug) gp_warn("Searching for enzyme %s",searche) ;

	if((inenz = LoadEnzyme(enzfile,searche)) == NULL)
		gp_error("Could not load the enzyme from the enzyme file") ;

	if(debug) gp_warn("Enzyme found") ;
	if(show_info == TRUE) PrintEnzyme(out,inenz) ;


	while((inseq = gp_seq_read_fragment(in,0,0,0))!= NULL) {

		
		FindRestrictionSites(out,inseq,inenz,outtype) ;


	}

	
	fclose(out) ;
	fclose(in) ;
	return(0);
}


/* Find restriction sites for a given sequence */

int FindRestrictionSites(FILE* out,sekw* s, enzym* e, output_type output) {

	int i,j,width = 70 ;
	long numsites = 0,dl_s, dl_e,last_start = 1 ;
	sekw *fragment ;


	dl_s = strlen(s->sequ) ;
	dl_e = strlen(e->sequ) ;
	fragment = (sekw*) calloc(1,sizeof(sekw)) ;
	fragment->sequ = (char*) calloc (dl_s+1,sizeof(char)) ;

	for(i = 0;i<(dl_s-dl_e);i++) {


		j = 0;
		while(j<dl_e && Compare(s->sequ[i+j],e->sequ[j]) ) j++;

		/* Have we found the restriction site? */
		if(j == dl_e) {
			if(debug) 
				gp_warn("digest: fragment from %i to %i",last_start,i+e->rightend) ;

			PrintSite(out,s,last_start,i,e,output) ;
			last_start = i ;
			numsites++;
		}

	}

	if(debug) 
		gp_warn("digest: last fragment: %i to %i",last_start,dl_s) ;

	PrintSite(out,s,last_start,dl_s,e,output) ;
	if(output == NUMONLY) fprintf(out,"%li\n",numsites) ;
	else gp_warn("%i restriction sites found",numsites) ;

	return(0) ;


}



/* This is for proper type of output */

int PrintSite(FILE* out, sekw*s, long pos1, 
	long pos2, enzym* e, output_type o) {


	long l,r,i,j,p1,p2l,p2r ;
	float step = 1,width = 70 ;
	long length ;
	sekw* f ;
	l = e->leftend ;
	r = e->rightend ;

	length = strlen(s->sequ) ;
	step = length/width ;

	(pos1== 1)?(p1= 1):(p1= pos1+l) ;
	if(pos2== length) {
		p2l = pos2 ;
		p2r = pos2 ;
	} else { 
		p2l = pos2+l ;
		p2r = pos2+r ;
	}

	if(o == POSITIONS) 
		fprintf(out,"%li\n",p2l) ;
	if(o == LENGTHS) 
		fprintf(out,"%li\n",(p2l-p1)) ;
	if(o == FRAGMENTS)
		fprintf(out,"%li %li %li\n",p1,p2r,(p2l-p1)) ;
	if(o == SEQUENCES) {
		f = gp_seq_copy_frag(s,p1+1,p2r) ;
		gp_seq_print_fasta(out,f,width) ;
	}

	if(o == ASCII) {
		if(pos1== 1) fprintf(out,"|") ;

		i = (p1)/step ;
		j = (p2l)/step ;

		while(i+1<j) {
			fprintf(out,"-") ;
			i++ ;
		}


		if(pos2>= length) fprintf(out,"|\n") ;
		else if(i<j) fprintf(out,"+") ;

	}

	return(0) ;
		
}




/* Load apriopriate enzyme from an enzyme file */

enzym *LoadEnzyme(FILE* in, const char se[10]) {


	char bufor[BUFSIZ+1] ;
	int check = TRUE,i = 0,j = 0 ;
	char c = '\0' ;
	char sequence[101] ;
	enzym *e ;

	e = (enzym*) calloc(1,sizeof(enzym));
	e->name[0] = '\0' ;
	e->temp = 0 ;
	e->heat = '?' ;
	e->buf[0] = '\0' ;

	/* reading enzyme file until file is finished or sequence is found */
	while( ! (fgets(bufor,BUFSIZ,in) == NULL) && check == TRUE) {

		i++ ;

		if(bufor[0] == '#' || bufor[0] == '\n') continue ;


		if(sscanf(bufor,"%10s %100s %i %c %10s",
			e->name, sequence, &e->temp, &e->heat, e->buf)<2)
			gp_warn("Could not read enzyme file line %i",i) ;
		else
			if(strcasecmp(e->name,se) == 0) check = FALSE ;

	}

	/* probably sequence has not been read.*/
	if(check == TRUE) return(NULL) ;

	for(i = 0,j = 0;i<strlen(sequence);i++) {

		c = sequence[i] ;

		switch(c) {

			case '>':
				e->leftend = (j) ;
				break ;
			case '<':
				e->rightend = (j) ;
				break ;
			default:
				e->sequ[j] = c ;
				j++;
				break ;
		
		} 
	}


	e->sequ[j] = '\0';

	return(e) ;

}

int PrintEnzyme(FILE* out, enzym* e) {

	int i = 0;

	fprintf(out,"Enzyme: %s\n",e->name) ;
	fprintf(out,"Recognized sequence: %s\n",e->sequ) ;

	fprintf(out,"5'-") ;

	for(i = 0;i<e->leftend;i++) {
		fprintf(out,"%c",e->sequ[i]) ;
	}

	fprintf(out," ") ;

	for(;i<strlen(e->sequ);i++) {
		fprintf(out,"%c",e->sequ[i]) ;
	}

	fprintf(out,"-3'\n") ;

	fprintf(out,"3'-") ;

	for(i = 0;i<e->rightend;i++) {
		fprintf(out,"%c",gp_nucl_complement(e->sequ[i])) ;
	}

	fprintf(out," ") ;

	for(;i<strlen(e->sequ);i++) {
		fprintf(out,"%c",gp_nucl_complement(e->sequ[i])) ;
	}

	fprintf(out,"-5'\n") ;

	if(e->temp != 0)
		fprintf(out,"Optimal temperature: %i\n",e->temp) ;

	if(e->heat != '?') {
		if(e->heat == '*') fprintf(out,"Can be heat deactivated\n") ;
		else fprintf(out,"Heat stable\n") ;
	}

	if(strlen(e->buf) != 0)
		fprintf(out,"Preferred buffer: %s\n",e->buf) ;

	return(0) ;



}



/* Standard mesage */

void Help()
{
printf("\n");
printf("%s version %s - restriction site analysis",PROGNAME,VERSION);
printf("\n");
printf("  Usage:\n");
printf("     %s [options] enzyme [ input file ] [ output file ]\n",progname);
printf("\n");
printf("  Options:\n");
printf("     -e file  : use alternate enzyme file\n");
printf("     -o type  : output type:\n");
printf("        -o a[scii] - simple ascii graphics (default)\n");
printf("        -o p[osition] - print restriction sites positions\n");
printf("        -o s[equences] - print restriction fragment sequences\n");
printf("        -o f[ragments] - print restriction fragment positions\n");
printf("        -o l[engths] - print restriction fragment lengths\n");
printf("        -o n[umber] - print the total number of restriction sites\n");
printf("     -v       : print version information & exit\n");
printf("     -h       : print this help screen & exit\n");
printf("     -q       : quiet, suppress error messages\n\n");
exit(0);
}


			
