/* ----------------------------------------------------------------------
 * qs -- quick sequence search
 * Copyright (C) 2000 January Weiner III
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 ---------------------------------------------------------------------- */

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <time.h>

#include "genpak.h"
#include "gp_getopt.h"

#define VERSION "0.2"
#define PROGNAME "gp_qs"
#define VARIA 2

char *progname ;


typedef struct {
	gzFile *in ;
	gzFile *query ;
	int degenerate ;
	int needle_on_stdin ;
	int plus ;
	int minus ;
	FILE *out ; } opt_s ;

int SearchNeedle(sekw* s, sekw* needle, opt_s opt) ;

/*
 *
 */

int main(int argc, char *argv[]) {
	extern int optind ;
	extern char *optarg ;
	sekw *inseq, *needle ;
	int c, nmatch, needle_stdin = FALSE;
	char message[100] ;
	opt_s opt ;

	opt.in = NULL ;
	opt.out = NULL ;
	opt.plus = TRUE ;
	opt.minus = TRUE ;
	opt.degenerate = FALSE ;
	opt.needle_on_stdin = FALSE ;

	allwarnings = NULL ;
	progname = argv[0] ;
	textdomain("genpak") ;

	while ((c = gp_getopt(argc, argv, "n:impqvdh")) != EOF)
		switch(c) {
		case 'n':
			if(sscanf(optarg,"%i",&opt.degenerate) != 1)
				gp_error(_("Could not read parameter for option %s"), "-n") ;
			gp_warn(_("Allowing not exact binding, %i"),opt.degenerate) ;
			break ;
		case 'i':
			gp_warn(_("Reading query sequences from standard input")) ;
			opt.needle_on_stdin = TRUE ;
			break ;
		case 'm':
			gp_warn(_("Searching only on the minus strang")) ;
			opt.plus = FALSE ;
			break ;
		case 'p':
			gp_warn(_("Searching only on the plus strang")) ;
			opt.minus = FALSE ;
			break ;
		case 'q':
			quiet = TRUE ;
			break ;
		case 'v':
			fprintf(stderr,_("%s version %s\n"),progname,VERSION) ;
			exit(EXIT_SUCCESS) ;
			break ;
		case 'H':
			html = TRUE ;
			break ;
		case 'd':
			debug = TRUE ;
			gp_warn(_("Running in debug mode")) ;
			break ;
		case 'h':
			Help() ;
			break ;
		default:
			gp_error(_("Type '%s -h' for help"),progname) ;
			break;
		}

	if(optind >= argc) gp_error(_("Not enough parameters given")) ;

	/* 
	 * if option -i defined, then reading sequences from stdin 
	 * and the database sequence from a file. 
	 * otherwise, try to get the sequences from command line 
	 * and the database sequence either from a file, or stdin
	 */

	if(opt.needle_on_stdin == FALSE) {
		needle = calloc(1, sizeof(*needle)) ;
		needle->sequ = calloc(strlen(argv[optind]) + 1, sizeof(*needle->sequ)) ;
		strcpy(needle->sequ,argv[optind]) ;
		needle->name = gp_strdup("Unknown") ;
		needle->leng = strlen(needle->sequ) ;
		optind++ ;
		if(debug) gp_warn(_("Query: %s"),needle->sequ) ;
	} else {
		opt.query = stdin ;
		gp_warn(_("Reading query from stdandard input")) ;
	}

	/* open the file pointer to read the sequences 
	 * from: standard input or a file provided? */
	if(optind >= argc) {
		if(opt.needle_on_stdin == TRUE) 
			gp_error(_("Cannot read both query and database from standard input!")) ;
		opt.in = stdin ;
	} else {
		if(debug) gp_warn(_("Reading database from file %s"),argv[optind]) ;
		opt.in = gp_gzOpen(argv[optind],"r") ;
		optind++ ;
	}

	/* opening the file pointer to write the output: 
	 * standard output or file provided? */

	if(optind >= argc) {
		gp_warn(_("Writing to standard output")) ;
		opt.out = stdout ;
	} else {
		opt.out = gp_file_open(argv[optind],"wb") ;
		gp_warn(_("Writing to file %s"),argv[optind]) ;
	}

	if(opt.needle_on_stdin == TRUE) {

		if((inseq = gp_seq_read(opt.in)) == NULL) {
			gp_error(_("Could not read database file from input file")) ;
		}
		
		while((needle = gp_seq_read(opt.query)) != NULL) {
			if(debug) gp_warn("qs: %s vs. %s",needle->name,inseq->name) ;
			nmatch = SearchNeedle(inseq,needle,opt) ;
			gp_seq_free(needle) ;
		}

	} else {

		while((inseq = gp_seq_read(opt.in)) != NULL) {
			if(debug) gp_warn("looking up in %s", inseq->name) ;
			nmatch = SearchNeedle(inseq, needle, opt) ;
			gp_seq_free(inseq) ;
		}

	}

	if(debug) gp_warn("%i matches found",nmatch) ;
	if(html) gp_warn_print_all(opt.out) ;
	fclose(opt.out) ;
	gzclose(opt.in) ;
	return EXIT_SUCCESS;
}


/* 
 * if we are looking for a degenerate sequence, we simply substitute 'N' for
 * nucleotides at a sequences end. For example, CACACACACA becomes NNCACACANN
 */
int degenerate_sequence(sekw *n, int deg) {
	int i, l ;

	l = strlen(n->sequ) ;
	if(deg > l) gp_error(_("Value %i to large"), deg) ;

	for(i = 0; i < deg; i++ ) {
		n->sequ[i] = 'N' ;
		n->sequ[l - i - 1] = 'N' ;
	}

	return EXIT_SUCCESS ;
}

/* searches for sequence n in sequence s and returns the number of matches */
int SearchNeedle(sekw* s, sekw* n, opt_s opt) {
	long dls, dln, pos1, i,j ;
	int res = 0, varia ;
	char name[31] ;
	sekw *rn ;

	varia = opt.degenerate ;

	for(i = 0; i < 10 ; i++) {
		name[i] = n->name[i] ;
		if(n->name[i] == '\0') i = 10 ; 
	}

	name[10] = '\0' ;
	strcat(name," v. ") ;

	j = strlen(name) ;
	for(i = j;i<(j+10);i++) {
		name[i] = s->name[(i-j)] ;
		if(s->name[(i-j)] == '\0') i = j+10 ;
	}

	name[j + 10] = '\0' ;

	dls = strlen(s->sequ) ;
	dln = strlen(n->sequ) ;

	/* preparing the reverse needle */
	rn = gp_seq_reverse(n) ;

	if(debug) gp_warn("reversed seq: rn->sequ is %s", rn->sequ) ;

	/* The main search loop */
	for(pos1 = 0; pos1 < (dls - dln); pos1++) {
		
		if(opt.plus) {
			for(i = 0; 
				i < dln && Compare(n->sequ[i], s->sequ[pos1+i]) ; 
				i++) ;

			if(i == dln) {
				fprintf(opt.out,"%li %li E %s\n", pos1+1,pos1+dln,name) ;
				res++ ;
			} else if(opt.degenerate) {

				for(i = varia; 
				i < (dln - varia) && Compare(n->sequ[i], s->sequ[pos1+i]); 
				i++) ;

				if(i == (dln - varia)) {
					fprintf(opt.out,"%li %li N %s\n", pos1+varia+1,pos1+dln-varia,name) ;
					res++ ;
				}

			}
		}

		if(opt.minus) {

			for(i = 0; 
				i < dln && Compare(rn->sequ[i], s->sequ[pos1+i]) ; 
				i++) ;

			if(i == dln) {
				fprintf(opt.out,"%li %li E %s\n", pos1+dln,pos1+1,name) ;
				res++ ;
			} else if(opt.degenerate) {

				for(i = varia; 
					i < (dln - varia) && Compare(rn->sequ[i], s->sequ[pos1+i]); 
					i++) ;

				if(i == (dln-varia)) {
					fprintf(opt.out,"%li %li N %s\n", pos1+dln-varia,pos1+varia+1,name) ;
					res++ ;
				}

			}
		}
	}

	gp_seq_free(rn) ;
	return res ;
}

/* Standard mesage */

void Help()
{
printf(_("\n"
	"%s version %s - quick sequence search"
	"\n"
	"  Usage:\n"
	"     %s [options] sequence [ database file ] [ output file ]\n"
	"     %s -i [options] database file [ output file ]\n"
	"\n"
	"  Options:\n"
	"     -i       : query sequences are read from standard input\n"
	"     -n       : allow not exact binding\n"
	"     -v       : print version information & exit\n"
	"     -h       : print this help screen & exit\n"
	"     -q       : quiet; suppress error messages\n\n"
	""),PROGNAME,VERSION,progname,progname) ;
	exit(EXIT_SUCCESS);
}

