/* ----------------------------------------------------------------------
 * gc - program for determining the GC contents of a sequence.
 * Copyright (C) 2000 January Weiner III
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 ---------------------------------------------------------------------- */

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

#include "genpak.h"
#include "gp_getopt.h"

#define PROGNAME "gp_gc"
#define VERSION "0.3"

/*
 *
 */

typedef struct {
	FILE* in ;
	FILE* out ;
	int onlymean ;
	int total ;
	int distro ; } opt_s ;

long ComputateGC(sekw *s) ;
int GCDistro(sekw* inseq, opt_s options);

int main(int argc, char *argv[])
{
	extern int optind ;
	sekw* inseq ;
	double squares = 0.0, giece = 0.0, mean = 0.0 ;
	long currgc = 0, totalgc = 0, currnt = 0, totalnt = 0;
	int c,nseq = 0;
	opt_s options ;

	options.onlymean = FALSE;
	options.total = FALSE;
	options.distro = FALSE ;
	progname = argv[0] ;
	allwarnings = NULL ;
	textdomain("genpak") ;

	while ((c = gp_getopt(argc, argv, "D:tHmvhqd")) != EOF)
		switch(c) {
		case 'D':
			if(sscanf(optarg,"%i",&options.distro) != 1)
				gp_error(_("Wrong argument for option -d")) ;
			gp_warn(_("Making sequence GC distribution, window %i"),options.distro) ;
			break ;
		case 't':
			gp_warn(_("Will calculate the total GC of all sequences")) ;
			options.total = TRUE ;
			break ;
		case 'm':
			gp_warn(_("Will show only the mean gc of all sequences")) ;
			options.onlymean = TRUE ;
			break ;
		case 'H':
			html = TRUE ;
			break ;
		case 'q':
			quiet = TRUE ;
			break ;
		case 'v':
			fprintf(stderr,_("%s version %s\n"),progname,VERSION) ;
			exit(0) ;
			break ;
		case 'd':
			debug = TRUE ;
			gp_warn(_("Running in debug mode")) ;
			break ;
		case 'h':
			Help() ;
			break ;
		default:
			gp_error(_("Type '%s -h' for help"),progname) ;
			break;
		}

	/* open the file pointer to read the sequences 
	 * from: standard input or a file provided? */
	if(optind >= argc) options.in = stdin ;
	else options.in = gp_gzOpen(argv[optind],"r") ;

	/* opening the file pointer to write the output: 
	 * standard output or file provided? */
	optind++ ;

	if(optind >= argc) options.out = stdout ;
	else options.out = gp_file_open(argv[optind],"wb") ;

	/* main loop, reading each sequence that is on input stream */
	while((inseq = gp_seq_read(options.in)) != NULL) {

		currgc = ComputateGC(inseq) ;
		currnt = inseq->leng ;
		if(options.total) {
			totalgc += currgc ;
			totalnt += currnt ;
		} 

		giece = 100 * ( (double) currgc) / ( (double) currnt)  ;
		nseq++ ;

		if(options.distro) GCDistro(inseq, options);

		if(options.onlymean) {
			mean += giece ;
			squares += giece*giece ;
		} else {
			if(!options.total && !options.distro) 
				fprintf(options.out,"%.2f %%  %s\n", giece, inseq->name) ;
		}
		gp_seq_free(inseq) ;
	}

	/* printing the mean sequence */
	if(options.onlymean) {
		squares = pow(((squares - (mean*mean)/nseq)/(nseq-1)),0.5) ;
		mean = mean / (double) nseq ;
		fprintf(options.out,_("%.2f %% = mean\n"),mean) ;
		if(nseq > 1) 
			fprintf(options.out,", %.2f = SE\n",squares) ;
	}

	/* if we are displaying the total GC contents of all sequences */
	if(options.total) {
		giece = 100 * ( (double) totalgc) / ( (double) totalnt)  ;
		fprintf(options.out,"%.2f\n",giece) ;
	}

	if(html) gp_warn_print_all(options.out) ;
	fclose(options.out) ;
	fclose(options.in) ;
	return(EXIT_SUCCESS);
}


/* Computing and printing GC distribution */
int GCDistro(sekw* s, opt_s o) {
	long i,w ;
	double gc ;
	char c ;
	long cnv[128] ;
	
	fprintf(o.out,"%s\t", s->name) ;
	cnv['A'] = 0 ; cnv['C'] = 0 ; cnv['G'] = 0 ; cnv['T'] = 0; cnv['U'] = 0;

	for(i = 0,gc = 0;i<(s->leng-o.distro);i++) {

		for(w = 0;w<o.distro;w++) {
			c = toupper(s->sequ[i+w]) ;
			cnv[c]++ ;
		}

		gc = 100.0*(cnv['G']+cnv['C'])/o.distro ;
		if(debug) gp_warn(_("position %i, [G][C] = %i,%i gc = %f window %i"),
			i,cnv['G'],cnv['C'],gc,o.distro) ;
		fprintf(o.out,"%2.1f\t",gc) ;
	}

	fprintf(o.out,"\n") ;
	return EXIT_SUCCESS ;
}

/* Simple function returning the gc contents of a sequence */

long ComputateGC(sekw *s) {
	long i,dlugosc ;
	double cnv[128] ;
	long gc ;
	char c ;

	cnv['A'] = 0 ; cnv['C'] = 0 ; cnv['G'] = 0 ; cnv['T'] = 0; cnv['U'] = 0;
	dlugosc = strlen(s->sequ) ;

	for(i = 0;i<dlugosc;i++) {
		c = toupper(s->sequ[i]) ;
		cnv[c]++ ;
	}

	gc = cnv['C'] + cnv['G'] ;

	if(debug) gp_warn(_("ComputateGC: GC= %i"),gc) ;
	return(gc) ;
}

/* Standard mesage */
void Help()
{
	printf(_("\n%s version %s - determine the gc contents of sequence(s)"),
		PROGNAME,VERSION);
	printf(
	_("\n"
	"  Usage:\n"
	"     %s [options] [ input file ] [ output file ]\n"
	"\n"
	"  Options:\n"
	"     -t       : will calculate the total GC%% of all read sequences\n"
	"     -m       : will show only mean GC%% of all read sequences\n"
	"     -D value : for each sequence, a distribution of GC with window\n"
	"                size equal to value is printed.\n"
	"     -H       : run in HTML mode\n"
	"     -q       : run in quiet mode\n"
	"     -v       : print version information & exit\n"
	"     -h       : print this help screen & exit\n\n"), progname);
	exit(EXIT_SUCCESS);
}
