PHP Labware source code viewer / Internal utilities | 22 Sep, 2025
Root | Help
./other/pileup2var.awk
#!/usr/bin/awk -f

#  pileup2var.awk
#  16 November 2015 version
#  By Santosh Patnaik
#  Ref. X632
#  
#  DESCRIPTION:
#  
#  pileup2var.awk is a command-line awk program to parse samtools pileup data for genome-location-specific single-nucleotide variation (change) to identify candidate positions at which RNA editing, mutation, etc. occurs.
#  
#  USAGE:
#  
#  [piped input | ] awk -f pileup2var.awk [-W posix] [-v option1=value1 option2=value2 ...] [input file]
#  
#  The W option should be included if the system's awk accepts it. The v option with the option1=value1... string is optional, in which case default values for option1, etc. will be used; see OPTIONS.
#  
#  Example: awk -f pileup2var.awk -W posix -v refBaseTypes=CG varBaseTypes=TA pileupData.txt
#  
#  REQUIREMENT
#  
#  A reasonably modern version of awk. Tested with GNU awk 4.0.2 on Mac OS X 10.6.8, Apple-supplied awk on Mac OS X 10.6.8, and awk 3.1.5 on Unix.
#  
#  INPUT:
#  
#  Input data, in a file or piped in, should be in samtools pileup format in which each line consists tab-delimited values for chromosome, 1-based coordinate, reference base, the number of reads covering the site, read bases, and base qualities. The last three types of values are repeated for each additional sample. Such data is generated with the samtools mpileup or pileup commands. The data may have been filtered by samtools for base-call quality, etc.
#  
#  OUTPUT:
#  
#  The primary output, in tab-delimited format and using one line per position, is to stdout, which the user can pipe to another command or save to a file. A summary/log file is generated as per the logFile option value. When option outputOnlyCounts is used, positions are parsed and various base-type count values are output for the samples for positions for which all conditions set by option values refBaseTypes, minSampleCalls and minGroupAvgCalls are satisfied. When option outputOnlyCounts is not used, the positions are examined for variation as per conditions set by various option values, and the output includes only those positions that satisfy the conditions.
#  
#  Output has tab-delimited values, each per sample in order of appearance in the pileup, and as colon-separated base-calls or base counts for all (as per samtools), A/C/G/T, non-reference, reference and N (unknown) base-types (disregarding casing), and a, c, g, t, A, C, G and T base-types (i.e., case/strand-specific for each the four base-types) for a chromosome position. This set of values is preceded by a set of tab-delimited values with information on chromosome, position (1-based coordinate), and reference base information. One line is used per chromosome position.
#  
#  When option outputOnlyCounts is not used, the reference base value is followed by a set of tab-delimited values with an annotation for a specific variation type (e.g., AG implying a candidate A>G variation position), and for this variation type, the variation rate minimum, maximum and average across all samples, and first and second group average variation rates and their ratio (NA used if there is no second group or if the rate for first group is 0). More than one line may be used per chromosome position if more than one variation-type has been observed for it.
#  
#  OPTIONS:
#  
#  Option values are provided without quotes in format noted in USAGE. Any user-provided value will be parsed by the program and may get converted so that it fits the type of value that the option accepts (noted below). When an option is not specified by the user, its default value will be used (noted below). Here, base-type refers to A, C, G, etc. Base-calls refer to counts of only A, C, G or T base-types. A reference base-call is one in which the base-type of a sample matches the reference base-type; a mis-match exists in case of a variant base-call. Variation rates are specific to one variation type; e.g., for A>G variation, it is the ratio of base-calls for G and the sum of those for A and G.
#  
#  Unless noted otherwise, all conditions set by the options, through either user-specified or default option values, have to be met for any chromosome position. The order in which the conditions are tested is noted in the log file.
#  
#  firstGroupSize – The first this many samples covered in the input data will be considered as belonging to one sample-group. All other samples will belong to a separate group. Type: integer >0. Default: Total samples.
#  logFile – Name of log file to write to. The file will be created. Type: absolute or relative file-path. Default: log_pileup2var.awk.txt in directory that awk is invoked in.
#  maxSampleOtherCalls – Maximum sum of base-calls for base-types other than reference and variant for a specific base-type variation for all samples. E.g., A+G in case of C>T variation. The condition set by this is tested only if the condition set by the minSampleRefAndVarFrac is not satisfied. Type: integer. Default: 1.
#  minGroupAvgCalls – Minimum group-wide average base-calls for all sample-groups. Type: integer or number in decimal. Default: 5.
#  minGroupAvgVarCalls – Minimum group-wide average variant base-calls for ≥1 sample-groups. Type: integer or number in decimal. Default: 1.
#  minGroupAvgVarRate – Minimum group-wide average variation rate for a specific base-type variation such as A>G for ≥1 sample-groups. Type: In range 0-1. Default: 0.01.
#  minSampleCalls – Minimum base-calls for all samples. Type: integer. Default: 5.
#  minSampleRefAndVarFrac – Minimum ratio of sum of reference and variant base-calls for a specific base-type variation to all base-calls for all samples. E.g., (C+T)/(A+C+G+T) in case of C>T variation. Type: In range 0-1. Default: 0.98.
#  minSampleVarCalls – Minimum variant base-calls for all samples of a sample-group for ≥1 sample-groups. Type: integer. Default: 1.
#  minSampleVarRate – Minimum variation rate for a specific base-type variation such as A>G for all samples of a sample-group for ≥1 sample-groups. Type: decimal. Default: 0.01.
#  outputOnlyCounts – Generate only base-call count values for positions that satisfy conditionalities imposed by option values other than those related to variation. Type: 0 or 1. Default: 0.
#  refBaseTypes – Reference base-type should be one of the characters in this string. Case-insensitive. Type: string with A, C, G and T characters. Default: ACGT.
#  varBaseTypes – Variant base-type should be one of the characters in this string. Case-insensitive. Type: string with A, C, G and T characters. Default: ACGT.
#
#  DEVELOPER NOTES:
#  
#  Initialization for the run – parsing/setting option values, getting start time, setting counters, etc. – is done in the BEGIN block. The conditional-for-first-line block is used to get sample/group-sizes and check for possible issues with input data or option values. Line-by-line pileup parsing, analyses of the parsed data, and printing of results is done in the middle block. Summary/log-writing is done in the END block.
#  
#  Variables with prefix count are used to track counts; with prefix mark to mark events such as errors; with prefix array for 1D arrays. The variable o is the 2D array that holds base-call counts for the samples.
#  
#  Variables have global scope in awk and persist between records. Use split command to effectively delete arrays since the delete command may be unavailable.
#  
#  Use of else-if statement is avoided through use of next/continue calls. Conditions likely to be encountered more frequently should be tested first for speed.
#  
#  Not directly using "/dev/stderr" to output error/status messages because of possible permission issue. Not using " | cat 1>&2" because process may print output after printing of the awk program's main output has started.

############ Initialize
BEGIN {
	# To measure run-time
	if(("date" | getline timeStart) <= 0){
        timeStart = "Could not get time";
    }
    close("date");
    
	## Tab as column-delimiter in input and output	
	FS = "\t";
	OFS = "";
	
	## Read/set options. Cast to integer with '+ 0'.	
	firstGroupSize = firstGroupSize ? int(firstGroupSize) : 0;
	logFile = logFile == "" ? "log_pileup2var.awk.txt" : logFile;
	maxSampleOtherCalls = maxSampleOtherCalls == "" ? 1 : int(maxSampleOtherCalls);
	minGroupAvgCalls = minGroupAvgCalls  == "" ? 5 : minGroupAvgCalls + 0;
	minGroupAvgVarCalls = minGroupAvgVarCalls == "" ? 1 : minGroupAvgVarCalls + 0;
	minGroupAvgVarRate = minGroupAvgVarRate  == "" ? 0.01 : minGroupAvgVarRate + 0;
	minSampleCalls = minSampleCalls == "" ? 5 : int(minSampleCalls);
	minSampleRefAndVarFrac = minSampleRefAndVarFrac == "" ? 0.98 : minSampleRefAndVarFrac + 0;
	minSampleVarCalls = minSampleVarCalls == "" ? 1 : int(minSampleVarCalls);
	minSampleVarRate = minSampleVarRate == "" ? 0.01 : minSampleVarRate + 0;
	outputOnlyCounts = outputOnlyCounts ? 1 : 0;
	refBaseTypes = refBaseTypes == "" ? "ACGT" : toupper(refBaseTypes);
	varBaseTypes = varBaseTypes == "" ? "ACGT" : toupper(varBaseTypes);

	## Set counters
	countHighOtherCalls = countIgnoreVarType = countInsuffCalls = countInsuffGroupAvgCalls = countInsuffGroupAvgVarCallsOrRate = countInsuffVarCallsOrRate = countNoRefBaseCall = countNotRefBase = countNoVarBaseCall = countPositive = countPositiveAndMulti = 0;	
}
############ Get sample & group sizes from first line; show status/error messages on terminal
NR == 1{
	markError = "";
	if(NF){
		sampleSize = (NF - 3)/3;
		if((sampleSize < 1) || (sampleSize != int(sampleSize))){
			markError = markError "Input does not appear to have correct data-type. The first line should be pileup data for at least one sample  ...";
		}
		else{
			firstGroupSize = ((firstGroupSize == 0) || (firstGroupSize > sampleSize)) ? sampleSize : firstGroupSize;
			secondGroupSize = (sampleSize > firstGroupSize) ? (sampleSize - firstGroupSize) : 0;
		}
	}
	else{
		markError = markError "Input does not appear to have correct data-type. The first line should be pileup data for at least one sample  ...";
	}
	
	if((refBaseTypes ~ "[^ACGT]") || (varBaseTypes ~ "[^ACGT]")){
		markError = "refBaseTypes or varBaseTypes option value was incorrectly specified ...";
		
	}
	
	## Exit if issues
	if(markError != ""){
		print "[pileup2var.awk] Exiting because of error(s): " markError > "/dev/tty";
        exit 1;
	}
		
	print "[pileup2var.awk] Input data has " sampleSize " total sample(s) in groups of sizes " firstGroupSize " and " secondGroupSize > "/dev/tty";
	print "[pileup2var.awk] Using these option values which are default values or values interpreted from those provided: firstGroupSize = " firstGroupSize ", logFile = " logFile ", maxSampleOtherCalls = " maxSampleOtherCalls ", minGroupAvgCalls = " minGroupAvgCalls ", minGroupAvgVarCalls = " minGroupAvgVarCalls ", minGroupAvgVarRate = " minGroupAvgVarRate ", minSampleCalls = " minSampleCalls ", minSampleRefAndVarFrac = " minSampleRefAndVarFrac ", minSampleVarCalls = " minSampleVarCalls ", minSampleVarRate = " minSampleVarRate ", outputOnlyCounts = " outputOnlyCounts ", refBaseTypes = " refBaseTypes ", varBaseTypes = " varBaseTypes > "/dev/tty";
	if(outputOnlyCounts){
		print "[pileup2var.awk] Only base-call counts will be output." > "/dev/tty";
	}
}
############ Process pileup by line
{
	# Reference base
	R = toupper($3);
	if(index(refBaseTypes, R) < 1){
		++countNotRefBase;
		next;
	}
	
	# Possible variant bases
	V = "ACGT";
	gsub(R, "", V);
	split(V, arrayVar, "");
	
	# Track sample IDs, etc.
	markSampleWithRefBaseSeen = markSampleWithVarBaseSeen = sID = 0;
	
	for(i = 4; i < NF; i = i + 3){
	
		$i = $i == "" ? 0 : $i;
		if($i < minSampleCalls){
			++countInsuffCalls;
			next;	
		}

		#### Parse the string of base-calls
		b = i + 1;
		
		# Remove read quality info
		gsub("\\^.", "", $b);
		
		# Remove indel info
		if($b ~ "[1-9]"){	
			# Curly braces for compatibility with gawk 3.1.5
    		while(match($b, "[0-9]{1,}") > 0) {
				n = substr($b, RSTART, RLENGTH);
				sub("." n "[a-zA-Z]{" n "}", "", $b);
			}
    	}
    	
    	w = gsub(",", "", $b);
    	W = gsub("\\.", "", $b);
		
		#### Save base-calls in a 2D array
		# a, c, g, t, A, C, G, T : calls of lower & upper cased base-types
		# ACGT : sum of above
		# all : all calls as output in the pileup data by samtools
		# N : calls of lower/upper-cased N
		# R : calls of lower/upper cased reference base-type
		
		++sID;
		
		o[sID, "all"] = $i;
		o[sID, "R"] = w + W;
		o[sID, "N"] = gsub("[nN]", "", $b);
		
		o[sID, "a"] = o[sID, "c"] = o[sID, "g"] = o[sID, "t"] = o[sID, "A"] = o[sID, "C"] = o[sID, "G"] = o[sID, "T"] = 0;
		
		if(R == "A"){o[sID, "a"] = w; o[sID, "A"] = W;}
    	else if(R == "C"){o[sID, "c"] = w; o[sID, "C"] = W;}
    	else if(R == "G"){o[sID, "g"] = w; o[sID, "G"] = W;}
    	else {o[sID, "t"] = w; o[sID, "T"] = W;}
    	
    	if(match(toupper($b), "[" V "]")){
    		++markSampleWithVarBaseSeen;
    		for(j in arrayVar){
    			x = arrayVar[j];
    			o[sID, x] = gsub(x, "", $b);
    			o[sID, tolower(x)] = gsub(tolower(x), "", $b);
    		}  		
    	}
    	
    	o[sID, "ACGT"] = o[sID, "a"] + o[sID, "c"] + o[sID, "g"] + o[sID, "t"] + o[sID, "A"] + o[sID, "C"] + o[sID, "G"] + o[sID, "T"];
    	
    	if(o[sID, "ACGT"] < minSampleCalls){
			++countInsuffCalls;
			next;	
		}
		
		if(o[sID, "R"]){
    		++markSampleWithRefBaseSeen;
    	}
    }
    
    ## Check if call group-averages are OK for both groups
    c = 0;
    for(i = 1; i <= firstGroupSize; ++i){
    	c += o[i, "ACGT"];
	}
	if((c/firstGroupSize) < minGroupAvgCalls){
		++countInsuffGroupAvgCalls;
		next;
	}
	else if(secondGroupSize){
		c = 0;
    	for(i = firstGroupSize + 1; i <= sampleSize; ++i){
    		c += o[i, "ACGT"];
		}
		if((c/secondGroupSize) < minGroupAvgCalls){
			++countInsuffGroupAvgCalls;
			next;
		}
	}
	
	#### Output only base-call count data if specified
	
	if(outputOnlyCounts){
		c = "";
		# Output has tab-delimited values, each per sample in order of appearance in the pileup, and as colon-separated counts for all (as per samtools), A/C/G/T, non-reference, reference and N (unknown) base-types (disregarding casing), and a, c, g, t, A, C, G and T base-types (i.e., case/strand-specific for each the four base-types)
		for(i = 1; i <= sampleSize; ++i){
			c = c "\t" o[i, "all"] ":" o[i, "ACGT"] ":" (o[i, "ACGT"] - o[i, "R"]) ":" o[i, "R"] ":" o[i, "N"] ":" o[i, "a"] ":" o[i, "c"] ":" o[i, "g"] ":" o[i, "t"] ":" o[i, "A"] ":" o[i, "C"] ":" o[i, "G"] ":" o[i, "T"];
		}
		# Output with chromosome, position and reference base info.
		print $1 "\t" $2 "\t" $3 c;
		next;
	}
	
	#### Variation evaluation

	if(markSampleWithRefBaseSeen == 0){
		++countNoRefBaseCall;
		next;
	}
	
	# Only examine variation types of interest
	varTypeSel = 0;
	split("", arrayVarSel);
	for(i in arrayVar){
		x = arrayVar[i];
		if(index(varBaseTypes, x)){
			arrayVarSel[x] = 1;
			++varTypeSel;
		}
	}
	if(varTypeSel < 1){
		++countIgnoreVarType;
		next;
	}
		
	# No variant in any sample
	if(markSampleWithVarBaseSeen == 0){
		++countNoVarBaseCall;
		next;	
	}
	
	# Samples with variants are fewer than either group-size when at least one variant base-call is needed for each sample of at least one group
	if(minSampleVarCalls && (((markSampleWithVarBaseSeen < firstGroupSize) && secondGroupSize == 0) || ((markSampleWithVarBaseSeen < firstGroupSize) && secondGroupSize && (markSampleWithVarBaseSeen < secondGroupSize)))){
		++countInsuffVarCallsOrRate;
		next;	
	}

	#### Detailed check for variations
		
	## Check variant calls & rates per sample & group; identify candidate variant bases from all possible variant bases of interest
	
	markSampleIssue = markGroupIssue = varTypeCand = 0;
	split("", arrayVarCand);
	for(j in arrayVarSel){
	    s1 = s2 = sr1 = sr2 = g1 = g2 = gr1 = gr2 = 0;
	    
	    # Group 1
	    for(i = 1; i <= firstGroupSize; ++i){
	    	v = o[i, j] + o[i, tolower(j)];
	    	vr = v ? v/(v + o[i, "R"]) : 0;
	    	s1 = (v >= minSampleVarCalls) ? s1 + 1 : s1;
	    	sr1 = (vr >= minSampleVarRate) ? sr1 + 1 : sr1;
	    	g1 += v;
	    	gr1 += vr;
	    }
	    
	    # Check 2nd group only if needed
	    if((s1 < firstGroupSize) && (sr1 < firstGroupSize) && ((g1/firstGroupSize) < minGroupAvgVarCalls) && ((gr1/firstGroupSize) < minGroupAvgVarRate) && secondGroupSize){
	    	# Logic like group 1 but faster checks possible since group 1 has been assessed
			for(i = firstGroupSize + 1; i <= sampleSize; ++i){
				v = o[i, j] + o[i, tolower(j)];
				if(v < minSampleVarCalls){
					continue;
				}
				++s2;
				vr = v ? v/(v + o[i, "R"]) : 0;
	    		if(vr < minSampleVarRate){
	    			continue;
	    		}
	    		++sr2;
	    		g2 += v;
	    		gr2 += vr;
	    	}
	    	if((s2 < secondGroupSize) || (sr2 < secondGroupSize)){
	    		++markSampleIssue;
	    		continue;
	    	}
	    	if(((g2/secondGroupSize) < minGroupAvgVarCalls) || ((gr2/secondGroupSize) < minGroupAvgVarRate)){
	    		++markGroupIssue;
	    		continue;
	    	}
	    	arrayVarCand[j] = 1;
	    	++varTypeCand;
	    	continue;
	    }
	    if((s1 < firstGroupSize) || (sr1 < firstGroupSize)){
	    	++markSampleIssue;
	    	continue;
	    }
	    if(((g1/firstGroupSize) < minGroupAvgVarCalls) || ((gr1/firstGroupSize) < minGroupAvgVarRate)){
	    	++markGroupIssue;
	    	continue;
	    }
	    arrayVarCand[j] = 1;
	    ++varTypeCand;
	}
	if(varTypeCand < 1){
		if(markSampleIssue){
			++countInsuffVarCallsOrRate;
		}
		else if(markGroupIssue){
			++countInsuffGroupAvgVarCallsOrRate;
		}
		next;
	}
	
	## Check if non-ref/variant calls or rates/levels are not too high in any sample. Also obtain various rates/levels. This could be done above but for speed.
	
	varTypeFin = 0;
	split("", arrayVarFin);
	for(j in arrayVarCand){
		markSampleIssue2 = 0;
		for(i = 1; i <= sampleSize; ++i){
	    	rv = o[i, j] + o[i, tolower(j)] + o[i, "R"];
	    	rvr = rv/o[i, "ACGT"];
	    	if((rvr < minSampleRefAndVarFrac) && ((o[i, "ACGT"] - rv) > maxSampleOtherCalls)){
	    		markSampleIssue2 = 1;
	    		break;
	    	}
	    }
	    if(markSampleIssue2 == 0){
	    	arrayVarFin[j] = 1;
	    	++varTypeFin;
	    }
	}
	if(varTypeFin < 1){
		++countHighOtherCalls;
		next;
	}
	
	#### Output

	++countPositive;
	if(varTypeFin > 1){
		++countPositiveAndMulti;
	}
	
	# Like when option outputOnlyCounts is set, output has tab-delimited values, each per sample in order of appearance in the pileup, and as colon-separated counts for all (as per samtools), A/C/G/T, non-reference, reference and N (unknown) base-types (disregarding casing), and a, c, g, t, A, C, G and T base-types (i.e., case/strand-specific for each the four base-types). 
	# Additionally, the values are preceded by variation rate minimum, maximum and average across all samples, and group average variation rates and their second-to-first-group ratio (NA used if there is no second group or if the rate for first group is 0).
	
	for(j in arrayVarFin){
		## Get rates/levels and their stats
		gr1 = gr2 = max = 0; min = 1;
		for(i = 1; i <= firstGroupSize; ++i){
	    	v = o[i, j] + o[i, tolower(j)];
	    	vr = v ? v/(v + o[i, "R"]) : 0;
	    	min = vr < min ? vr : min;
	    	max = vr > max ? vr : max;
	    	gr1 += vr;
	    }
	    if(secondGroupSize){
	    	for(i = firstGroupSize + 1; i <= sampleSize; ++i){
	    		v = o[i, j] + o[i, tolower(j)];
	    		vr = v ? v/(v + o[i, "R"]) : 0;
	    		min = vr < min ? vr : min;
	    		max = vr > max ? vr : max;
	    		gr2 += vr;
	    	}    	
	    }
	    mean = (gr1 + gr2)/sampleSize;
	    gr1 = gr1/firstGroupSize;
	    gr2 = secondGroupSize ? (gr2/secondGroupSize) : "NA";
	    
	    ## Output
	    c = "";
		for(i = 1; i <= sampleSize; ++i){
			c = c "\t" o[i, "all"] ":" o[i, "ACGT"] ":" (o[i, "ACGT"] - o[i, "R"]) ":" o[i, "R"] ":" o[i, "N"] ":" o[i, "a"] ":" o[i, "c"] ":" o[i, "g"] ":" o[i, "t"] ":" o[i, "A"] ":" o[i, "C"] ":" o[i, "G"] ":" o[i, "T"];
		}
		# Output with chromosome, position, reference base, variation-type, and the stats.
		print $1 "\t" $2 "\t" $3 "\t" R j "\t" min "\t" max "\t" mean "\t" gr1 "\t" gr2 "\t" (gr2 ? (gr1/gr2) : "NA") c;
	}
}
############ Save log/summary in log file
END {
	print "# Program pileup2var.awk" (markError != "" ? " did not complete run" : " completed run on " sampleSize " total sample(s) of groups of sizes " firstGroupSize " and " secondGroupSize) > logFile;
	print "# Run started at system time: ", timeStart >> logFile;
	if(("date" | getline timeEnd) <= 0){
        timeEnd = "Could not get time";
    }
    print "# Run ended at system time: ", timeEnd >> logFile;
    close("date");
    
    if(markError != ""){
    	print "# " markError >> logFile;
    }
    else{
    	if(outputOnlyCounts){
    		print "# Only base-call counts were output" >> logFile;
    		print "# Output has tab-delimited values, each per sample in order of appearance in the pileup, and as colon-separated counts for all (as per samtools), A/C/G/T, non-reference, reference and N (unknown) base-types (disregarding casing), and a, c, g, t, A, C, G and T base-types (i.e., case/strand-specific for each the four base-types) for a chromosome position. This set of values is preceded by a set of tab-delimited values with information on chromosome, position (1-based coordinate), and reference base information." (outputOnlyCounts ? " The reference base value is followed by a set of tab-delimited values with an annotation for a specific variation type (e.g., AG implying a candidate A>G variation position), and for this variation type, the variation rate minimum, maximum and average across all samples, and first and second group average variation rates and their ratio (NA used if there is no second group or if the rate for first group is 0). More than one line may be used per chromosome position if more than one variation-type has been observed for it." : " There is no information with result of variation analysis since such analysis was not asked for. One line is used per chromosome position.") >> logFile;
    	}
		print "# Used these option values, which are default values or values interpreted from those provided: firstGroupSize = " firstGroupSize ", logFile = " logFile ", maxSampleOtherCalls = " maxSampleOtherCalls ", minGroupAvgCalls = " minGroupAvgCalls ", minGroupAvgVarCalls = " minGroupAvgVarCalls ", minGroupAvgVarRate = " minGroupAvgVarRate ", minSampleCalls = " minSampleCalls ", minSampleRefAndVarFrac = " minSampleRefAndVarFrac ", minSampleVarCalls = " minSampleVarCalls ", minSampleVarRate = " minSampleVarRate ", outputOnlyCounts = " outputOnlyCounts ", refBaseTypes = " refBaseTypes ", varBaseTypes = " varBaseTypes >> logFile;
		print "# Position(s) skipped because of various reasons noted below, conditionality of earlier reasons being checked first." >> logFile;
		print "EXAMINED total position(s) (lines in the pileup input)\t", NR >> logFile;
		print "[pileup2var.awk] Ending... examined " NR " total position(s) (lines in the pileup input)" > "/dev/tty";
		
		# Following should be in the same order in which the conditions were checked
		print "Reference base-type not of interest \t" countNotRefBase >> logFile;
		print "Insufficient A/C/G/T base-calls for samples \t" countInsuffCalls >> logFile;
		print "Insufficient average A/C/G/T base-calls for sample-groups \t" countInsuffGroupAvgCalls >> logFile;
		if(outputOnlyCounts == 0){
			print "No reference-type A/C/G/T base-call in any sample\t" countNoRefBaseCall >> logFile;
			print "No variant A/C/G/T base-call in any sample\t" countNoVarBaseCall >> logFile;
			print "Variant base-type not of interest\t" countIgnoreVarType >> logFile;
			print "Insufficient variant A/C/G/T base-calls or variation rate for samples\t" countInsuffVarCallsOrRate >> logFile;
			print "Insufficient average variant A/C/G/T base-calls or variation rate for sample-groups\t" countInsuffGroupAvgVarCallsOrRate >> logFile;
			print "Too many A/C/G/T base-calls for other than reference or variant base-types for samples\t" countHighOtherCalls >> logFile;
			print "IDENTIFIED possible variant positions\t" countPositive >> logFile;
			print "Possible variant positions with >1 variation-type\t" countPositiveAndMulti >> logFile;
			print "[pileup2var.awk] Identified " countPositive " possible variant position(s), with >1 variation-type seen for " countPositiveAndMulti " position(s)" > "/dev/tty";
		}
	}
}
Presented with Sourceer
PHP Labware home | visitors since Sept 2017