#!/usr/bin/perl
# blast2many - program to split a Blast output file from multiple
# queries into separate output files, one for each query.
#
# See display_more_help() below for more details.
#
# Written by: James D. White, University of Oklahoma, Advanced Center for
#   Genome Technology
#
# Date Written: Nov 10, 2004
#
# 2006.08.04 JDW - Save file header and trailer info to duplicate around
#                  split files.  Needed to take care of new NCBI blastall
#                  output that does not duplicate this info between
#                  queries.  Also now handles Blast output files > 2GB
#                  in length, but individual query outputs should be < 2GB.
# 2006.06.02 JDW - Get rid of bogus doubled T?BLAST[NPX] header on
#                  consecutive lines
# 2005.08.24 JDW - Fixed usage info.  Added '-f flag to read filenames
#                  for files to be created.
#

#
$::Date_Last_Modified = "August 4, 2006";

use strict;

($::my_name) = $0 =~ m"[\\/]?([^\\/]+)$";
$::my_name ||= 'blast2many';

$::VERSION = '2.0';


$::USAGE = <<USAGE;

USAGE:  $::my_name [-d dir_name] [-f file_of_files] [-l] [-o] [-q]
                   [-s suffix] [input_file]
            or
        $::my_name -h           <== for more info

where 'input_file' is the name of the blast output file to be split.
        'input_file' may be omitted to read from standard input.

USAGE

my($DIRECTORY, $FILE_OF_FILES, $SUFFIX) = ('', '', '');
my($LIST, $ONLY_HITS, $QUIET, $have_directory, $have_files,
   $have_suffix, $directory_ok) = (0) x 7;
my $process_algorithm = 1;

# get command line flags
our($opt_d, $opt_f, $opt_h, $opt_l, $opt_o, $opt_q, $opt_s);
use Getopt::Std;
die $::USAGE unless(getopts('d:f:hloqs:'));
display_more_help() if ($opt_h);
die "Too many arguments\n$::USAGE" if (@ARGV > 1);

if (defined($opt_d))
  {
  $DIRECTORY = $opt_d if ($opt_d ne '');
  $have_directory = 1;
  }
if (defined($opt_f) && $opt_f ne '')
  {
  $FILE_OF_FILES = $opt_f;
  $have_files = 1;
  }
$LIST = 1 if $opt_l;
$ONLY_HITS = 1 if $opt_o;
$QUIET = 1 if $opt_q;
if (defined($opt_s))
  {
  $SUFFIX = $opt_s if ($opt_s ne '');
  $SUFFIX = ".$SUFFIX" if ($SUFFIX ne '' && $SUFFIX !~ /^[\.\'\"]/);
  $SUFFIX =~ s/^'(.*)'$/$1/;
  $SUFFIX =~ s/^"(.*)"$/$1/;
  $have_suffix = 1;
  }
$process_algorithm = 0 if $have_directory && $have_suffix;

print STDERR "$::my_name Version $::VERSION, $::Date_Last_Modified\n"
  unless $QUIET;

# get 'input_file' and open it not STDIN
my $file = shift(@ARGV) || 'Standard Input';
# process file name from command line
my $infile = 'STDIN';
if ($file ne 'Standard Input')
  {
  # open the blast input file
  unless (defined(open(INFILE, "<$file")))
    {
    die "Cannot open input_file: '$file', $!\n";
    }
  $infile = 'INFILE';
  }

# get 'file_of_files' and open it if needed
if ($have_files)
  {
  unless (defined(open(FOF, "<$FILE_OF_FILES")))
    {
    die "Cannot open file_of_files: '$FILE_OF_FILES', $!\n";
    }
  }

# initialize counters
my($query_lines, $queries_per_header, $query_count, $query_hit_count,
   $query_nohit_count, $query_noid_count, $extra_filenames, $extra_queries,
   $files_written, $total_hits) = (0) x 10;
my($query_string, $line, $header_lines, $trailer_lines) = ('') x 2;
my @query_bodies = ();

# read blast query results and process them
while (defined($line = <$infile>))
  {
  if ($line =~ /^(T?BLAST[NPX])\s/)	# is it a blast query header?
    {
    my $algorithm = $1 || '';
    process_algorithm($algorithm) if $process_algorithm;
    if ($query_lines > 1)
      {
      process_queries();
      # reset for next query result
      $query_string = $header_lines = $trailer_lines = '';
      @query_bodies = ();
      }
    elsif ($query_lines == 1)	# get rid of bogus doubled T?BLAST[NPX] header
      {
      $query_string = '';
      }
    $query_lines = 0;
    $queries_per_header = 0;
    }
  elsif ($line =~ /^Query=\s*(\S+)/)
    {
    $queries_per_header++;
    if ($queries_per_header == 1)	# first (only?) query per blast header?
      {
      $header_lines = $query_string;	# save blast header
      @query_bodies = ();
      $trailer_lines = '';
      }
    else # ($queries_per_header > 1)	# second+? query per blast header?
      {
      push @query_bodies, $query_string; # save previous query
      }
    $query_string = '';
    }
  $query_string .= $line;
  $query_lines++;
  }
process_queries() if $query_string ne '';
if ($have_files)
  {
  while (defined ($_ = <FOF>))
    {
    $extra_filenames++  if $_ ne '';
    }
  close FOF;
  $have_files = 0;
  }
close(INFILE) unless ($infile eq 'STDIN');

unless ($QUIET)		# print statistics to STDERR
  {
  print STDERR "$query_count queries found in input_file '$file'\n";
  print STDERR "$query_hit_count queries had a total of $total_hits hits\n";
  print STDERR "$query_nohit_count queries had no hits\n"
    if $query_nohit_count;
  print STDERR "$query_noid_count queries had no valid query ids\n"
    if $query_noid_count;
  if ($FILE_OF_FILES ne '')
    {
    print "$extra_filenames unused filenames found in file '$FILE_OF_FILES'\n"
      if $extra_filenames;
    print "$extra_queries queries were found without names in file '$FILE_OF_FILES'\n"
      if $extra_queries;
    }
  print STDERR "$files_written files ", ($DIRECTORY ne '/dev/nul' ?
     "were created in directory '$DIRECTORY'\n" :
     "would have been created\n");
  }


###########################################################################
# process_algorithm - Get the algorithm name from the first query,
#   if needed, and get the directory name and suffix if not already
#   provided.
###########################################################################

sub process_algorithm
    {
    my($algorithm) = @_;
    $process_algorithm = 0;	# we only do this once

    # get algorithm name for default directory name
    $algorithm = "\L$algorithm";
    $DIRECTORY = $algorithm unless ($have_directory);

    # modify algorithm name for default suffix
    $algorithm =~ s/blast//;
    $SUFFIX = ".$algorithm" unless $have_suffix;

    } # end process_algorithm


###########################################################################
# process_queries() - Construct separate Blast search strings from joined
#   searches that do not have separate header and trailer info
###########################################################################

sub process_queries
    {
    if ($query_string =~ /(.+)(\n\s+Database:.*)\Z/s)
      {
      push @query_bodies, $1;	# save last query for this blast header
      $trailer_lines = $2;	# save blast trailer
      }
    else
      {
      push @query_bodies, $query_string; # save last query for this blast header
      $trailer_lines = '';	# there was no blast trailer
      }

    $header_lines =~ s/\n+\Z/\n\n/;
    $trailer_lines =~ s/\A\n+/\n\n/;

    my($query1);
    foreach my $body (@query_bodies)
      {
      $body =~ s/\A\n+//;
      $body =~ s/\n+\Z//;
      $query1 = $header_lines . $body . $trailer_lines;
      process_query(\$query1);
      }
    } # end process_queries


###########################################################################
# process_query($query_ref) - Get query id and hit information from query
###########################################################################

sub process_query
    {
    my($query_ref) = @_;
    my $query_id = '';
    my $hit_count = 0;
    my $filename;
    $query_count++;

    # get query id
    if ($$query_ref =~ /^Query=\s*(\S+)/m)
      {
      $query_id = $1;
      }
    else
      {
      $query_id = '** No query id **';
      $query_noid_count++;
      }

    # get hit count for query
    my @hit_ids = $$query_ref =~ /^>(\S+)/gm;
    $hit_count = scalar @hit_ids;
    if ($hit_count)
      {
      $query_hit_count++;
      }
    else
      {
      $query_nohit_count++;
      }
    $total_hits += $hit_count;

    # get a filename from 'file_of_files' if '-f' was specified
    $filename = get_filename();

    # write query result to file unless '-o' and no hits
    my $write_string = '';
    $write_string = write_query($query_id, $filename, $query_ref)
      unless (($ONLY_HITS && ! $hit_count) ||
        ($query_id eq '** No query id **'));

    # write query hit info to STDOUT if '-l'
    print STDOUT "$query_id\t# $hit_count hits$write_string\n" if $LIST;
    } # end process_query


###########################################################################
# write_query - Write query result to a file
###########################################################################

sub write_query
    {
    my($query_id, $filename, $query_ref) = @_;
    $files_written++;

    # test for dummy directory
    return  '' if $DIRECTORY eq '/dev/nul';

    create_dir() unless $directory_ok;	# create directory if needed

    # use query_id as default, if filename not available
    if ($filename eq '')
      {
      $extra_queries++ if ($FILE_OF_FILES ne '');
      $filename = $query_id;
      }

    # Open output file.  Note that $DIRECTORY and $SUFFIX already have a
    # trailing or leading '/' if needed.
    open(QUERY_OUT, ">${DIRECTORY}${filename}$SUFFIX") ||	# open file for output
      die "cannot create file '$filename', $!\n";
    print QUERY_OUT $$query_ref;	# write the query result
    close QUERY_OUT;			# close the file
    return " were written to file '${filename}$SUFFIX'";
    } # end write_query


###########################################################################
# get_filename - Return a filename from the 'file_of_files' or an empty
#   string.
###########################################################################

sub get_filename
    {
    my $filename = '';
    # read a filename from a file, if '-f' was specified
    if ($have_files)
      {
      $filename = <FOF>;
      if (defined $filename)
        {
        chomp $filename;
        $filename =~ s/\s+$//;
        $filename =~ s/^\s+//;
        }
      else
        {
        $have_files = 0;
        close FOF;
        $filename = '';
        }
      }
    return $filename;
    } # end get_filename


###########################################################################
# create_dir - Create the target directory, if necessary
###########################################################################

sub create_dir
    {
    $directory_ok = 1;			# we only come here once
    $DIRECTORY =~ s/^'(.*)'$/$1/;	# clean up directory name
    $DIRECTORY =~ s/^"(.*)"$/$1/;
    my $directory = $DIRECTORY;
    $directory =~ s#/$##;		# remove trailing / for mkdir

    # make sure directory does not exist before creating it
    if ($directory ne '' && ! -d $directory)
      {
      mkdir($directory) ||		# create directory
        die "cannot create directory '$directory', $!\n";
      }
    # add trailing / if needed as part of filename
    $DIRECTORY .= '/' unless ($DIRECTORY =~ m#/$# || $DIRECTORY eq '');
    } # end create_dir


###########################################################################
# display full help info
###########################################################################

sub display_more_help
    {
    print STDOUT <<HELP;

$::my_name - program to split a Blast output file from multiple queries
into separate output files in a directory, one output file for each
query result.


USAGE:  $::my_name [-d dir_name] [-f file_of_files] [-l] [-o] [-q]
                   [-s suffix] [input_file]
            or
        $::my_name -h           <== what you are reading


where

  'dir_name' is the output directory name.  See '-d'.

  'file_of_files' is the name of a file containing the names of the
	output files to be created.  See '-f'.

  'suffix' is a suffix to be added to the output filenames.  See '-s'.

  'input_file' is the name of the blast output file to be split.
        'input_file' may be omitted to read from standard input.


OPTIONS:

  -d dirname	Output directory name.  Specify the name of a
	directory into which the output files are to be written.
	The directory is assumed to be a subdirectory of the current
	directory, unless a full rooted path is given.  If '-d' is
	omitted, then the blast algorithm from the first query is
	used as the directory, but converted to lower case (blastn,
	blastp, blastx, tblastn, or tblastx).  The directory will be
	created, if necessary, but only the last level.  Use '-d .'
	or '-d ""' to write the files into the current directory.
	If input filenames from a 'file_of_files' (See '-f'.) contain
	a directory path, then you should specify '-d ""'.

  -f file_of_files	File of filenames.  Specify the name of a
	file from which the names of the output files are to be read.
	Filename should be entered each on a separate line, with each
	line corresponding to a query in the 'input_file'.  If '-o'
	is specified, then queries without hits still "use up" a
	filename, even though that file will not be created.  If the
	'-f' option is omitted, or if there are more queries than
	filenames, or if a blank line is read, then the query id is
	used as the output filename.  Extra filenames in the
	'file_of_files' will be ignored.  Leading and trailing white
	space will be removed, and a suffix may be added, depending
	upon the '-s' option.

  -h	Print full help information to STDOUT.

  -l	Listing mode.  Write a list of all query ids and hit status to
	STDOUT.  The default is not to list the individual query ids.
	The '-o' option does not affect this option -- all query ids
	are written to STDOUT if requested.

  -o	Only hits mode.  Only the query results with hits are to be
	written as files.  The queries with no hits are not written.
	The default is to write all query results as separate files.

  -q	Quiet mode.  Do not write any stats to STDERR.  The default is
	to write a summary to STDERR.

  -s suffix	Output filename suffix.  This is the suffix to be
	added to each query id or filename from the 'file_of_files'
	to form the name of the file created for each query results.
	If '-s' is omitted, then the suffix is the blast algorithm
	with the word blast removed and then converted to lower case.
	Note that the default suffix is derived from the first query
	result only.  If no suffix is desired, then specify '-s' with
	a null value ('' or "").  A period will be pre-pended to the
	suffix, unless the value is enclosed in quotes or already
	begins with a period.


NOTE:  The query result output files are named:

  <directory>/<filename>[<suffix>]
      or
  <directory>/<query_id>[<suffix>]

where <directory> is the name of the directory into which the file is
to be created (see -d), <query_id> is the query id from the "Query="
line in the result file, <filename> is a filename read from the
'file_of_files' (see -f), and <suffix> is the suffix to be appended to
the query id (see -s).

If the directory already exists, then existing files may be overwritten
without warning.  If the directory is specified as '/dev/nul', then the
files will not be created, but the stats and listing will be written to
STDERR and STDOUT, respectively, depending upon the '-q' and '-l'
options.


EXAMPLES:

For the following examples, assume file 'xyzzy' contains the following:

BLASTP ...
...
Query= query_id1
... (hit info) ...
BLASTP ...
...
Query= query_id2
... (no hit info) ...
BLASTP ...   
...
Query= query_id3
... (hit info) ...


and assume that 'fof' contains the following:

gene1
gene2


\$$::my_name zyzzy

creates the files blastp/query_id1.p, blastp/query_id2.p, and
blastp/query_id3.p.


\$$::my_name -o -l -d xyz -s "" xyzzy

creates the files xyz/query_id1 and xyz/query_id3.  The second
query result for query_id2, with no hits, is not written to a file
(-o).  The names of all three query ids are written to STDOUT, along
with the hit status of each (-l).


\$$::my_name -f fof -o -s out zyzzy

creates the files blastp/gene1.out and blastp/query_id3.out.  No file
is written for query_id2, because there were no hits (-o), but 'gene2'
is not carried forward for the third query, so query_id3 is used to
form the filename.


DATE LAST MODIFIED: $::Date_Last_Modified

HELP
    exit 1;
    } # end display_more_help
