#!/usr/bin/perl 
# 
# by Harry Mangalam; hjm@tacgi.com
# scut is free software; I explicitly place it in the public domain.  There are NO restrictions
# on its use, except I'd appreciate a note if you find it useful or find/fix a bug, or
# can offer a suggestion.  It would be rude to remove this header, but you can if you want.

# Version 1.12
# Changes: 
#   1.12   10.15.02 - fixed bad test for begin & end, final tab on output (stupid misuse of substr)
#   1.11   10.07.02 - add offset capability to slice out sections of a file for processing.
#                     --begin='regex|#' --end='regex|#'
#                     also, if scut is called with no args, should dump help
#   1.10   10.02.02 - added ability to use alphabetic/excel-type column IDs rather than 
#                     explicit numbers to make it easier to convert from spreadsheet
#                     notation to 0-based notation
#   1.06    5.30.02 - changed name to scut from the original 'mergem'
#                     for 'smarter cut', the util that performs scut work for you
#                     some typos fixed, some text clarified.
#           9.14.01 - added ability to process STDIN for smarter cut function
#                     no need to define input with '--f1'
#           7.28.00 - added columnar grabbing for single files (no keying required)
#                     like 'cut', but is column-based and can be both discontinuous and 
#                     out-of-order.
#           9.29.99 - added file for grabbing error output
#				    7.28.99 - added '--version' and '--nocase' 
#           7.27.99 - fixed mem leak from expanding hash table
#           7.25.99 - added '--sync'
use Getopt::Long;
$ver = 0;

# hash to convert alphabetic columns to 0-based indices up to 77 cols.  You can design your own 
# algo to do this correctly if you feel offended by this cheesy hack
%excel_ids = ('A' => 0,
'B' =>   1,'C'  =>  2,'D'  =>  3,'E'  =>  4,'F'  =>  5,'G'  =>  6,'H'  =>  7,'I'  =>  8,'J'  =>  9,'K'  => 10,
'L' =>  11,'M'  => 12,'N'  => 13,'O'  => 14,'P'  => 15,'Q'  => 16,'R'  => 17,'S'  => 18,'T'  => 19,'U'  => 20,
'V' =>  21,'W'  => 22,'X'  => 23,'Y'  => 24,'Z'  => 25,'AA' => 26,'AB' => 27,'AC' => 28,'AD' => 29,'AE' => 30,
'AF' => 31,'AG' => 32,'AH' => 33,'AI' => 34,'AJ' => 35,'AK' => 36,'AL' => 37,'AM' => 38,'AN' => 39,'AO' => 40,
'AP' => 41,'AQ' => 42,'AR' => 43,'AS' => 44,'AT' => 45,'AU' => 46,'AV' => 47,'AW' => 48,'AX' => 49,'AY' => 50,
'AZ' => 51,'BA' => 52,'BB' => 53,'BC' => 54,'BD' => 55,'BE' => 56,'BF' => 57,'BG' => 58,'BH' => 59,'BI' => 60,
'BJ' => 61,'BK' => 62,'BL' => 63,'BM' => 64,'BN' => 65,'BO' => 66,'BP' => 67,'BQ' => 68,'BR' => 69,'BS' => 70,
'BT' => 71,'BU' => 72,'BV' => 73,'BW' => 74,'BX' => 75,'BY' => 76,'BZ' => 77);

if (!defined $ARGV[0]){
  die "\n\tUse 'scut --help' for help on usage.\n";
}
        
&GetOptions("f1=s"      => \$f1,      # file name 1
            "f2=s"      => \$f2,      # file name 2
            "k1=i"      => \$k1,      # key column 1
            "k2=i"      => \$k2,      # key column 2
            "c1=s"      => \$c1,      # columns to print from f1
            "c2=s"      => \$c2,      # columns to print from f2
            "id1=s"     => \$id1,     # input delimiter
            "id2=s"     => \$id2,     # input delimiter
            "od=s"      => \$od,      # output delimiter
            "help!"     => \$help,    # dump usage, tips
            "noerr!"    => \$noerr,   # DON'T write bulk of std out
            "version!"  => \$ver,     # just asking for version
            "begin=s"   => \$begin,   # start at this line (if #) or that contains this regex (if regex) 
            "end=s"     => \$end,     # end at this line (if #) or that contains this regex (if regex)            
            "excl!"     => \$excl,    # if set, exclude the begin/end lines, if not set, include them 
                                      #  code is a bit odd as this was done 1st using the 'include' form which
                                      #  is less intuitive, but since the code already worked with that flag, 
                                      #  just changed the sense of the flag.
            "nocase!"   => \$nocase,  # no case distinction
            "sync!"     => \$sync     # maintain sync of input and output lines
);

if ($ver == 1 ) {
   print "scut: Version 1.12 (10.07.02) - author: Harry Mangalam (hjm\@tacgi.com)\n";
   exit 0;
}

if (!defined $begin) {
   $begin = 1;
   $begin_flag = "numeric";
} elsif ($begin =~ /\D/) { $begin_flag = "regex";} 
   else { $begin_flag = "numeric"; }

if (!defined $end) {
   $end = 10000000;  # effective limit is 1,000,000 lines
   $end_flag = "numeric";
}elsif ($end =~ /\D/) { $end_flag = "regex";} 
else { $end_flag = "numeric"; }

if ($begin_flag eq "numeric" && $end_flag eq "numeric" && ($end <= $begin)) {
   die "The --begin value has to be less than the --end value.\n";
}

#print "begin = $begin, flag = $begin_flag and  end = $end, flag =  $end_flag\n";

if ($excl == 0) {$incl = 1;} # inverts the --incl/exclude flag to be more intuitive..? w/o changing code
else            {$incl = 0;}

if ($help) {
   print <<HELP;

 scut has 2 purposes:
 1) printing fields from lines that have one field that matches a field from
    another file (explained below).
    
 2) slicing out columns out of a file and (optionally) re-ordering them
    If you had a file, a line of which was:
     0   1  2   3    4   5   6       7      8  9      10 11  12     13 14
    "now is the time for all twisted wackos to wheeze on the snoots of coots"
    and you only wanted fields 3 5 7 and 8, but you wanted them in the order:
    5 8 7 3, you could specify this by --c1='5 8 7 3', and that line would be
    output as:
    "all to wackos time"
    This function is essentially a smarter 'cut', and only REQUIRES the input
    (as STDIN, not a file name) and the columns to print (--c1='# # # #').  
    If you want it to break on something other than whitespace, you have to 
    specify that as well.
    
Usage: scut [options, below] > output_file 
  --f1=[file1]    - the shorter or 'needle' file.  If using as a smarter cut,
                   use STDIN.
  --f2=[file2]    - the 'longer' or 'haystack' file 
  
  --k1=col#       - the key column from file1 (numbered from ZERO, not 1) 
                     i.e the number of the column (starting from 0) that  
                     has the key column name for file1 (see example below)
  --k2=col#       - the key column from file2 (ditto)
  
  --c1='# # ..'   - the numbers of the columns from file1 that you want
     or              printed out in the order in which you want them.  If
  --c1='A C F ..'    you DON'T want any columns from the file, just
                     enter it as '' (2 single quotes) or omit it
                     completely.  If you want the whole line, type 'all'
                     Notes: 
                     1) #s are split on whitespace, not commas.
     or              2) scut also supports Excel-style column specifiers 
  --c1='A C F ..'    (A B F AD BG etc) for up to 78 columns (->BZ)  If you want
                     more, add them to the %excel_ids hash above or create an 
                     algo that does it right.
                     
  --c2='# # ..'   - ditto for file2 
     or          
  --c2='A C F ..'
 
  --id1='...'     - the delimiter string for file1; defaults to whitespace 
                    (specify TAB = '\t'), but can be a multicharacter string 
                    as well such as '_|_'

  --id2='...'     - ditto for file2 

  --od='...'      - the delimiter string for the output (defaults to TAB)

  --noerr         - stops most stderr from being generated (for large files,
                     most of the CPU is dedicated to processing the STDERR text
                     stream (thanks for stressing it, Peter), but if you need
                     this output, you'll just have to deal with it.
                     
  NB: the following 3 options: --begin, --end, --excl currently only work with 
  the single file version (as a smarter cut, not the merging functions).  
  Stay tuned for the 2 file version..
                     
  --begin=[#|regex] - specifies the line to START processing data at (for 
                      example, if the file has 2 format sections and you only 
                      want to process one of them).  The option can be either
                      an integer value to specify the line number, or a non-
                      repeating regular expression that unambiguously identifies 
                      the line.
                      
  --end=[#|regex] - as above, but specifies the line to STOP processing data at.
  
  --excl          - if added to the arguments, excludes the lines specified by 
                      --begin and --end (in case you need to exclude the 
                      defining header lines).
  
  --version       - gives the version of the software and dies.
  
  --nocase        - makes the merging key case INSENSITIVE.

  --sync          - whether you want the output sync'ed on file2.  The sync
                    will insert blank lines where there are comments as well.
  --help          - dumps these lines to stdout and dies.

 Notes:
 
 = there have to be the same number of columns in each line or it will get
 confused.  The matches are case-sensitive, unless you use the '--nocase'  
 option to turn it off.

 = scut sends its output to stdout, so if you want to catch the output in a
 file, use redirection '>' (see below) and if you want to catch the stderr
 you'll have to catch that as well ( >& out ).

 = scut ignores any line that starts with a '#', so you can document what
 the columns mean, add column numbering, etc, as long as those lines start
 with a '#'

 = scut always puts the matched key in the 1st column of the output

 = under Win/DOS execution, you will probably need to run it with the perl
   prefix i.e. perl scut [options] and will also have to enclose the option
   strings with DOUBLE QUOTES (\"opts\") instead of single quotes('opts').
HELP
   exit 0;
}

#process the c1/c2 numbers into an array for manipulation
$Nc1i = @c1i = split(/\s+/,$c1); # $Nc1i - # of cols selected for output from file 1
$Nc2i = @c2i = split(/\s+/,$c2); # ditto for file 2

# convert excel IDs to 0-based col numbers.
for ($i=0; $i<$Nc1i; $i++) {
   if ($c1i[$i] =~ /\D/ ) { # matches a non-digit, convert to a digit
      $c1i[$i] =~ tr/a-z/A-Z/;
      if (length($c1i[$i]) > 2) { # something's wrong - hash doesn't support keys > 2
         die "the column specifier in --c1 ($c1i[$i]) is too long\n";
      } else {
#         print "before conversion arg # $i of $Nc1i = $c1i[$i]\n";
         $c1i[$i] = $excel_ids{$c1i[$i]}; # replace inline
#         print " after conversion arg # $i = $c1i[$i]\n\n";
      }
   }
}
for ($i=0; $i<$Nc2i; $i++) {
   if ($c2i[$i] =~ /\D/ ) { # matches a non-digit, convert to a digit
      $c1i[$i] =~ tr/a-z/A-Z/;
      if (length($c2i[$i]) > 2) { # something's wrong - hash doesn't support keys > 2
         die "the column specifier in --c2 ($c2i[$i]) is too long\n";
      } else {
         $c2i[$i] = $excel_ids{$c2i[$i]}; # replace inline
      }
   }
}


#print STDERR "ERR: @c1i .. @c2i \n";

#print STDERR "ERR: $id1  $id2  $od\n";
if ($id1 eq "") { # if it's not defined in the command line, 
   $id1 = '\s+'; # it's defined here as whitespace
}
if ($id2 eq "") { # if it's not defined in the command line, 
   $id2 = '\s+'; # it's defined here as whitespace
}
if ($od eq "") { # if it's not defined in the command line, 
   $od = "\t"; # it's defined here as a tab
   $L_od = -1;
} else {
	$L_od = -1 * (length $od);
	#$L_od = length $od;
}

#print STDERR "ERR: $id1  $id2  $od\n";
if (!defined $f2) { # if there's no File2, then slice the requested columns out of File1
#	open(FILE1, "$f1") or die "Can't open the first file: $f1!\n";
$last_line = 0;
$line_counter = 1;
$process = 0;
   while (<>) {
      if ($process == 0) {  # then we still haven't hit the start condition
         if (($begin_flag eq "regex")   && ($_ =~ /$begin/) || 
            (($begin_flag eq "numeric") && ($line_counter == $begin))) { 
            $process++;
         }
      } else {  # $process > 0 we're in the midst of processing and just checking for the end condition
         if ((($end_flag eq "regex") && ($_ =~ /$end/)) ||  
             (($end_flag eq "numeric") && ($line_counter == $end)  )) { # then we're done; exit
            if ($incl == 1) { $lastline = 1; }
            else {
               print STDERR "Total Lines Counted = $line_counter, Processed = $process\n"; 
               exit(1);
            }
         } # else keep on keeping on
      }
      
      # this is what we do if processing 
#      print "process = $process\n";
      if ($process >= 1) {
         if ($process > 1 && $incl == 1) {}
         chop;
         if ($_ !~ /^#/) {
   	      $newcols = ""; # zero the string
   	      $WC = @L = split /$id1/; #  $WC = Word Count, $id1 = input delimiter
            for ($r=0; $r<$Nc1i;$r++) { # do this over the number of cols we want
               $newcols .= "$L[$c1i[$r]]$od";
            }
            #substr $newcols, 0, $L_od;
            $newcols = substr($newcols, 0, $L_od);
           # print conditions
            if (($incl == 1) || ($process > 1) || ($lastline == 1 && $incl == 1)) {
#               print "$line_counter : $newcols\n";
               print "$newcols\n";
            }
         } 
         $process++;
      }
      
      $line_counter++;
      if ($lastline == 1) {
         print STDERR "Total Lines Counted = $line_counter, Processed = $process\n"; 
         exit(1);
      }
   }
   
} else {
   open(FILE1, "$f1") or die "Can't open the first file: $f1 or STDIN!\n";
   $TotLineCnt = 0; $UnCommented = 0;
   $UniqIndexCnt = 0;
   while (<FILE1>) {
      chop;
      $TotLineCnt++;
      if ($_ !~ /^#/) {
         $UnCommented++;
         $WC = @L = split /$id1/; #  $WC = Word Count, $id1 = input delimiter
         # need to add checking for redundant indices, other error checking
         # if this is supposed to be Case-INSENSITIVE
         if ($nocase == 1) {
            $L[$k1] = uc($L[$k1]); # change everything to UPPER case
         }
         if ($C1{$L[$k1]}[0] == 1 && $noerr != 1) { # if we've already set it (already hit the same index word before)
            print STDERR "\nERR: Keyword \"$L[$k1]\", line ", $TotLineCnt," already seen: ", $_, "\n";
         } else {
            $UniqIndexCnt++;
            # $C1 is the BIG array for keeping all the info we want saved from file 1
            # $C1 uses a hash index to keep track of the bits - will it work with purely integers as well?
            $C1{$L[$k1]}[0] = 1; # set the [0] so that we know it's been hit.
            $C1{$L[$k1]}[1] = $L[$k1]; # and put the key itself into the [1]
            # now save all the info we want saved in $C1
            for ($r=2; $r<$Nc1i+2; $r++) { # for every col that we want to output eventually
               $C1{$L[$k1]}[$r] = $L[$c1i[$r-2]];
            }
         }
      }
   }
   print STDERR "\nERR:Total Lines: ", $TotLineCnt, "  Uncommented Lines: ", $UnCommented, "  Lines with Unique Keys: ", $UniqIndexCnt, "\n\n";

   #open the 2nd file
   open(F2, "$f2") or die "Can't open the second file: $f2!\n";
   $TotLineCnt = 0;  $UnCommented = 0;
   $UniqIndexCnt = 0;

   while (<F2>) {
      chop;
      $TotLineCnt++;
      if ($_ !~ /^#/) {
         $UnCommented++;
         $WC = @L = split /$id2/; #      $WC = Word Count

         # if this is supposed to be Case-INSENSITIVE
         if ($nocase == 1) {
            $L[$k2] = uc($L[$k2]); # change everything to UPPER case
         }

         if ($C1{$L[$k2]}[0] != 1) { # if it hasn't been set, then it's not a match, so print it to stderr
            delete $C1{$L[$k2]};
            if ($noerr != 1) {
               print STDERR "ERR:Keyword \"$L[$k2]\", line ", $TotLineCnt, " not a match: ", $_, "\n";
            }
            if ($sync == 1) { # if we want the output to sync (maintain line numbers),
               print "\n"; # add a newline
            }
         } else { # it IS a match and we want all the juicy bits printed out in a particular format
            $UniqIndexCnt++;
            # 1st print out the stuff from file 1 in order of storage, then the stuff from file 2 as requested
            for ($r=1; $r<$Nc1i+2; $r++) { # for the 1st file
               print "$C1{$L[$k2]}[$r]$od";
            }  
            for ($r=0; $r<$Nc2i; $r++) { # for the second file
               print "$L[$c2i[$r]]$od";
            }  
            print "\n";
         }
      } elsif ($sync == 1) {
         print "\n";
      }
   }  
}