#!/usr/bin/perl
#
#
$PGM = $0;                      # name of program
$PGM =~ s#.*/##;                # remove part up to last slash
@args = @ARGV;                  # arguments to program
$| = 1;                         # flush after all prints
$SIG{'INT'} = 'cleanup';        # interrupt handler

#Defining all the variables
my $filename = ""; 	#store the file name entered by the user
my @filedata = ();	#store the file data here
my $in_sequence = 0; 	#Have not seen the sequence yet
my $line = "";	#variable to store file line of file data, respectively.

##################
$usage = <<USAGE;		# usage message
  USAGE:
	$PGM  -i <feature file> -g <gene_file_name> -start <start> -end <end> 
	      [-up <no. of bases up>] [-down <no. of bases down>]
               
	      

	-i <feature file>	feature locations on genome in .psl format
	-g <gene file name>	Gene annotations
	-start <start>	Start of the search space (options are:up-k-bases, GeneStart, GeneEnd)
	-end <end>	End of search space (Options are:GeneStart, GeneEnd, down-k-bases)
	-up <no. of bases up>	Number of bases to search upstream of a gene start (default=zero bases)
	-down <no. of bases up>	Number of bases to search downstream of a gene end (default=zero bases) 
	-h 	Prints out the USAGE message.

	IMP. NOTE: This program outputs STDOUT 

        Copyright
        (2009) Queensland Institute of Medical Research
        All Rights Reserved.
        Author: Sonika Tyagi 
USAGE

# Defaults
$upBases = 0;
$downBases = 0;

$nargs = 8;			# number of required args
if ($#ARGV+1 < $nargs) { &print_usage("$usage", 1); }

# get input arguments
while ($#ARGV >= 0) {
  $_ = shift;
  if ($_ eq "-h") {				# help
    &print_usage("$usage", 0);
  } elsif ($_ eq "-i") {			# primer file name
    $primer_file_name = shift;
  } elsif ($_ eq "-g") {			# gene file 
    $gene_filename = shift;
  } elsif ($_ eq "-up") {			# number of bases upstream
    $upBases = shift;
  } elsif ($_ eq "-down") {			# number of bases downstream
    $downBases = shift;
  } elsif($_ eq "-start") {			#search within Gene start and end
    $searchStart = shift;
  } elsif($_ eq "-end") {			#search within Gene start and end
    $searchEnd = shift;
  } else {
    &print_usage("$usage", 1);
  }
} #ends while
#########################
#### Reading in the gene info (UCSC gene annotation)#########
open(GENE, $gene_filename) || die ("ERROR!!!!\n\nCould not find $gene_filename\n");
@gene_data = <GENE>;
close(GENE);
#initialize the variables
(@GeneStart, @GeneEnd, @mrnaAcc) = ();
(%gnameHash,%chrHash, %ExstHash, %ExenHash, %strandHash, %ExonNumber)=();
#geneName   name    chrom   strand  txStart txEnd   cdsStart     cdsEnd  exonCount   exonStarts    exonEnds
foreach $geneline (@gene_data) {
  chomp($geneline);
  @getdata = ();
  if($geneline !~ /^#/ and $geneline !~ /^$/ and $geneline !~ /chr.(.)?_/) { #also, ignoring the gene's '>' or '<' coord
    @getdata = split(/\t/,$geneline);
    push (@GeneStart, $getdata[4]);
    push (@GeneEnd, $getdata[5]);
    push (@cdsStart, $getdata[6]);
    push (@cdsEnd, $getdata[7]);
    push (@mrnaAcc, $getdata[1]);
    $strandHash{$getdata[1]} = $getdata[3]; #storing gene strand
    $gnameHash{$getdata[1]} = $getdata[0]; #storing gene names
    $chrHash{$getdata[1]} = $getdata[2]; #storing chromosome name 
    $ExstHash{$getdata[1]} = $getdata[9]; #Exon starts coord separated by comma
    $ExenHash{$getdata[1]} = $getdata[10]; #Exon ends coord separated by comma 
    $ExonNumber{$getdata[1]} = $getdata[8]; #total number of exons in the given gene
  } #end if line is a comment or empty line
}# end foreach gene line

#create references to arrays
$ref_gs= \@GeneStart; 
$ref_ge = \@GeneEnd;
$ref_cs= \@cdsStart; 
$ref_ce = \@cdsEnd;
$ref_accn = \@mrnaAcc;
#
####<............ Read in the primer location data .psl format ........>
($reftstart,$reftend,$refStrand,$refQname,$refTname)=readPrimers($primer_file_name);
### < done... reading primer data
search($ref_gs,$ref_ge,$reftstart,$reftend,$refStrand,$refQname,$refTname,$upBases,$downBases,$ref_cs,$ref_ce,$ref_accn,$searchStart,$searchEnd);

sub search{
  my ($ref_gs,$ref_ge,$reftstart,$reftend,$refStrand,$refQname,$refTname,$upBases,$downBases,$ref_cs, $ref_ce,$ref_accn,$searchStart,$searchEnd)=@_;
  @sort_gs=@$ref_gs;
  @sort_ge=@$ref_ge;
  @Tstart = @$reftstart;
  @Tend = @$reftend;
  @Accn = @$ref_accn;
  @Tstrand = @$refStrand;
  @qname = @$refQname;
  @tname = @$refTname;
  @cds_st = @$ref_cs;
  @cds_en = @$ref_ce;
  my ($i, $j) = 0;
  print "#Qname,Chr,FeatStart,FeatEnd,ChrStrand,GeneStart,GeneEnd,GeneStrand,GeneName,AccnNo,Map-on-gene(from-to)\n#\n";
  for ($i=0; $i<=$#Tstart; $i++){
    for ($j=0; $j<=$#sort_gs; $j++) {
      if($chrHash{$Accn[$j]} eq $tname[$i]){
        if($Tstart[$i] >= ($sort_gs[$j]-$upBases) and $Tstart[$i] <= ($sort_ge[$j]+$downBases)) {
          $locFrom = $locTo = $loc_up1 = $loc_up2 = $loc_down1 = $loc_down2 = $loc_from1 =$loc_from2=$loc_to1=$loc_to2= "NA";
###search spaces
	  if(($searchStart eq 'up-k-bases' and $searchEnd eq 'GeneEnd') or ($searchStart eq 'up-k-bases' and $searchEnd eq 'down-k-bases') or ($searchStart eq 'GeneStart' and $searchEnd eq 'GeneEnd') or ($searchStart eq 'GeneStart' and $searchEnd eq 'down-k-bases')){
	    ($locFrom,$locTo)=get_exon_match($ExstHash{$Accn[$j]}, $ExenHash{$Accn[$j]},$Tstart[$i],$Tend[$i],$sort_gs[$j], $sort_ge[$j],$cds_st[$j],$cds_en[$j]);
	  }
	  if(($searchStart eq 'up-k-bases' and $searchEnd eq 'GeneEnd') or ($searchStart eq 'up-k-bases' and $searchEnd eq 'down-k-bases') or ($searchStart eq 'up-k-bases' and $searchEnd eq 'GeneStart')){
  	    if($upBases >=100) {
	      $upstream = $sort_gs[$j] - $upBases; 
	      ($loc_up1,$loc_up2) = search_ends('up',$Tstart[$i],$Tend[$i],$upstream, $sort_gs[$j]);
              if($loc_up1 ne 'NA') {$loc_from1 = 'upstream'.$loc_up1;}
              if($loc_up2 ne 'NA') {$loc_to1 = 'upstream'.$loc_up2;}
	    } else {
 		  print STDERR "Please provide upstream and downstream length of at least 100 bases\n"; 
    	  	  &print_usage("$usage", 1);
	    }
	  }
	  if(($searchStart eq 'GeneEnd' and $searchEnd eq 'down-k-bases') or ($searchStart eq 'up-k-bases' and $searchEnd eq 'down-k-bases') or ($searchStart eq 'GeneStart' and $searchEnd eq 'down-k-bases')){
	    if($downBases >=100) {
	      $downstream = $sort_ge[$j] + $downBases;
	      ($loc_down1,$loc_down2) = search_ends('down',$Tstart[$i],$Tend[$i],$sort_ge[$j], $downstream);
              if($loc_down1 ne 'NA') {$loc_from2 = 'downstream'.$loc_down1;}
              if($loc_down2 ne 'NA') {$loc_to2 = 'downstream'.$loc_down2;}
	    } else {
 		  print STDERR "Please provide upstream and downstream length of at least 100 bases\n"; 
    	  	  &print_usage("$usage", 1);
	    }
	  }
#print "$loc_from1,$loc_from2,$locFrom TO $loc_to1, $loc_to2, $locTo\n";
    	  if($loc_from1 ne 'NA'){
	    $locFrom = $loc_from1;
	  }elsif($loc_from2 ne 'NA') {
	    $locTo = $loc_from2;
	  }
    	  if($loc_to1 ne 'NA'){
	    $locTo = $loc_to1;
	  }elsif($loc_to2 ne 'NA') {
	    $locFrom = $loc_to2;
	  }
      #####If feature is on -ve strand reverse the exon/intron counting #####################
 	if($strandHash{$Accn[$j]} eq '-' and $locFrom =~ /(Exon|Intron)/) {
          ($name,$exnum) = split(/_/,$locFrom);
          if($name =~ /Exon/){$new_exon_num = ($ExonNumber{$Accn[$j]} - $exnum )+1;}
	  elsif($name =~ /Intron/){$new_exon_num = $ExonNumber{$Accn[$j]} - $exnum;}
          $locFrom = $name.'_'.$new_exon_num;
        } #end if strand -ve
 	if($strandHash{$Accn[$j]} eq '-' and $locTo =~ /(Exon|Intron)/) {
          ($name,$exnum) = split(/_/,$locTo);
          if($name =~ /Exon/){$new_exon_num = ($ExonNumber{$Accn[$j]} - $exnum )+1;}
	  elsif($name =~ /Intron/){$new_exon_num = $ExonNumber{$Accn[$j]} - $exnum;}
          $locTo = $name.'_'.$new_exon_num;
        } #end if strand -ve
                        ########## Print the out put here.....>
   	  print  "$qname[$i],$tname[$i],$Tstart[$i],$Tend[$i],$Tstrand[$i],$sort_gs[$j],$sort_ge[$j],$strandHash{$Accn[$j]},$gnameHash{$Accn[$j]},$Accn[$j],$locFrom..$locTo\n";
        }#if
      } #if chr names are the same
    } #for j
  } #for i
} #end search
######<.......................................>
sub search_ends{
  my(
  $tag,
  $start_i,
  $end_i,
  $first,
  $last
  )=@_;
  my $bases1=$bases2="NA";
  if($start_i >= $first and $start_i <= $last){
    $bases1 = $start_i>$last?$start_i-$last:$last-$start_i;
  }
  if($end_i >= $first and $end_i <= $last){
    $bases2 = $end_i>$last?$end_i-$last:$last-$end_i;
  }
#print "$start_i,$end_i, $first,$last, bases1 = $bases1, bases2=$bases2\n";
return($bases1,$bases2);
}#end sub search_ends

sub get_exon_match{
  my(
  $exst_line,
  $exen_line,
  $start_i,
  $end_i,
  $gene_st,
  $gene_en,
  $cds_st,
  $cds_en
  )=@_;
  my (@ExonSt, @ExonEn,@tmp_cds) = ();
  my $loc_from = $loc_to = 'NA';
  $start_i = int($start_i);
  @ExonStart = split (/\,/, $exst_line); #Split exon coordinates
  @ExonEnd = split (/\,/, $exen_line); #Split exon coordinates
#### to get the start of match span
my $cnt1 = $cnt2 = 0;
  if($cds_st > $gene_st ) {
    if ($start_i >= $cds_en and $start_i <= $gene_en) {
      $loc_from = "3UTR"; #last;
#print "start=$start_i, $loc_from\n";
    } elsif ($start_i >= $gene_st and $start_i <= $cds_st) {
      $loc_from = "5UTR"; #last; 
#print "start=$start_i, $loc_from\n";
    } elsif ($start_i > $cds_st and $start_i <= $ExonEnd[0]) {
      $loc_from = "Exon_1"; #last; 
#print "start=$start_i, $loc_from\n";
    } else {
      for($c=1; $c<=$#ExonStart; $c++) {
	$cnt1 = $c - 1;
	$cnt2 = $c + 1;
        if($start_i >= $ExonStart[$c] and $start_i <= $ExonEnd[$c]) {
	  $loc_from = "Exon_".$cnt2; #last;
#print "start=$start_i, $loc_from\n";
        } elsif ($start_i >$ExonEnd[$cnt1] and $start_i < $ExonStart[$c]) {
	  $loc_from = "Intron_".$c;
#print "start=$start_i, $loc_from\n";
        }
      } #for cnt 
    }
  } elsif( $cds_st == $gene_st or $cds_st == $gene_en) {
      for($c=0; $c<$#ExonStart; $c++) {
	$cnt2 = $c + 1;
        if($start_i >= $ExonStart[$c] and $start_i <= $ExonEnd[$c]) {
	  $loc_from = "Exon_".$cnt2; #last;
#print "start=$start_i, $loc_from\n";
        } elsif ($start_i >$ExonEnd[$c] and $start_i < $ExonStart[$cnt2]) {
	  $loc_from = "Intron_".$cnt2;
#print "starti=$start_i, $loc_from\n";
      } #for cnt 
  }
}

#### to get the end of match span
my $cnt1 =  $cnt2 = 0;
  if($cds_st > $gene_st ) {
    if ($end_i >= $cds_en and $end_i <= $gene_en) {
      $loc_to = "3UTR"; #last;
#print "endi=$end_i, $loc_to\n";
    } elsif ($end_i >= $gene_st and $end_i <= $cds_st) {
      $loc_to = "5UTR"; #last; 
#print "endi=$end_i, $loc_to\n";
    } elsif ($end_i > $cds_st and $end_i <= $ExonEnd[0]) {
      $loc_to= "Exon_1"; #last; 
#print "endi=$end_i, $loc_to\n";
    } else {
      for($c=1; $c<=$#ExonStart; $c++) {
	$cnt1 = $c - 1;
	$cnt2 = $c + 1;
        if($end_i >= $ExonStart[$c] and $end_i <= $ExonEnd[$c]) {
	  $loc_to= "Exon_".$cnt2; #last;
#print "endi=$end_i, $loc_to\n";
        } elsif ($end_i >$ExonEnd[$cnt1] and $end_i < $ExonStart[$c]) {
	  $loc_to= "Intron_".$c;
#print "endi=$end_i, $loc_to\n";
        }
      } #for cnt 
    }
  } elsif( $cds_st == $gene_st or $cds_st == $gene_en) {
      for($c=0; $c<$#ExonStart; $c++) {
	$cnt2 = $c + 1;
        if($end_i >= $ExonStart[$c] and $end_i <= $ExonEnd[$c]) {
	  $loc_to = "Exon_".$cnt2; #last;
#print "endi=$end_i, $loc_to\n";
        } elsif ($end_i >$ExonEnd[$c] and $end_i < $ExonStart[$cnt2]) {
	  $loc_to= "Intron_".$cnt2;
#print "endi=$end_i, $loc_to\n";
        }
      } #for cnt 
  }
return ($loc_from, $loc_to);
} #end of the subroutine

#####################################
sub readPrimers {
  my $primerfile = shift;
  open(PM, $primerfile) || die ( "ERROR!!!\nCould not find $primerfile\n\n");
  my @filedata = <PM>;
  my(@tStart,@tEnd,@Strand,@Qname,@Tname)=();
  close(PM);
  $i=0;
  foreach $l (@filedata) {
    if($l =~ /^\d/) {
      ($match,$mism,$repm,$ncnt,$qins, $qinsbase, $tins, $tinsbase, $strand, $qname, $qsize,$qstart, $qend, $tname, $tsize, $tstart, $tend, $blkcnt, $blksize, $qstarts,$tstarts) = split(/\t/,$l);
    #  print "$match,$mism,$repm,$ncnt,$qins, $qinsbase, $tins, $tinsbase, $strand, $qname, $qsize,$qstart, $qend, $tname, $tsize, $tstart, $tend, $blkcnt, $blksize, $qstarts,$tstarts\n"; exit;
      push(@tStart,$tstart);
      push(@tEnd,$tend);
      push(@Strand, $strand);
      push(@Qname, $qname);
      push(@Tname, $tname); ##chromosome name in our case
      $hash{$tname}{$tStart[$i]}=$tstart; $i++;
    }
  }#end foreach l
return (\@tStart, \@tEnd,\@Strand,\@Qname,\@Tname);
}#end read primer subroutine
#
#
#
################################################################################
#
#       print_usage
#
#	Print the usage message and exit.
#
################################################################################
sub print_usage {
  my($usage, $status) = @_;
  if (-c STDOUT) {			# standard output is a terminal
    open(C, "| more");
    print C $usage;
    close C;
  } else {				# standard output not a terminal
    print STDERR $usage;
  }
  exit $status;
}
################################################################################
