#!/usr/bin/perl -w

package go2msig;

#use lib "/usr/lib/local-lib/lib/perl5";

use strict;
use DBI ;
use Getopt::Long;
use Text::CSV;
use GO::Parser;
use MLDBM qw( DB_File Storable);
use Fcntl;
use Text::Wrap;


# The logical flow of this program when generating genesets is:
# 1) Process command line switches
# 2) Extract the relevant GO ontology term information into an internal data structure
# 3) Extract the gene associations into an internal data structure, mapping via the usermap file and the geneinfo table if required
# 4) Propagate the gene associations up the GO hierarchy
# 5) Merge identical gene sets and filter by size cutoffs
# 6) Generate a matrix containing the final genesets and descriptions
# 7) Output in the required format.

# Alternative functions are:
# 1) Generating a list of species available in the associated go or ncbi database
# 2) Generating a cached version of the ontology to speed up searches where the ontology source is a slow database or large obo file

# Initialise package vars
my %termwarn=();
my $graph;
my %children=();#$children{parent GO id} is a list ref, the list contains GO ids of the child terms. Populated by the get_term_data routines.
my %altids=();#$altids{GO id} contains the preferred id for the term in question. Populated by the get_term_data routines.
my %obsids=();# $obsids{GO id} is true if the GO term is obsolete. Populated by the get_term_data routines.
my %termname=();# $termname{GO id} contains the human readable name for the GO term. Populated by the get_term_data routines.
my %goids=();# $goids{GO identifier}{geneid} exists if geneid is associated with GO identifier. Initial population by the make_term_hash routines.
my %termlevel=();# $termlevel{GO id} contains the distance of GO term from the root.
my %termalreadyvisited=();
my %seenalready=();
my @outputmatrix=();
my $debug=0;
my $start=time();
my $go_dbh;
my $ncbi_dbh;
my $sth_geneid2symbol;

# Process command line switches
# Default settings
# See option verification code for allowed contents
my $ontology="mf,bp,cc";
my $assocsource="ncbi";
my $termsource="godb";
my $query="geneset";
my $evidence="IDA,IPI,IMP,IGI,IEP,ISS,TAS,EXP"; 
my $taxid=0;
my $genesetformat="gmt";
my $genekey="symbol";
my $nochildgenes=0 ;
my $maxgenes=700;
my $mingenes=10;
my $help;
my $assocfile="";
my $obofile="";
my $gouser="";
my $gopass="";
my $cacheroot="";
my $goconnector="";
my $repress;
my $mapfile="";

# Process user command line options
&GetOptions("help"=>\$help, "ontology=s"=>\$ontology,"query=s"=>\$query,"assocsource=s"=>\$assocsource, "termsource=s"=>\$termsource,"taxid=n"=>\$taxid,"evidence=s"=>\$evidence, "mapfile=s"=>\$mapfile, "format=s"=>\$genesetformat, "geneid=s"=>\$genekey, "nochild"=>\$nochildgenes, "maxgenes=n"=>\$maxgenes, "mingenes=n"=>\$mingenes,"assocfile=s"=>\$assocfile,"repress"=>\$repress,"obofile=s"=>\$obofile,"godb=s"=>\$goconnector,"gouser=s"=>\$gouser,"gopass=s"=>\$gopass, "cachefile=s"=>\$cacheroot) || exit;
if (defined $help) {&DisplayHelp();exit();}

my @ontologies=split(/,/,$ontology);
my @evlist=split(/,/,$evidence);

# Check for absence of required options where possible up front. Additional checks have to get done within main logic.
&CheckSwitchList(\@evlist,('all'=>1,'exp'=>1,'ic'=>1,'ida'=>1,'iea'=>1,'iep'=>1,'igc'=>1,'igi'=>1,'imp'=>1,'ipi'=>1,'isa'=>1,'ism'=>1,
'iso'=>1,'iss'=>1,'nas'=>1, 'nd'=>1,'nr'=>1,'rca'=>1,'tas'=>1, '!exp'=>1,'!ic'=>1,'!ida'=>1,'!iea'=>1,'!iep'=>1,'!igc'=>1,'!igi'=>1,'!imp'=>1,'!ipi'=>1, '!isa'=>1,'!ism'=>1,'!iso'=>1,'!iss'=>1,'!nas'=>1,'!nd'=>1,'!nr'=>1,'!rca'=>1,'!tas'=>1));
&CheckSwitchList(\@ontologies,('mf'=>1,'cc'=>1,'bp'=>1));
&CheckSwitch(\$assocsource,('ncbi'=>1,'godb'=>1,'affy'=>1,'agilent'=>1,'test'=>1,'gaf'=>1));
&CheckSwitch(\$termsource,('godb'=>1,'obofile'=>1,'cache'=>1));
&CheckSwitch(\$query,('geneset'=>1,'species'=>1,'makecache'=>1));
&CheckSwitch(\$genesetformat,('gmt'=>1,'gmx'=>1));
&CheckSwitch(\$genekey,('symbol'=>1,'id'=>1));

# Process evidence codes into a form that can be used in queries
my $allflag=0;
my @evcodes=();
my @negevcodes=();
foreach my $evcode (@evlist) {
  if ($evcode=~/^!/) {$evcode=~s/^!//;push @negevcodes,$evcode;} else {
    push @evcodes,$evcode;
    if ($evcode eq "all") {$allflag=1;}
  }
}
if (scalar @evcodes && scalar @negevcodes) {die "Can't mix ! with non-! evidence codes\n";}
my $evquery="";
if ($allflag) {
#  $evquery="";
} else {
  if (scalar @evcodes) {$evquery=uc("IN (\"".join("\",\"",@evcodes)."\")");} else {$evquery=uc("NOT IN (\"".join("\",\"",@negevcodes)."\")");}
}

# Connect with the required databases
# Good connector for external database at EBI is:
# -goconnector 'dbi:mysql:go_latest:mysql.ebi.ac.uk:4085;mysql_compression=1' -gouser 'go_select' -gopass 'amigo'
if ($assocsource eq "godb" || ($termsource eq "godb" && $query ne "species")) {&connect_to_go;} # essential, so errors will terminate execution
if ($assocsource eq "ncbi") {&connect_to_ncbi;} # errors will terminate execution

# Deal with the different types of query: species and makecache can be dealt with immediately
if ($query eq "species" && $assocsource eq "ncbi") {&getncbispecies;exit;}
if ($query eq "species" && $assocsource eq "godb") {&getgospecies;exit;}
if ($query eq "species") { die "Can only return species from ncbi or go databases\n";}
if ($query eq "makecache") {&make_term_cache;exit}


# At this point we know we're making a geneset
# Now we populate the primary data structures representing the GO ontology
if (!scalar @ontologies) {die "No ontologies listed to build geneset for\n";}
if ((($assocsource eq "godb") or ($assocsource eq "ncbi") or ($assocsource eq "gaf")) && $taxid==0) { die "No taxon supplied by user.\n";}

&get_term_data ;
my %roots=(mf=>'GO:0003674',bp=>'GO:0008150',cc=>'GO:0005575');
$termname{"GO:0003674"}="molecular_function";
$termname{"GO:0008150"}="biological_process";
$termname{"GO:0005575"}="cellular_component";
&debugtime("Got ontology data");

if ($assocsource eq "godb" || $assocsource eq "affy" || $assocsource eq "test" || $assocsource eq "agilent") {$genekey="id";} # for go database, affy sheet, or testfile mapping to symbol doesn't exist

# Read in mapping file if user has specified one
# The first column in this contains identifers that correspond to those in the gene association source
# The second column contains what we map these TO
# uses a hash of hashes so duplicate entries are not a problem
# currently this file is read in with case preserved
# the mapping code gets the ids out of the NCBI/GO database also with case preserved
# once mapping (and potentially entrez ID to symbol translation) is complete they are put into the data structure in upper case form

my %usermap=();my $line="";
if ($mapfile) {
  open (MAPFILE,$mapfile) || die "Couldn't open user map file $mapfile\n";
  while (defined ($line=<MAPFILE>)) {
    if ($line=~/(.*)\t(.*)/) {
      $usermap{$1}{$2}=1;
    } else { die "Can't parse user map file $mapfile. Should be tab separat
ed identifiers\n;"}
  }
  close MAPFILE;
}

# Here we obtain the gene association data, and carry out any relevant ID mapping
&debugtime("Time before make hash");
if ($assocsource eq "ncbi") {&make_term_hash_from_gene2go();} elsif ($assocsource eq "godb") {&make_term_hash_from_go();} elsif ($assocsource eq "affy") {&make_term_hash_from_affy();} elsif ($assocsource eq "gaf") {&make_term_hash_from_gofile();} elsif ($assocsource eq "agilent") {&make_term_hash_from_agilent();} else {&make_term_hash_from_testfile();}
&debugtime("Time after make hash");

# Here we propogate the gene associations up the tree so that we have both direct and indirect associations represented.
# Level calculation also occurs at this point
foreach $ontology (@ontologies) {
  &propagate($roots{$ontology},1); # This will set the already visited hash
} # end of ontologies loop
&debugtime("Time after propagate");

# Look for non sibling terms which are identical, and also filter for the size cutoffs.
&create_prematrix;

# Reorganise for output
&create_output_matrix;

# Generate msigdb formatted files in either gmx or gmt format
if ($genesetformat eq "gmx") {
  my $maxsetsize=0;
  foreach my $setref (@outputmatrix) {
    if (scalar(@$setref)>$maxsetsize) {$maxsetsize=scalar(@$setref);}
  }
  for (my $y=0;$y<$maxsetsize;$y++) {
    foreach my $setref (@outputmatrix) {
      if (defined ($setref->[$y])) {print $setref->[$y];}
      print "\t";
    }
    print "\n";
  }

} elsif ($genesetformat eq "gmt") {
# gmt format has gene sets in rows
  foreach my $setref (sort bygoid @outputmatrix) {
    print join("\t",@$setref)."\n";
  }

} else { die "Software error";}
exit();

sub bygoid {
  return $a->[0] cmp $b->[0];
}

#============================================================================================================
# get_term_data routines
# Code in this section extracts the basic ontology term relationship data for processing
# Each source type leads to the same set of hashes so that subsequent code is independent of source
# We put into hashes data about the term's status (obsolete or not), name, alternative ids, and children
# This may be used directly, or stored in a tied hash to speed up subsequent invocations of the program

# Get term data in order to build genesets
sub get_term_data {
  if ($termsource eq "godb") {&get_term_data_sql;return;}
  if ($termsource eq "obofile") {&get_term_data_obo;return;}
  if ($termsource eq "cache") {&get_term_data_cached;return;}
  die "Software error\n";
}

# Get term data and cache this to disk (to speed up geneset building with slow term data sources)
sub make_term_cache {
  &debugtime("Making cache for term data");
  if (!defined $cacheroot || $cacheroot eq "") { die "Need to specify cache files root name with -cachefile\n";}
  if ($termsource eq "cache") { die "Can't make cache from another cache\n";}
  &check_for_file("$cacheroot.children.cache");
  &check_for_file("$cacheroot.altids.cache");
  &check_for_file("$cacheroot.obsids.cache");
  &check_for_file("$cacheroot.termname.cache");
  tie %children,'MLDBM',"$cacheroot.children.cache";
  tie %altids,'MLDBM',"$cacheroot.altids.cache";
  tie %obsids,'MLDBM',"$cacheroot.obsids.cache";
  tie %termname,'MLDBM',"$cacheroot.termname.cache";
  if ($termsource eq "godb") {&get_term_data_sql;return;}
  if ($termsource eq "obofile") {&get_term_data_obo;return;}
  die "Software error\n";
}

sub check_for_file {
  my ($file)=@_;
  if (-e $file) { die "Cache files for $cacheroot already exist, please remove manually before rebuilding cache\n";}
}


# Get term data from a GO mysql database, either local or remote
# Optimised to work as fast as possible with a remote system
sub get_term_data_sql {
  &debugtime("Getting term data from go database");
  my $sth_obsolete=$go_dbh->prepare('select acc from term where is_obsolete=1');
 # my $sth_altid=$go_dbh->prepare('select term.acc as termacc, acc_synonym as synonymacc from term,term_synonym where synonym_type_id=25 and term.id=term_synonym.term_id');

  my $sth_altid=$go_dbh->prepare('select term.acc as termacc, acc_synonym as synonymacc from term,term_synonym where synonym_type_id=(select id from term where acc="alt_id") and term.id=term_synonym.term_id');

  my $sth_relns=$go_dbh->prepare('select term_a.acc as parent, term_b.acc as child, term_b.name from term term_a, term term_b, term2term where term1_id=term_a.id AND term2_id=term_b.id AND term_a.acc like "GO%" AND term_b.acc like "GO%"');
# Could do this in separate steps but it turned out to be slower
# my $sth_relns=$go_dbh->prepare('select term_a.acc as parent, term_b.acc as child from term term_a, term term_b, term2term where term1_id=term_a.id AND term2_id=term_b.id AND term_a.acc like "GO%" AND term_b.acc like "GO%"');
# my $sth_terms=$go_dbh->prepare('select acc,name from term');

  $sth_relns->execute();
  while (my $row=$sth_relns->fetchrow_hashref) {
    my $parent=$row->{"parent"};my $child=$row->{"child"};
    $termname{$child}=$row->{"name"};
#   if (exists $children{$parent}) { push @{$children{$parent}},$child ;} else { $children{$parent}=[$child];}
#   if (exists $children{$parent}) { $children{$parent}=[@{$children{$parent}},$child] ;} else { $children{$parent}=[$child];}
#   $temp is needed due to limitations with MLDBM
    if (exists $children{$parent}) { my $temp=$children{$parent};push @$temp,$child;$children{$parent}=$temp ;} else { $children{$parent}=[$child];}
  }

  $sth_altid->execute();
  while (my $row=$sth_altid->fetchrow_hashref) {
    $altids{$row->{"synonymacc"}}=$row->{"termacc"};
  }

  $sth_obsolete->execute();
  while (my $row=$sth_obsolete->fetchrow_hashref) {
    $obsids{$row->{"acc"}}=1;
  }

}


# Get term data from an OBO file
# We parse it into a graph object first
sub get_term_data_obo {
  &debugtime("Getting term data from obo file");
  my $parser = new GO::Parser({handler=>'obj'}); # create parser object
# $parser->litemode(1); #would speed up, but we don't get alt_id tags
  unless (-e $obofile) { die "Can't locate obo file $obofile\n";}
  $parser->parse($obofile); # parse file -> objects
  &debugtime("Parsed");

   $graph=$parser->handler->graph;
   my $terms = $graph->get_all_terms; 
   foreach my $term (@$terms) {
     if ($term->is_obsolete) {$obsids{$term->acc}=1;}
     else {
       $termname{$term->acc}=$term->name;
       my $rels = $graph->get_child_relationships($term->acc);
       foreach my $rel (@$rels) {
         if ($term->namespace ne $graph->get_term($rel->subject_acc)->namespace) { warn "parent ".$term->acc." and child ".$rel->subject_acc." are in different ontologies\n";}
#        if (exists $children{$term->acc}) { push @{$children{$term->acc}},$rel->subject_acc ;} else { $children{$term->acc}=[$rel->subject_acc];}
# @temp needed due to MLDBM limitations
if (exists $children{$term->acc}) { my $temp=$children{$term->acc};push @$temp,$rel->subject_acc;$children{$term->acc}=$temp ;} else { $children{$term->acc}=[$rel->subject_acc];}
       }
       my $synlist=$term->alt_id_list;
       foreach my $syn (@$synlist) {
         $altids{$syn}=$term->acc;
       }
     }
   }
}


# Get term data from a previously generated cache
sub get_term_data_cached {
  if ($debug) { warn "Getting term data from cache\n";}
  &fetch_cached(\%children,"$cacheroot.children.cache");
  &fetch_cached(\%altids,"$cacheroot.altids.cache");
  &fetch_cached(\%obsids,"$cacheroot.obsids.cache");
  &fetch_cached(\%termname,"$cacheroot.termname.cache");
}

# When using the tied hash as the source of the ontology data the process of building the geneset is
# significantly faster if the tied hash has been fully retrieved and copied into a new hash at the
# start rather than remaining tied throughout the code excution (even if opened readonly)
sub fetch_cached {
  my ($hashref,$filename)=@_;
  my %temphash=();
  tie %temphash,'MLDBM',$filename,O_RDONLY or die "Couldn't retrieve cachefile $filename\n";
  %{$hashref}=%temphash;
  untie %temphash;
  return;
}

#============================================================================================================
# Propagate child gene associations upwards to parent
#
# It turns out that starting at the root and following the links means that we don't hit any obsolete terms,
# even though some of these are still in the go and ncbi databases and are present in the goids hash
#
# This routine also works out how far terms are from the top of the go tree. Calculating this on its own is quick
# so there is no need to cache the level (as done by previous incarnations of this program).
# Hence the level calculation and gene propagation have been combined.
#
# In some instances we may get to a term a second time by a different route which is shorter. In that case
# we have to re-descend the tree to correct the levels of child terms. However there is no need to
# propagate gene associations as all terms below will have been previously completed.

sub propagate {
#  my ($termacc)=@_;
  my ($termacc,$level)=@_; #level is current distance from top
  if (!exists $termalreadyvisited{$termacc}) {
    $termlevel{$termacc}=$level ; #record distance from root
# Here the term is not done. So we have geneids attached to this term which are only direct ones
    if (exists $children{$termacc}) {
 # This iterates through a list of children, we get a list of genes back for each
      foreach my $rel (@{$children{$termacc}}) {
        my @childgenelist=&propagate($rel,$level+1);
        if (!$nochildgenes) { # add to hash if we are propagating genes
          foreach my $gene (@childgenelist) {
            $goids{$termacc}{$gene}=1;
          }
        }
      }
    }
  } else {
# We have already visited this term, so the genelist of this (and lower terms) is correct, but the level might need updating
    if ($level < $termlevel{$termacc}) { # re-entered with a shorter route from top
      $termlevel{$termacc}=$level;
      if (exists $children{$termacc}) {
        foreach my $rel (@{$children{$termacc}}) {
          &propagate($rel,$level+1);
        }
      }
    }
  }
  $termalreadyvisited{$termacc}=1;
  if (!exists $termlevel{$termacc}) { die "program error: termacc doesn't exist $termacc\n";}
  if (!exists $goids{$termacc}) { return ();} # seems not actually needed but wouldn't rely on the behaviour described below
  return keys(%{$goids{$termacc}}); # returns empty without erroring even in goids undefined (as in no genes case)
}

#============================================================================================================
# Code used to do final processing and formatting before output

# This routine is used to detect identical genesets and then compiles a hash used by the output generating code
# For identical genesets the description is a concatenation of the terms that apply to this geneset,
# in order of most general (closest to root term by shortest route) to least general.
# The link placed in the output spreadsheet is to the most general term.

sub create_prematrix {
  foreach my $termacc (sort by_level_then_id (keys %termalreadyvisited)) {
    my @genes=keys(%{$goids{$termacc}});
    if ((scalar @genes >=$mingenes) && (scalar @genes <=$maxgenes)) {

#generate a key for these genes
      my $key=join("",sort @genes)."\n";
  #    print "key for first set $key\n";die;
       if (!exists $seenalready{$key}) {
         $seenalready{$key}{"acc"}=$termacc;#record highest level term with this gene list
        # $seenalready{$key}{"names"}=&get_term_name($termacc)."($termacc)";
$seenalready{$key}{"names"}=&get_term_name($termacc)."(".$termlevel{$termacc}.")";
       } else {
       #  $seenalready{$key}{"names"}.="\\\\".&get_term_name($termacc)."($termacc)";# append next name
$seenalready{$key}{"names"}.="&".&get_term_name($termacc)."(".$termlevel{$termacc}.")";# append next name
       }
    }
}
}

sub by_level_then_id {
if (!defined($a)) { die "a is undefined in by level or id!\n";}
  if ($termlevel{$a} != $termlevel{$b}) { return $termlevel{$a} <=> $termlevel{$b};}
  return $a cmp $b;
}

#Put into a 'matrix' so that we can easily output as is, or transposed
sub create_output_matrix {
foreach my $key (keys %seenalready) {
  push @outputmatrix,[$seenalready{$key}{"names"},&get_term_desc($seenalready{$key}{"acc"}),sort keys(%{$goids{$seenalready{$key}{"acc"}}})];
}
}

# Goes into second column of geneset file - we need a url returned here for correct formatting of the GSEA output
sub get_term_desc {
  my ($termid)=@_;
 # return "Genes annotated by $termid";
 # return $apph->acc2name_h->{$termid};
  return "http://amigo.geneontology.org/cgi-bin/amigo/term-details.cgi?term=$termid";
}

# When using a db getting term then acc is very slow, acc2name much faster
# However currently we don't need to do this because we have the name in a hash
sub get_term_name {
  my ($termid)=@_;
  my $name=$termname{$termid};
  $name=~s/ /_/g; # required for correct formatting of GSEA output
  return $name;
}

#============================================================================================================
# make_term_hash routines
#
# These acquire the gene products associated with GO terms and populate the goids hash which has the form 
# $goids{GO term accession}{gene id/symbol}=1 if the gene product is associated to the term


# Get gene associations from a GO association file
# The direct approach below is over 50* faster than using go-perl to parse - using go-perl would preclude
# some of the larger GO association files.

sub make_term_hash_from_gofile {
  my ($version, $gene);

  &debugtime("Before assocfile parse");

  open (FILE,$assocfile) || die "Couldn't open GAF file '$assocfile'\n";

  while (defined(my $line=<FILE>)) {
    chomp $line;
    if ($line=~/^!gaf-version: 2.0\s*/ || $line=~/^!gaf-version: 1.0\s*/) {$version=$line;}
    if ($line!~/^!/){
      if (!defined $version) { die "Not a GAF file\n";}
      my @items=split(/\t/,$line);
      if ($items[12]=~/^taxon:(\d+)/i) {
        if ($1==$taxid) {
          if ($items[3]!~/^NOT$|\|NOT$|^NOT\||\|NOT\|/i) {
            if ($genekey eq "symbol") {$gene=$items[2];} else {$gene=$items[1];}
            if ($allflag || ((scalar @evcodes) && is_in_list(lc($items[6]),\@evcodes)) || ((scalar @negevcodes) && !is_in_list(lc($items[6]),\@negevcodes))) {
              &map_and_set_term($items[4],$gene);
#print "setting gene $gene go term".$items[4]." evidence".$items[6]."\n";
            }
          } #else { warn "NOT term detected $line\n"}
        }
      } #else { warn "Couldn't get taxon from GAF file\n$line\n"}
    }
  }

  &debugtime("Processed GAF");

}

# Get gene associations from a local mysql database populated with the NCBI gene2go table
sub make_term_hash_from_gene2go {
  if ($evquery) {$evquery="and evidence_qualifier $evquery";}
# my $sth=$ncbi_dbh->prepare("select * from gene2go where tax_id=? and go_ontology=? $evquery and go_term != \"NOT\"");
  my $sth=$ncbi_dbh->prepare("select go_id,gene_id from gene2go where tax_id=?  $evquery and go_term != \"NOT\"");

  $sth->execute($taxid);
  &debugtime("Finished query for gene2go term hash");
# Array ref is faster here than hash ref, noticeable for typical data sets
  while (my $row=$sth->fetchrow_arrayref) {
#   &map_and_set_term($row->{"go_id"},$row->{"gene_id"});
    &map_and_set_term($row->[0],$row->[1]);
  }
}

# Get gene associations from a mysql installation of the GO database
sub make_term_hash_from_go {
  if ($evquery) {$evquery="and evidence.code $evquery";}
  my $sth=$go_dbh->prepare("select distinct term.acc, gene_product.symbol from term,association,gene_product,species,evidence where term.id=association.term_id AND gene_product.id=association.gene_product_id and gene_product.species_id=species.id AND evidence.association_id=association.id $evquery and association.is_not=0 and species.ncbi_taxa_id=?");
  $sth->execute($taxid);
  while (my $row=$sth->fetchrow_hashref) {
    &map_and_set_term($row->{"acc"},$row->{"symbol"});
  }
}

# uses existing msigdb file to get associations, this was used as part of the debugging process
# and does not really have any application for real world use.
sub make_term_hash_from_testfile {
  if ($assocfile eq "") {die "Need to provide name of an annotation file with '-assocfile' switch\n";}
  open (FILE,$assocfile) || die "Couldn't open test file '$assocfile'\n";
  while (defined ($line=<FILE>)) {
    chomp $line;
    my @vars=split('\t',$line);
    if ($vars[1]=~/(GO:\d+)/) {
      my $term=$1;
      shift @vars;shift @vars;
      foreach my $gene (@vars) {
        &map_and_set_term($term,$gene);
      }
    } else {die "Couldn't read go term\n";}
  }
close FILE;
}

# Get associations from an Affymetrix annotation file
# Affy files via GEO have slight differences c.f. those direct from Affymetrix, including being tsv not csv.
my ($fmf,$fbp,$fcc,$fhead,$fgo);
sub make_term_hash_from_affy {
  my ($row,@row,$sv);
  if ($assocfile eq "") {die "Need to provide name of Affymetrix annotation file with '-assocfile' switch\n";}
  my $csv = Text::CSV->new ({binary=>1}) or die "Cannot use CSV: ".Text::CSV->error_diag ();
  my $tsv = Text::CSV->new ({binary=>1,'sep_char'=>"\t"}) or die "Cannot use CSV: ".Text::CSV->error_diag ();
  open my $fh, "<:encoding(utf8)", $assocfile or die "Couldn't open Affymetrix file '$assocfile'\n";
  while (defined ($line=<$fh>)) {
    if ($csv->parse($line)) {
      if (&affy_headingrow($csv->fields())) { $sv=$csv;last;}
    }
    if ($tsv->parse($line)) {
      if (&affy_headingrow($tsv->fields())) { $sv=$tsv;last;}
    }
  } 
  if (!defined $line) { die "Couldn't get headings\n";}

  $sv->column_names($sv->fields());
  while ($row=$sv->getline_hr($fh)) {
   my $termstring=$row->{$fmf}.$row->{$fbp}.$row->{$fcc};
   while (defined($termstring) && $termstring=~s/(\d\d\d\d\d\d\d)//) {
     &map_and_set_term("GO:$1",$row->{$fhead});
   }
  }
  close $fh;
}

# Uses regexp to allow some flexibility in column headings
# Also works with Affy annoation files as returned by GEO
sub affy_headingrow {
  my (@row)=@_;
  if ((lc($row[0]) ne "id") && (lc($row[0]) ne "probe set id")) {return 0;}
  $fhead=$row[0];
  foreach my $head (@row) {
    if ($head=~/molecular function/i) {$fmf=$head;}
    if ($head=~/biological process/i) {$fbp=$head;}
    if ($head=~/cellular component/i) {$fcc=$head;}
  }
  if (defined $fmf&&$fbp&&$fcc) {return 1;} else {return 0;}
}

# Get associations from an Agilent annotation file
sub make_term_hash_from_agilent {
  my $row;
  if ($assocfile eq "") {die "Need to provide name of Agilent annotation file with '-assocfile' switch\n";}
  my $csv = Text::CSV->new ({binary=>1,'sep_char'=>"\t"}) or die "Cannot use CSV: ".Text::CSV->error_diag ();
  open my $fh, "<:encoding(utf8)", $assocfile or die "Couldn't open Agilent file '$assocfile'\n";
  do {$row=$csv->getline($fh)} until (!defined $row || &agilent_headingrow($row));
  if (!defined $row) {die "Couldn't find appropriate column headings in Agilent sheet\n";}
  $csv->column_names(@$row);
  while ($row=$csv->getline_hr($fh)) {
   my $termstring=$row->{$fgo};
   while (defined($termstring) && $termstring=~s/(GO:\d\d\d\d\d\d\d)//) {
     &map_and_set_term($1,$row->{$fhead});
   }
  }
  close $fh;
}

# Uses regexp to allow some flexibility in column headings
sub agilent_headingrow {
  my ($row)=@_;
  if ((lc($row->[0]) ne "probeid") && (lc($row->[0]) ne "id")) {return 0;}
  $fhead=$row->[0];
  foreach my $head (@$row) {
    if (($head=~/^go$/i) || ($head=~/^go_id$/i)) {$fgo=$head;return 1;}
  }
  return 0;
}

# User map file can contain multiple synonyms for the gene identifiers obtained from the primary
# source (i.e. NCBI, GO, Affymetrix or Agilent). In that case the gene identifier gets expanded
# into each of the possible synonyms. Similarly, more than one gene identifier can be mapped
# to the same synonym, in which case they effectively get 'collapsed' to that synonym by virtue
# of the hash based representation of the gene associations.
# If the 'repress' flag is set and the gene id is not in the usermap then do not register the association.

sub map_and_set_term {
  my ($acc,$symbol)=@_;
#if ($symbol=~/available/i) { die "Program Error\n";}
  if (($genekey eq "symbol") && ($assocsource eq "ncbi")) {$symbol=(&geneid2symbol($symbol))};
  if (!exists $usermap{$symbol}) {
    if ($mapfile && defined $repress) {
     return;
    }
    &set_term($acc,$symbol);return;
  }
  foreach my $aliassymbol (keys %{$usermap{$symbol}}) {
    &set_term($acc,$aliassymbol);
  }
}

# $acc is the GO term acession. $symbol is gene symbol or Entrez ID.
# If this GO term ID is a synonym for a preferred term in the GO ontology, replace it with the preferred version of the term.
# Warn the user this is occuring (but only the first time for each id/symbol)
# Similarly warn if the association data includes obsolete terms, or contains terms not detected in the GO ontology
sub set_term {
  my ($acc,$symbol)=@_;

   if (exists $altids{$acc}) {
     if (!exists $termwarn{$acc}) {warn "replacing term $acc with preferred version ".$altids{$acc}."\n";$termwarn{$acc}=1;}# warn first time
     $acc=$altids{$acc}; # replace all occurences
   }
   elsif (exists $obsids{$acc} && !exists $termwarn{$acc}) { warn "gene associations include obsolete term $acc\n";$termwarn{$acc}=1;}
   elsif (!exists $termname{$acc} && !exists $termwarn{$acc}) { warn "gene associations include non existent term $acc\n";$termwarn{$acc}=1;}

   $goids{$acc}{uc($symbol)}=1;# GSEA expects symbols in genesets to be uppercase
}

sub is_in_list {
  my ($checkitem,$listref)=@_;
  foreach my $item (@$listref) {
    if ($item eq $checkitem) { return 1;}
  }
  return 0;
}

# convert a geneid to a genesymbol
sub geneid2symbol {
  my ($geneid)=@_;
  $sth_geneid2symbol->execute($geneid);
  my @row=$sth_geneid2symbol->fetchrow_array;
  if (!defined $row[0]) {warn "no symbol for $geneid\n";return "geneid_".$geneid;}
  return $row[0];
}


#============================================================================================================
# Database connection routines

# Try to connect to the specified GO database
# If no database specified then use defaults which will be for a local install
sub connect_to_go {
  if ($goconnector eq "") {
    $go_dbh = DBI->connect("dbi:mysql:mygo","gouser","amigo") or  die "Couldn't connect to default local go database\n";
    return 0;
  }
  $go_dbh = DBI->connect($goconnector,$gouser,$gopass) or  die "Couldn't connect to $goconnector as $gouser\n";
}

# Connect to local database populated with NCBI gene2go and geneinfo tables
# Also setup the geneid to symbol query
sub connect_to_ncbi {
 $ncbi_dbh = DBI->connect("dbi:mysql:bioannotation","gouser","amigo") or  die "Couldn't connect to local NCBI annotation database\n";
 $sth_geneid2symbol=$ncbi_dbh->prepare('select symbol from geneinfo where gene_id=?');
}

#============================================================================================================
# These routines are used to display the available species (i.e. with gene associations) in the database selected

# List the species present in the NCBI gene2go table
# Tries to translate ncbi taxa into names using go species data, but if a GO database not available, just display the taxon id
sub getncbispecies {
  if (!defined($go_dbh) || $go_dbh eq "") {
   eval {
     &connect_to_go;
   }
  }
  my $sth=$ncbi_dbh->prepare("select distinct tax_id from gene2go");
  $sth->execute();
  my @names=();
  while (my @row=$sth->fetchrow_array) {
    push @names,[$row[0],&gettaxonname($row[0])];
  }
  foreach my $name (sort {($a->[1])." ".$a->[2] cmp ($b->[1]." ".$b->[2])} @names) {
    print $name->[0]."\t".$name->[1]." ".$name->[2]."\n";
  }
}

# Get genus and species name for a given taxon from the go db if there is one
# Because we may use external GO database, need this to be a separate query, not a join
sub gettaxonname {
  my ($taxon)=@_;
  if (!defined($go_dbh) || $go_dbh eq "")  { return ("unknown","taxon");}
  my $sth_taxonquery=$go_dbh->prepare("select genus, species from species where ncbi_taxa_id=?");
  $sth_taxonquery->execute($taxon);
  my @row=$sth_taxonquery->fetchrow_array;
  if (!defined $row[0]) {$row[0]="";}
  if (!defined $row[1]) {$row[1]="";}
  return ($row[0],$row[1]);
}

# List the species present in the GO database
sub getgospecies {
  my $sth=$go_dbh->prepare("select distinct ncbi_taxa_id,genus,species from species, gene_product where species.id=gene_product.species_id");
  $sth->execute();
  &debugtime("Finished SQL query for GO db species retrieval");
  my @names=();
  while (my @row=$sth->fetchrow_array) {
    if (!defined $row[1]) {$row[1]="";}
    if (!defined $row[2]) {$row[2]="";}
    push @names,\@row;
  }

  foreach my $name (sort {($a->[1])." ".$a->[2] cmp ($b->[1]." ".$b->[2])} @names) {
    print $name->[0]."\t".$name->[1]." ".$name->[2]."\n";
  }

}

#============================================================================================================
# Input processing,debug and help routines

sub debugtime {
  my ($text)=@_;
  if ($debug) { warn "debug: ".$text." ".(time()-$start)." seconds\n";}
}

sub DisplayHelp {

  print wrap("","",<<ENDHELP);

VERSION 1.0 - 02/08/2012

For full details see: http://www.bioinformatics.org/go2msig/usage_and_install_guide.html

COMMAND LINE OPTIONS

-ontology [list of ontologies]: Takes a comma separated list of ontologies, possible values are 'cc', 'mf' and 'bp' for the cellular compartment, molecular function or biological process ontologies respectively. Default is 'cc,mf,bp'.

-assocsource ['ncbi'|'godb'|'affy'|'agilent'|'gaf'] : Specify source of gene association data. The options are a local mysql install of the ncbi gene2go table, a mysql install of the GO database (local or remote), an Affymetrix array annotation file, an Agilent array annotation file, or a GO gene annotation file. Default is 'ncbi'.

-assocfile [filename] The file containing the mapping between GO terms and probesets or genes. This is used with the -assocsource 'affy', 'agilent' or 'gaf' options.

-query ['geneset'|'species'|'makecache'] : 'geneset' will generate a geneset in msigdb formats. 'species' will return a list of species that have associated GO annotations in the database being searched. 'makecache' will generate a cache of the GO ontology from whichever termsource is selected, providing a dramatic speed up of future searches if using a slow database server or large OBO file as the term source. Default is 'geneset'.

-cachefile [filename] : Root of the filename to use for the cached file. Four files are generated, filename.termnames.cache, filename.children.cache, filename.obsids.cache and filename.altids.cache.

-termsource ['godb','obofile','cache'] : Specifies the source of the GO ontology hierarchy. Primary sources are a GO database, or an OBO file. If a cache has previously been generated using the -q makecache switch then 'cache' can be specified. Default is 'godb'.

-obofile [filename] : Name of the OBO file if using -termsource obofile.

-godb [database connector] : Connection string for the GO database if one is being used. This is used in conjunction with the -gouser and -gopass switches. The example switches for connection to the EBI implementation would be: -godb 'dbi:mysql:go_latest:mysql.ebi.ac.uk:4085;mysql_compression=1' -gouser 'go_select' -gopass 'amigo'. If not set on the command line go2msig defaults to using the standard local install of the GO db as described in the installation instructions.

-gouser [username] : Username for the GO mysql database.

-gopass [password] : Password for the GO mysql database.

-evidence [list of evidences codes] : takes a comma separated list of evidence codes which are searched for. This is ignored when using Affymetrix or Agilent annotation files as the association source. Can be 'all' for all codes. Can be negated by prefixing codes with !. Full list of codes is EXP, IC, IDA, IEA, IEP, IGC, IGI, IMP, IPI, ISA, ISM, ISO, ISS, NAS, ND, NR, RCA, TAS. Default is 'IDA, IPI, IMP, IGI, IEP, ISS, TAS, EXP'.

-taxid [ncbi taxon id] : The ncbi taxon number of the species/strain for which the geneset is being built.

-format ['gmt'|'gmx'] : Selects gene matrix format (gmx) or gene matrix transposed (gmt) format for the output. See the GSEA data format documentation for an explanation of these. Default is 'gmt'.

-maxgenes [maximum number of genes] : Genesets where the number of genes is greater than this value are excluded. Default value is 700.

-mingenes [minimum number of genes] : Genesets where the number of genes is less than this value are excluded. Default value is 10.

-nochild : If this option is set then only genes directly associated with GO terms are included in the geneset. If the option is not set then genes associated with child terms of the GO term in question will also be included in the set. Default is unset.

-mapfile: [mapfile name]: Optional file which contains tab separated key value pairs for mapping the identifiers (derived from the originating NCBI or GO database, or chip annotation file) in the final genesets to the value defined in the user supplied map file. The NCBI/GO value is used as the key. If no key exists and the -repress switch is NOT set, the existing value will be output. It the same key exists more than once in the mapping file with different values the original identifier will be expanded into each of the relevant values.

-repress: By default if an original gene identifier does not have an entry in the mapfile it is left untranslated. If the repress flag is used it will instead be removed from output. This can be used to extract single species gene sets from affymetrix arrays that contain probes for multiple species. Default is unset.

-geneid ['id'|'symbol'] : When obtaining associations from the ncbi gene2go table, output either the gene id (direct from the gene2go table), or translate the gene id to the gene symbol (using the geneinfo table). Alternatively if obtaining gene associations from an OBO file, use the symbol column, or the id column as the source of the gene identifier. Default is 'symbol'.

-help : display this message

EXAMPLES

Query NCBI for list of all species with go annotations:
go2msig -assocs ncbi -q species

Make gene sets for all 3 ontologies, using a local GO database for the term source and association source, taxon 101510, all evidence codes:
go2msig -assocs godb -ont mf,cc,bp -tax 101510 -e ALL

Make gene sets for molecular function ontology for taxon 9606 using a local install of the NCBI tables as the association source, a local install of the GO db as the term souce, all evidence codes except IEA:
go2msig -assocs ncbi -ont mf -tax 9606 -e '!IEA'

As above but using an obo file (gene_ontology_edit.obo.2008-02-01 in this case) as the term source:
go2msig -assocs ncbi -termsource obofile -obofile gene_ontology_edit.obo.2008-02-01 -ont mf -tax 9606 -e '!IEA'

ENDHELP
}

# Check that an input option contains only allowed values, and make the text lower case
sub CheckSwitch {
  my ($switchref,%allowed)=@_;
  $$switchref=lc($$switchref);
  if (exists $allowed{$$switchref}) { return 1;} else { die "Illegal option value '".$$switchref."'\n";}
}

# Check that a list generated from an input switch contains only allowed values without repetition, make all letters lower case
# remove spaces on the start and end of the values
sub CheckSwitchList {
  my ($switchlistref,%allowed)=@_;
  my %visited=();
  for (my $i=0;$i < (scalar @$switchlistref); $i++) {
    my $value=lc($switchlistref->[$i]);
    $value=~s/^ *//;$value=~s/ *$//;
    $switchlistref->[$i]=$value;
    if (exists $visited{$value}) { die "Option value $value appears more than once\n";}
    if (!exists $allowed{$value}) { die "Illegal option value '$value'\n";}
    $visited{$value}=1;
  }
}




