DSC script benchmark.dsc


# This DSC can be executed from: https://github.com/gaow/dr-tree/tree/master/analysis
# To run the entire benchmark:
# dsc exec benchmark.dsc -j8
# To run part of the benchmark, here are a few highlights
# dsc exec benchmark.dsc -j8 -s "SimTree * SimBM * SklearnDR * PlotPY"
# dsc exec benchmark.dsc -j8 -s "SimTree * SimBM * MST * PlotR[1]"
# dsc exec benchmark.dsc -j8 -s "SimTree * SimBM * (RDR[2], RDR[3]) * PlotR[2]"
# dsc exec benchmark.dsc -j8 -s "SimTree * SimBM * (RDR[4] * PlotR[3], RDR[5] * PlotR[4])"
# dsc exec benchmark.dsc -j8 -s "ToyTreeBM * (RDR[4] * PlotR[3], RDR[5] * PlotR[4])"
# To run the benchmark without calculating the population covariance:
# dsc exec benchmark.dsc -j8 -s "(SimTree * PlotTree * SimBM, ToyTreeBM) * (SklearnDR * PlotPY, MST * PlotR[1], (RDR[2], RDR[3]) * PlotR[2], (RDR[1], RDR[4], RDR[5], RDR[6]) * PlotR[3], RDR[7] * PlotR[4], (RDR[8], RDR[9]) * PlotR[5], (RDR[1], RDR[2], RDR[3], RDR[4], RDR[5], RDR[6], RDR[7]) * MST * PlotR[1])"

###
# Simulate bifurcated tree structure with data points in between nodes
###
SimTree:
  exec: BiTree.R
  seed: 31415
  params:
    n: 2, 8
    t: 30
    m: 100
    unify_branch: 1
  return: tree = R(res$data),
          nodes = R(res$all_nodes),
          X_labels = R(res$labels)

###
# Plot tree structure
###
PlotTree:
  exec: TreePlot.R
  params:
    tree: $tree
    treeplot: File(pdf)
  return: treeplot

###
# Calculate covariance matrix for the tree structure
###
CalcVcv:
  exec: VcvTree.R
  seed: 31415
  params:
    tree: $tree
    positions: $nodes
  return: CX

###
# Generate samples from covariance
###
Vcv2BM:
  exec: VcvBM.R
  seed: 31415
  params:
    CX: $CX
    X_labels: $X_labels
    mu: 0
    sigma: 0.1
    k: 100
    scalar: 1
    type_I_noise: 0, Asis(sigma)
    type_II_noise: 0, Asis(k * 2)
  return: X

###
# Simulate BM directly from tree structure
###
SimBM:
  exec: IntermBM.R
  seed: 31415
  params:
    tree: $tree
    all_nodes: $nodes
    Nsim: 100
    diffusion: 1
    precision: 1000
    type_I_noise: 0, Asis(diffusion)
    type_II_noise: 0, Asis(Nsim * 2)
  return: X

###
# Simulate simple tree structure and sample Brownian motion on it
# - Linear edge
# - Y-shaped tree
###
ToyTreeBM:
  exec: LinearBM.R, YShapedBM.R
  seed: 31415
  params:
    m: 100
    t_scale: 1
    Nsim: 100
    diffusion: 1
    precision: 1000
    type_I_noise: 0, Asis(sqrt(diffusion/precision))
    type_II_noise: 0, Asis(Nsim * 2)
  return: X

###
# Apply different methods from sklearn package
# - global methods, local methods (manifold learning)
###
SklearnDR:
  exec: Combo(manifold.py, $(manifold_methods))
  seed: 31415
  params:
    data: $X
    label: (I10, I12, I11, I1, I2, I13, I3, I4, I14, I5, I6, I15, I7, I8)
    n_comp: 2
    n_neighbors: 50
  return: projection, properties

###
# Generate 2D plots for Python based methods
###
PlotPY:
  exec: PlotDR.py
  params:
    data: $projection
    properties: $properties
    vizplot: File(pdf)
  return: vizplot

###
# Apply methods in R packages:
# - minimimum spanning tree via Ape package
# - sparse PCA via PMA package
# - sparse factor analysis via sfa wrapped in R
###
MST:
  exec: ApeMST.R
  seed: 31415
  params:
    X: $X
    label: (I10, I12, I11, I1, I2, I13, I3, I4, I14, I5, I6, I15, I7, I8)
  return: projection, properties, X

RDR(MST):
  exec: FlashK.R, PmaPCA.R, PCA.R, SFA.R, PMD.R, PosPMD.R, CountCluster.R, NeighborJoining.R, FastME.R
  params:
    exec[2]:
      sumabsv: 1, Asis(NULL)
      K: 5
      orth: 1, 0
    exec[4]:
      K: Asis(NULL), 9
      model: -mn
      rng: 31415
    exec[1,5,6]:
      K: Asis(NULL), 9
    exec[7]:
      K: Asis(NULL), 9
      tol: 1

###
# Generate 2D plots for R based methods 
###
PlotR:
  exec: PlotMST.R, PlotPCA.R, PlotFA.R, PlotCountCluster.R, PlotApeTree.R
  seed: 31415
  params:
    X: $projection
    properties: $properties
    vizplot: File(pdf)
    exec[1]:
      igraph_layout: $(igraph_layouts)
  return: vizplot

DSC:
  run: ((SimTree * PlotTree) * (CalcVcv * Vcv2BM, SimBM), ToyTreeBM) *
       (SklearnDR * PlotPY,
       MST * PlotR[1],
       (RDR[2], RDR[3]) * PlotR[2],
       (RDR[1], RDR[4], RDR[5], RDR[6]) * PlotR[3],
       RDR[7] * PlotR[4],
       (RDR[8], RDR[9]) * PlotR[5],
       (RDR[1], RDR[2], RDR[3], RDR[4], RDR[5], RDR[6], RDR[7]) * MST * PlotR[1])
  R_libs: liamrevell/phytools, ape, adephylo, MASS, igraph, PMA, ggplot2,
          reshape2, maptpx, kkdey/CountClust, stephens999/ashr (1.0.0+)
  exec_path: output/src/simulate, output/src/analyze, output/src/plot
  lib_path: output/src/simulate, output/src/analyze, output/src
  output: output/20160628
  parameters:
    manifold_methods: LLE_standard, LLE_ltsa, LLE_hessian, LLE_modified,
                      isomap, mds, spectral_embedding, t_sne, pca
    igraph_layouts: layout_as_star, layout_as_tree, layout_in_circle,
                    layout_nicely, layout_on_grid, layout_on_sphere, layout_randomly,
                    layout_with_dh, layout_with_drl, layout_with_fr, layout_with_gem,
                    layout_with_graphopt, layout_with_kk, layout_with_lgl,
                    layout_with_mds
                    # unusable: layout_as_bipartite, layout_with_sugiyama
SimTree

## Generate tree structure
## 1. Starts off with a bifurcate tree of
## terminal nodes n and time t 
## and the probability of speciation b = (log(n) - log(2)) / t
## The output will be a tree object:
# > names(tree) # an object of class "phylo"
# [1] "edge""Nnode"       "tip.label"   "edge.length"
## 2. Want m points in between each pair of nodes
## For each non-root node, generate m data points on the branch via
## i + runif(m) where i is the ID of a node
## and the position are upstream with respect to i.

simulate_bitree <- function(n, t, m, unify_branch = FALSE) {
  tmp_n <- n
  if (n == 2) n <- 3
  b <- (log(n) - log(2)) / t
  k <- n * 2 - 1
  tree <- phytools::pbtree(b = b, n = n, t = t, type = "discrete")
  if (unify_branch) ## Reset all branch length to 1
    tree$edge.length <- rep(1, length(tree$edge.length))
  ## Adjust tip label
  tree$tip.label <- sapply(strsplit(tree$tip.label, split='t', fixed=T), function(x) x[2])
  if (tmp_n == 2) {
    ## A hack to simulate tree with only 2 tips
    tree$edge <- matrix(c(3,3,1,2), 2, 2)
    tree$Nnode <- 1 
    tree$tip.label <- c("1", "2")
    tree$edge.length <- tree$edge.length[3:4]
  }
  ## Generate positions in between nodes
  n_nodes <- tree$Nnode + length(tree$tip.label)
  positions <- vector()
  for (i in 1:n_nodes) {
    if (i != (length(tree$tip.label) + 1)) ## exclude the root here
      positions <- append(positions, i + runif(m)) 
  }
  ## Get proper node labelling
  group <- as.integer(positions)
  X_labels <- vector()
  for (i in 1:length(group)) {
    if (group[i] <= length(tree$tip.label)) {
      X_labels[i] <- tree$tip.label[group[i]]
    } else {
      X_labels[i] <- as.character(group[i])
    }
  }
  return(list(data = tree, all_nodes = positions, labels = X_labels))
}
res <- simulate_bitree(n, t, m, unify_branch)

PlotTree

plot_tree <- function(tree, filename, w = 8, h = 6) {
  pdf(filename, w, h)
  ape::plot.phylo(tree, show.tip.label=F,
             show.node.label = F, edge.color="#5B9279", edge.width=4)
  ape::nodelabels(bg = "#FFFFFF", frame = 'circle')
  # labeling is tricky here ... need to specify tip.label!
  ape::tiplabels(tree$tip.label, frame = 'circle', bg = "#F0C808")
  dev.off()
}
plot_tree(tree, treeplot)

CalcVcv

# Calculate variance/covariance matrix of data points along a tree

## 1 Create a matrix of pairwise elements of input data points
## then each entry of these matrix is covariance between a pair of data points, (i, j)
## 2. For each entry (i, j) calculation covariance:
## 2.1 when int(i) == int(j): the covariance is `adephylo::distRoot(tree, int(i)) - (max(i, j) - int(i))`
## 2.2 when i != j: `adephylo::distRoot(tree, ape::mrca(tree, full = T)[int(i), int(j)]) - foo(i - int(i), j - int(j)))`
## where foo returns: if i on j's path to MRCA then is i - int(i); if j on i's path to MRCA then is j - int(j); else zero
## It works flawlessly except for being way too slow
## \input: tree, positions
## This script works with DSC. Please load tree and positions data when you run it independently

suppressMessages(require(adephylo, quietly = T, warn.conflicts = F))
suppressMessages(require(ape, quietly = T, warn.conflicts = F))
is_subset <- function(x, y) any(apply(embed(y,length(y)-length(x)+1),2,identical,x))
get_covij <- function(tree, i, j) {
if (as.integer(i) != as.integer(j)) {
  covij <- as.vector(distRoot(tree, mrca(tree, full = T)[as.integer(i), as.integer(j)]))[1]
  path_i <- nodepath(tree, from = as.integer(i), to = length(tree$tip.label) + 1)
  path_j <- nodepath(tree, from = as.integer(j), to = length(tree$tip.label) + 1)
  ## j contains i or i contains j
  if (length(path_j) > length(path_i)) {
    covij <- covij - (i - as.integer(i)) *  is_subset(path_i, path_j)
  } else if (length(path_i) > length(path_j)) {
    covij <- covij - (j - as.integer(j)) * is_subset(path_j, path_i)
  } else {
    if (identical(path_i, path_j)) print('ERROR')
  }
} else {
  covij <- distRoot(tree, as.integer(i)) - (max(i, j) - as.integer(i))
}
return(covij)
}

CX <- matrix(NA, nrow = length(positions), ncol = length(positions))
for (ii in 1:length(positions)) {
  for (jj in 1:length(positions)) {
    if (ii >= jj) CX[ii, jj] <- get_covij(tree, positions[ii], positions[jj])
  }
}
CX[upper.tri(CX)] <- 0
CX <- CX + t(CX) - diag(diag(CX))

Vcv2BM

## Each row will be one realization from the tree,
## row count will be number of nodes on the tree
## nrow == length(X_labels)
## Each column will be on sample (replicate)
suppressMessages(require(MASS, quietly = T))
dat <- t(mvrnorm(k, rep(mu, length(X_labels)), CX * sigma * scalar))
if (type_I_noise != 0) {
  dat <- dat + matrix(rnorm(nrow(dat) * ncol(dat), sd = type_I_noise * scalar),
                      nrow = nrow(dat), ncol = ncol(dat))
}
if (type_II_noise != 0) {
  dat <- cbind(dat, t(mvrnorm(type_II_noise, rep(mu, nrow(dat)),
                              diag(nrow(dat)) * sigma * scalar)))
}
X <- list()
for (item in unique(X_labels)) {
  X[[paste0('I', item)]] <- dat[which(X_labels == item), ]
}

SimBM

## Generate Brownian motion on a tree and sample points from the path
## [Kushal Dey and Gao Wang]
## tree: A tree structure generated by the simulate_bitree() function res$data
## all_nodes: in-between nodes data points generated by simulate_bitree() 
## Nsim: Number of Brownian motions to generate/simulate
## diffusion: A diffusion rate parameter for the diffusion Brownian process. Default=1
## precision: The time-scale for each branch of the tree produced (if the branch length is t,
## it blows it up to t*precision and generate Brownian motion with scaled variance
## under that new scale)

## plot: if TRUE, plots the paths for each simulation,defaults to FALSE

## Samples points from Brownian path for each non-root branch
## based on tree$all_nodes() function and then records these time
## points and the corresponding value of the Tree Diffusion process
## at that point for each simulation run. This whole process is repeated
## Nsim number of times to generate Nsim Brownian processes on trees
suppressMessages(require(MASS, quietly = T))
IntermBM <- function(tree, all_nodes, Nsim, diffusion = 1, precision = 1000, plot = FALSE)
{
  Y <- list()
  ll <- floor(tree$edge.length * precision) + 1
  for (m in 1:Nsim) {
    ## create a fine grid (w.r.t precision) of discrete points to approximate continuous Brownian motion
    ## G[[i]] is finer than wnat we need (Y[[i]])
    G <- lapply(tree$edge.length,
                function(x) return(c(0, cumsum(rnorm(n = floor(x * precision), sd = sqrt(diffusion/precision))))))
    if(plot) {
      H <- nodeHeights(tree)
      plot(H[1, 1], G[[1]][1], ylim = range(G), xlim = range(H), xlab = "time", ylab = "phenotype")
      for (i in 1:length(G))
        lines(H[i, 1]:H[i, 2], G[[i]][seq(from=1, to=length(G[[i]]), length.out=length(H[i,1]:H[i,2]))])
      yy <- sapply(1:length(tree$tip.label), function(x, y) which(x == y), y = tree$edge[,2])
      yy <- sapply(yy, function(x, y) y[[x]][length(y[[x]])], y = G)
      text(x = max(H), y = yy, tree$tip.label)
    }
    ## simulate data points
    for (i in 1:nrow(tree$edge)) {
      ## find parent
      pp <- which(tree$edge[, 2] == tree$edge[i, 1])
      ## add parent's value to itself
      if (length(pp) > 0)
        ## non-root
        G[[i]] <- G[[i]] + G[[pp]][ll[pp]]
      else
        ## root
        G[[i]] <- G[[i]] + G[[1]][1]
      ## find all data points between its parent and itself
      curr_index <- which(floor(all_nodes) == tree$edge[i, 2])
      pos <- ceiling((1-all_nodes[curr_index]%%1) * length(G[[i]])) ## position: 1 minus the factional part
      ## Properly determine labels
      if (tree$edge[i, 2] <= length(tree$tip.label)) 
        label = tree$tip.label[tree$edge[i, 2]]
      else
        label = as.character(tree$edge[i, 2])
      ## Y[[label]] is a subset of G[[i]]
      if (m == 1)
        Y[[paste0('I', label)]] <- G[[i]][pos]
      else
        Y[[paste0('I', label)]] <- cbind(Y[[paste0('I', label)]], G[[i]][pos])
    }
  }
  return(Y)
}

AddNoise <- function(Y, noise1, noise2, sigma) {
  for (i in names(Y)) {
    if (noise1 != 0) {
      Y[[i]] <- Y[[i]] + matrix(rnorm(nrow(Y[[i]]) * ncol(Y[[i]]), sd = noise1),
                      nrow = nrow(Y[[i]]), ncol = ncol(Y[[i]]))
    }
    if (noise2 != 0) {
      Y[[i]] <- cbind(Y[[i]], t(mvrnorm(noise2, rep(0, nrow(Y[[i]])),
                              diag(nrow(Y[[i]])) * sigma)))
    }
  }
  return(Y)
}

X <- IntermBM(tree, all_nodes, Nsim, diffusion, precision)
X <- AddNoise(X, type_I_noise, type_II_noise, sqrt(diffusion/precision))

ToyTreeBM

## Simulate Brownian motion along a linear tree branch
## params:
##   m: 100
##   t_scale: 1
##   Nsim: 1000
##   difussion: 100
##   precision: 1000
## return: X
## The returned data is ordered by rows (data from one end node to the other)
suppressMessages(require(MASS, quietly = T))
brown <- matrix(0, t_scale*precision, Nsim)

for (m in 1:Nsim) {
  brown[,m] <- c(0, cumsum(rnorm(n = (t_scale*precision-1), sd = sqrt(diffusion/precision))))
}

points_of_interest <- floor(seq(1, dim(brown)[1], length.out = m))
dat <- brown[points_of_interest,]
if (type_I_noise != 0) {
  dat <- dat + matrix(rnorm(nrow(dat) * ncol(dat), sd = type_I_noise),
                      nrow = nrow(dat), ncol = ncol(dat))
}
if (type_II_noise != 0) {
  dat <- cbind(dat, t(mvrnorm(type_II_noise, rep(0, nrow(dat)),
                              diag(nrow(dat)) * sqrt(diffusion/precision))))
}

X <- list(I1 = dat)


## Simulate Brownian motion along a linear tree branch
## params:
##   m: 100
##   t_scale: 1
##   Nsim: 1000
##   difussion: 100
##   precision: 1000
##   type_I_noise: 0, Asis(sqrt(diffusion/precision))
##   type_II_noise: 0, Asis(Nsim * 2)
## return: X
## The returned data is ordered by rows (data from one end node to the other)

AddNoise <- function(dat, type_I_noise, type_II_noise, diffusion, precision) {
if (type_I_noise != 0) {
  dat <- dat + matrix(rnorm(nrow(dat) * ncol(dat), sd = type_I_noise),
                      nrow = nrow(dat), ncol = ncol(dat))
}
if (type_II_noise != 0) {
  dat <- cbind(dat, t(mvrnorm(type_II_noise, rep(0, nrow(dat)),
                              diag(nrow(dat)) * sqrt(diffusion/precision))))
}
return(dat)
}

suppressMessages(require(MASS, quietly = T))
brown1 <- matrix(0, t_scale*precision, Nsim)
brown2 <- matrix(0, t_scale*precision, Nsim)
brown3 <- matrix(0, t_scale*precision, Nsim)

for (m in 1:Nsim) {
  brown1[,m] <- c(0, cumsum(rnorm(n = (t_scale*precision-1), sd = sqrt(diffusion/precision))))
  brown2[,m] <-(tail(brown1[,m],1) + 0.01) + c(0, cumsum(rnorm(n = (t_scale*precision-1), sd = sqrt(diffusion/precision))))
  brown3[,m] <-(tail(brown1[,m],1) - 0.01) + c(0, cumsum(rnorm(n = (t_scale*precision-1), sd = sqrt(diffusion/precision))))
}
## brown <- rbind(brown1, brown2, brown3)
brown1_sampled <- brown1[floor(seq(1, dim(brown1)[1], length.out=m)),]
brown2_sampled <- brown2[floor(seq(1, dim(brown2)[1], length.out=m)),]
brown3_sampled <- brown3[floor(seq(1, dim(brown3)[1], length.out=m)),]

X <- list(I10 = AddNoise(brown1_sampled,  type_I_noise, type_II_noise, diffusion, precision),
          I11 = AddNoise(brown2_sampled,  type_I_noise, type_II_noise, diffusion, precision),
          I12 = AddNoise(brown3_sampled,  type_I_noise, type_II_noise, diffusion, precision))

RDR

## Iteratively calling FLASH for K factors
## Because FLASH is a rank one method, each time we call FLASH, remove the variations explained and use residuals for the next round 
options(warn=-1)
source("flash.R")
flashpool <- function(data,K,
                      tol=1e-5, maxiter_r1 = 50,
                 partype = c("constant","known","Bayes_var","var_col"),
                 sigmae2_true = NA,
                 factor_value = NA,fix_factor = FALSE,
                 nonnegative = FALSE,
                 objtype = c("margin_lik","lowerbound_lik"),
                 ash_para = list(),
                 fl_list=list())
{
  res <- data
  loadings <- numeric()
  factors <- numeric()
  for(k in 1:K){
    out1 <- flash(res,
                tol=tol, 
                maxiter_r1=maxiter_r1,
                partype=partype,
                sigmae2_true = sigmae2_true,
                factor_value = factor_value,
                fix_factor = fix_factor,
                nonnegative = nonnegative,
                objtype = objtype,
                ash_para = ash_para,
                fl_list = fl_list)
    
      res <- res - out1$l%*%t(out1$f)
      loadings <- cbind(loadings, out1$l)
      factors  <- cbind(factors,  out1$f)
  }
  
  ll <- list(u=loadings,
             v=factors,
             residuals=res)
  return(ll)
}
source('utils.R')
Y <- merge_nodes(X, label)
if (is.null(K)) K <- length(Y$label) + 1
projection <- flashpool(Y$dat, K=K)
properties <- list(method = paste('Flash', 'K =', K), samples = Y$sample_labels, legend_text = Y$label)
X <- list(dat = projection$u, sample_labels = Y$sample_labels, label = Y$label)


## sparse PCA on data matrix
## http://gidir.com/thesis/work/WittenTibsHastiePMD2008.pdf
source('utils.R')
suppressMessages(library(PMA))
Y <- merge_nodes(X, label)
if (is.null(sumabsv)) sumabsv <- SPC.cv(Y$dat)$bestsumabsv1se
projection <- SPC(Y$dat, sumabsv = sumabsv, K = K, orth = orth)
properties <- list(method = paste('SPC', 'sumabsv =', sumabsv, 'K =', K, 'orth = ', orth), samples = Y$sample_labels, legend_text = Y$label)
X <- list(dat = projection$u, sample_labels = Y$sample_labels, label = Y$label)


## PCA on data matrix
source('utils.R')
Y <- merge_nodes(X, label)
projection <- prcomp(Y$dat)
projection$u <- projection$x
projection <- projection[projection != "x"]
properties <- list(method = 'PCA', samples = Y$sample_labels, legend_text = Y$label)
X <- list(dat = projection$u, sample_labels = Y$sample_labels, label = Y$label)


## SFA on data matrix
## B ENGELHARDT AND M STEPHENS
## Input: X, label, K, model
## See SFA documentation for details
source('utils.R')
if (Sys.info()['sysname'] == "Darwin") {cmd <- "sfa_mac"
} else {cmd <- "sfa_linux"}
Y <- merge_nodes(X, label)
iter <- 1000
input <- paste0('/tmp/', Sys.getpid(), '.sfa')
output <- paste0('/tmp/', Sys.getpid())
write.table(Y$dat, input, sep = ' ', row.names = F, col.names = F)
## Oracle K
if (is.null(K)) K <- length(Y$label) + 1
cmd <- paste(cmd, '-gen', input, '-g', nrow(Y$dat), '-n', ncol(Y$dat), '-k', K,
             '-iter', iter, '-r', rng, model, '-o', output, '>', paste0('/tmp/', Sys.getpid(), '.sfa.log'))
system(cmd)
projection <- list()
## matrix of loadings
projection$u <- as.matrix(read.table(paste0(output, '_lambda.out')))
## matrix of factors
projection$v <- as.matrix(read.table(paste0(output, '_F.out')))
properties <- list(method = paste('SFA', 'K =', K, 'model = ', model), samples = Y$sample_labels, legend_text = Y$label)
X <- list(dat = projection$u, sample_labels = Y$sample_labels, label = Y$label)


## Penalized Matrix Decomposition on data matrix
## Finds factors u and v that summarize the data matrix well. u and v will both be sparse, and v can optionally also be smooth
suppressMessages(library(PMA))
source('utils.R')
Y <- merge_nodes(X, label)
if (is.null(K)) K <- length(Y$label) + 1
projection <- PMD(Y$dat, center=TRUE, sumabs=1, K=K)
properties <- list(method = paste('PMD', 'K =', K), samples = Y$sample_labels, legend_text = Y$label)
X <- list(dat = projection$u, sample_labels = Y$sample_labels, label = Y$label)


## Penalized Matrix Decomposition on positive data matrix
## with positive constraints on loading / factors
## Finds factors u and v that summarize the data matrix well. u and v will both be sparse, and v can optionally also be smooth
suppressMessages(library(PMA))
source('utils.R')
Y <- merge_nodes(X, label)
if (is.null(K)) K <- length(Y$label) + 1
projection <- PMD(exp(Y$dat / 100) * 100, center=TRUE, upos = TRUE, vpos = TRUE, sumabs=1, K=K)
properties <- list(method = paste('Positive PMD', 'K =', K), samples = Y$sample_labels, legend_text = Y$label)
X <- list(dat = projection$u, sample_labels = Y$sample_labels, label = Y$label)


## CountCluster on data matrix
source('utils.R')
if ('dat' %in% names(X)) {stop('Error input: cannot work iteratively on X')}
Y <- merge_nodes(X, label)
dat <- Y$dat
## Convert Gaussian data to counts
dat[dat > 10] <- 10
dat <- floor(exp(dat))
if (is.null(K)) K <- length(Y$label) + 1
## Use Taddy's topics model implementation to create clusters
projection <- maptpx::topics(dat, K = K, tol = tol)
properties <- list(method = paste('CountClust', 'K = ', K, 'tol = ', tol),
                   samples = Y$sample_labels, legend_text = Y$label)
X <- list(dat = projection$omega, sample_labels = Y$sample_labels, label = Y$label)


## neighbor-joining tree estimation of Saitou and Nei (1987).
source('utils.R')
Y <- merge_nodes(X, label)
projection <- ape::nj(dist(Y$dat))
properties <- list(method = 'Neighbor-joining', samples = Y$sample_labels, legend_text = Y$label)
X <- list()


## Desper, R. and Gascuel, O. (2002) Fast and accurate phylogeny
## reconstruction algorithms based on the minimum-evolution
## principle.  _Journal of Computational Biology_, *9(5)*, 687-705.
source('utils.R')
Y <- merge_nodes(X, label)
projection <- ape::fastme.ols(dist(Y$dat), nni = TRUE)
properties <- list(method = 'Fast minimum-evolution (OLS)',
                   samples = Y$sample_labels, legend_text = Y$label)
X <- list()

MST

## MST on distance matrix
source('utils.R')
Y <- merge_nodes(X, label)
## use cor here for scale invariance
## projection <- ape::mst(1 - abs(cor(t(Y$dat))))
projection <- ape::mst(dist(Y$dat))
properties <- list(method = 'MST', samples = Y$sample_labels, legend_text = Y$label)
## Give it an empty list because MST cannot be applied to itself
X <- list()

PlotR

## Plot MST
## FIXME: Not sure which is the best way to visualize the MST matrix
## Here use igraph::graph.adjacency to view
source('plotconf.R')
suppressMessages(library(igraph))
fill <- rep(COLORS, ceiling(length(properties$legend_text)/length(COLORS)))[1:length(properties$legend_text)]
pdf(vizplot)
plot(graph.adjacency(X),
     edge.arrow.mode = 0,
     vertex.size = PSIZE,
     vertex.label = NA,
     vertex.color = fill[match(properties$samples, properties$legend_text)],
     vertex.frame.color = fill[match(properties$samples, properties$legend_text)],
     layout = eval(parse(text = igraph_layout))
     )
legend("right", legend = properties$legend_text, fill = fill, bty = "n", border = fill, cex = 0.75)
title(main = paste(c(properties$method, igraph_layout)))
dev.off()


## Plot PCA 
source('plotconf.R')
fill <- rep(COLORS, ceiling(length(properties$legend_text)/length(COLORS)))[1:length(properties$legend_text)]
pdf(vizplot)
plot(X$u[,1], X$u[,2], pch = 20,
     col = fill[match(properties$samples, properties$legend_text)]
     )
legend("bottomleft", legend = properties$legend_text, fill = fill, bty = "n", border = fill, cex = 0.75)
title(main = properties$method)
dev.off()


## Plot SFA result
source("plotconf.R")
suppressMessages(library(reshape2))
suppressMessages(library(ggplot2))
lmat = as.data.frame(t(X$u))
colnames(lmat) = as.character(seq(1:ncol(lmat)))
## lmat = abs(lmat)
lmat$Factors = as.factor(seq_len(nrow(lmat)))
lmat <- melt(lmat, id.vars = "Factors")
lmat$labels = factor(properties$samples[lmat$variable], levels = properties$legend_text)
uf = unique(lmat$Factors)
fill <- rep(COLORS, ceiling(length(uf)/length(COLORS)))[1:length(uf)]
tmp1 <- subset(lmat, value >= 0)
tmp2 <- subset(lmat, value < 0)
pdf(vizplot)
ggplot() + 
    geom_bar(data = tmp1, aes(x = reorder(variable, value), y = value, fill = Factors), stat = "identity") +
    geom_bar(data = tmp2, aes(x = reorder(variable, value), y = value, fill = Factors), stat = "identity") +
    facet_grid(~labels, scales = "free", space = "free") +
    xlab("\nSamples") +
    ylab("Loading\n") + 
    ggtitle(properties$method) +
    scale_fill_manual(values = fill) + 
    theme(axis.text.x = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks = element_blank())
dev.off()


## Plot CountCluster 
source('plotconf.R')
suppressMessages(library(CountClust))
fill <- rep(COLORS, ceiling(X$K/length(COLORS)))[1:X$K]
annotation <- data.frame(
  sample_id = paste0("X", c(1:nrow(X$omega))),
  tissue_label = factor(properties$samples,
                        levels = properties$legend_text))
pdf(vizplot)
StructureGGplot(omega = X$omega,
                annotation = annotation,
                palette = fill,
                yaxis_label = "External nodes of tree",
                order_sample = TRUE,
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 7,
                                 axis_label_face = "bold"),
                figure_title = properties$method)
dev.off()


## Plot results from tree reconstruction algorithms
source('plotconf.R')
suppressMessages(library(ape))
fill <- rep(COLORS, ceiling(length(properties$legend_text)/length(COLORS)))[1:length(properties$legend_text)]
pdf(vizplot)
plot(X, type="phylogram",
     show.tip.label = FALSE,
     cex=1)
tiplabels(pch = 19, col = fill[match(properties$samples, properties$legend_text)])
legend("right", legend = properties$legend_text, fill = fill, bty = "n", border = fill, cex = 0.75)
title(main = properties$method)
dev.off()

SklearnDR

import sys
import numpy as np
from sklearn import manifold
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from plotconf import *

class DMR:
    '''
    Run various off the shelf dimensionality reduction methods
    and make figures
    '''
    def __init__(self, data, label, **kwargs):
        if label is None:
            label = list(data.keys())
        self.properties = {'range': [], 'label': label, 'color': [], 'method': None}
        self.data = None
        nb_samples = 0
        for idx, k in enumerate(label):
            if k not in data:
                continue
            self.properties['range'].append((nb_samples, nb_samples + data[k].shape[0]))
            self.properties['color'].append(COLORS[idx % len(COLORS)])
            if self.data is None:
                self.data = data[k]
            else:
                self.data = np.vstack((self.data, data[k]))
            nb_samples += data[k].shape[0]
        self.data_new = None
        self.params = kwargs

    def LLE_standard(self):
        self.properties['method'] = "LLE"
        self.data_new = manifold.LocallyLinearEmbedding(n_neighbors = self.params["n_neighbors"],
                                               n_components = self.params["n_components"],
                                               eigen_solver='auto',
                                               method = 'standard').fit_transform(self.data)

    def LLE_ltsa(self):
        self.properties['method'] = "LTSA"
        self.data_new = manifold.LocallyLinearEmbedding(n_neighbors = self.params["n_neighbors"],
                                               n_components = self.params["n_components"],
                                               eigen_solver='auto',
                                               method = 'ltsa').fit_transform(self.data)

    def LLE_hessian(self):
        self.properties['method'] = "Hessian LLE"
        self.data_new = manifold.LocallyLinearEmbedding(n_neighbors = self.params["n_neighbors"],
                                               n_components = self.params["n_components"],
                                               eigen_solver='auto',
                                               method = 'hessian').fit_transform(self.data)

    def LLE_modified(self):
        self.properties['method'] = "Modified LLE"
        self.data_new = manifold.LocallyLinearEmbedding(n_neighbors = self.params["n_neighbors"],
                                               n_components = self.params["n_components"],
                                               eigen_solver='auto',
                                               method = 'modified').fit_transform(self.data)

    def isomap(self):
        self.properties['method'] = "Isomap"
        self.data_new = manifold.Isomap(n_neighbors = self.params["n_neighbors"],
                               n_components = self.params["n_components"]).fit_transform(self.data)

    def mds(self):
        self.properties['method'] = "MDS"
        self.data_new = manifold.MDS(n_components = self.params["n_components"],
                                     max_iter=100, n_init=1).fit_transform(self.data)

    def spectral_embedding(self):
        self.properties['method'] = "Spectral Embedding"
        self.data_new = manifold.SpectralEmbedding(n_neighbors = self.params["n_neighbors"],
                                          n_components = self.params["n_components"]).fit_transform(self.data)

    def t_sne(self):
        self.properties['method'] = "t-SNE"
        self.data_new = manifold.TSNE(n_components = self.params["n_components"],
                                      init='pca', random_state=0).fit_transform(self.data)

    def pca(self):
        self.properties['method'] = "PCA"
        self.data_new = PCA(n_components = self.params["n_components"]).fit_transform(self.data)

    # def mst(self):
    #     self.properties['method'] = "MST"
    # FIXME: input is graph matrix not distance matrix
    #     self.data_new = minimum_spanning_tree(csr_matrix(1 - np.absolute(np.corrcoef(self.data)))).toarray()

dmr = DMR(data, label, n_neighbors = n_neighbors, n_components = n_comp)
try:
    getattr(dmr, sys.argv[1])()
    properties, projection = dmr.properties, dmr.data_new
except Exception as e:
    properties = projection = []
    sys.stderr.write('{}'.format(e))

PlotPY

# Gao Wang (c) 2016
from plotconf import *
def plot_dr_dot(data, properties, fname):
    if len(data) == 0:
        os.system('cp ~/Dropbox/.blank.pdf {}'.format(fname))
        return
    ax = plt.subplot(111)
    for x, y, z in zip(properties['range'], properties['color'], list(properties['label'])):
        plt.scatter(data[x.item(0):x.item(1), 0], data[x.item(0):x.item(1), 1],
                    color = y, label = z, s = 40)
    plt.title(properties['method'][0] + '\n')
    plt.legend(loc = "center left", bbox_to_anchor=(0.9, 0.5))
    ax.xaxis.set_major_formatter(NullFormatter())
    ax.yaxis.set_major_formatter(NullFormatter())
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    plt.axis('tight')
    plt.savefig(fname)
    plt.close()

plot_dr_dot(data, properties, vizplot)