vcv-bm.dsc | DSC2


# This DSC can be executed from: https://github.com/gaow/dr-tree/tree/master/analysis
# To run the entire benchmark:
# dsc exec vcv-bm.dsc -j8
# To run part of the benchmark, here are a few highlights
# dsc exec vcv-bm.dsc -j8 -s "SimTree * SimBM * SklearnDR * PlotPY"
# dsc exec vcv-bm.dsc -j8 -s "SimTree * SimBM * RDR[1] * PlotR[1]"
# dsc exec vcv-bm.dsc -j8 -s "SimTree * SimBM * (RDR[2], RDR[3]) * PlotR[2]"
# dsc exec vcv-bm.dsc -j8 -s "SimTree * SimBM * (RDR[4] * PlotR[3], RDR[5] * PlotR[4])"
###
# Simulate bifurcated tree structure with data points in between nodes
###
SimTree:
  exec: BiTree.R
  seed: 31415
  params:
    n: 8
    t: 30
    m: 100
    unify_branch: 1
  return: tree = R(res$data),
          nodes = R(res$all_nodes),
          X_labels = R(res$labels)

###
# Plot tree structure
###
PlotTree:
  exec: TreePlot.R
  params:
    tree: $tree
    treeplot: File(pdf)
  return: treeplot

###
# Calculate covariance matrix for the tree structure
###
CalcVcv:
  exec: VcvTree.R
  seed: 31415
  params:
    tree: $tree
    positions: $nodes
  return: CX

###
# Generate samples from covariance
###
Vcv2BM:
  exec: VcvBM.R
  seed: 31415
  params:
    CX: $CX
    X_labels: $X_labels
    mu: 0
    sigma: 0.1
    k: 100
    scalar: 1
  return: X

###
# Simulate BM directly from tree structure
###
SimBM:
  exec: IntermBM.R
  seed: 31415
  params:
    tree: $tree
    all_nodes: $nodes
    Nsim: 100
    diffusion: 1
    precision: 1000
  return: X

###
# Apply different methods from sklearn package
# - global methods, local methods (manifold learning)
###
SklearnDR:
  exec: Combo(manifold.py, $(manifold_methods))
  seed: 31415
  params:
    data: $X
    label: (I10, I12, I11, I1, I2, I13, I3, I4, I14, I5, I6, I15, I7, I8)
    n_comp: 2
    n_neighbors: 50
  return: projection, properties

###
# Generate 2D plots for Python based methods
###
PlotPY:
  exec: PlotDR.py
  params:
    data: $projection
    properties: $properties
    vizplot: File(pdf)
  return: vizplot

###
# Apply methods in R packages:
# - minimimum spanning tree via Ape package
# - sparse PCA via PMA package
# - sparse factor analysis via sfa wrapped in R
###
RDR:
  exec: ApeMST.R, PmaPCA.R, PCA.R, SFA.R, CountCluster.R
  seed: 31415
  params:
    X: $X
    label: (I10, I12, I11, I1, I2, I13, I3, I4, I14, I5, I6, I15, I7, I8)
    exec[2]:
      sumabsv: 1, Asis(NULL)
      K: 5
      orth: 1, 0
    exec[4]:
      K: Asis(NULL)
      model: -mn
      rng: 31415
    exec[5]:
      K: Asis(NULL)
      tol: 1
  return: projection, properties

###
# Generate 2D plots for R based methods 
###
PlotR:
  exec: PlotMST.R, PlotPCA.R, PlotSFA.R, PlotCountCluster.R
  seed: 31415
  params:
    X: $projection
    properties: $properties
    vizplot: File(pdf)
    exec[1]:
      igraph_layout: $(igraph_layouts)
  return: vizplot

DSC:
  run: (SimTree * PlotTree) * (CalcVcv * Vcv2BM, SimBM) *
       (SklearnDR * PlotPY, RDR[1] * PlotR[1], (RDR[2], RDR[3]) * PlotR[2], RDR[4] * PlotR[3],
       RDR[5] * PlotR[4])
  R_libs: liamrevell/phytools, ape, adephylo, MASS, igraph, PMA, ggplot2,
          reshape2, maptpx, kkdey/CountClust
  exec_path: output/src/simulate, output/src/analyze
  lib_path: output/src/simulate, output/src/analyze, output/src
  output: output/20160621
  parameters:
    manifold_methods: LLE_standard, LLE_ltsa, LLE_hessian, LLE_modified,
                      isomap, mds, spectral_embedding, t_sne, pca
    igraph_layouts: layout_as_star, layout_as_tree, layout_in_circle,
                    layout_nicely, layout_on_grid, layout_on_sphere, layout_randomly,
                    layout_with_dh, layout_with_drl, layout_with_fr, layout_with_gem,
                    layout_with_graphopt, layout_with_kk, layout_with_lgl,
                    layout_with_mds
                    # unusable: layout_as_bipartite, layout_with_sugiyama

SimTree

BiTree.R


## Generate tree structure
## 1. Starts off with a bifurcate tree of
## terminal nodes n and time t 
## and the probability of speciation b = (log(n) - log(2)) / t
## The output will be a tree object:
# > names(tree) # an object of class "phylo"
# [1] "edge""Nnode"       "tip.label"   "edge.length"
## 2. Want m points in between each pair of nodes
## For each non-root node, generate m data points on the branch via
## i + runif(m) where i is the ID of a node
## and the position are upstream with respect to i.

simulate_bitree <- function(n, t, m, unify_branch = FALSE) {
  b <- (log(n) - log(2)) / t
  k <- n * 2 - 1
  tree <- phytools::pbtree(b = b, n = n, t = t, type = "discrete")
  if (unify_branch) ## Reset all branch length to 1
    tree$edge.length <- rep(1, length(tree$edge.length))
  ## Adjust tip label
  tree$tip.label <- sapply(strsplit(tree$tip.label, split='t', fixed=T), function(x) x[2])
  ## Generate positions in between nodes
  n_nodes <- tree$Nnode + length(tree$tip.label)
  positions <- vector()
  for (i in 1:n_nodes) {
    if (i != (length(tree$tip.label) + 1)) ## exclude the root here
      positions <- append(positions, i + runif(m)) 
  }
  ## Get proper node labelling
  group <- as.integer(positions)
  X_labels <- vector()
  for (i in 1:length(group)) {
    if (group[i] <= length(tree$tip.label)) {
      X_labels[i] <- tree$tip.label[group[i]]
    } else {
      X_labels[i] <- as.character(group[i])
    }
  }
  return(list(data = tree, all_nodes = positions, labels = X_labels))
}
res <- simulate_bitree(n, t, m, unify_branch)

SimBM

IntermBM.R


## Generate Brownian motion on a tree and sample points from the path
## [Kushal Dey and Gao Wang]
## tree: A tree structure generated by the simulate_bitree() function res$data
## all_nodes: in-between nodes data points generated by simulate_bitree() 
## Nsim: Number of Brownian motions to generate/simulate
## diffusion: A diffusion rate parameter for the diffusion Brownian process. Default=1
## precision: The time-scale for each branch of the tree produced (if the branch length is t,
## it blows it up to t*precision and generate Brownian motion with scaled variance
## under that new scale)

## plot: if TRUE, plots the paths for each simulation,defaults to FALSE

## Samples points from Brownian path for each non-root branch
## based on tree$all_nodes() function and then records these time
## points and the corresponding value of the Tree Diffusion process
## at that point for each simulation run. This whole process is repeated
## Nsim number of times to generate Nsim Brownian processes on trees

IntermBM <- function(tree, all_nodes, Nsim, diffusion = 1, precision = 1000, plot = FALSE)
{
  Y <- list()
  ll <- floor(tree$edge.length * precision) + 1
  for (m in 1:Nsim) {
    ## create a fine grid (w.r.t precision) of discrete points to approximate continuous Brownian motion
    ## G[[i]] is finer than wnat we need (Y[[i]])
    G <- lapply(tree$edge.length,
                function(x) return(c(0, cumsum(rnorm(n = floor(x * precision), sd = sqrt(diffusion/precision))))))
    if(plot) {
      H <- nodeHeights(tree)
      plot(H[1, 1], G[[1]][1], ylim = range(G), xlim = range(H), xlab = "time", ylab = "phenotype")
      for (i in 1:length(G))
        lines(H[i, 1]:H[i, 2], G[[i]][seq(from=1, to=length(G[[i]]), length.out=length(H[i,1]:H[i,2]))])
      yy <- sapply(1:length(tree$tip.label), function(x, y) which(x == y), y = tree$edge[,2])
      yy <- sapply(yy, function(x, y) y[[x]][length(y[[x]])], y = G)
      text(x = max(H), y = yy, tree$tip.label)
    }
    ## simulate data points
    for (i in 1:nrow(tree$edge)) {
      ## find parent
      pp <- which(tree$edge[, 2] == tree$edge[i, 1])
      ## add parent's value to itself
      if (length(pp) > 0)
        ## non-root
        G[[i]] <- G[[i]] + G[[pp]][ll[pp]]
      else
        ## root
        G[[i]] <- G[[i]] + G[[1]][1]
      ## find all data points between its parent and itself
      curr_index <- which(floor(all_nodes) == tree$edge[i, 2])
      pos <- ceiling((1-all_nodes[curr_index]%%1) * length(G[[i]])) ## position: 1 minus the factional part
      ## Properly determine labels
      if (tree$edge[i, 2] <= length(tree$tip.label)) 
        label = tree$tip.label[tree$edge[i, 2]]
      else
        label = as.character(tree$edge[i, 2])
      ## Y[[label]] is a subset of G[[i]]
      if (m == 1)
        Y[[paste0('I', label)]] <- G[[i]][pos]
      else
        Y[[paste0('I', label)]] <- cbind(Y[[paste0('I', label)]], G[[i]][pos])
    }
  }
  return(Y)
}

X <- IntermBM(tree, all_nodes, Nsim, diffusion, precision)

RDR

ApeMST.R
PmaPCA.R
PCA.R
SFA.R
CountCluster.R


## Merge data across nodes and perform MST on distance matrix
dat <- NULL
sample_labels <- vector()
for (i in label) {
  if (is.null(dat)) {dat <- X[[i]]
  } else {dat <- rbind(dat, X[[i]])}
  sample_labels <- append(sample_labels, rep(i, nrow(X[[i]])))
}
## use cor here for scale invariance
## projection <- ape::mst(1 - abs(cor(t(dat))))
projection <- ape::mst(dist(dat))
properties <- list(method = 'MST', samples = sample_labels, legend_text = label)


## Merge data across nodes and perform sparse PCA on data matrix
## http://gidir.com/thesis/work/WittenTibsHastiePMD2008.pdf
suppressMessages(library(PMA))
dat <- NULL
sample_labels <- vector()
for (i in label) {
  if (is.null(dat)) {dat <- X[[i]]
  } else {dat <- rbind(dat, X[[i]])}
  sample_labels <- append(sample_labels, rep(i, nrow(X[[i]])))
}
if (is.null(sumabsv)) sumabsv <- SPC.cv(dat)$bestsumabsv1se
projection <- SPC(dat, sumabsv = sumabsv, K = K, orth = orth)
properties <- list(method = paste('SPC', 'sumabsv =', sumabsv, 'K =', K, 'orth = ', orth), samples = sample_labels, legend_text = label)


## Merge data across nodes and perform PCA on data matrix
dat <- NULL
sample_labels <- vector()
for (i in label) {
  if (is.null(dat)) {dat <- X[[i]]
  } else {dat <- rbind(dat, X[[i]])}
  sample_labels <- append(sample_labels, rep(i, nrow(X[[i]])))
}
projection <- prcomp(dat)
projection$u <- projection$x
projection <- projection[projection != "x"]
properties <- list(method = 'PCA', samples = sample_labels, legend_text = label)


## Merge data across nodes and perform SFA on data matrix
## B ENGELHARDT AND M STEPHENS
## Input: X, label, K, model
## See SFA documentation for details
if (Sys.info()['sysname'] == "Darwin") {cmd <- "sfa_mac"
} else {cmd <- "sfa_linux"}
dat <- NULL
sample_labels <- vector()
for (i in label) {
  if (is.null(dat)) {dat <- X[[i]]
  } else {dat <- rbind(dat, X[[i]])}
  sample_labels <- append(sample_labels, rep(i, nrow(X[[i]])))
}
iter <- 1000
input <- paste0('/tmp/', Sys.getpid(), '.sfa')
output <- paste0('/tmp/', Sys.getpid())
write.table(dat, input, sep = ' ', row.names = F, col.names = F)
## Oracle K
if (is.null(K)) K <- length(label) + 1
cmd <- paste(cmd, '-gen', input, '-g', nrow(dat), '-n', ncol(dat), '-k', K,
             '-iter', iter, '-r', rng, model, '-o', output, '>', paste0('/tmp/', Sys.getpid(), '.sfa.log'))
system(cmd)
projection <- list()
## matrix of loadings
projection$lmat <- as.matrix(read.table(paste0(output, '_lambda.out')))
## matrix of factors
projection$fmat <- as.matrix(read.table(paste0(output, '_F.out')))
properties <- list(method = paste('SFA', 'K =', K, 'model = ', model), samples = sample_labels, legend_text = label)


## Merge data across nodes and perform CountCluster on data matrix
dat <- NULL
sample_labels <- vector()
for (i in label) {
  if (is.null(dat)) {dat <- X[[i]]
  } else {dat <- rbind(dat, X[[i]])}
  sample_labels <- append(sample_labels, rep(i, nrow(X[[i]])))
}
## Convert Gaussian data to counts
dat[dat > 10] <- 10
dat <- floor(exp(dat))
if (is.null(K)) K <- length(label)
## Use Taddy's topics model implementation to create clusters
projection <- maptpx::topics(dat, K = K, tol = tol)
properties <- list(method = paste('CountClust', 'K = ', K, 'tol = ', tol),
                   samples = sample_labels, legend_text = label)

PlotR

PlotMST.R
PlotPCA.R
PlotSFA.R
PlotCountCluster.R


## Plot MST
## FIXME: Not sure which is the best way to visualize the MST matrix
## Here use igraph::graph.adjacency to view
source('plotconf.R')
suppressMessages(library(igraph))
fill <- rep(COLORS, ceiling(length(properties$legend_text)/length(COLORS)))[1:length(properties$legend_text)]
pdf(vizplot)
plot(graph.adjacency(X),
     edge.arrow.mode = 0,
     vertex.size = PSIZE,
     vertex.label = NA,
     vertex.color = fill[match(properties$samples, properties$legend_text)],
     vertex.frame.color = fill[match(properties$samples, properties$legend_text)],
     layout = eval(parse(text = igraph_layout))
     )
legend("right", legend = properties$legend_text, fill = fill, bty = "n", border = fill, cex = 0.75)
title(main = paste(c(properties$method, igraph_layout)))
dev.off()


## Plot PCA 
source('plotconf.R')
fill <- rep(COLORS, ceiling(length(properties$legend_text)/length(COLORS)))[1:length(properties$legend_text)]
pdf(vizplot)
plot(X$u[,1], X$u[,2], pch = 20,
     col = fill[match(properties$samples, properties$legend_text)]
     )
legend("bottomleft", legend = properties$legend_text, fill = fill, bty = "n", border = fill, cex = 0.75)
title(main = properties$method)
dev.off()


## Plot SFA result
source("plotconf.R")
suppressMessages(library(reshape2))
suppressMessages(library(ggplot2))
lmat = as.data.frame(t(X$lmat))
colnames(lmat) = as.character(seq(1:ncol(lmat)))
## lmat = abs(lmat)
lmat$Factors = as.factor(seq_len(nrow(lmat)))
lmat <- melt(lmat, id.vars = "Factors")
lmat$labels = factor(properties$samples[lmat$variable], levels = properties$legend_text)
uf = unique(lmat$Factors)
fill <- rep(COLORS, ceiling(length(uf)/length(COLORS)))[1:length(uf)]
tmp1 <- subset(lmat, value >= 0)
tmp2 <- subset(lmat, value < 0)
pdf(vizplot)
ggplot() + 
    geom_bar(data = tmp1, aes(x = reorder(variable, value), y = value, fill = Factors), stat = "identity") +
    geom_bar(data = tmp2, aes(x = reorder(variable, value), y = value, fill = Factors), stat = "identity") +
    facet_grid(~labels, scales = "free", space = "free") +
    xlab("\nSamples") +
    ylab("Loading\n") + 
    ggtitle(properties$method) +
    scale_fill_manual(values = fill) + 
    theme(axis.text.x = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks = element_blank())
dev.off()


## Plot CountCluster 
source('plotconf.R')
suppressMessages(library(CountClust))
fill <- rep(COLORS, ceiling(length(properties$legend_text)/length(COLORS)))[1:length(properties$legend_text)]
annotation <- data.frame(
  sample_id = paste0("X", c(1:nrow(X$omega))),
  tissue_label = factor(properties$samples,
                        levels = properties$legend_text))
pdf(vizplot)
StructureGGplot(omega = X$omega,
                annotation = annotation,
                palette = fill,
                yaxis_label = "External nodes of tree",
                order_sample = TRUE,
                axis_tick = list(axis_ticks_length = .1,
                                 axis_ticks_lwd_y = .1,
                                 axis_ticks_lwd_x = .1,
                                 axis_label_size = 7,
                                 axis_label_face = "bold"),
                figure_title = properties$method)
dev.off()

DSC script vcv-bm.dsc