DSC script vcv-bm.dsc


# This DSC can be executed from: https://github.com/gaow/dr-tree/tree/master/analysis
# To run the entire benchmark:
# dsc exec vcv-bm.dsc -j8
# To run part of the benchmark, here are a few highlights
# dsc exec vcv-bm.dsc -j8 -s "(SimTree * SimBM * ApeMST * PlotMST)"
###
# Simulate bifurcated tree structure with data points in between nodes
###
SimTree:
  exec: BiTree.R
  seed: 31415
  params:
    n: 8
    t: 30
    m: 100
    unify_branch: 1
  return: tree = R(res$data),
          nodes = R(res$all_nodes),
          X_labels = R(res$labels)

###
# Plot tree structure
###
PlotTree:
  exec: TreePlot.R
  params:
    tree: $tree
    treeplot: File(pdf)
  return: treeplot

###
# Calculate covariance matrix for the tree structure
###
CalcVcv:
  exec: VcvTree.R
  seed: 31415
  params:
    tree: $tree
    positions: $nodes
  return: CX

###
# Generate samples from covariance
###
Vcv2BM:
  exec: VcvBM.R
  seed: 31415
  params:
    CX: $CX
    X_labels: $X_labels
    mu: 0
    sigma: 0.1
    k: 100
    scalar: 1
  return: X

###
# Simulate BM directly from tree structure
###
SimBM:
  exec: IntermBM.R
  seed: 31415
  params:
    tree: $tree
    all_nodes: $nodes
    Nsim: 100
    diffusion: 1
    precision: 1000
  return: X

###
# Apply different manifold learning methods
###
ManifoldDR:
  exec: manifold.py $(manifold_methods)
  seed: 31415
  params:
    data: $X
    label: (I10, I12, I11, I1, I2, I13, I3, I4, I14, I5, I6, I15, I7, I8)
    n_comp: 2
    n_neighbors: 50
  return: projection, properties

###
# Generate 2D plots for manifold methods
###
PlotDR:
  exec: PlotDR.py
  params:
    data: $projection
    properties: $properties
    vizplot: File(pdf)
  return: vizplot

###
# Apply minimimum spanning tree via Ape package
###
ApeMST:
  exec: ApeMST.R
  seed: 31415
  params:
    X: $X
    label: (I10, I12, I11, I1, I2, I13, I3, I4, I14, I5, I6, I15, I7, I8)
  return: projection, properties

###
# Generate 2D plots for mst
###
PlotMST:
  exec: PlotMST.R
  seed: 31415
  params:
    mst: $projection
    properties: $properties
    vizplot: File(pdf)
  return: vizplot

DSC:
  run: (SimTree * PlotTree) * (CalcVcv * Vcv2BM, SimBM) *
       (ManifoldDR * PlotDR, ApeMST * PlotMST)
  R_libs: liamrevell/phytools, ape, adephylo, MASS
  exec_path: output/src/simulate, output/src/analyses
  lib_path: output/src/simulate, output/src/analyses, output/src
  output: output/20160614
  parameters:
    manifold_methods: LLE_standard, LLE_ltsa, LLE_hessian, LLE_modified,
                      isomap, mds, spectral_embedding, t_sne, pca
SimTree

## Generate tree structure
## 1. Starts off with a bifurcate tree of
## terminal nodes n and time t 
## and the probability of speciation b = (log(n) - log(2)) / t
## The output will be a tree object:
# > names(tree) # an object of class "phylo"
# [1] "edge""Nnode"       "tip.label"   "edge.length"
## 2. Want m points in between each pair of nodes
## For each non-root node, generate m data points on the branch via
## i + runif(m) where i is the ID of a node
## and the position are upstream with respect to i.

simulate_bitree <- function(n, t, m, unify_branch = FALSE) {
  b <- (log(n) - log(2)) / t
  k <- n * 2 - 1
  tree <- phytools::pbtree(b = b, n = n, t = t, type = "discrete")
  if (unify_branch) ## Reset all branch length to 1
    tree$edge.length <- rep(1, length(tree$edge.length))
  ## Adjust tip label
  tree$tip.label <- sapply(strsplit(tree$tip.label, split='t', fixed=T), function(x) x[2])
  ## Generate positions in between nodes
  n_nodes <- tree$Nnode + length(tree$tip.label)
  positions <- vector()
  for (i in 1:n_nodes) {
    if (i != (length(tree$tip.label) + 1)) ## exclude the root here
      positions <- append(positions, i + runif(m)) 
  }
  ## Get proper node labelling
  group <- as.integer(positions)
  X_labels <- vector()
  for (i in 1:length(group)) {
    if (group[i] <= length(tree$tip.label)) {
      X_labels[i] <- tree$tip.label[group[i]]
    } else {
      X_labels[i] <- as.character(group[i])
    }
  }
  return(list(data = tree, all_nodes = positions, labels = X_labels))
}
res <- simulate_bitree(n, t, m, unify_branch)

PlotTree

plot_tree <- function(tree, filename, w = 8, h = 6) {
  pdf(filename, w, h)
  ape::plot.phylo(tree, show.tip.label=F,
             show.node.label = F, edge.color="#5B9279", edge.width=4)
  ape::nodelabels(bg = "#FFFFFF", frame = 'circle')
  # labeling is tricky here ... need to specify tip.label!
  ape::tiplabels(tree$tip.label, frame = 'circle', bg = "#F0C808")
  dev.off()
}
plot_tree(tree, treeplot)

CalcVcv

# Calculate variance/covariance matrix of data points along a tree

## 1 Create a matrix of pairwise elements of input data points
## then each entry of these matrix is covariance between a pair of data points, (i, j)
## 2. For each entry (i, j) calculation covariance:
## 2.1 when int(i) == int(j): the covariance is `adephylo::distRoot(tree, int(i)) - (max(i, j) - int(i))`
## 2.2 when i != j: `adephylo::distRoot(tree, ape::mrca(tree, full = T)[int(i), int(j)]) - foo(i - int(i), j - int(j)))`
## where foo returns: if i on j's path to MRCA then is i - int(i); if j on i's path to MRCA then is j - int(j); else zero
## It works flawlessly except for being way too slow
## \input: tree, positions
## This script works with DSC. Please load tree and positions data when you run it independently

suppressMessages(require(adephylo, quietly = T, warn.conflicts = F))
suppressMessages(require(ape, quietly = T, warn.conflicts = F))
is_subset <- function(x, y) any(apply(embed(y,length(y)-length(x)+1),2,identical,x))
get_covij <- function(tree, i, j) {
if (as.integer(i) != as.integer(j)) {
  covij <- as.vector(distRoot(tree, mrca(tree, full = T)[as.integer(i), as.integer(j)]))[1]
  path_i <- nodepath(tree, from = as.integer(i), to = length(tree$tip.label) + 1)
  path_j <- nodepath(tree, from = as.integer(j), to = length(tree$tip.label) + 1)
  ## j contains i or i contains j
  if (length(path_j) > length(path_i)) {
    covij <- covij - (i - as.integer(i)) *  is_subset(path_i, path_j)
  } else if (length(path_i) > length(path_j)) {
    covij <- covij - (j - as.integer(j)) * is_subset(path_j, path_i)
  } else {
    if (identical(path_i, path_j)) print('ERROR')
  }
} else {
  covij <- distRoot(tree, as.integer(i)) - (max(i, j) - as.integer(i))
}
return(covij)
}

CX <- matrix(NA, nrow = length(positions), ncol = length(positions))
for (ii in 1:length(positions)) {
  for (jj in 1:length(positions)) {
    if (ii >= jj) CX[ii, jj] <- get_covij(tree, positions[ii], positions[jj])
  }
}
CX[upper.tri(CX)] <- 0
CX <- CX + t(CX) - diag(diag(CX))

Vcv2BM

suppressMessages(require(MASS, quietly = T))
## Each row will be one realization from the tree,
## row length will be number of nodes on the tree
## nrow == length(X_labels)
## Each column will be on sample (replicate)
dat <- t(mvrnorm(k, rep(mu, length(X_labels)), CX * sigma * scalar))
X <- list()
for (item in unique(X_labels)) {
  X[[paste0('I', item)]] <- dat[which(X_labels == item), ]
}

SimBM

################ Generate Brownian motion tree option ############
## By Kushal Dey
## Generate Brownian motion on a tree and sample points from the path
## tree: A tree structure generated by the simulate_bitree() function res$data
## all_nodes: in-between nodes data points generated by simulate_bitree() 
## Nsim: Number of Brownian motions to generate/simulate
## diffusion: A diffusion rate parameter for the diffusion Brownian process. Default=1
## precision: The time-scale for each branch of the tree produced (if the branch length is t,
## it blows it up to t*precision and generate Brownian motion with scaled variance
## under that new scale)

## plot: if TRUE, plots the paths for each simulation,defaults to FALSE

## Samples points from Brownian path for each non-root branch
## based on tree$all_nodes() function and then records these time
## points and the corresponding value of the Tree Diffusion process
## at that point for each simulation run. This whole process is repeated
## Nsim number of times to generate Nsim Brownian processes on trees

IntermBM <- function(tree, all_nodes, Nsim, diffusion = 1, precision = 1000, plot = FALSE)
{
  Y <- list()
  ll <- floor(tree$edge.length * precision) + 1
  for (m in 1:Nsim) {
    ## create a fine grid (w.r.t precision) of discrete points to approximate continuous Brownian motion
    ## G[[i]] is finer than wnat we need (Y[[i]])
    G <- lapply(tree$edge.length,
                function(x) return(c(0, cumsum(rnorm(n = floor(x * precision), sd = sqrt(diffusion/precision))))))
    if(plot) {
      H <- nodeHeights(tree)
      plot(H[1, 1], G[[1]][1], ylim = range(G), xlim = range(H), xlab = "time", ylab = "phenotype")
      for (i in 1:length(G))
        lines(H[i, 1]:H[i, 2], G[[i]][seq(from=1, to=length(G[[i]]), length.out=length(H[i,1]:H[i,2]))])
      yy <- sapply(1:length(tree$tip.label), function(x, y) which(x == y), y = tree$edge[,2])
      yy <- sapply(yy, function(x, y) y[[x]][length(y[[x]])], y = G)
      text(x = max(H), y = yy, tree$tip.label)
    }
    ## simulate data points
    for (i in 1:nrow(tree$edge)) {
      ## find parent
      pp <- which(tree$edge[, 2] == tree$edge[i, 1])
      ## add parent's value to itself
      if (length(pp) > 0)
        ## non-root
        G[[i]] <- G[[i]] + G[[pp]][ll[pp]]
      else
        ## root
        G[[i]] <- G[[i]] + G[[1]][1]
      ## find all data points between its parent and itself
      curr_index <- which(floor(all_nodes) == tree$edge[i, 2])
      pos <- ceiling((1-all_nodes[curr_index]%%1) * length(G[[i]])) ## position: 1 minus the factional part
      ## Properly determine labels
      if (tree$edge[i, 2] <= length(tree$tip.label)) 
        label = tree$tip.label[tree$edge[i, 2]]
      else
        label = as.character(tree$edge[i, 2])
      ## Y[[label]] is a subset of G[[i]]
      if (m == 1)
        Y[[paste0('I', label)]] <- G[[i]][pos]
      else
        Y[[paste0('I', label)]] <- cbind(Y[[paste0('I', label)]], G[[i]][pos])
    }
  }
  return(Y)
}

X <- IntermBM(tree, all_nodes, Nsim, diffusion, precision)

ManifoldDR

import sys
import numpy as np
from sklearn import manifold
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from plotconf import *

class DMR:
    '''
    Run various off the shelf dimensionality reduction methods
    and make figures
    '''
    def __init__(self, data, label, **kwargs):
        if label is None:
            label = list(data.keys())
        self.properties = {'range': [], 'label': label, 'color': [], 'method': None}
        self.data = None
        nb_samples = 0
        for idx, k in enumerate(label):
            self.properties['range'].append((nb_samples, nb_samples + data[k].shape[0]))
            self.properties['color'].append(COLORS[idx % len(COLORS)])
            if self.data is None:
                self.data = data[k]
            else:
                self.data = np.vstack((self.data, data[k]))
            nb_samples += data[k].shape[0]
        self.data_new = None
        self.params = kwargs

    def LLE_standard(self):
        self.properties['method'] = "LLE"
        self.data_new = manifold.LocallyLinearEmbedding(n_neighbors = self.params["n_neighbors"],
                                               n_components = self.params["n_components"],
                                               eigen_solver='auto',
                                               method = 'standard').fit_transform(self.data)

    def LLE_ltsa(self):
        self.properties['method'] = "LTSA"
        self.data_new = manifold.LocallyLinearEmbedding(n_neighbors = self.params["n_neighbors"],
                                               n_components = self.params["n_components"],
                                               eigen_solver='auto',
                                               method = 'ltsa').fit_transform(self.data)

    def LLE_hessian(self):
        self.properties['method'] = "Hessian LLE"
        self.data_new = manifold.LocallyLinearEmbedding(n_neighbors = self.params["n_neighbors"],
                                               n_components = self.params["n_components"],
                                               eigen_solver='auto',
                                               method = 'hessian').fit_transform(self.data)

    def LLE_modified(self):
        self.properties['method'] = "Modified LLE"
        self.data_new = manifold.LocallyLinearEmbedding(n_neighbors = self.params["n_neighbors"],
                                               n_components = self.params["n_components"],
                                               eigen_solver='auto',
                                               method = 'modified').fit_transform(self.data)

    def isomap(self):
        self.properties['method'] = "Isomap"
        self.data_new = manifold.Isomap(n_neighbors = self.params["n_neighbors"],
                               n_components = self.params["n_components"]).fit_transform(self.data)

    def mds(self):
        self.properties['method'] = "MDS"
        self.data_new = manifold.MDS(n_components = self.params["n_components"],
                                     max_iter=100, n_init=1).fit_transform(self.data)

    def spectral_embedding(self):
        self.properties['method'] = "Spectral Embedding"
        self.data_new = manifold.SpectralEmbedding(n_neighbors = self.params["n_neighbors"],
                                          n_components = self.params["n_components"]).fit_transform(self.data)

    def t_sne(self):
        self.properties['method'] = "t-SNE"
        self.data_new = manifold.TSNE(n_components = self.params["n_components"],
                                      init='pca', random_state=0).fit_transform(self.data)

    def pca(self):
        self.properties['method'] = "PCA"
        self.data_new = PCA(n_components = self.params["n_components"]).fit_transform(self.data)

    # def mst(self):
    #     self.properties['method'] = "MST"
    # FIXME: input is graph matrix not distance matrix
    #     self.data_new = minimum_spanning_tree(csr_matrix(1 - np.absolute(np.corrcoef(self.data)))).toarray()

dmr = DMR(data, label, n_neighbors = n_neighbors, n_components = n_comp)
try:
    getattr(dmr, sys.argv[1])()
    properties, projection = dmr.properties, dmr.data_new
except Exception as e:
    properties = projection = []
    sys.stderr.write('{}'.format(e))

PlotDR

# Gao Wang (c) 2016
from plotconf import *
def plot_dr_dot(data, properties, fname):
    if len(data) == 0:
        os.system('cp ~/Dropbox/.blank.pdf {}'.format(fname))
        return
    ax = plt.subplot(111)
    for x, y, z in zip(properties['range'], properties['color'], list(properties['label'])):
        plt.scatter(data[x.item(0):x.item(1), 0], data[x.item(0):x.item(1), 1],
                    color = y, label = z, s = 40)
    plt.title(properties['method'][0] + '\n')
    plt.legend(loc = "center left", bbox_to_anchor=(0.9, 0.5))
    ax.xaxis.set_major_formatter(NullFormatter())
    ax.yaxis.set_major_formatter(NullFormatter())
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    plt.axis('tight')
    plt.savefig(fname)
    plt.close()

plot_dr_dot(data, properties, vizplot)

ApeMST

## Merge data across nodes and perform MST on distance matrix (1 - abs(cor(X)))
dat <- NULL
sample_labels <- vector()
for (i in label) {
  if (is.null(dat)) dat <- X[[i]]
  else dat <- rbind(dat, X[[i]])
  sample_labels <- append(sample_labels, rep(i, nrow(X[[i]])))
}
## use cor here for scale invariance
## projection <- ape::mst(1 - abs(cor(t(dat))))
projection <- ape::mst(dist(dat))
properties <- list(method = 'MST', samples = sample_labels, legend_text = label)

PlotMST

## Plot MST
## FIXME: Not sure which is the best way to visualize the MST matrix
## Here use igraph::graph.adjacency to view
source('plotconf.R')
suppressMessages(library(igraph))
fill <- rep(COLORS, ceiling(length(properties$legend_text)/length(COLORS)))[1:length(properties$legend_text)]
pdf(vizplot)
plot(graph.adjacency(mst),
     edge.arrow.mode = 0,
     vertex.size = PSIZE,
     vertex.label = NA,
     vertex.color = fill[match(properties$samples, properties$legend_text)],
     vertex.frame.color = fill[match(properties$samples, properties$legend_text)],
     layout=igraph::layout_with_fr
)
legend("right", legend = properties$legend_text, fill = fill, bty = "n", border = fill, cex = 0.75)
title(main = properties$method)
dev.off()