Last updated: 2016-10-28

Code version: 4ef2b53c021976f186b791795ba9dcfa0c105217

Objective

This page contains codes for reproducing figures reported in the paper that are related the analysis of cell-to-cell variation in transcriptional gene expression.

Set up

library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
theme_set(theme_bw(base_size = 12))
source("functions.R")
library("Humanzee")
library("cowplot")
library("MASS")
library("matrixStats")
source("../code/plotting-functions.R")

We import molecule counts before standardizing and transformation and also log2-transformed counts after batch-correction. Biological variation analysis of the individuals is performed on the batch-corrected and log2-transformed counts.

# Import filtered annotations
anno_filter <- read.table("../data/annotation-filter.txt", 
                      header = TRUE,
                      stringsAsFactors = FALSE)

# Import filtered molecule counts
molecules_filter <- read.table("../data/molecules-filter.txt",
                               header = TRUE, stringsAsFactors = FALSE)
stopifnot(NROW(anno_filter) == NCOL(molecules_filter))

# Import final processed molecule counts of endogeneous genes
molecules_final <- read.table("../data/molecules-final.txt", 
                             header = TRUE, stringsAsFactors = FALSE)
stopifnot(NROW(anno_filter) == NCOL(molecules_final))

# Import gene symbols
gene_symbols <- read.table(file = "../data/gene-info.txt", sep = "\t",
                           header = TRUE, stringsAsFactors = FALSE, quote = "")

# Import cell-cycle gene list
cell_cycle_genes <- read.table("../data/cellcyclegenes.txt",
                               header = TRUE, sep = "\t",
                               stringsAsFactors = FALSE)

# Import pluripotency gene list

pluripotency_genes <- read.table("../data/pluripotency-genes.txt",
                               header = TRUE, sep = "\t",
                               stringsAsFactors = FALSE)$To

Load gene CVs computed over all cells (link1) and CV computed only over the expressed cells (link2).

load("../data/cv-all-cells.rda")
load("../data/cv-expressed-cells.rda")
#load("../data/sig-mean-expressed.rda")

## permutation results including the detected/expressed cells
load("../data/permuted-pval-expressed-set1.rda")

## load observed statistics (MAD)
load("../data/mad-expressed.rda")

Compute a matrix of 0’s and 1’s labeling non-detected and detected cells, respectively.

molecules_expressed <- molecules_filter
molecules_expressed[which(molecules_filter > 0 , arr.ind = TRUE)] <- 1
molecules_expressed <- as.matrix((molecules_expressed))

Take the gene subset included in the final data.

genes_included <- Reduce(intersect,
                         list(rownames(molecules_final),
                             rownames(expressed_cv$NA19098),
                             names(perm_pval_set1)) )

molecules_filter_subset <- molecules_filter[
  which(rownames(molecules_filter) %in% genes_included), ]

molecules_final_subset <- molecules_final[
  which(rownames(molecules_final) %in% genes_included), ]

molecules_expressed_subset <- molecules_expressed[
  which(rownames(molecules_expressed) %in% genes_included), ]

molecules_final_expressed_subset <- molecules_final_subset*molecules_expressed_subset
molecules_final_expressed_subset[which(molecules_expressed_subset == 0, 
                                       arr.ind = TRUE)] <- NA

permuted_pval_subset <- perm_pval_set1[which(names(perm_pval_set1) %in% genes_included)]

expressed_cv_subset <- lapply(expressed_cv, function(x)
  x[which(rownames(x) %in% genes_included), ] )
names(expressed_cv_subset) <- names(expressed_cv)

expressed_dm_subset <- expressed_dm[which(rownames(expressed_dm) %in% genes_included), , ] 

dim(molecules_final_subset)
[1] 13043   564
dim(molecules_expressed_subset)
[1] 13043   564
all.equal(rownames(expressed_cv_subset$NA19098), 
          rownames(molecules_final_expressed_subset) )
[1] TRUE
all.equal(names(permuted_pval_subset), 
          rownames(molecules_expressed_subset) )
[1] TRUE

Compute drop-out rates.

drop_out <- lapply(unique(anno_filter$individual), function(ind) {
  temp_df <- molecules_filter_subset[,anno_filter$individual == ind]
  zero_count <- rowMeans(temp_df == 0)
  return(zero_count)
})  
names(drop_out) <- unique(anno_filter$individual)
drop_out$all <- rowMeans(as.matrix(molecules_filter_subset) == 0)


summary(drop_out$NA19098)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.02113 0.21130 0.32060 0.59860 0.99300 
summary(drop_out$NA19101)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.04975 0.29350 0.36500 0.66170 0.99500 
summary(drop_out$NA19239)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.02715 0.25340 0.35060 0.65610 0.99550 
summary(drop_out$all)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.03723 0.26060 0.34820 0.64540 0.93090 

Statistical test for individual difference in dropout rate: logistic regression

if(!file.exists("../data/sig-dropout.rda")) {
library(lmtest)
  drop_out_significance <- sapply(1:NROW(molecules_expressed_subset), function(i) {
  fit <- glm(molecules_expressed_subset[i,] ~ as.factor(anno_filter$individual),
             family = "binomial")
  lrtest(fit)$P[2]  
})
  save(drop_out_significance, file = "../data/sig-dropout.rda")
} else {
  load("../data/sig-dropout.rda")
}
quantile(drop_out_significance, prob = seq(0,1,.1))[2] < 1e-5
 10% 
TRUE 
sum(drop_out_significance < quantile(drop_out_significance, prob = seq(0,1,.1))[2])
[1] 1214

Main figure 5: Cell-to-cell variation in gene expression

Expressed cells

# mean gene expression
genes <- rownames(expressed_cv_subset[[1]])
venn_mean_rank <- gplots::venn( 
  list(NA19098 = genes[ which(rank(expressed_cv_subset[[1]]$expr_mean) > length(genes) - 1000 ) ],
       NA19101 = genes[ which(rank(expressed_cv_subset[[2]]$expr_mean) > length(genes) - 1000 ) ],
       NA19239 = genes[ which(rank(expressed_cv_subset[[3]]$expr_mean) > length(genes) - 1000 ) ] ) )

# adjusted CV
venn_cv_rank <- gplots::venn( 
  list(NA19098 = genes[ which(rank(expressed_dm_subset[[1]]) > length(genes) - 1000 ) ],
       NA19101 = genes[ which(rank(expressed_dm_subset[[2]]) > length(genes) - 1000 ) ],
       NA19239 = genes[ which(rank(expressed_dm_subset[[3]]) > length(genes) - 1000 ) ] ) )

All cells

Venn diagram of gene-specific mean and adjusted CV for when all cells are included. For more results for when including all cells, see here.

load("../data/cv-all-cells.rda")
genes <- rownames(ENSG_cv[[1]])
library(gplots)
venn_cv_rank <- gplots::venn(
  list(NA19098 = genes[ which( rank(ENSG_cv_adj$NA19098$log10cv2_adj) 
                               > length(genes) - 1000 ) ],
       NA19101 = genes[ which( rank(ENSG_cv_adj$NA19101$log10cv2_adj) 
                               > length(genes) - 1000 ) ],
       NA19239 = genes[ which( rank(ENSG_cv_adj$NA19239$log10cv2_adj) 
                               > length(genes) - 1000 ) ] ))

gplots::venn( 
  list(NA19098 = genes[ which(rank(ENSG_cv[[1]]$mean) > length(genes) - 1000 ) ],
       NA19101 = genes[ which(rank(ENSG_cv[[2]]$mean) > length(genes) - 1000 ) ],
       NA19239 = genes[ which(rank(ENSG_cv[[3]]$mean) > length(genes) - 1000 ) ] ) )