Last updated: 2015-12-11
Code version: 884b335605607bd2cf5c9a1ada1e345466388866
We computed the adjusted CV for each individual and observed that genes with extreme CVs are unique to each individual, while genes with extreme means are likey to be shared across individuals. Moreover, the extreme genes are not more likely to be cell cycle genes than the non-extreme genes; in fact, we can see from the scatter plots that cell cycle genes tend to be small in our adjusted CV than the non-cell-cycle genes.
library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
theme_set(theme_bw(base_size = 12))
source("functions.R")
Input annotation of only QC-filtered single cells. Remove NA19098.r2
anno_qc <- read.table("../data/annotation-filter.txt", header = TRUE,
stringsAsFactors = FALSE)
is_include <- anno_qc$batch != "NA19098.r2"
anno_qc_filter <- anno_qc[which(is_include), ]
Import endogeneous gene molecule counts that are QC-filtered, CPM-normalized, ERCC-normalized, and also processed to remove unwanted variation from batch effet. ERCC genes are removed from this file.
molecules_ENSG <- read.table("../data/molecules-final.txt", header = TRUE, stringsAsFactors = FALSE)
molecules_ENSG <- molecules_ENSG[ , is_include]
Input moleclule counts before log2 CPM transformation. This file is used to compute percent zero-count cells per sample.
molecules_sparse <- read.table("../data/molecules-filter.txt", header = TRUE, stringsAsFactors = FALSE)
molecules_sparse <- molecules_sparse[grep("ENSG", rownames(molecules_sparse)), ]
stopifnot( all.equal(rownames(molecules_ENSG), rownames(molecules_sparse)) )
We compute squared CV across cells for each individual and then for each individual CV profile, account for mean dependency by computing distance with respect to the data-wide coefficient variation on the log10 scale.
source("../code/cv-functions.r")
ENSG_cv <- compute_cv(log2counts = molecules_ENSG,
grouping_vector = anno_qc_filter$individual)
ENSG_cv_adj <- normalize_cv(group_cv = ENSG_cv,
log2counts = molecules_ENSG,
anno = anno_qc_filter)
*Some plotting
Mark genes with black dots if log10(cv^2) > .5.
Distance to the data-wise CV (adjusted CV): small CV values are more variable in terms their distance to the data-wise CV value; hence we see a large distance to the data-wise CV for the small CVs than the large CVs.
high_cv <- subset(do.call(rbind, ENSG_cv_adj), log10(cv^2) > .5)
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10(cv^2))) +
geom_point(aes(col = group)) + facet_wrap( ~ group) +
ggtitle("CV-mean relationship") +
geom_point(data = high_cv, colour = "black")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group)) + facet_wrap( ~ group) +
ggtitle("Adjusted CV and mean relationship") +
geom_point(data = high_cv, colour = "black")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(cv^2), y = log10cv2_adj)) +
geom_point(aes(col = group)) + facet_wrap( ~ group) +
ggtitle("CV before versus after adjustment") +
geom_point(data = high_cv, colour = "black")
extreme_genes_50 <- lapply(ENSG_cv_adj, function(xx) {
low_mean <- rank(xx$mean) < 50
high_mean <- rank(xx$mean) > (length(xx$mean) - 50)
low_cv <- rank(xx$log10cv2_adj) < 50
high_cv <- rank(xx$log10cv2_adj) > (length(xx$log10cv2_adj) - 50)
res <- data.frame(high_mean = high_mean,
low_mean = low_mean,
high_cv = high_cv,
low_cv = low_cv)
rownames(res) <- rownames(ENSG_cv_adj[[1]])
res
})
*Top 50 in adjusted CV
genes <- rownames(extreme_genes_50[[1]])
library(gplots)
venn( list(genes[extreme_genes_50[[1]]$high_cv],
genes[extreme_genes_50[[2]]$high_cv],
genes[extreme_genes_50[[3]]$high_cv] ) )
*Bottom 50 in adjusted CV
genes <- rownames(extreme_genes_50[[1]])
library(gplots)
venn( list(genes[extreme_genes_50[[1]]$low_cv],
genes[extreme_genes_50[[2]]$low_cv],
genes[extreme_genes_50[[3]]$low_cv] ) )
*Top 50 in mean
genes <- rownames(extreme_genes_50[[1]])
library(gplots)
venn( list(genes[extreme_genes_50[[1]]$high_mean],
genes[extreme_genes_50[[2]]$high_mean],
genes[extreme_genes_50[[3]]$high_mean] ) )
*Bottom 50 in mean
genes <- rownames(extreme_genes_50[[1]])
library(gplots)
venn( list(genes[extreme_genes_50[[1]]$low_mean],
genes[extreme_genes_50[[2]]$low_mean],
genes[extreme_genes_50[[3]]$low_mean] ) )
extreme_genes_200 <- lapply(ENSG_cv_adj, function(xx) {
low_mean <- rank(xx$mean) < 200
high_mean <- rank(xx$mean) > (length(xx$mean) - 200)
low_cv <- rank(xx$log10cv2_adj) < 200
high_cv <- rank(xx$log10cv2_adj) > (length(xx$log10cv2_adj) - 200)
res <- data.frame(high_mean = high_mean,
low_mean = low_mean,
high_cv = high_cv,
low_cv = low_cv)
rownames(res) <- rownames(ENSG_cv_adj[[1]])
res
})
*Top 200 in adjusted CV
genes <- rownames(extreme_genes_200[[1]])
library(gplots)
venn( list(genes[extreme_genes_200[[1]]$high_cv],
genes[extreme_genes_200[[2]]$high_cv],
genes[extreme_genes_200[[3]]$high_cv] ) )
*Bottom 200 in adjusted CV
genes <- rownames(extreme_genes_200[[1]])
library(gplots)
venn( list(genes[extreme_genes_200[[1]]$low_cv],
genes[extreme_genes_200[[2]]$low_cv],
genes[extreme_genes_200[[3]]$low_cv] ) )
*Top 200 in mean
genes <- rownames(extreme_genes_200[[1]])
library(gplots)
venn( list(genes[extreme_genes_200[[1]]$high_mean],
genes[extreme_genes_200[[2]]$high_mean],
genes[extreme_genes_200[[3]]$high_mean] ) )
*Extreme genes as in top/bottom 500
extreme_genes_500 <- lapply(ENSG_cv_adj, function(xx) {
low_mean <- rank(xx$mean) < 500
high_mean <- rank(xx$mean) > (length(xx$mean) - 500)
low_cv <- rank(xx$log10cv2_adj) < 500
high_cv <- rank(xx$log10cv2_adj) > (length(xx$log10cv2_adj) - 500)
res <- data.frame(high_mean = high_mean,
low_mean = low_mean,
high_cv = high_cv,
low_cv = low_cv)
rownames(res) <- rownames(ENSG_cv_adj[[1]])
res
})
*Top 500 in adjusted CV
genes <- rownames(extreme_genes_500[[1]])
library(gplots)
venn( list(genes[extreme_genes_500[[1]]$high_cv],
genes[extreme_genes_500[[2]]$high_cv],
genes[extreme_genes_500[[3]]$high_cv] ) )
*Bottom 200 in adjusted CV
genes <- rownames(extreme_genes_500[[1]])
library(gplots)
venn( list(genes[extreme_genes_500[[1]]$low_cv],
genes[extreme_genes_500[[2]]$low_cv],
genes[extreme_genes_500[[3]]$low_cv] ) )
*Top 200 in mean
genes <- rownames(extreme_genes_500[[1]])
library(gplots)
venn( list(genes[extreme_genes_500[[1]]$high_mean],
genes[extreme_genes_500[[2]]$high_mean],
genes[extreme_genes_500[[3]]$high_mean] ) )
*Bottom 200 in mean
genes <- rownames(extreme_genes_500[[1]])
library(gplots)
venn( list(genes[extreme_genes_500[[1]]$low_mean],
genes[extreme_genes_500[[2]]$low_mean],
genes[extreme_genes_500[[3]]$low_mean] ) )
*Top 50 in adjusted CV
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19098") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_50[[1]]$high_cv),
colour = "grey20")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19101") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_50[[2]]$high_cv),
colour = "grey20")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19239") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_50[[3]]$high_cv),
colour = "grey20")
*Top 200 in adjusted CV
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19098") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_200[[1]]$high_cv),
colour = "grey20")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19101") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_200[[2]]$high_cv),
colour = "grey20")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19239") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_200[[3]]$high_cv),
colour = "grey20")
*Bottom 50 in adjusted CV
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19098") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_50[[1]]$low_cv),
colour = "grey20")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19101") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_50[[2]]$low_cv),
colour = "grey20")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19239") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_50[[3]]$low_cv),
colour = "grey20")
*Bottom 200 in adjusted CV
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19098") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_200[[1]]$low_cv),
colour = "grey20")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19101") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_200[[2]]$low_cv),
colour = "grey20")
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = .8) + facet_wrap( ~ group) +
ggtitle("Adj-CV top 50, NA19239") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj), extreme_genes_200[[3]]$low_cv),
colour = "grey20")
*Import cell-cycle gene list
cellcycle_genes <- read.table("../data/cellcyclegenes.txt", sep = "\t",
header = TRUE, stringsAsFactors = FALSE)
str(cellcycle_genes)
'data.frame': 151 obs. of 5 variables:
$ G1.S: chr "ENSG00000102977" "ENSG00000119640" "ENSG00000154734" "ENSG00000088448" ...
$ S : chr "ENSG00000114770" "ENSG00000144827" "ENSG00000180071" "ENSG00000105011" ...
$ G2.M: chr "ENSG00000011426" "ENSG00000065000" "ENSG00000213390" "ENSG00000122644" ...
$ M : chr "ENSG00000135541" "ENSG00000135334" "ENSG00000154945" "ENSG00000011426" ...
$ M.G1: chr "ENSG00000173744" "ENSG00000160216" "ENSG00000170776" "ENSG00000218624" ...
genes <- rownames(extreme_genes_50[[1]])
ii_cellcycle_genes <- lapply(1:3, function(per_individual) {
genes %in% unlist(cellcycle_genes)
})
names(ii_cellcycle_genes) <- names(extreme_genes_50)
ii_cellcycle_genes <- do.call(c, ii_cellcycle_genes)
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = 1.2) + facet_wrap( ~ group) +
ggtitle("Cellcycle genes") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj),
ii_cellcycle_genes),
colour = "grey20", cex = 1.2)
*Import pluripotent gene list
pluripotent_genes <- read.table("../data/pluripotency-genes.txt", sep = "\t",
header = TRUE, stringsAsFactors = FALSE)
str(pluripotent_genes)
'data.frame': 27 obs. of 4 variables:
$ From : chr "GDF3" "ZFP42" "NR6A1" "SOX2" ...
$ To : chr "ENSG00000184344" "ENSG00000179059" "ENSG00000148200" "ENSG00000181449" ...
$ Species : chr "Homo sapiens" "Homo sapiens" "Homo sapiens" "Homo sapiens" ...
$ Gene.Name: chr "growth differentiation factor 3" "zinc finger protein 42 homolog (mouse)" "nuclear receptor subfamily 6, group A, member 1" "SRY (sex determining region Y)-box 2" ...
genes <- rownames(extreme_genes_50[[1]])
ii_pluripotent_genes <- lapply(1:3, function(per_individual) {
genes %in% unlist(pluripotent_genes$To)
})
names(ii_pluripotent_genes) <- names(extreme_genes_50)
ii_pluripotent_genes <- do.call(c, ii_pluripotent_genes)
ggplot(do.call(rbind, ENSG_cv_adj),
aes(x = log10(mean), y = log10cv2_adj)) +
geom_point(aes(col = group), cex = 1.2) + facet_wrap( ~ group) +
ggtitle("Pluripotent genes") +
geom_point(data = subset(do.call(rbind, ENSG_cv_adj),
ii_pluripotent_genes),
colour = "grey20", cex = 1.2)
sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.5 (Yosemite)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] gplots_2.17.0 zoo_1.7-12 ggplot2_1.0.1 edgeR_3.10.5
[5] limma_3.24.15 dplyr_0.4.3 data.table_1.9.6 knitr_1.11
loaded via a namespace (and not attached):
[1] Rcpp_0.12.2 magrittr_1.5 MASS_7.3-45
[4] munsell_0.4.2 lattice_0.20-33 colorspace_1.2-6
[7] R6_2.1.1 stringr_1.0.0 plyr_1.8.3
[10] caTools_1.17.1 tools_3.2.1 parallel_3.2.1
[13] gtable_0.1.2 KernSmooth_2.23-15 DBI_0.3.1
[16] gtools_3.5.0 htmltools_0.2.6 yaml_2.1.13
[19] assertthat_0.1 digest_0.6.8 reshape2_1.4.1
[22] formatR_1.2.1 bitops_1.0-6 evaluate_0.8
[25] rmarkdown_0.8.1 labeling_0.3 gdata_2.17.0
[28] stringi_1.0-1 scales_0.3.0 chron_2.3-47
[31] proto_0.3-10