Last updated: 2016-11-08
Code version: bd286a36f14d3b332285cdc7e62258b1f616bb14
The subsampled files were created using the pipeline described here, which is identical to the pipeline used to process the full data files.
library("dplyr")
library("tidyr")
library("ggplot2")
library("cowplot")
theme_set(theme_bw(base_size = 12))
theme_update(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
legend.key = element_blank(),
plot.title = element_text(size = rel(1)))
d <- read.table("../data/subsampling-results.txt",
header = TRUE, sep = "\t", stringsAsFactors = FALSE)
str(d)
'data.frame': 6240 obs. of 25 variables:
$ type : chr "reads" "molecules" "reads" "molecules" ...
$ depth : int 50000 50000 50000 50000 50000 50000 250000 250000 250000 250000 ...
$ gene_subset : chr "all" "all" "lower" "lower" ...
$ seed : int 1 1 1 1 1 1 1 1 1 1 ...
$ subsampled_cells: int 5 5 5 5 5 5 5 5 5 5 ...
$ individual : chr "NA19098" "NA19098" "NA19098" "NA19098" ...
$ replicate : logi NA NA NA NA NA NA ...
$ lower_q : num 0 0 0 0 0.5 0.5 0 0 0 0 ...
$ upper_q : num 1 1 0.5 0.5 1 1 1 1 0.5 0.5 ...
$ available_ensg : int 12192 12192 12192 12192 12192 12192 12192 12192 12192 12192 ...
$ used_ensg : int 12192 12192 6096 6096 6096 6096 12192 12192 6096 6096 ...
$ available_ercc : int 43 43 43 43 43 43 43 43 43 43 ...
$ used_ercc : int 43 43 22 22 22 22 43 43 22 22 ...
$ potential_cells : int 142 142 142 142 142 142 142 142 142 142 ...
$ available_cells : int 142 142 142 142 142 142 142 142 142 142 ...
$ pearson_ensg : num 0.774 0.79 0.368 0.372 0.727 ...
$ pearson_ercc : num 0.893 0.907 0.456 0.406 0.914 ...
$ spearman_ensg : num 0.772 0.79 0.405 0.411 0.703 ...
$ spearman_ercc : num 0.854 0.853 0.456 0.383 0.955 ...
$ detected_ensg : int 7972 7972 2413 2413 5559 5559 9493 9493 3550 3550 ...
$ detected_ercc : int 29 29 10 10 20 20 35 35 14 14 ...
$ mean_counts_ensg: num 22471 12126 1292 738 21180 ...
$ mean_counts_ercc: num 568.8 269 8.2 3.2 561.4 ...
$ var_pearson : num 0.804 0.846 0.4 0.455 0.76 ...
$ var_spearman : num 0.762 0.792 0.451 0.467 0.693 ...
d_grouped <- d %>%
group_by(type, depth, gene_subset, subsampled_cells,
individual, potential_cells, available_cells,
lower_q, upper_q, available_ensg, used_ensg,
available_ercc, used_ercc) %>%
summarize(mean_detected = mean(detected_ensg),
sem_detected = sd(detected_ensg) / sqrt(length(detected_ensg)),
mean_bulk = mean(pearson_ensg),
sem_bulk = sd(pearson_ensg) / sqrt(length(pearson_ensg)),
mean_var = mean(var_pearson),
sem_var = sd(var_pearson) / sqrt(length(var_pearson)))
d_filter <- d_grouped %>% filter(individual == "NA19239",
type == "molecules",
gene_subset %in% c("lower", "upper"))
d_filter$gene_subset <- factor(d_filter$gene_subset,
levels = c("lower", "upper"),
labels = c("Lower 50% of expressed genes",
"Upper 50% of expressed genes"))
plot_bulk_title <- "Correlation of gene expression levels \n between single cells and the corresponding bulk samples"
plot_bulk_title <- "Correlation of gene expression levels between single cells and the corresponding bulk samples"
plot_bulk <- ggplot(d_filter,
aes(x = subsampled_cells, y = mean_bulk,
color = as.factor(depth))) +
geom_line() +
geom_errorbar(aes(ymin = mean_bulk - sem_bulk,
ymax = mean_bulk + sem_bulk),
width = 1) +
facet_wrap(~gene_subset) +
scale_color_grey(start = 0.8, end = 0.2, name = "Sequencing depth") +
theme(legend.position = "none") +
labs(x = "Number of subsampled cells",
y = "Pearson's r (+/- SEM)",
title = paste(strwrap(plot_bulk_title, width = 80), collapse = "\n"))
plot_bulk
plot_detected <- ggplot(d_filter,
aes(x = subsampled_cells, y = mean_detected,
color = as.factor(depth))) +
geom_line() +
geom_errorbar(aes(ymin = mean_detected - sem_detected,
ymax = mean_detected + sem_detected),
width = 1) +
facet_wrap(~gene_subset) +
scale_color_grey(start = 0.8, end = 0.2, name = "Sequencing depth",
labels = format(unique(d$depth), big.mark = ",",
scientifc = FALSE, trim = TRUE)) +
theme(legend.position = c(0.75, 0.35),
legend.key.size = grid::unit(0.2, "in")) +
labs(x = "Number of subsampled cells",
y = "Number of genes detected",
title = "Number of genes detected as expressed in single cell samples \n at different sequence coverage")
plot_detected
plot_var <- ggplot(d_filter,
aes(x = subsampled_cells, y = mean_var,
color = as.factor(depth))) +
geom_line() +
geom_errorbar(aes(ymin = mean_var - sem_var,
ymax = mean_var + sem_var),
width = 1) +
facet_wrap(~gene_subset) +
scale_color_grey(start = 0.8, end = 0.2, name = "Sequencing depth") +
theme(legend.position = "none") +
labs(x = "Number of subsampled cells",
y = "Pearson's r (+/- SEM)",
title = "Correlation of the variation in gene expression levels \n between subsets of single cells and the entire single cell dataset")
plot_var
plot_final <- plot_grid(plot_bulk, plot_detected, plot_var,
ncol = 1, labels = letters[1:3], label_size = 12)
plot_final
# png("../paper/figure/fig-subsample.png", width = 8, height = 12, units = "in", res = 300)
tiff("../paper/figure/fig-subsample.tiff",
width = 8, height = 12, units = "in", res = 300, compression = "zip")
plot_final
dev.off()
png
2
molecules_per_depth <- d %>%
filter(gene_subset == "all", type == "molecules")
plot_mol_depth_ensg <- ggplot(molecules_per_depth,
aes(x = as.factor(depth),
y = mean_counts_ensg)) +
geom_boxplot() +
facet_wrap(~individual) +
scale_y_continuous(breaks = seq(0, 1e5, by = 1e4)) +
theme(axis.text.x = element_text(angle = 90))
plot_mol_depth_ensg
plot_mol_depth_ercc <- plot_mol_depth_ensg %+% aes(y = mean_counts_ercc)
plot_mol_depth_ercc
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_0.3.1 ggplot2_1.0.1 tidyr_0.2.0 dplyr_0.4.2 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.12.4 magrittr_1.5 MASS_7.3-40 munsell_0.4.3
[5] colorspace_1.2-6 R6_2.1.1 stringr_1.0.0 httr_0.6.1
[9] plyr_1.8.3 tools_3.2.0 parallel_3.2.0 grid_3.2.0
[13] gtable_0.1.2 DBI_0.3.1 htmltools_0.2.6 lazyeval_0.1.10
[17] yaml_2.1.13 assertthat_0.1 digest_0.6.8 reshape2_1.4.1
[21] formatR_1.2 bitops_1.0-6 RCurl_1.95-4.6 evaluate_0.7
[25] rmarkdown_0.6.1 labeling_0.3 stringi_1.0-1 scales_0.4.0
[29] proto_0.3-10