Last updated: 2016-11-08
Code version: bd286a36f14d3b332285cdc7e62258b1f616bb14
library("biomaRt")
library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
library("cowplot")
theme_set(theme_bw(base_size = 16))
theme_update(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank(),
legend.key = element_blank(),
plot.title = element_text(size = rel(1)))
source("functions.R")
Input ERCC molecule counts calculated in capture efficiency.
ercc <- read.table("../data/expected-ercc-molecules.txt", header = TRUE,
stringsAsFactors = FALSE)
head(ercc)
id conc_mix1 ercc_molecules_well
1 ERCC-00130 30000.000 4877.93485
2 ERCC-00004 7500.000 1219.48371
3 ERCC-00136 1875.000 304.87093
4 ERCC-00108 937.500 152.43546
5 ERCC-00116 468.750 76.21773
6 ERCC-00092 234.375 38.10887
Input annotation.
anno <- read.table("../data/annotation.txt", header = TRUE,
stringsAsFactors = FALSE)
head(anno)
individual replicate well batch sample_id
1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01
2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02
3 NA19098 r1 A03 NA19098.r1 NA19098.r1.A03
4 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04
5 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05
6 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06
Input read counts and filter for quality cells.
reads <- read.table("../data/reads.txt", header = TRUE,
stringsAsFactors = FALSE)
quality_single_cells <- scan("../data/quality-single-cells.txt", what = "character")
reads <- reads[, colnames(reads) %in% quality_single_cells]
Input read counts in high quality cells for filtered genes
reads_filter <- read.table("../data/reads-filter.txt", header = TRUE,
stringsAsFactors = FALSE)
Input molecule counts and filter for quality cell.
molecules <- read.table("../data/molecules.txt", header = TRUE,
stringsAsFactors = FALSE)
molecules <- molecules[, colnames(molecules) %in% quality_single_cells]
Input molecule counts in high quality cells for filtered genes
molecules_filter <- read.table("../data/molecules-filter.txt", header = TRUE,
stringsAsFactors = FALSE)
Compare the means of each gene obtained via the different methods.
## calculate mean
reads_mean <- apply(reads, 1, mean)
molecules_mean <- apply(molecules, 1, mean)
distribution <- data.frame(reads_mean, molecules_mean)
reads_filter_mean <- apply(reads_filter, 1, mean)
molecules_filter_mean <- apply(molecules_filter, 1, mean)
distribution_filter <- data.frame(reads_filter_mean, molecules_filter_mean)
## correlation between reads and molecules
cor(distribution)
reads_mean molecules_mean
reads_mean 1.0000000 0.9277703
molecules_mean 0.9277703 1.0000000
cor(distribution_filter)
reads_filter_mean molecules_filter_mean
reads_filter_mean 1.0000000 0.9408234
molecules_filter_mean 0.9408234 1.0000000
## select ERCC
distribution$type <- ifelse(grepl("ERCC", rownames(distribution)), "ERCC", "gene")
distribution_filter$type <- ifelse(grepl("ERCC", rownames(distribution_filter)), "ERCC", "gene")
## color palette
cbPalette <- c("#0000FF", "#999999", "#990033", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#009E73")
## plots
plot_mean_log <- ggplot(distribution, aes(x = molecules_mean, y = reads_mean, col = type)) +
geom_point(size = 3, alpha = 0.5) +
scale_colour_manual(values=cbPalette) +
labs(x = "Mean molecule-counts across samples",
y = "Mean read-counts across samples",
title = "Amplification bias is stronger \n in endogenous genes than in ERCC spike-ins") +
scale_x_log10() +
scale_y_log10()
plot_mean_log
plot_mean_filter_log <- ggplot(distribution_filter, aes(x = molecules_filter_mean, y = reads_filter_mean, col = type)) +
geom_point(size = 3, alpha = 0.5) +
scale_colour_manual(values=cbPalette) +
labs(x = "Mean molecule-counts across samples",
y = "Mean read-counts",
title = "Read-counts and molecule-counts of each gene (log scale)") +
scale_x_log10() +
scale_y_log10()
plot_mean_filter_log
plot_mean <- ggplot(distribution, aes(x = molecules_mean, y = reads_mean, col = type)) +
geom_point(size = 3, alpha = 0.5) +
scale_colour_manual(values=cbPalette) +
labs(x = "Mean molecule-counts across samples",
y = "Mean read-counts across samples",
title = "Amplification bias is stronger \n in endogenous genes than in ERCC spike-ins") +
geom_smooth(method = "lm")
plot_mean
Look at the distribution of fold change to mean. As being reported by others, the lowly expressed genes show divergent read and molecule counts
## calculate fold change to mean
distribution$fold_change_read <- log2(reads_mean/mean(reads_mean))
distribution$fold_change_molecule <- log2(molecules_mean/mean(molecules_mean))
plot_distribution <- ggplot(distribution, aes(x = fold_change_molecule, y = fold_change_read, col = type)) +
geom_point(size = 3, alpha = 0.5) +
scale_colour_manual(values=cbPalette) +
stat_function(fun= function(x) {x}, col= "#56B4E9") +
labs(x = "log2 fold change to mean (molecule)", y = "log2 fold change to mean (reads)")
plot_distribution
Use only those 50 ERCC genes with at least 1 expected molecule per well.
ercc_list <- list()
for (spike in ercc$id[ercc$ercc_molecules_well >= 1]) {
if (spike %in% rownames(molecules)) {
ercc_list$id <- c(ercc_list$id, spike)
ercc_list$observed_mean <- c(ercc_list$observed_mean,
mean(as.numeric(molecules[spike, ])))
ercc_list$observed_sem <- c(ercc_list$observed_sem,
sd(as.numeric(molecules[spike, ])) /
sqrt(ncol(molecules)))
ercc_list$expected <- c(ercc_list$expected,
ercc$ercc_molecules_well[ercc$id == spike])
}
}
ercc_plot <- as.data.frame(ercc_list, stringsAsFactors = FALSE)
str(ercc_plot)
'data.frame': 50 obs. of 4 variables:
$ id : chr "ERCC-00130" "ERCC-00004" "ERCC-00136" "ERCC-00108" ...
$ observed_mean: num 298.1879 57.2784 20.1933 13.5833 0.0603 ...
$ observed_sem : num 2.8448 0.6346 0.2841 0.2141 0.0106 ...
$ expected : num 4877.9 1219.5 304.9 152.4 76.2 ...
cor(ercc_plot$observed_mean, ercc_plot$expected)
[1] 0.9916157
Use molecule filter file.
ercc_list_filter <- list()
for (spike in ercc$id[ercc$ercc_molecules_well >= 0]) {
if (spike %in% rownames(molecules_filter)) {
ercc_list_filter$id <- c(ercc_list_filter$id, spike)
ercc_list_filter$observed_mean <- c(ercc_list_filter$observed_mean,
mean(as.numeric(molecules_filter[spike, ])))
ercc_list_filter$observed_sem <- c(ercc_list_filter$observed_sem,
sd(as.numeric(molecules_filter[spike, ])) /
sqrt(ncol(molecules_filter)))
ercc_list_filter$expected <- c(ercc_list_filter$expected,
ercc$ercc_molecules_well[ercc$id == spike])
}
}
ercc_filter_plot <- as.data.frame(ercc_list_filter, stringsAsFactors = FALSE)
str(ercc_filter_plot)
'data.frame': 48 obs. of 4 variables:
$ id : chr "ERCC-00130" "ERCC-00004" "ERCC-00136" "ERCC-00108" ...
$ observed_mean: num 298.19 57.28 20.19 13.58 2.15 ...
$ observed_sem : num 2.8448 0.6346 0.2841 0.2141 0.0674 ...
$ expected : num 4877.9 1219.5 304.9 152.4 38.1 ...
cor(ercc_filter_plot$observed_mean, ercc_filter_plot$expected)
[1] 0.9916498
p_efficiency <- ggplot(ercc_plot, aes(x = expected, y = observed_mean, label = id)) +
geom_point(col = "red") +
geom_errorbar(aes(ymin = observed_mean - observed_sem,
ymax = observed_mean + observed_sem), width = 0) +
labs(x = "Expected ERCC molecules",
y = "Observed ERCC molecules\n(mean +/- SEM)",
title = "ERCC capture efficiency")
p_efficiency_plot <- p_efficiency + scale_x_log10() +
scale_y_log10() +
labs(x = "Expected ERCC molecules",
y = "Observed ERCC molecules\n(mean +/- SEM)",
title = "ERCC capture efficiency \n (48 ERCC spike-in controls)")
p_efficiency_plot + geom_text(hjust = 0, nudge_x = 0.05, size = 4)
p_efficiency_filter_plot <- ggplot(ercc_filter_plot, aes(x = expected, y = observed_mean)) +
geom_point(col = "red") +
geom_errorbar(aes(ymin = observed_mean - observed_sem,
ymax = observed_mean + observed_sem), width = 0) +
scale_x_log10() + scale_y_log10() +
labs(x = "Expected ERCC molecules",
y = "Observed ERCC molecules\n(mean +/- SEM)",
title = "ERCC capture efficiency")
p_efficiency_filter_plot
ercc_index <- grep("ERCC", rownames(molecules_filter))
length(ercc_index)
[1] 48
efficiency <- numeric(length = ncol(molecules_filter))
total_ercc_molecules <- sum(ercc_filter_plot$expected)
for (i in 1:ncol(molecules_filter)) {
efficiency[i] <- sum(molecules_filter[ercc_index, i]) / total_ercc_molecules
}
summary(efficiency)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.03978 0.05459 0.06140 0.06580 0.07621 0.12150
plot_grid(plot_mean_log + theme(legend.position = c(.85,.25)) + labs (col = ""),
p_efficiency_plot + theme(legend.position = "none"),
labels = letters[6:7])
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] cowplot_0.3.1 ggplot2_1.0.1 edgeR_3.10.2 limma_3.24.9
[5] dplyr_0.4.2 data.table_1.9.4 biomaRt_2.24.0 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.12.4 formatR_1.2 GenomeInfoDb_1.4.0
[4] plyr_1.8.3 bitops_1.0-6 tools_3.2.0
[7] digest_0.6.8 RSQLite_1.0.0 evaluate_0.7
[10] gtable_0.1.2 DBI_0.3.1 yaml_2.1.13
[13] parallel_3.2.0 proto_0.3-10 httr_0.6.1
[16] stringr_1.0.0 S4Vectors_0.6.0 IRanges_2.2.4
[19] stats4_3.2.0 Biobase_2.28.0 R6_2.1.1
[22] AnnotationDbi_1.30.1 XML_3.98-1.2 rmarkdown_0.6.1
[25] reshape2_1.4.1 magrittr_1.5 scales_0.4.0
[28] htmltools_0.2.6 BiocGenerics_0.14.0 MASS_7.3-40
[31] assertthat_0.1 colorspace_1.2-6 labeling_0.3
[34] stringi_1.0-1 RCurl_1.95-4.6 munsell_0.4.3
[37] chron_2.3-45