Last updated: 2015-12-11
Code version: ba8f659f8899d05d9a13b626bda016c7d2389a16
Explore the relationship between coefficient of variation and sparsity.
As expected, a large coefficient of variation is associated with a large number of zero-count cells in a gene, an observation that is consistent with an important feature of the count data where the coefficient of variation is large for genes with low expression levels.
library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
theme_set(theme_bw(base_size = 12))
source("functions.R")
## Knitr settings
library(knitr)
opts_knit$set(progress = TRUE, verbose = TRUE)
opts_chunk$set(message = FALSE, warning = FALSE, eval = TRUE,
echo = TRUE)
Input annotation of only QC-filtered single cells
anno_qc <- read.table("../data/annotation-filter.txt", header = TRUE,
stringsAsFactors = FALSE)
head(anno_qc)
individual replicate well batch sample_id
1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01
2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02
3 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04
4 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05
5 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06
6 NA19098 r1 A07 NA19098.r1 NA19098.r1.A07
Input molecule counts that are filtered, transformed, and also processed to remove unwanted variation from batch effet. ERCC genes are also removed.
molecules_qc <- read.table("../data/molecules-final.txt", header = TRUE, stringsAsFactors = FALSE)
Compute per batch coefficient of variation.
Include only genes with positive coefficient of variation. Some genes in this data may have zero coefficient of variation, because we include gene with more than 0 count across all cells.
molecules_cv_batch <-
lapply(1:length(unique(anno_qc$batch)), function(per_batch) {
molecules_per_batch <- molecules_qc[ , unique(anno_qc$batch) == unique(anno_qc$batch)[per_batch] ]
mean_per_gene <- apply(molecules_per_batch, 1, mean, na.rm = TRUE)
sd_per_gene <- apply(molecules_per_batch, 1, sd, na.rm = TRUE)
cv_per_gene <- data.frame(mean = mean_per_gene,
sd = sd_per_gene,
cv = sd_per_gene/mean_per_gene)
rownames(cv_per_gene) <- rownames(molecules_qc)
cv_per_gene <- cv_per_gene[rowSums(is.na(cv_per_gene)) == 0, ]
cv_per_gene$batch <- unique(anno_qc$batch)[per_batch]
return(cv_per_gene)
})
names(molecules_cv_batch) <- unique(anno_qc$batch)
dim(molecules_cv_batch[[1]])
[1] 10564 4
Import molecule count file before normlization.
molecules_filter <- read.table("../data/molecules-filter.txt")
select_nonERCC <- grep("ERCC", rownames(molecules_filter), invert = TRUE)
molecules_ENSG <- molecules_filter[select_nonERCC, ]
stopifnot(dim(molecules_ENSG) == dim(molecules_qc))
Compute the proportion of zero count cells in each gene.
per_batch = 2
molecules_sparse_batch <-
lapply(1:length(unique(anno_qc$batch)), function(per_batch) {
sparse_per_batch <- molecules_ENSG[ , unique(anno_qc$batch) == unique(anno_qc$batch)[per_batch] ]
sparse_per_gene <- apply(sparse_per_batch, 1, function(x) {
mean(x == 0)
})
return(sparse_per_gene)
})
names(molecules_sparse_batch) <- unique(anno_qc$batch)
str(molecules_sparse_batch[[1]])
Named num [1:10564] 0.7042 0.3239 0.2676 0.0423 0.2535 ...
- attr(*, "names")= chr [1:10564] "ENSG00000237683" "ENSG00000188976" "ENSG00000187608" "ENSG00000188157" ...
str(molecules_sparse_batch[[2]])
Named num [1:10564] 0.7324 0.2113 0.1831 0.0141 0.2958 ...
- attr(*, "names")= chr [1:10564] "ENSG00000237683" "ENSG00000188976" "ENSG00000187608" "ENSG00000188157" ...
Frequency of zero-count cell proportions.
par(mfrow = c(3,3), mar = c(4,2,2,3))
for (per_batch in 1:length(unique(anno_qc$batch))) {
hist( molecules_sparse_batch[[per_batch]],
main = unique(anno_qc$batch)[per_batch], breaks = 100,
ylim = c(0, 3000), xlim = c(0, 1),
xlab = "No. of zero-count cells")
}
require(broman)
require(scales)
crayons <- brocolors("crayon")
par(mfrow = c(3,3), mar = c(4,2,2,3))
for (per_batch in 1:length(unique(anno_qc$batch))) {
df_xy <- data.frame(x = molecules_sparse_batch[[per_batch]],
y = molecules_cv_batch[[per_batch]]$cv)
plot(df_xy,
xlab = "Proportion of zero-count cells",
ylab = "CV",
xlim = c(0, 1), ylim = c(0, 1),
main = unique(anno_qc$batch)[per_batch],
cex = .6, pch = 16, col = alpha(crayons["Outer Space"], .8))
loess_fit <- loess(y ~ x, df_xy)
x_seq <- with(df_xy, seq(min(x), max(x), by = .01))
lines(x_seq,
predict(loess_fit, x_seq),
col = crayons["Outrageous Orange"],
lwd = 1)
}
sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.5 (Yosemite)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] scales_0.3.0 broman_0.59-5 ggplot2_1.0.1 edgeR_3.10.5
[5] limma_3.24.15 dplyr_0.4.3 data.table_1.9.6 knitr_1.11
loaded via a namespace (and not attached):
[1] Rcpp_0.12.2 magrittr_1.5 MASS_7.3-45 munsell_0.4.2
[5] colorspace_1.2-6 R6_2.1.1 stringr_1.0.0 plyr_1.8.3
[9] tools_3.2.1 parallel_3.2.1 gtable_0.1.2 DBI_0.3.1
[13] htmltools_0.2.6 yaml_2.1.13 assertthat_0.1 digest_0.6.8
[17] reshape2_1.4.1 formatR_1.2.1 evaluate_0.8 rmarkdown_0.8.1
[21] stringi_1.0-1 chron_2.3-47 proto_0.3-10