Last updated: 2016-02-27
Code version: e913fc0a3f954acf436955c670b4cdf1d052c075
library("dplyr")
library("doMC")
registerDoMC(7)
library("ggplot2")
theme_set(theme_bw(base_size = 12))
source("functions.R")
This file tranforms the standardized molecule counts by modeling the ERCC counts using a Poisson generalized linear model (glm).
Creates the following file:
molecules-cpm-trans.txt - Molecules in high quality single cells after linear transformation with ERCC
Input filtered annotation.
anno_filter <- read.table("../data/annotation-filter.txt", header = TRUE,
stringsAsFactors = FALSE)
head(anno_filter)
individual replicate well batch sample_id
1 NA19098 r1 A01 NA19098.r1 NA19098.r1.A01
2 NA19098 r1 A02 NA19098.r1 NA19098.r1.A02
3 NA19098 r1 A04 NA19098.r1 NA19098.r1.A04
4 NA19098 r1 A05 NA19098.r1 NA19098.r1.A05
5 NA19098 r1 A06 NA19098.r1 NA19098.r1.A06
6 NA19098 r1 A07 NA19098.r1 NA19098.r1.A07
Input filtered molecule counts.
molecules_filter <- read.table("../data/molecules-filter.txt", header = TRUE,
stringsAsFactors = FALSE)
stopifnot(ncol(molecules_filter) == nrow(anno_filter),
colnames(molecules_filter) == anno_filter$sample_id)
Input standardized molecule counts.
molecules_cpm <- read.table("../data/molecules-cpm.txt", header = TRUE,
stringsAsFactors = FALSE)
stopifnot(ncol(molecules_cpm) == nrow(anno_filter),
colnames(molecules_cpm) == anno_filter$sample_id)
Input expected ERCC molecule counts (calculated in another analysis). Filter to only include those ERCC controls that passed the expression filter.
spike <- read.table("../data/expected-ercc-molecules.txt", header = TRUE,
sep = "\t", stringsAsFactors = FALSE)
spike_input <- spike$ercc_molecules_well[spike$id %in% rownames(molecules_filter)]
names(spike_input) <- spike$id[spike$id %in% rownames(molecules_filter)]
spike_input <- spike_input[order(names(spike_input))]
stopifnot(names(spike_input) == grep("ERCC", rownames(molecules_filter), value = TRUE))
pois_glm_params <- lapply(1:3,function(g) numeric(ncol(molecules_cpm)))
names(pois_glm_params) = c("intercept","nu","theta")
for (i in 1:ncol(molecules_cpm)) {
fit <- glm(molecules_filter[names(spike_input), i] ~ log(spike_input), family="poisson")
pois_glm_params$intercept[i] <- fit$coefficients[1]
pois_glm_params$nu[i] <- fit$coefficients[2]
}
molecules_cpm_trans <- sweep( sweep( molecules_cpm, 2, pois_glm_params$intercept, "-"), 2, pois_glm_params$nu, "/" )
Write to file.
write.table(round(molecules_cpm_trans, digits = 6), "../data/molecules-cpm-trans.txt", quote = FALSE,
sep = "\t", col.names = NA)
pca_molecules_cpm_trans <- run_pca(molecules_cpm_trans)
pca_molecules_cpm_trans_plot <- plot_pca(pca_molecules_cpm_trans$PCs, explained = pca_molecules_cpm_trans$explained,
metadata = anno_filter, color = "individual",
shape = "replicate") +
labs(title = "Molecules (Poisson transformation with ERCC) for single cells")
pca_molecules_cpm_trans_plot
Collect together all the normalized expression matrices for analysis
cpm_mats <- list( raw=molecules_cpm, molecules_cpm_trans=molecules_cpm_trans )
We need a nested ANOVA since the replicates are not common across the individuals.
my_nested_anova=function(temp) {
global_mean=mean(temp$y)
ind_means=temp %>% group_by(individual) %>% summarize(m=mean(y)) %>% as.data.frame
rownames(ind_means) = ind_means$individual
temp$ind_means=ind_means[ temp$individual, "m" ]
batch_means=temp %>% group_by(batch) %>% summarise(m=mean(y)) %>% as.data.frame
rownames(batch_means) = batch_means$batch
temp$batch_means=batch_means[ temp$batch, "m" ]
c(ssa=sum( (temp$ind_means - global_mean)^2 ),
ssb=sum( (temp$batch_means - temp$ind_means)^2 ),
sse=sum( (temp$y - temp$batch_means)^2 ),
sst=sum( (temp$y - global_mean)^2 ))
}
Run ANOVAs per gene for each matrix and calculate variance components
anovas <- lapply(cpm_mats, function(x) {
foreach(i=1:nrow(x)) %dopar% my_nested_anova(data.frame(y=as.numeric(x[i,]), batch=anno_filter$batch, individual=anno_filter$individual))
})
variance_components <- lapply( as.list(names(anovas)), function(name) {
ss=do.call(rbind,anovas[[name]])[,1:3]
colnames(ss)=c("individual","batch","residual")
data.frame(sweep(ss,1,rowSums(ss),"/"), method=name)
} )
names(variance_components)=names(cpm_mats)
batch_over_explained <- lapply( as.list(names(anovas)), function(name) {
ss=do.call(rbind,anovas[[name]])[,1:2]
colnames(ss)=c("individual","batch")
data.frame( prop_batch=ss[,"batch"] / rowSums(ss), method=name)
} )
names(batch_over_explained) = names(cpm_mats)
Plot proportions explained by batch vs. individual
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ggplot( do.call(rbind,batch_over_explained), aes(prop_batch,col=method)) + geom_density(alpha=0.2, position="identity") + xlab("variance_batch / (variance_batch + variance_individual)") + theme(legend.position=c(.8,.8)) + scale_colour_manual(values=cbPalette)+ scale_fill_manual(values=cbPalette)
Plot overall percent variance explained
ggplot( do.call(rbind,variance_components), aes(1-residual,col=method)) + geom_density() + xlab("proportion variance explained") + xlim(0,.5)+ scale_colour_manual(values=cbPalette) + theme(legend.position=c(.8,.8))
Warning: Removed 64 rows containing non-finite values (stat_density).
Warning: Removed 140 rows containing non-finite values (stat_density).
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] testit_0.4 ggplot2_1.0.1 doMC_1.3.4 iterators_1.0.7
[5] foreach_1.4.2 dplyr_0.4.2 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.12.0 magrittr_1.5 MASS_7.3-40 munsell_0.4.2
[5] colorspace_1.2-6 R6_2.1.1 plyr_1.8.3 stringr_1.0.0
[9] httr_0.6.1 tools_3.2.0 grid_3.2.0 gtable_0.1.2
[13] DBI_0.3.1 htmltools_0.2.6 yaml_2.1.13 assertthat_0.1
[17] digest_0.6.8 reshape2_1.4.1 formatR_1.2 bitops_1.0-6
[21] codetools_0.2-11 RCurl_1.95-4.6 evaluate_0.7 rmarkdown_0.6.1
[25] labeling_0.3 stringi_0.4-1 compiler_3.2.0 scales_0.2.4
[29] proto_0.3-10