Last updated: 2015-09-14
Code version: 11a61e12229c07d54ae50b9a5db91ff78b59c4e4
source("functions.R")
library("limma")
library("edgeR")
library(ggplot2)
theme_set(theme_bw(base_size = 16))
Input annotation
anno <- read.table("../data/annotation.txt", header = TRUE,
stringsAsFactors = FALSE)
Input molecule counts
molecules <- read.table("../data/molecules.txt", header = TRUE,
stringsAsFactors = FALSE)
Input read count
reads <- read.table("../data/reads.txt", header = TRUE,
stringsAsFactors = FALSE)
Remove batch 2 of individual 19098.
molecules_no <- molecules[, !(anno$individual == 19098 & anno$batch == 2)]
reads_single <- reads[, !(anno$individual == 19098 & anno$batch == 2)]
anno_no <- anno[!(anno$individual == 19098 & anno$batch == 2), ]
stopifnot(ncol(molecules_no) == nrow(anno_no))
Remove bulk samples.
molecules_single <- molecules_no[, anno_no$well != "bulk"]
anno_single <- anno_no[anno_no$well != "bulk", ]
stopifnot(ncol(molecules_single) == nrow(anno_single))
Remove genes with zero count in the single cells
expressed_single <- rowSums(molecules_single) > 0
molecules_single <- molecules_single[which(expressed_single), ]
reads_single <- reads_single[expressed_single, ]
require(matrixStats)
Loading required package: matrixStats
matrixStats v0.14.0 (2015-02-13) successfully loaded. See ?matrixStats for help.
number_nonzero_cells <- colSums(molecules_single != 0)
number_genes <- dim(molecules_single)[1]
molecules_prop_genes_detected <-
data.frame(prop = number_nonzero_cells/number_genes,
individual = anno_single$individual,
individual.batch = paste(anno_single$individual,
anno_single$batch, sep = "."))
ggplot(molecules_prop_genes_detected,
aes(y = prop, x = as.factor(individual.batch))) +
geom_boxplot( aes(fill = as.factor(individual)) ) +
labs(x = "Individual.batch ID",
y = "Proportion of detected genes",
title = "Proportion of detected genes")
Principal component analysis on log2 transformed values. We avoid log of 0’s by add 1’s. In addition, our PCA analysis requires that every gene needs to be present in at least one of the cells.
molecules_single_log2_pca <- run_pca( log2( molecules_single + 1 ) )
qplot(y = molecules_single_log2_pca$PCs[,1],
x = molecules_prop_genes_detected$prop,
shape = as.factor(anno_single$batch),
colour = as.factor(anno_single$individual),
xlab = "Proportion of genes detected",
ylab = "First principal component score")
qplot(y = molecules_single_log2_pca$PCs[,2],
x = molecules_prop_genes_detected$prop,
shape = as.factor(anno_single$batch),
colour = as.factor(anno_single$individual),
xlab = "Proportion of genes detected",
ylab = "Second principal component score")
Compute median of gene expression of non-zero measurements.
require(matrixStats)
molecules_single_log2_cpm <- molecules_single
molecules_single_log2_cpm[molecules_single_log2_cpm==0] <- NA
libsize <- colSums(molecules_single_log2_cpm, na.rm = TRUE)
molecules_single_log2_cpm <- log2( 10^6 * t(t(molecules_single_log2_cpm)/libsize) )
molecules_median_expression <-
apply(molecules_single_log2_cpm, 2,
function(per_cell) { median(per_cell[per_cell!=0],
na.rm = TRUE) })
molecules_df <- data.frame(medians = molecules_median_expression,
prop = molecules_prop_genes_detected$prop,
individual = anno_single$individual,
batch = anno_single$batch,
individual.batch = paste(anno_single$individual,
anno_single$batch, sep = "."))
ggplot(molecules_df, aes(y = medians, x = prop) ) +
geom_point( aes(shape = as.factor(batch),
colour = as.factor(individual) ) ) +
stat_smooth(method = "loess") +
labs( xlab = "Proportion of genes detected",
ylab = "Median expression of non-zero cells (log2 CPM)")
Input list of quality single cells
quality_single_cells <- scan("../data/quality-single-cells.txt", what = "character")
Keep only the single cells that pass the QC filters. This also removes the bulk samples
reads_single <- reads_single[, colnames(reads_single) %in% quality_single_cells]
molecules_single <- molecules_single[, colnames(molecules_single) %in% quality_single_cells]
anno_single <- anno_single[anno_single$sample_id %in% quality_single_cells, ]
stopifnot(ncol(molecules_single) == nrow(anno_single),
colnames(molecules_single) == anno_single$sample_id)
Remove genes with greater than or equal to 1,024 molecules in at least one of the cells
overexpressed_genes <- rownames(molecules_single)[ apply(molecules_single, 1,
function(x) any(x >= 1024))]
molecules_single <- molecules_single[!(rownames(molecules_single) %in% overexpressed_genes), ]
Remove genes with zero count in the single cells
molecules_single <- molecules_single[which(rowSums(molecules_single) > 0), ]
reads_single <- reads_single[rowSums(reads_single) > 0, ]
The observations below were the same before removing genes with greater than or equal to 1,024 molecules in at least one of the cells.
require(matrixStats)
number_nonzero_cells <- colSums(molecules_single != 0)
number_genes <- dim(molecules_single)[1]
molecules_prop_genes_detected <-
data.frame(prop = number_nonzero_cells/number_genes,
individual = anno_single$individual,
individual.batch = paste(anno_single$individual,
anno_single$batch, sep = "."))
ggplot(molecules_prop_genes_detected,
aes(y = prop, x = as.factor(individual.batch))) +
geom_boxplot( aes(fill = as.factor(individual)) ) +
labs(x = "Individual.batch ID",
y = "Proportion of detected genes",
title = "Proportion of detected genes")
Principal component analysis on log2 transformed values. We avoid log of 0’s by add 1’s. In addition, our PCA analysis requires that every gene needs to be present in at least one of the cells.
molecules_single_log2_pca <- run_pca( log2( molecules_single + 1 ) )
qplot(y = molecules_single_log2_pca$PCs[,1],
x = molecules_prop_genes_detected$prop,
shape = as.factor(anno_single$batch),
colour = as.factor(anno_single$individual),
xlab = "Proportion of genes detected",
ylab = "First principal component score")
qplot(y = molecules_single_log2_pca$PCs[,2],
x = molecules_prop_genes_detected$prop,
shape = as.factor(anno_single$batch),
colour = as.factor(anno_single$individual),
xlab = "Proportion of genes detected",
ylab = "Second principal component score")
Compute median of gene expression of non-zero measurements.
require(matrixStats)
molecules_single_log2_cpm <- molecules_single
molecules_single_log2_cpm[molecules_single_log2_cpm==0] <- NA
libsize <- colSums(molecules_single_log2_cpm, na.rm = TRUE)
molecules_single_log2_cpm <- log2( 10^6 * t(t(molecules_single_log2_cpm)/libsize) )
molecules_median_expression <-
apply(molecules_single_log2_cpm, 2,
function(per_cell) { median(per_cell[per_cell!=0],
na.rm = TRUE) })
molecules_df <- data.frame(medians = molecules_median_expression,
prop = molecules_prop_genes_detected$prop,
individual = anno_single$individual,
batch = anno_single$batch,
individual.batch = paste(anno_single$individual,
anno_single$batch, sep = "."))
ggplot(molecules_df, aes(y = medians, x = prop) ) +
geom_point( aes(shape = as.factor(batch),
colour = as.factor(individual) ) ) +
stat_smooth(method = "loess") +
labs( xlab = "Proportion of genes detected",
ylab = "Median expression of non-zero cells (log2 CPM)")
require(matrixStats)
number_nonzero_cells <- colSums(reads_single != 0)
number_genes <- dim(reads_single)[1]
reads_prop_genes_detected <-
data.frame(prop = number_nonzero_cells/number_genes,
individual = anno_single$individual,
individual.batch = paste(anno_single$individual,
anno_single$batch, sep = "."))
ggplot(reads_prop_genes_detected,
aes(y = prop, x = as.factor(individual.batch))) +
geom_boxplot( aes(fill = as.factor(individual)) ) +
labs(x = "Individual.batch ID",
y = "Proportion of detected genes",
title = "Proportion of detected genes in read count data")
Principal component analysis on log2 transformed values. We avoid log of 0’s by add 1’s. In addition, our PCA analysis requires that every gene needs to be present in at least one of the cells.
reads_log2_pca <- run_pca( log2( reads_single + 1 ) )
qplot(y = reads_log2_pca$PCs[,1],
x = reads_prop_genes_detected$prop,
shape = as.factor(anno_single$batch),
colour = as.factor(anno_single$individual),
xlab = "Proportion of genes detected",
ylab = "First principal component score")
qplot(y = reads_log2_pca$PCs[,2],
x = reads_prop_genes_detected$prop,
shape = as.factor(anno_single$batch),
colour = as.factor(anno_single$individual),
xlab = "Proportion of genes detected",
ylab = "Second principal component score")
Compute median of gene expression of non-zero measurements.
require(matrixStats)
reads_single[reads_single==0] <- NA
reads_libsize <- colSums(reads_single, na.rm = TRUE)
reads_cpm <- log2( 10^6 * t(t(reads_single)/reads_libsize) )
reads_median_expression <-
apply(reads_cpm, 2,
function(per_cell) { median(per_cell[per_cell!=0],
na.rm = TRUE) })
reads_df <- data.frame(medians = reads_median_expression,
prop = reads_prop_genes_detected$prop,
individual = anno_single$individual,
batch = anno_single$batch,
individual.batch = paste(anno_single$individual,
anno_single$batch, sep = "."))
ggplot(reads_df, aes(y = medians, x = prop) ) +
geom_point( aes(shape = as.factor(batch),
colour = as.factor(individual) ) ) +
stat_smooth(method = "loess") +
labs( xlab = "Proportion of genes detected",
ylab = "Median expression of non-zero cells (log2 CPM)")
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] testit_0.4 matrixStats_0.14.0 ggplot2_1.0.1
[4] edgeR_3.10.2 limma_3.24.9 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.12.0 magrittr_1.5 MASS_7.3-40 munsell_0.4.2
[5] colorspace_1.2-6 stringr_1.0.0 plyr_1.8.3 tools_3.2.0
[9] grid_3.2.0 gtable_0.1.2 htmltools_0.2.6 yaml_2.1.13
[13] digest_0.6.8 reshape2_1.4.1 formatR_1.2 evaluate_0.7
[17] rmarkdown_0.6.1 labeling_0.3 stringi_0.4-1 scales_0.2.4
[21] proto_0.3-10