Last updated: 2015-10-22
Code version: 3b9f77ca4ecbf3c03d571d6c62c92a181dc81982
library("knitr")
opts_chunk$set(fig.align = "center", fig.width = 10)
Load packages.
library("edgeR")
## Loading required package: limma
library("ggplot2")
library("ggdendro")
library("cowplot")
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:ggplot2':
##
## ggsave
theme_set(theme_bw(base_size = 12))
Import the counts matrix.
counts <- read.table("../../data/mouse/counts-matrix.txt", header = TRUE)
dim(counts)
## [1] 22013 6
colnames(counts) <- sub("mESc.2i.RNA.", "", colnames(counts))
head(counts)
## DMSO.REP1 DMSO.REP2 DMSO.REP3 RA.REP1 RA.REP2 RA.REP3
## ENSMUSG00000051951 0 3 2 0 0 4
## ENSMUSG00000025900 1 0 1 0 1 0
## ENSMUSG00000025902 14 7 17 14 3 9
## ENSMUSG00000033845 3312 2890 2922 2981 772 1627
## ENSMUSG00000025903 1837 1789 1459 1348 409 881
## ENSMUSG00000104217 0 0 0 0 0 0
Create an annotation matrix.
anno <- strsplit(colnames(counts), split = "\\.")
anno <- data.frame(treatment = sapply(anno, function(x) x[1]),
replicate = sapply(anno, function(x) x[2]))
write.table(anno, file = "../../data/mouse/annotation.txt", quote = FALSE,
sep = "\t", row.names = FALSE)
anno
## treatment replicate
## 1 DMSO REP1
## 2 DMSO REP2
## 3 DMSO REP3
## 4 RA REP1
## 5 RA REP2
## 6 RA REP3
Remove genes with zero counts across all samples.
zeros <- rowSums(counts) == 0
counts <- counts[!zeros, ]
There were 3083 genes with no observed counts across the 6 samples.
Next remove lowly expressed genes. Use TMM-normalized log2 counts per million (cpm).
normalized_lib_sizes <- calcNormFactors(counts)
log_cpm <- cpm(counts, log = TRUE,
lib.size = colSums(counts) * normalized_lib_sizes)
There is a bimodal distribution of expression values. A good cutoff to remove lowly expressed genes is a log2 cpm < 0 in all samples (red line).
op <- par(mfcol = c(1, 3))
plot(density(log_cpm[, 1]), main = "Distribution of log2 cpm")
apply(log_cpm[, -1], 2, function(x) lines(density(x)))
## NULL
abline(v = 0, col = "red")
boxplot(log_cpm, las = 3, col = ifelse(anno$treatment == "RA", "blue", "red"),
names = paste0(anno$treatment, "\n", anno$replicate))
abline(h = 0, col = "red")
plot(rowMeans(log_cpm[, anno$treatment == "DMSO"]),
rowMeans(log_cpm[, anno$treatment == "RA"]),
main = "Mean log2 cpm", xlab = "DMSO", ylab = "RA")
abline(v = 0, h = 0, col = "red")
par(op)
Remove genes with mean log2 cpm < 0 in both the DMSO and RA-treated samples.
lowly_expressed <- apply(log_cpm, 1, function(x) {
mean(x[anno$treatment == "DMSO"]) < 0 & mean(x[anno$treatment == "RA"]) < 0
})
counts <- counts[!lowly_expressed, ]
Using this cutoff removes 5192 lowly expressed genes for a total of 13738 expressed genes, a reasonable number for a single cell type.
Now recalculate the log2 cpm with only the expressed genes.
normalized_lib_sizes <- calcNormFactors(counts)
log_cpm <- cpm(counts, log = TRUE,
lib.size = colSums(counts) * normalized_lib_sizes)
The lowly expressed genes have been removed, but still allowing for genes to be robustly expressed in only one of the two treatment conditions.
op <- par(mfcol = c(1, 3))
plot(density(log_cpm[, 1]), main = "Distribution of log2 cpm")
apply(log_cpm[, -1], 2, function(x) lines(density(x)))
## NULL
abline(v = 0, col = "red")
boxplot(log_cpm, las = 3, col = ifelse(anno$treatment == "RA", "blue", "red"),
names = paste0(anno$treatment, "\n", anno$replicate))
abline(h = 0, col = "red")
plot(rowMeans(log_cpm[, anno$treatment == "DMSO"]),
rowMeans(log_cpm[, anno$treatment == "RA"]),
main = "Mean log2 cpm", xlab = "DMSO", ylab = "RA")
abline(v = 0, h = 0, col = "red")
par(op)
Save the filtered counts.
write.table(counts, file = "../../data/mouse/counts-matrix-filtered.txt",
quote = FALSE, sep = "\t", col.names = NA)
Perform principal components analysis (PCA). See Zhang et al. 2009 for reference.
pca <- prcomp(t(log_cpm), retx = TRUE, center = TRUE, scale. = TRUE)
variances <- pca$sdev^2
explained <- variances / sum(variances)
barplot(explained * 100, main = "Variance explained by each PC", xlab = "PC",
ylab = "Percent variance explained")
pca_df <- cbind(anno, pca$x)
As expected, the first two PCs explain most of the variation in the data. The first PC separates the samples by treatment. The second PC separates the RA-treated samples. The hierarchical clustering (complete linkage of Euclidean distances) results agree with the PCA.
pca_plot <- ggplot(pca_df, aes(x = PC1, y = PC2, color = treatment,
shape = replicate)) +
geom_point() +
labs(x = sprintf("PC%d (%.2f%%)", 1, round(explained[1] * 100, 2)),
y = sprintf("PC%d (%.2f%%)", 2, round(explained[2] * 100, 2)),
title = "PCA")
hclust_result <- hclust(dist(t(log_cpm)))
dendro_plot <- ggdendrogram(hclust_result) +
labs(title = "Hierarchical clustering")
plot_grid(pca_plot, dendro_plot, labels = LETTERS[1:2])
sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-unknown-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cowplot_0.3.1 ggdendro_0.1-17 ggplot2_1.0.1 edgeR_3.10.2
## [5] limma_3.24.9 knitr_1.10.5
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.0 magrittr_1.5 MASS_7.3-40 munsell_0.4.2
## [5] colorspace_1.2-6 stringr_1.0.0 httr_0.6.1 plyr_1.8.3
## [9] tools_3.2.0 grid_3.2.0 gtable_0.1.2 htmltools_0.2.6
## [13] yaml_2.1.13 digest_0.6.8 reshape2_1.4.1 formatR_1.2
## [17] bitops_1.0-6 RCurl_1.95-4.6 evaluate_0.7 rmarkdown_0.6.1
## [21] labeling_0.3 stringi_0.4-1 scales_0.2.4 proto_0.3-10