Exploring the data

Setup
Filter genes
PCA
Session information

Last updated: 2015-10-22

Code version: 3b9f77ca4ecbf3c03d571d6c62c92a181dc81982

library("knitr")
opts_chunk$set(fig.align = "center", fig.width = 10)

Setup

Load packages.

library("edgeR")

## Loading required package: limma

library("ggplot2")
library("ggdendro")
library("cowplot")

## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:ggplot2':
## 
##     ggsave

theme_set(theme_bw(base_size = 12))

Import the counts matrix.

counts <- read.table("../../data/mouse/counts-matrix.txt", header = TRUE)
dim(counts)

## [1] 22013     6

colnames(counts) <- sub("mESc.2i.RNA.", "", colnames(counts))
head(counts)

##                    DMSO.REP1 DMSO.REP2 DMSO.REP3 RA.REP1 RA.REP2 RA.REP3
## ENSMUSG00000051951         0         3         2       0       0       4
## ENSMUSG00000025900         1         0         1       0       1       0
## ENSMUSG00000025902        14         7        17      14       3       9
## ENSMUSG00000033845      3312      2890      2922    2981     772    1627
## ENSMUSG00000025903      1837      1789      1459    1348     409     881
## ENSMUSG00000104217         0         0         0       0       0       0

Create an annotation matrix.

anno <- strsplit(colnames(counts), split = "\\.")
anno <- data.frame(treatment = sapply(anno, function(x) x[1]),
                   replicate = sapply(anno, function(x) x[2]))
write.table(anno, file = "../../data/mouse/annotation.txt", quote = FALSE,
            sep = "\t", row.names = FALSE)
anno

##   treatment replicate
## 1      DMSO      REP1
## 2      DMSO      REP2
## 3      DMSO      REP3
## 4        RA      REP1
## 5        RA      REP2
## 6        RA      REP3

Filter genes

Remove genes with zero counts across all samples.

zeros <- rowSums(counts) == 0
counts <- counts[!zeros, ]

There were 3083 genes with no observed counts across the 6 samples.

Next remove lowly expressed genes. Use TMM-normalized log₂ counts per million (cpm).

normalized_lib_sizes <- calcNormFactors(counts)
log_cpm <- cpm(counts, log = TRUE,
               lib.size = colSums(counts) * normalized_lib_sizes)

There is a bimodal distribution of expression values. A good cutoff to remove lowly expressed genes is a log₂ cpm < 0 in all samples (red line).

op <- par(mfcol = c(1, 3))
plot(density(log_cpm[, 1]), main = "Distribution of log2 cpm")
apply(log_cpm[, -1], 2, function(x) lines(density(x)))

## NULL

abline(v = 0, col = "red")
boxplot(log_cpm, las = 3, col = ifelse(anno$treatment == "RA", "blue", "red"),
        names = paste0(anno$treatment, "\n", anno$replicate))
abline(h = 0, col = "red")
plot(rowMeans(log_cpm[, anno$treatment == "DMSO"]),
     rowMeans(log_cpm[, anno$treatment == "RA"]),
     main = "Mean log2 cpm", xlab = "DMSO", ylab = "RA")
abline(v = 0, h = 0, col = "red")

par(op)

Remove genes with mean log₂ cpm < 0 in both the DMSO and RA-treated samples.

lowly_expressed <- apply(log_cpm, 1, function(x) {
  mean(x[anno$treatment == "DMSO"]) < 0 & mean(x[anno$treatment == "RA"]) < 0
})
counts <- counts[!lowly_expressed, ]

Using this cutoff removes 5192 lowly expressed genes for a total of 13738 expressed genes, a reasonable number for a single cell type.

Now recalculate the log₂ cpm with only the expressed genes.

normalized_lib_sizes <- calcNormFactors(counts)
log_cpm <- cpm(counts, log = TRUE,
                lib.size = colSums(counts) * normalized_lib_sizes)

The lowly expressed genes have been removed, but still allowing for genes to be robustly expressed in only one of the two treatment conditions.

op <- par(mfcol = c(1, 3))
plot(density(log_cpm[, 1]), main = "Distribution of log2 cpm")
apply(log_cpm[, -1], 2, function(x) lines(density(x)))

## NULL

abline(v = 0, col = "red")
boxplot(log_cpm, las = 3, col = ifelse(anno$treatment == "RA", "blue", "red"),
        names = paste0(anno$treatment, "\n", anno$replicate))
abline(h = 0, col = "red")
plot(rowMeans(log_cpm[, anno$treatment == "DMSO"]),
     rowMeans(log_cpm[, anno$treatment == "RA"]),
     main = "Mean log2 cpm", xlab = "DMSO", ylab = "RA")
abline(v = 0, h = 0, col = "red")

par(op)

Save the filtered counts.

write.table(counts, file = "../../data/mouse/counts-matrix-filtered.txt",
            quote = FALSE, sep = "\t", col.names = NA)

PCA

Perform principal components analysis (PCA). See Zhang et al. 2009 for reference.

pca <- prcomp(t(log_cpm), retx = TRUE, center = TRUE, scale. = TRUE)
variances <- pca$sdev^2
explained <- variances / sum(variances)
barplot(explained * 100, main = "Variance explained by each PC", xlab = "PC",
     ylab = "Percent variance explained")

pca_df <- cbind(anno, pca$x)

As expected, the first two PCs explain most of the variation in the data. The first PC separates the samples by treatment. The second PC separates the RA-treated samples. The hierarchical clustering (complete linkage of Euclidean distances) results agree with the PCA.

pca_plot <- ggplot(pca_df, aes(x = PC1, y = PC2, color = treatment,
                               shape = replicate)) +
  geom_point() +
  labs(x = sprintf("PC%d (%.2f%%)", 1, round(explained[1] * 100, 2)),
       y = sprintf("PC%d (%.2f%%)", 2, round(explained[2] * 100, 2)),
       title = "PCA")
hclust_result <- hclust(dist(t(log_cpm)))
dendro_plot <- ggdendrogram(hclust_result) +
  labs(title = "Hierarchical clustering")
plot_grid(pca_plot, dendro_plot, labels = LETTERS[1:2])

Session information

sessionInfo()

## R version 3.2.0 (2015-04-16)
## Platform: x86_64-unknown-linux-gnu (64-bit)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] cowplot_0.3.1   ggdendro_0.1-17 ggplot2_1.0.1   edgeR_3.10.2   
## [5] limma_3.24.9    knitr_1.10.5   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.0      magrittr_1.5     MASS_7.3-40      munsell_0.4.2   
##  [5] colorspace_1.2-6 stringr_1.0.0    httr_0.6.1       plyr_1.8.3      
##  [9] tools_3.2.0      grid_3.2.0       gtable_0.1.2     htmltools_0.2.6 
## [13] yaml_2.1.13      digest_0.6.8     reshape2_1.4.1   formatR_1.2     
## [17] bitops_1.0-6     RCurl_1.95-4.6   evaluate_0.7     rmarkdown_0.6.1 
## [21] labeling_0.3     stringi_0.4-1    scales_0.2.4     proto_0.3-10

Exploring the data

John Blischak

Setup

Filter genes

PCA

Session information