Last updated: 2015-06-11
Code version: 44338a0a5f347c0c7a565729f2a86161eb3229d3
I performed PCA on the ERCC spike-ins to assess technical variability and a subset (~1000) of genes with a high coefficient of variation to assess the biological variability.
Conclusions:
type
or well
)The script gather-gene-counts.py compiles all the gene counts and extracts the relevant variables from the filename. The result is the file gene-counts.txt. Each row is a given fastq file (i.e. the sequences obtained for one from one lane of one flow cell), and each column is a variable (including all the genes).
library("data.table")
library("edgeR")
library("ggplot2")
library("gplots")
theme_set(theme_bw(base_size = 16))
counts <- fread("/mnt/gluster/data/internal_supp/singleCellSeq/gene-counts.txt")
Read 0.0% of 12402 rows
Read 80.6% of 12402 rows
Read 12402 rows and 20427 (of 20427) columns from 0.524 GB file in 00:00:30
Remove the combined samples. Only want to keep the per lane counts.
counts <- counts[!is.na(lane)]
dim(counts)
[1] 10656 20427
head(counts[, 1:11, with = FALSE])
individual batch well index lane flow_cell sickle rmdup
1: 19098 1 A08 GAGCTCCA L003 C6WURACXX quality-trimmed reads
2: 19098 1 B07 TCAGTGTG L006 C723YACXX not-quality-trimmed reads
3: 19098 1 A06 GTCTGTAT L003 C6WURACXX quality-trimmed reads
4: 19098 1 B09 AAAGGTCC L002 C6WYKACXX not-quality-trimmed reads
5: 19098 1 C06 TCGGGTGA L002 C6WYKACXX not-quality-trimmed reads
6: 19098 1 F06 TCGCTAGT L003 C6WURACXX quality-trimmed reads
ENSG00000186092 ENSG00000237683 ENSG00000235249
1: 0 0 0
2: 0 0 0
3: 0 0 0
4: 0 0 0
5: 0 0 0
6: 0 0 0
table(counts$lane, counts$flow_cell, counts$sickle, counts$rmdup)
, , = not-quality-trimmed, = molecules
C6WURACXX C6WYKACXX C723YACXX C72JMACXX
L001 96 18 96 96
L002 96 96 96 96
L003 96 96 96 96
L004 96 96 96 96
L005 96 96 18 96
L006 96 96 96 96
L007 96 96 96 0
L008 18 96 96 18
, , = quality-trimmed, = molecules
C6WURACXX C6WYKACXX C723YACXX C72JMACXX
L001 96 18 96 96
L002 96 96 96 96
L003 96 96 96 96
L004 96 96 96 96
L005 96 96 18 96
L006 96 96 96 96
L007 96 96 96 0
L008 18 96 96 18
, , = not-quality-trimmed, = reads
C6WURACXX C6WYKACXX C723YACXX C72JMACXX
L001 96 18 96 96
L002 96 96 96 96
L003 96 96 96 96
L004 96 96 96 96
L005 96 96 18 96
L006 96 96 96 96
L007 96 96 96 0
L008 18 96 96 18
, , = quality-trimmed, = reads
C6WURACXX C6WYKACXX C723YACXX C72JMACXX
L001 96 18 96 96
L002 96 96 96 96
L003 96 96 96 96
L004 96 96 96 96
L005 96 96 18 96
L006 96 96 96 96
L007 96 96 96 0
L008 18 96 96 18
calc_pca <- function(dt, num_pcs = 1:6) {
# Calculate pca.
# dt - data.table
# num_pcs: Number of PCs to return
require("edgeR")
stopifnot(class(dt)[1] == "data.table")
dt <- dt[, colSums(dt) > 0, with = FALSE]
dt <- t(dt)
cpm_mat <- cpm(dt, log = TRUE)
pca <- prcomp(t(cpm_mat), scale. = TRUE)
# return(as.list(pca$x))
return(pca$x[, num_pcs])
}
calc_pca_by_subset <- function(counts, anno, num_pcs = 1:6) {
# Calculate pca for 4 subsets:
# quality-trimmed reads
# quality-trimmed molecules
# not-quality-trimmed reads
# not-quality-trimmed molecules
# counts: data.table of counts with rows = samples and columns = genes
# anno: annotation file with rows = samples and columns = categorical variables
# num_pcs: Number of PCs to return
# Returns data.frame with PCs plus annotation
stopifnot(class(counts)[1] == "data.table",
class(anno)[1] == "data.table")
pca <- data.frame()
for (rmdup_status in c("molecules", "reads")) {
for (sickle_status in c("not-quality-trimmed", "quality-trimmed")) {
dat <- counts[anno$rmdup == rmdup_status & anno$sickle == sickle_status, ]
# print(dat[, .N])
result <- calc_pca(dat, num_pcs = num_pcs)
result_df <- data.frame(anno[rmdup == rmdup_status & sickle == sickle_status, ], result,
stringsAsFactors = FALSE)
pca <- rbind(pca, result_df)
}
}
return(pca)
}
correlate_pcs <- function(pca) {
# Plot heatmaps of adjusted r-squared values from simple linear regression between PCs
# and categorical variables.
# pca: data.frame of PCs plus annotation
factors <- colnames(pca)[c(1:6, 9)]
pcs <- colnames(pca)[10:ncol(pca)]
for (rmdup_status in c("molecules", "reads")) {
for (sickle_status in c("not-quality-trimmed", "quality-trimmed")) {
pca_corr <- matrix(NA, nrow = length(factors), ncol = length(pcs),
dimnames = list(factors, pcs))
for (fac in factors) {
for (pc in pcs) {
result_lm <- lm(pca[pca$rmdup == rmdup_status &
pca$sickle == sickle_status, pc] ~
as.factor(pca[pca$rmdup == rmdup_status &
pca$sickle == sickle_status, fac]))
result_r2 <- summary(result_lm)$adj.r.squared
# result_df <- data.frame(fac, pc, result_r2, stringsAsFactors = FALSE)
pca_corr[fac, pc] <- result_r2
}
}
heatmap.2(pca_corr, trace = "none", main = paste(rmdup_status, sickle_status))
}
}
}
Using only the ERCC control genes for PCA. Should isolate purely technical effects.
counts_ercc <- counts[, grep("ERCC", colnames(counts)), with = FALSE]
anno <- counts[, 1:8, with = FALSE]
anno$type <- ifelse(anno$well == "bulk", "bulk", "single")
pca_ercc <- calc_pca_by_subset(counts_ercc, anno)
ggplot(pca_ercc, aes(x = PC1, y = PC2, col = as.factor(type))) +
geom_point() +
facet_grid(sickle ~ rmdup)
ggplot(pca_ercc, aes(x = PC1, y = PC2, col = as.factor(individual), shape = as.factor(batch))) +
geom_point(size = 3, alpha = 0.5) +
facet_grid(sickle ~ rmdup)
ggplot(pca_ercc, aes(x = PC1, y = PC2, col = as.factor(batch))) +
geom_point(size = 3, alpha = 0.5) +
facet_grid(sickle ~ rmdup)
correlate_pcs(pca_ercc)
It is too computationally intensive to perform PCA on thousands of genes. Instead limiting to genes with a high coefficient of variation for the non-quality-trimmed reads.
cov <- counts[, lapply(.SD, function(x) sd(x) / (mean(x) + 1)),
by = .(rmdup, sickle),
.SDcols = grep("ENSG", colnames(counts))]
cov_reads_not_qual <- cov[rmdup == "reads" & sickle == "not-quality-trimmed",
grep("ENSG", colnames(cov)), with = FALSE]
cov_vec <- as.numeric(cov_reads_not_qual)
names(cov_vec) <- colnames(cov_reads_not_qual)
var_genes <- names(cov_vec)[cov_vec > 3]
counts_cov <- counts[, var_genes, with = FALSE]
pca_cov <- calc_pca_by_subset(counts_cov, anno)
ggplot(pca_cov, aes(x = PC1, y = PC2, col = as.factor(type))) +
geom_point() +
facet_grid(sickle ~ rmdup)
ggplot(pca_cov, aes(x = PC1, y = PC2, col = as.factor(individual))) +
geom_point() +
facet_grid(sickle ~ rmdup)
correlate_pcs(pca_cov)
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] gplots_2.17.0 ggplot2_1.0.1 edgeR_3.10.2 limma_3.24.9
[5] data.table_1.9.4 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.11.6 magrittr_1.5 MASS_7.3-40
[4] munsell_0.4.2 colorspace_1.2-6 stringr_1.0.0
[7] plyr_1.8.2 caTools_1.17.1 tools_3.2.0
[10] grid_3.2.0 gtable_0.1.2 KernSmooth_2.23-14
[13] gtools_3.5.0 htmltools_0.2.6 yaml_2.1.13
[16] digest_0.6.8 reshape2_1.4.1 formatR_1.2
[19] bitops_1.0-6 evaluate_0.7 rmarkdown_0.6.1
[22] labeling_0.3 gdata_2.16.1 stringi_0.4-1
[25] scales_0.2.4 chron_2.3-45 proto_0.3-10