Last updated: 2015-08-18
Code version: 60a212f941df6f91b772339aebd682435732d8f4
This is the quality control at single cell level of 19239 LCLs. 96 cells were collected on a C1 prep. I only purchased enough Tn5 from Epicentre for the generation of transposomes with 24 different barcodes. As a results, the 96 cells were divided into four groups, each have 24 cells, for library preps (4 batches) and for sequencing (4 lanes). Additionally, 4 individual cells collected from a different C1 prep were sujected to library preps using either the Epicentre Tn5 (A9E1 and B2E2) or the home-made Tn5 (B4H1 and D2H2).
library("dplyr")
library("ggplot2")
theme_set(theme_bw(base_size = 16))
library("edgeR")
counts <- read.table("/mnt/gluster/data/internal_supp/singleCellSeq/lcl/gene-counts-lcl.txt",
header = TRUE, sep = "\t", stringsAsFactors = FALSE)
anno <- counts %>%
filter(rmdup == "molecules") %>%
select(individual:well) %>%
arrange(well)
anno <- mutate(anno, sample_id = paste(paste0("NA", individual),
batch, well, sep = "."))
anno <- mutate(anno, full_lane = well %in% c("A9E1", "B2E2", "B4H1", "D2H2"))
stopifnot(sum(anno$full_lane) == 4)
write.table(anno, "../data/annotation-lcl.txt", quote = FALSE, sep = "\t",
row.names = FALSE)
head(anno)
individual batch well sample_id full_lane
1 19239 1 A01 NA19239.1.A01 FALSE
2 19239 1 A02 NA19239.1.A02 FALSE
3 19239 1 A03 NA19239.1.A03 FALSE
4 19239 1 A04 NA19239.1.A04 FALSE
5 19239 1 A05 NA19239.1.A05 FALSE
6 19239 1 A06 NA19239.1.A06 FALSE
molecules <- counts %>%
arrange(well) %>%
filter(rmdup == "molecules") %>%
select(-(individual:rmdup)) %>%
t
dim(molecules)
[1] 20419 100
colnames(molecules) <- anno$sample_id
# Fix ERCC names. A data table can have dashes in column names, but data frame
# converts to period. Since the iPSC data was read with fread from the
# data.table package in sum-counts-per-sample.Rmd, this was not a problem
# before.
rownames(molecules) <- sub("\\.", "-", rownames(molecules))
write.table(molecules, "../data/molecules-lcl.txt", quote = FALSE, sep = "\t",
col.names = NA)
molecules[1:10, 1:5]
NA19239.1.A01 NA19239.1.A02 NA19239.1.A03 NA19239.1.A04
ENSG00000186092 0 0 0 0
ENSG00000237683 0 0 0 0
ENSG00000235249 0 0 0 0
ENSG00000185097 0 0 0 0
ENSG00000269831 0 0 0 0
ENSG00000269308 0 0 0 0
ENSG00000187634 0 0 0 0
ENSG00000268179 0 0 0 0
ENSG00000188976 0 0 0 0
ENSG00000187961 0 0 0 0
NA19239.1.A05
ENSG00000186092 0
ENSG00000237683 0
ENSG00000235249 0
ENSG00000185097 0
ENSG00000269831 0
ENSG00000269308 0
ENSG00000187634 0
ENSG00000268179 0
ENSG00000188976 0
ENSG00000187961 0
reads <- counts %>%
arrange(well) %>%
filter(rmdup == "reads") %>%
select(-(individual:rmdup)) %>%
t
dim(reads)
[1] 20419 100
colnames(reads) <- anno$sample_id
rownames(reads) <- sub("\\.", "-", rownames(reads))
write.table(reads, "../data/reads-lcl.txt", quote = FALSE, sep = "\t",
col.names = NA)
reads[1:10, 1:5]
NA19239.1.A01 NA19239.1.A02 NA19239.1.A03 NA19239.1.A04
ENSG00000186092 0 0 0 0
ENSG00000237683 0 0 0 0
ENSG00000235249 0 0 0 0
ENSG00000185097 0 0 0 0
ENSG00000269831 0 0 0 0
ENSG00000269308 0 0 0 0
ENSG00000187634 0 0 0 0
ENSG00000268179 0 0 0 0
ENSG00000188976 0 0 0 0
ENSG00000187961 0 0 0 0
NA19239.1.A05
ENSG00000186092 0
ENSG00000237683 0
ENSG00000235249 0
ENSG00000185097 0
ENSG00000269831 0
ENSG00000269308 0
ENSG00000187634 0
ENSG00000268179 0
ENSG00000188976 0
ENSG00000187961 0
Summary counts from featureCounts. Created with gather-summary-counts.py. These data were collected from the summary files of the full combined samples.
summary_counts <- read.table("../data/summary-counts-lcl.txt", header = TRUE,
stringsAsFactors = FALSE)
Currently this file only contains data from sickle-trimmed reads, so the code below simply ensures this and then removes the column.
summary_per_sample <- summary_counts %>%
filter(sickle == "quality-trimmed") %>%
select(-sickle) %>%
arrange(individual, batch, well, rmdup)
stopifnot(summary_per_sample$well[c(TRUE, FALSE)] == anno$well)
Input single cell observational quality control data.
# File needs to be created
qc <- read.csv("../data/qc-lcl.csv", header = TRUE,
stringsAsFactors = FALSE)
head(qc)
X ll_name cell.number.location cell.num
1 1 A01 1_C03 1
2 2 A02 1_C02 1
3 3 A03 0_C01 0
4 4 A04 1_C49 1
5 5 A05 1_C50 1
6 6 A06 1_C51 1
Looking at the unmapped ratio and ERCC ratio of each cell based on number of reads.
# reads per sample
summary_per_sample_reads <- summary_per_sample %>% filter(rmdup == "reads")
# create unmapped ratios
summary_per_sample_reads$unmapped.ratios <- summary_per_sample_reads[,9]/apply(summary_per_sample_reads[,5:13],1,sum)
# create total mapped reads
summary_per_sample_reads$total.mapped <- apply(summary_per_sample_reads[,5:8],1,sum)
# plot
ggplot(summary_per_sample_reads, aes(x = total.mapped, y = unmapped.ratios, col = as.factor(individual), shape = as.factor(batch), label = well)) + geom_point(size = 3, alpha = 0.5) + geom_text()
# plot the sum of reads and 'Assigned'
plot(apply(reads,2,sum),summary_per_sample_reads[,5])
# total ERCC reads
summary_per_sample_reads$total.ERCC <- apply(reads[grep("ERCC", rownames(reads)), ],2,sum)
plot(summary_per_sample_reads$total.ERCC)
# creat ERCC ratios
summary_per_sample_reads$ERCC.ratios <- apply(reads[grep("ERCC", rownames(reads)), ],2,sum)/apply(summary_per_sample_reads[,5:8],1,sum)
# plot
ggplot(summary_per_sample_reads, aes(x = total.mapped, y = ERCC.ratios, col = as.factor(individual), shape = as.factor(batch), label = well)) + geom_point(size = 3, alpha = 0.5) + geom_text(angle = 45)
summary_per_sample_reads$total_ERCC_molecule <- apply(molecules[grep("ERCC", rownames(molecules)), ],2,sum)
summary_per_sample_reads$total_gene_molecule <- apply(molecules[grep("ENSG", rownames(molecules)), ],2,sum)
ggplot(summary_per_sample_reads, aes(x = total.mapped, y = total_gene_molecule, col = as.factor(individual), shape = as.factor(batch), label = well)) + geom_point(size = 3, alpha = 0.5) + xlab("Total mapped reads") + ylab("Total gene molecule") + geom_smooth()
ggplot(summary_per_sample_reads, aes(x = total.mapped, y = total_ERCC_molecule, col = as.factor(individual), shape = as.factor(batch), label = well)) + geom_point(size = 3, alpha = 0.5) + xlab("Total mapped reads") + ylab("Total ERCC molecule") + geom_smooth()
Looking at only the multiplexed single cell libraries (96 samples total, 24 each in lanes 1-4):
# remove the 4 individual cells
summary_per_sample_reads_single <- summary_per_sample_reads[!anno$full_lane, ]
# many plots
ggplot(summary_per_sample_reads_single, aes(x = total.mapped, y = unmapped.ratios, col = as.factor(individual), shape = as.factor(batch))) + geom_point(size = 3, alpha = 0.5) + xlab("Number of mapped reads") + ylab("Umapped reads ratio") + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5))
ggplot(summary_per_sample_reads_single, aes(x = total.mapped, y = ERCC.ratios, col = as.factor(individual), shape = as.factor(batch))) + geom_point(size = 3, alpha = 0.5) + xlab("Number of mapped reads") + ylab("Spike-in reads ratio") + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5))
The cell number of each capture site is recorded after the cell corting step and before the cells got lysed.
#add cell number per well by merging qc file
summary_per_sample_reads_single_qc <- summary_per_sample_reads_single
summary_per_sample_reads_single_qc$cell_number <- qc$cell.num
ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = unmapped.ratios, col = as.factor(individual), label = as.character(cell_number))) + geom_text(fontface=3)
ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = ERCC.ratios, col = as.factor(individual), label = as.character(cell_number))) + geom_text(fontface=3)
Based on the observation that these is a dinstint cell population with more than 2 million reads, we used it as a cutoff.
#qc filter only keep cells with more than 2 million reads
summary_per_sample_reads_single_qc$qc_filter <- summary_per_sample_reads_single_qc$cell_number == 1 & summary_per_sample_reads_single_qc$total.mapped > 2 * 10^6
ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = unmapped.ratios, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of mapped reads") + ylab("Umapped reads ratio")
ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = ERCC.ratios, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of mapped reads") + ylab("Spike-in reads ratio")
Number of genes deteced in LCLS are smaller than iPSCs!!!
## remove genes with no read
expressed <- rowSums(reads) > 0
reads <- reads[expressed, ]
dim(reads)
[1] 13721 100
## number of expressed gene in each cell
reads_single <- reads[, anno$full_lane == "FALSE"]
reads_single_gene_number <- colSums(reads_single > 1)
summary_per_sample_reads_single_qc$reads_single_gene_number <- reads_single_gene_number
ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = reads_single_gene_number, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of mapped reads") + ylab("Number of genes")
ggplot(summary_per_sample_reads_single_qc, aes(x = reads_single_gene_number, y = ERCC.ratios, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of genes") + ylab("Spike-in reads ratio")
What are the numbers of detected genes of the 4 heavily-sequenced individual cells?
## the 4 individual cells
reads_4 <- reads[, anno$full_lane == "TRUE"]
reads_4_gene_number <- colSums(reads_4 > 1)
reads_4_gene_number
NA19239.1.A9E1 NA19239.1.B2E2 NA19239.1.B4H1 NA19239.1.D2H2
5896 6911 6025 5783
## create a list of mitochondrial genes (13 protein-coding genes)
## MT-ATP6, MT-CYB, MT-ND1, MT-ND4, MT-ND4L, MT-ND5, MT-ND6, MT-CO2, MT-CO1, MT-ND2, MT-ATP8, MT-CO3, MT-ND3
mtgene <- c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", "ENSG00000198886", "ENSG00000212907", "ENSG00000198786", "ENSG00000198695", "ENSG00000198712", "ENSG00000198804", "ENSG00000198763","ENSG00000228253", "ENSG00000198938", "ENSG00000198840")
## reads of mt genes in single cells
mt_reads <- reads_single[mtgene,]
dim(mt_reads)
[1] 13 96
## mt ratio of single cell
mt_reads_total <- apply(mt_reads, 2, sum)
summary_per_sample_reads_single_qc$mt_reads_total <- mt_reads_total
summary_per_sample_reads_single_qc$mt_reads_ratio <- summary_per_sample_reads_single_qc$mt_reads_total/summary_per_sample_reads_single_qc$total.mapped
ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = mt_reads_ratio, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of mapped reads") + ylab("Mitochondrial ratio")
ggplot(summary_per_sample_reads_single_qc, aes(x = mt_reads_ratio, y = ERCC.ratios, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Mitochondrial ratio") + ylab("Spike-in reads ratio")
ggplot(summary_per_sample_reads_single_qc, aes(x = reads_single_gene_number, y = mt_reads_ratio, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of genes") + ylab("Mitochondrial ratio")
Calculate the counts per million for the singe cells.
reads_cells <- reads_single
reads_cells_cpm <- cpm(reads_cells)
Select the most variable genes.
reads_cells_cpm_log_var <- log(apply(reads_cells_cpm, 1, var))
hist(reads_cells_cpm_log_var)
sum(reads_cells_cpm_log_var > 8)
[1] 6473
Using the 6473 most variable genes, perform PCA.
reads_cells_cpm <- reads_cells_cpm[reads_cells_cpm_log_var > 8, ]
pca_reads_cells <- prcomp(t(reads_cells_cpm), retx = TRUE, scale. = TRUE,
center = TRUE)
plot(pca_reads_cells)
pca_reads_cells$perc_explained <- pca_reads_cells$sdev^2 / sum(pca_reads_cells$sdev^2) * 100
plot(pca_reads_cells$perc_explained)
The first PC accounts for 5% of the variance and the second PC 3%.
stopifnot(colnames(reads_cells) ==
paste(paste0("NA", summary_per_sample_reads_single_qc$individual),
summary_per_sample_reads_single_qc$batch,
summary_per_sample_reads_single_qc$well, sep = "."))
pca_reads_cells_anno <- cbind(summary_per_sample_reads_single_qc, pca_reads_cells$x)
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = as.factor(individual),
shape = as.factor(batch))) +
geom_point()
Using various simple filtering cutoffs.
pca_reads_cells_anno$cell_filter <- pca_reads_cells_anno$cell_number == 1
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = cell_filter)) +
geom_point()
pca_reads_cells_anno$total_cutoff <- pca_reads_cells_anno$total.mapped > 1.5 * 10^6
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = total_cutoff)) +
geom_point()
pca_reads_cells_anno$unmapped_cutoff <- pca_reads_cells_anno$unmapped.ratios < 0.4
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = unmapped_cutoff)) +
geom_point()
pca_reads_cells_anno$ERCC_cutoff <- pca_reads_cells_anno$ERCC.ratios < 0.05
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = ERCC_cutoff)) +
geom_point()
pca_reads_cells_anno$gene_filter <- pca_reads_cells_anno$reads_single_gene_number > 3000
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = gene_filter)) +
geom_point()
pca_reads_cells_anno$mt_filter <- pca_reads_cells_anno$mt_reads_ratio < 0.15
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = mt_filter)) +
geom_point()
The two cutoffs, total reads and cell number, largely overlap.
table(pca_reads_cells_anno$cell_filter, pca_reads_cells_anno$total_cutoff,
dnn = c("Num cells == 1", "Total reads > 1.5e6"))
Total reads > 1.5e6
Num cells == 1 FALSE TRUE
FALSE 8 4
TRUE 5 79
Add the third cutoff, 3000 genes
pca_reads_cells_anno$qc_filter <- pca_reads_cells_anno$total_cutoff &
pca_reads_cells_anno$gene_filter &
pca_reads_cells_anno$cell_filter
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = qc_filter,
label = as.character(cell_number))) +
geom_text(fontface=3)
Apply all the cutoffs
pca_reads_cells_anno$qc_filter_all <- pca_reads_cells_anno$cell_filter &
pca_reads_cells_anno$total_cutoff &
pca_reads_cells_anno$unmapped_cutoff &
pca_reads_cells_anno$ERCC_cutoff &
pca_reads_cells_anno$gene_filter &
pca_reads_cells_anno$mt_filter
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = qc_filter_all,
label = as.character(cell_number))) +
geom_text(fontface=3)
How many cells do we keep from each individual and batch using this filter?
table(pca_reads_cells_anno[pca_reads_cells_anno$qc_filter,
c("individual", "batch")])
batch
individual 1
19239 76
table(pca_reads_cells_anno[pca_reads_cells_anno$qc_filter_all,
c("individual", "batch")])
batch
individual 1
19239 75
Output list of single cells to keep.
stopifnot(nrow(pca_reads_cells_anno) == nrow(anno[anno$full_lane == "FALSE", ]))
quality_single_cells <- anno %>%
filter(full_lane == "FALSE") %>%
filter(pca_reads_cells_anno$qc_filter_all) %>%
select(sample_id)
stopifnot(!grepl("TRUE", quality_single_cells$sample_id))
write.table(quality_single_cells,
file = "../data/quality-single-cells-lcl.txt", quote = FALSE,
sep = "\t", row.names = FALSE, col.names = FALSE)
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] edgeR_3.10.2 limma_3.24.9 ggplot2_1.0.1 dplyr_0.4.1 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.11.6 magrittr_1.5 MASS_7.3-40 munsell_0.4.2
[5] colorspace_1.2-6 stringr_1.0.0 plyr_1.8.2 tools_3.2.0
[9] parallel_3.2.0 grid_3.2.0 gtable_0.1.2 DBI_0.3.1
[13] htmltools_0.2.6 lazyeval_0.1.10 yaml_2.1.13 assertthat_0.1
[17] digest_0.6.8 reshape2_1.4.1 formatR_1.2 evaluate_0.7
[21] rmarkdown_0.6.1 labeling_0.3 stringi_0.4-1 scales_0.2.4
[25] proto_0.3-10