Quality control for LCLs

Input
Total mapped reads, unmapped ratios, and ERCC ratios
The total molecule number of ERCC and endogenous genes
Cell number
Number of genes detected
Reads mapped to mitochondrial genes
PCA
Cutoffs
Session information

Last updated: 2015-08-18

Code version: 60a212f941df6f91b772339aebd682435732d8f4

This is the quality control at single cell level of 19239 LCLs. 96 cells were collected on a C1 prep. I only purchased enough Tn5 from Epicentre for the generation of transposomes with 24 different barcodes. As a results, the 96 cells were divided into four groups, each have 24 cells, for library preps (4 batches) and for sequencing (4 lanes). Additionally, 4 individual cells collected from a different C1 prep were sujected to library preps using either the Epicentre Tn5 (A9E1 and B2E2) or the home-made Tn5 (B4H1 and D2H2).

Input

library("dplyr")
library("ggplot2")
theme_set(theme_bw(base_size = 16))
library("edgeR")

counts <- read.table("/mnt/gluster/data/internal_supp/singleCellSeq/lcl/gene-counts-lcl.txt",
                     header = TRUE, sep = "\t", stringsAsFactors = FALSE)

anno <- counts %>%
  filter(rmdup == "molecules") %>%
  select(individual:well) %>%
  arrange(well)
anno <- mutate(anno, sample_id = paste(paste0("NA", individual),
                                       batch, well, sep = "."))
anno <- mutate(anno, full_lane = well %in% c("A9E1", "B2E2", "B4H1", "D2H2"))
stopifnot(sum(anno$full_lane) == 4)
write.table(anno, "../data/annotation-lcl.txt", quote = FALSE, sep = "\t",
            row.names = FALSE)
head(anno)

  individual batch well     sample_id full_lane
1      19239     1  A01 NA19239.1.A01     FALSE
2      19239     1  A02 NA19239.1.A02     FALSE
3      19239     1  A03 NA19239.1.A03     FALSE
4      19239     1  A04 NA19239.1.A04     FALSE
5      19239     1  A05 NA19239.1.A05     FALSE
6      19239     1  A06 NA19239.1.A06     FALSE

molecules <- counts %>%
  arrange(well) %>%
  filter(rmdup == "molecules") %>%
  select(-(individual:rmdup)) %>%
  t
dim(molecules)

[1] 20419   100

colnames(molecules) <- anno$sample_id
# Fix ERCC names. A data table can have dashes in column names, but data frame
# converts to period. Since the iPSC data was read with fread from the
# data.table package in sum-counts-per-sample.Rmd, this was not a problem
# before.
rownames(molecules) <- sub("\\.", "-", rownames(molecules))
write.table(molecules, "../data/molecules-lcl.txt", quote = FALSE, sep = "\t",
            col.names = NA)
molecules[1:10, 1:5]

                NA19239.1.A01 NA19239.1.A02 NA19239.1.A03 NA19239.1.A04
ENSG00000186092             0             0             0             0
ENSG00000237683             0             0             0             0
ENSG00000235249             0             0             0             0
ENSG00000185097             0             0             0             0
ENSG00000269831             0             0             0             0
ENSG00000269308             0             0             0             0
ENSG00000187634             0             0             0             0
ENSG00000268179             0             0             0             0
ENSG00000188976             0             0             0             0
ENSG00000187961             0             0             0             0
                NA19239.1.A05
ENSG00000186092             0
ENSG00000237683             0
ENSG00000235249             0
ENSG00000185097             0
ENSG00000269831             0
ENSG00000269308             0
ENSG00000187634             0
ENSG00000268179             0
ENSG00000188976             0
ENSG00000187961             0

reads <- counts %>%
  arrange(well) %>%
  filter(rmdup == "reads") %>%
  select(-(individual:rmdup)) %>%
  t
dim(reads)

[1] 20419   100

colnames(reads) <- anno$sample_id
rownames(reads) <- sub("\\.", "-", rownames(reads))
write.table(reads, "../data/reads-lcl.txt", quote = FALSE, sep = "\t",
            col.names = NA)
reads[1:10, 1:5]

                NA19239.1.A01 NA19239.1.A02 NA19239.1.A03 NA19239.1.A04
ENSG00000186092             0             0             0             0
ENSG00000237683             0             0             0             0
ENSG00000235249             0             0             0             0
ENSG00000185097             0             0             0             0
ENSG00000269831             0             0             0             0
ENSG00000269308             0             0             0             0
ENSG00000187634             0             0             0             0
ENSG00000268179             0             0             0             0
ENSG00000188976             0             0             0             0
ENSG00000187961             0             0             0             0
                NA19239.1.A05
ENSG00000186092             0
ENSG00000237683             0
ENSG00000235249             0
ENSG00000185097             0
ENSG00000269831             0
ENSG00000269308             0
ENSG00000187634             0
ENSG00000268179             0
ENSG00000188976             0
ENSG00000187961             0

Summary counts from featureCounts. Created with gather-summary-counts.py. These data were collected from the summary files of the full combined samples.

summary_counts <- read.table("../data/summary-counts-lcl.txt", header = TRUE,
                             stringsAsFactors = FALSE)

Currently this file only contains data from sickle-trimmed reads, so the code below simply ensures this and then removes the column.

summary_per_sample <- summary_counts %>%
  filter(sickle == "quality-trimmed") %>%
  select(-sickle) %>%
  arrange(individual, batch, well, rmdup)
stopifnot(summary_per_sample$well[c(TRUE, FALSE)] == anno$well)

Input single cell observational quality control data.

# File needs to be created
qc <- read.csv("../data/qc-lcl.csv", header = TRUE,
                 stringsAsFactors = FALSE)
head(qc)

  X ll_name cell.number.location cell.num
1 1     A01                1_C03        1
2 2     A02                1_C02        1
3 3     A03                0_C01        0
4 4     A04                1_C49        1
5 5     A05                1_C50        1
6 6     A06                1_C51        1

Total mapped reads, unmapped ratios, and ERCC ratios

Looking at the unmapped ratio and ERCC ratio of each cell based on number of reads.

# reads per sample
summary_per_sample_reads <- summary_per_sample %>% filter(rmdup == "reads")

# create unmapped ratios
summary_per_sample_reads$unmapped.ratios <- summary_per_sample_reads[,9]/apply(summary_per_sample_reads[,5:13],1,sum)

# create total mapped reads
summary_per_sample_reads$total.mapped <- apply(summary_per_sample_reads[,5:8],1,sum)

# plot
ggplot(summary_per_sample_reads, aes(x = total.mapped, y = unmapped.ratios, col = as.factor(individual), shape = as.factor(batch), label = well)) + geom_point(size = 3, alpha = 0.5) + geom_text()

# plot the sum of reads and 'Assigned'
plot(apply(reads,2,sum),summary_per_sample_reads[,5])

# total ERCC reads 
summary_per_sample_reads$total.ERCC <- apply(reads[grep("ERCC", rownames(reads)), ],2,sum)

plot(summary_per_sample_reads$total.ERCC)

# creat ERCC ratios
summary_per_sample_reads$ERCC.ratios <- apply(reads[grep("ERCC", rownames(reads)), ],2,sum)/apply(summary_per_sample_reads[,5:8],1,sum)

# plot
ggplot(summary_per_sample_reads, aes(x = total.mapped, y = ERCC.ratios, col = as.factor(individual), shape = as.factor(batch), label = well)) + geom_point(size = 3, alpha = 0.5) + geom_text(angle = 45)

The total molecule number of ERCC and endogenous genes

summary_per_sample_reads$total_ERCC_molecule <- apply(molecules[grep("ERCC", rownames(molecules)), ],2,sum)

summary_per_sample_reads$total_gene_molecule <- apply(molecules[grep("ENSG", rownames(molecules)), ],2,sum)

ggplot(summary_per_sample_reads, aes(x = total.mapped, y = total_gene_molecule, col = as.factor(individual), shape = as.factor(batch), label = well)) + geom_point(size = 3, alpha = 0.5) + xlab("Total mapped reads") + ylab("Total gene molecule") + geom_smooth()

ggplot(summary_per_sample_reads, aes(x = total.mapped, y = total_ERCC_molecule, col = as.factor(individual), shape = as.factor(batch), label = well)) + geom_point(size = 3, alpha = 0.5) + xlab("Total mapped reads") + ylab("Total ERCC molecule") + geom_smooth()

Looking at only the multiplexed single cell libraries (96 samples total, 24 each in lanes 1-4):

# remove the 4 individual cells
summary_per_sample_reads_single <- summary_per_sample_reads[!anno$full_lane, ]

# many plots
ggplot(summary_per_sample_reads_single, aes(x = total.mapped, y = unmapped.ratios, col = as.factor(individual), shape = as.factor(batch))) + geom_point(size = 3, alpha = 0.5) + xlab("Number of mapped reads") + ylab("Umapped reads ratio") + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5))

ggplot(summary_per_sample_reads_single, aes(x = total.mapped, y = ERCC.ratios, col = as.factor(individual), shape = as.factor(batch))) + geom_point(size = 3, alpha = 0.5) + xlab("Number of mapped reads") + ylab("Spike-in reads ratio") + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5))

Cell number

The cell number of each capture site is recorded after the cell corting step and before the cells got lysed.

#add cell number per well by merging qc file
summary_per_sample_reads_single_qc <- summary_per_sample_reads_single
summary_per_sample_reads_single_qc$cell_number <- qc$cell.num

ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = unmapped.ratios, col = as.factor(individual), label = as.character(cell_number))) + geom_text(fontface=3)

ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = ERCC.ratios, col = as.factor(individual), label = as.character(cell_number))) + geom_text(fontface=3)

Based on the observation that these is a dinstint cell population with more than 2 million reads, we used it as a cutoff.

#qc filter only keep cells with more than 2 million reads
summary_per_sample_reads_single_qc$qc_filter <- summary_per_sample_reads_single_qc$cell_number == 1 & summary_per_sample_reads_single_qc$total.mapped > 2 * 10^6

ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = unmapped.ratios, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of mapped reads") + ylab("Umapped reads ratio")

ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = ERCC.ratios, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of mapped reads") + ylab("Spike-in reads ratio")

Number of genes detected

Number of genes deteced in LCLS are smaller than iPSCs!!!

## remove genes with no read
expressed <- rowSums(reads) > 0
reads <- reads[expressed, ]
dim(reads)

[1] 13721   100

## number of expressed gene in each cell
reads_single <- reads[, anno$full_lane == "FALSE"]
reads_single_gene_number <- colSums(reads_single > 1)
summary_per_sample_reads_single_qc$reads_single_gene_number <- reads_single_gene_number

ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = reads_single_gene_number, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of mapped reads") + ylab("Number of genes")

ggplot(summary_per_sample_reads_single_qc, aes(x = reads_single_gene_number, y = ERCC.ratios, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of genes") + ylab("Spike-in reads ratio")

What are the numbers of detected genes of the 4 heavily-sequenced individual cells?

## the 4 individual cells
reads_4 <- reads[, anno$full_lane == "TRUE"]
reads_4_gene_number <- colSums(reads_4 > 1)
reads_4_gene_number

NA19239.1.A9E1 NA19239.1.B2E2 NA19239.1.B4H1 NA19239.1.D2H2 
          5896           6911           6025           5783

Reads mapped to mitochondrial genes

## create a list of mitochondrial genes (13 protein-coding genes)
## MT-ATP6, MT-CYB, MT-ND1, MT-ND4, MT-ND4L, MT-ND5, MT-ND6, MT-CO2, MT-CO1, MT-ND2, MT-ATP8, MT-CO3, MT-ND3
mtgene <- c("ENSG00000198899", "ENSG00000198727", "ENSG00000198888", "ENSG00000198886", "ENSG00000212907", "ENSG00000198786", "ENSG00000198695", "ENSG00000198712", "ENSG00000198804", "ENSG00000198763","ENSG00000228253", "ENSG00000198938", "ENSG00000198840")

## reads of mt genes in single cells
mt_reads <- reads_single[mtgene,]
dim(mt_reads)

[1] 13 96

## mt ratio of single cell
mt_reads_total <- apply(mt_reads, 2, sum)
summary_per_sample_reads_single_qc$mt_reads_total <- mt_reads_total
summary_per_sample_reads_single_qc$mt_reads_ratio <- summary_per_sample_reads_single_qc$mt_reads_total/summary_per_sample_reads_single_qc$total.mapped

ggplot(summary_per_sample_reads_single_qc, aes(x = total.mapped, y = mt_reads_ratio, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of mapped reads") + ylab("Mitochondrial ratio")

ggplot(summary_per_sample_reads_single_qc, aes(x = mt_reads_ratio, y = ERCC.ratios, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Mitochondrial ratio") + ylab("Spike-in reads ratio")

ggplot(summary_per_sample_reads_single_qc, aes(x = reads_single_gene_number, y = mt_reads_ratio, col = qc_filter, label = as.character(cell_number))) + geom_text(fontface=3) + facet_grid(individual ~ batch) + theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + xlab("Number of genes") + ylab("Mitochondrial ratio")

PCA

Calculate the counts per million for the singe cells.

reads_cells <- reads_single
reads_cells_cpm <- cpm(reads_cells)

Select the most variable genes.

reads_cells_cpm_log_var <- log(apply(reads_cells_cpm, 1, var))
hist(reads_cells_cpm_log_var)

sum(reads_cells_cpm_log_var > 8)

[1] 6473

Using the 6473 most variable genes, perform PCA.

reads_cells_cpm <- reads_cells_cpm[reads_cells_cpm_log_var > 8, ]
pca_reads_cells <- prcomp(t(reads_cells_cpm), retx = TRUE, scale. = TRUE,
                          center = TRUE)

plot(pca_reads_cells)

pca_reads_cells$perc_explained <- pca_reads_cells$sdev^2 / sum(pca_reads_cells$sdev^2) * 100
plot(pca_reads_cells$perc_explained)

The first PC accounts for 5% of the variance and the second PC 3%.

stopifnot(colnames(reads_cells) == 
            paste(paste0("NA", summary_per_sample_reads_single_qc$individual),
                   summary_per_sample_reads_single_qc$batch,
                   summary_per_sample_reads_single_qc$well, sep = "."))
pca_reads_cells_anno <- cbind(summary_per_sample_reads_single_qc, pca_reads_cells$x)

ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = as.factor(individual),
                                 shape = as.factor(batch))) +
  geom_point()

Cutoffs

Using various simple filtering cutoffs.

pca_reads_cells_anno$cell_filter <- pca_reads_cells_anno$cell_number == 1
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = cell_filter)) +
  geom_point()

pca_reads_cells_anno$total_cutoff <- pca_reads_cells_anno$total.mapped > 1.5 * 10^6
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = total_cutoff)) +
  geom_point()

pca_reads_cells_anno$unmapped_cutoff <- pca_reads_cells_anno$unmapped.ratios < 0.4
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = unmapped_cutoff)) +
  geom_point()

pca_reads_cells_anno$ERCC_cutoff <- pca_reads_cells_anno$ERCC.ratios < 0.05
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = ERCC_cutoff)) +
  geom_point()

pca_reads_cells_anno$gene_filter <- pca_reads_cells_anno$reads_single_gene_number  > 3000
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = gene_filter)) +
  geom_point()

pca_reads_cells_anno$mt_filter <- pca_reads_cells_anno$mt_reads_ratio < 0.15
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = mt_filter)) +
  geom_point()

The two cutoffs, total reads and cell number, largely overlap.

table(pca_reads_cells_anno$cell_filter, pca_reads_cells_anno$total_cutoff,
      dnn = c("Num cells == 1", "Total reads > 1.5e6"))

              Total reads > 1.5e6
Num cells == 1 FALSE TRUE
         FALSE     8    4
         TRUE      5   79

Add the third cutoff, 3000 genes

pca_reads_cells_anno$qc_filter <- pca_reads_cells_anno$total_cutoff &
                                  pca_reads_cells_anno$gene_filter &
                                  pca_reads_cells_anno$cell_filter
  
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = qc_filter,
                                 label = as.character(cell_number))) +
  geom_text(fontface=3)

Apply all the cutoffs

pca_reads_cells_anno$qc_filter_all <- pca_reads_cells_anno$cell_filter &
                                      pca_reads_cells_anno$total_cutoff &
                                      pca_reads_cells_anno$unmapped_cutoff &
                                      pca_reads_cells_anno$ERCC_cutoff &
                                      pca_reads_cells_anno$gene_filter &
                                      pca_reads_cells_anno$mt_filter
  
ggplot(pca_reads_cells_anno, aes(x = PC1, y = PC2, col = qc_filter_all,
                                 label = as.character(cell_number))) +
  geom_text(fontface=3)

How many cells do we keep from each individual and batch using this filter?

table(pca_reads_cells_anno[pca_reads_cells_anno$qc_filter,
                           c("individual", "batch")])

          batch
individual  1
     19239 76

table(pca_reads_cells_anno[pca_reads_cells_anno$qc_filter_all,
                           c("individual", "batch")])

          batch
individual  1
     19239 75

Output list of single cells to keep.

stopifnot(nrow(pca_reads_cells_anno) == nrow(anno[anno$full_lane == "FALSE", ]))
quality_single_cells <- anno %>%
  filter(full_lane == "FALSE") %>%
  filter(pca_reads_cells_anno$qc_filter_all) %>%
  select(sample_id)
stopifnot(!grepl("TRUE", quality_single_cells$sample_id))
write.table(quality_single_cells,
            file = "../data/quality-single-cells-lcl.txt", quote = FALSE,
            sep = "\t", row.names = FALSE, col.names = FALSE)

Session information

sessionInfo()

R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] edgeR_3.10.2  limma_3.24.9  ggplot2_1.0.1 dplyr_0.4.1   knitr_1.10.5 

loaded via a namespace (and not attached):
 [1] Rcpp_0.11.6      magrittr_1.5     MASS_7.3-40      munsell_0.4.2   
 [5] colorspace_1.2-6 stringr_1.0.0    plyr_1.8.2       tools_3.2.0     
 [9] parallel_3.2.0   grid_3.2.0       gtable_0.1.2     DBI_0.3.1       
[13] htmltools_0.2.6  lazyeval_0.1.10  yaml_2.1.13      assertthat_0.1  
[17] digest_0.6.8     reshape2_1.4.1   formatR_1.2      evaluate_0.7    
[21] rmarkdown_0.6.1  labeling_0.3     stringi_0.4-1    scales_0.2.4    
[25] proto_0.3-10