Last updated: 2015-09-11

Code version: 37f57b1c3b81f30e213726dfcc3da532197ac224

We standardize the molecule counts to account for differences in sequencing depth. This is necessary because the sequencing depth affects the total molecule counts.

However, according to our study design. Each cell within the same C1 patch, independent of total number of transcripts, should have equal amount of ERCC molecule. As a result, cpm (no log transforamtion) will introduce bias to ERCC, where cells with more molecules will end up with fewer ERCC molecules. Therefore, cpm should not be performed on ERCC genes, but only on endogenous genes.

Input

Input annotation.

anno <- read.table("../data/annotation.txt", header = TRUE,
                   stringsAsFactors = FALSE)
head(anno)
  individual batch well     sample_id
1      19098     1  A01 NA19098.1.A01
2      19098     1  A02 NA19098.1.A02
3      19098     1  A03 NA19098.1.A03
4      19098     1  A04 NA19098.1.A04
5      19098     1  A05 NA19098.1.A05
6      19098     1  A06 NA19098.1.A06

Input molecule counts.

molecules <- read.table("../data/molecules.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input read counts.

reads <- read.table("../data/reads.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input list of quality single cells.

quality_single_cells <- scan("../data/quality-single-cells.txt",
                             what = "character")

Input ERCC concentration

ercc <- read.table("../data/ercc-info.txt", header = TRUE, sep = "\t",
                   stringsAsFactors = FALSE)
colnames(ercc) <- c("num", "id", "subgroup", "conc_mix1", "conc_mix2",
                    "expected_fc", "log2_mix1_mix2")
head(ercc)
  num         id subgroup conc_mix1  conc_mix2 expected_fc log2_mix1_mix2
1   1 ERCC-00130        A 30000.000 7500.00000           4              2
2   2 ERCC-00004        A  7500.000 1875.00000           4              2
3   3 ERCC-00136        A  1875.000  468.75000           4              2
4   4 ERCC-00108        A   937.500  234.37500           4              2
5   5 ERCC-00116        A   468.750  117.18750           4              2
6   6 ERCC-00092        A   234.375   58.59375           4              2
stopifnot(nrow(ercc) == 92)

ercc <- ercc[order(ercc$id), ]

Prepare single cell molecule data

Keep only the single cells that passed the QC filters and the bulk samples.

molecules <- molecules[, grepl("bulk", colnames(molecules)) |
                         colnames(molecules) %in% quality_single_cells]
anno <- anno[anno$well == "bulk" | anno$sample_id %in% quality_single_cells, ]
stopifnot(ncol(molecules) == nrow(anno),
          colnames(molecules) == anno$sample_id)

reads <- reads[, grepl("bulk", colnames(reads)) |
                         colnames(reads) %in% quality_single_cells]
stopifnot(ncol(reads) == nrow(anno),
          colnames(reads) == anno$sample_id)

Also remove batch 2 of individual 19098.

molecules <- molecules[, !(anno$individual == 19098 & anno$batch == 2)]
reads <- reads[, !(anno$individual == 19098 & anno$batch == 2)]
anno <- anno[!(anno$individual == 19098 & anno$batch == 2), ]
stopifnot(ncol(molecules) == nrow(anno))

Remove genes with zero read counts in the single cells or bulk samples.

expressed <- rowSums(molecules[, anno$well == "bulk"]) > 0 &
             rowSums(molecules[, anno$well != "bulk"]) > 0
molecules <- molecules[expressed, ]
dim(molecules)
[1] 17084   571
expressed <- rowSums(reads[, anno$well == "bulk"]) > 0 &
             rowSums(reads[, anno$well != "bulk"]) > 0
reads <- reads[expressed, ]
dim(reads)
[1] 17093   571

Split the bulk and single samples.

molecules_bulk <- molecules[, anno$well == "bulk"]
molecules_single <- molecules[, anno$well != "bulk"]
reads_bulk <- reads[, anno$well == "bulk"]
reads_single <- reads[, anno$well != "bulk"]
anno_single <- anno[anno$well != "bulk",]

How many genes have greater than or equal to 1,024 molecules in at least one of the cells?

overexpressed_genes <- rownames(molecules_single)[apply(molecules_single, 1,
                                                        function(x) any(x >= 1024))]

15 have greater than or equal to 1,024 molecules. Remove them.

molecules_single <- molecules_single[!(rownames(molecules_single) %in% overexpressed_genes), ]
reads_single <- reads_single[!(rownames(reads_single) %in% overexpressed_genes), ]

Correct for collision probability. See Grun et al. 2014 for details.

molecules_single_collision <- -1024 * log(1 - molecules_single / 1024)

Standardization without log transformation (in order to calculate CV for gene expression noise)

molecules_single_cpm <- cpm(molecules_single_collision, log = FALSE)
reads_single_cpm <- cpm(reads_single, log = FALSE)

total ERCC vs total molecule and total reads (counts)

Number of total molecule correlates with total read number.

anno_single$total_reads <- apply(reads_single, 2, sum)
anno_single$total_molecules <- apply(molecules_single_collision, 2, sum)

ggplot(anno_single, aes(x= total_reads, y= total_molecules, col=as.factor(individual), shape=as.factor(batch))) + geom_point() + labs(x = "total reads", y = "total molecule")

Look at ERCC and endogenous gene separately. * Total ERCC molecule counts is independent on total reads, total moleceuls, or total gene molecule counts * Total ERCC molecule counts show (surprisngly) individual effect, where 19098 have more and 19239 have fewer.

anno_single$total_ERCC_molecule <- apply(molecules_single_collision[grep("ERCC", rownames(molecules_single)), ],2,sum)

anno_single$total_gene_molecule <- apply(molecules_single_collision[grep("ENSG", rownames(molecules_single)), ],2,sum)

ggplot(anno_single, aes(x= total_reads, y= total_ERCC_molecule, col=as.factor(individual), shape=as.factor(batch))) + geom_point() + labs(x = "total reads", y = "total ERCC molecule")

ggplot(anno_single, aes(x= total_molecules, y= total_ERCC_molecule, col=as.factor(individual), shape=as.factor(batch))) + geom_point() + labs(x = "total molecules", y = "total ERCC molecule")

ggplot(anno_single, aes(x= total_gene_molecule, y= total_ERCC_molecule, col=as.factor(individual), shape=as.factor(batch))) + geom_point() + labs(x = "total endogenous gene molecules", y = "total ERCC molecule")

PC1 of ERCC expression is explained by total ERCC molecule counts, but not total reads or total molecules

## without cpm standardization
molecules_single_ERCC <- molecules_single_collision[grep("ERCC", rownames(molecules_single_collision)), ]

pca_ERCC <- run_pca(molecules_single_ERCC)
pca_ERCC_plot <- plot_pca(pca_ERCC$PCs, explained = pca_ERCC$explained,
         metadata = anno_single, color = "individual",
         shape = "batch", factors = c("individual", "batch"))

pca_ERCC_plot + labs(title="ERCC spike-in (no-cpm)")

## PC1 of ERCC expression is explained by total ERCC molecule number
pca_ERCC <- prcomp(t(molecules_single_ERCC), retx = TRUE, scale. = TRUE, center = TRUE)
pca_ERCC_anno <- cbind(anno_single, pca_ERCC$x)

ggplot(pca_ERCC_anno, aes(x = PC1, y = total_ERCC_molecule, col = as.factor(individual), shape = as.factor(batch))) +geom_point() + labs(title="ERCC spike-in (no-cpm)")

ggplot(pca_ERCC_anno, aes(x = PC1, y = total_molecules, col = as.factor(individual), shape = as.factor(batch))) +geom_point() + labs(title="ERCC spike-in (no-cpm)")

ggplot(pca_ERCC_anno, aes(x = PC1, y = total_reads, col = as.factor(individual), shape = as.factor(batch))) +geom_point() + labs(title="ERCC spike-in (no-cpm)")

cpm transformation based on total molecule counts (WRONG!)

After cpm, total ERCC molecule number is no longer independent on total molecule counts.

anno_single$total_molecules_cpm <- apply(molecules_single_cpm, 2, sum)
anno_single$total_ERCC_molecule_cpm <- apply(molecules_single_cpm[grep("ERCC", rownames(molecules_single)), ],2,sum)

anno_single$total_gene_molecule_cpm <- apply(molecules_single_cpm[grep("ENSG", rownames(molecules_single)), ],2,sum)

## before and after cpm
ggplot(anno_single, aes(x= total_ERCC_molecule_cpm, y= total_ERCC_molecule, col=as.factor(individual), shape=as.factor(batch))) + geom_point() + labs(x = "total ERCC molecule cpm (no log)", y = "total ERCC molecule", title ="wrong cpm")

ggplot(anno_single, aes(x= total_molecules, y= total_ERCC_molecule_cpm, col=as.factor(individual), shape=as.factor(batch))) + geom_point() + labs(x = "total molecule counts", y = "total ERCC molecule cpm", title ="wrong cpm")

cpm transformation based on ERCC (CORRECT!)

cpm of ERCC should be perfomed seperated.

## read counts of ERCC
reads_single_ERCC <- reads_single[grep("ERCC", rownames(reads_single)), ]

## sum of ERCC readds per cell
anno_single$total_ERCC_reads <- apply (reads_single_ERCC, 2, sum)

ggplot(anno_single, aes(x= total_ERCC_reads, y= total_ERCC_molecule, col=as.factor(individual), shape=as.factor(batch))) + geom_point() + labs(x = "total ERCC molecule reads (counts)", y = "total ERCC molecule (counts)")

## cpm of ERCC (seperate from endogenous genes)
molecules_single_ERCC_cpm <- cpm(molecules_single_ERCC, log = TRUE)

## pca
pca_ERCC_cpm <- run_pca(molecules_single_ERCC_cpm)
pca_ERCC_plot_cpm <- plot_pca(pca_ERCC_cpm$PCs, explained = pca_ERCC_cpm$explained,
         metadata = anno_single, color = "individual",
         shape = "batch", factors = c("individual", "batch"))

pca_ERCC_plot_cpm + labs(title="ERCC spike-in (cpm ERCC)")

# compared the two different kinds of cpm
anno_single$total_ERCC_cpm <- apply(molecules_single_ERCC_cpm, 2, sum)

ggplot(anno_single, aes(x= total_ERCC_molecule_cpm, y= total_ERCC_cpm, col=as.factor(individual), shape=as.factor(batch))) + geom_point() + labs(x = "total ERCC molecule cpm all molecules (no log)", y = "total ERCC cpm (log)", title ="correct cpm")

ggplot(anno_single, aes(x= total_molecules, y= total_ERCC_cpm, col=as.factor(individual), shape=as.factor(batch))) + geom_point() + labs(x = "total molecule counts", y = "total ERCC cpm (log)", title ="correct cpm")

Linear Shift Normalization

Try the linear shift normalization after cpm ERCC separately and then use the method linear shift normalization. There is error in the original r code. It shoud be Y = mX + b -> X = (Y - b) / m instead of Y = mX + b -> X = (Y + b) / m. After the shift, take exponential to get non log transformed value.

## prepare ERCC concentration information 
ercc_single <- ercc[ercc$id %in% rownames(molecules_single), ]
stopifnot(rownames(molecules_single_ERCC_cpm) == ercc_single$id)

## cpm log transformed endogenous genes
molecules_single_genes <- molecules_single_collision[grep("ENSG", rownames(molecules_single)), ]
molecules_single_genes_cpm <- cpm(molecules_single_genes, log = TRUE)

## linear shift 
single_norm <- molecules_single_genes_cpm
single_norm[, ] <- NA
for (i in 1:ncol(single_norm)) {
  single_fit <- lm(molecules_single_ERCC_cpm[, i] ~ log2(ercc_single$conc_mix1))
  # Y = mX + b -> X = (Y - b) / m
  single_norm[, i] <- (molecules_single_genes_cpm[, i] - single_fit$coefficients[1]) /
                       single_fit$coefficients[2]
}
stopifnot(!is.na(single_norm))


pca_single_norm <- run_pca(single_norm)
plot_pca(pca_single_norm$PCs, explained = pca_single_norm$explained,
         metadata = anno_single, color = "individual",
         shape = "batch", factors = c("individual", "batch")) + labs(title = "Linear shift normalization (log2 cpm)")

## exponential
single_norm_expt <- 2^(single_norm)
pca_single_norm_expt <- run_pca(single_norm_expt)
plot_pca(pca_single_norm_expt$PCs, explained = pca_single_norm_expt$explained,
         metadata = anno_single, color = "individual",
         shape = "batch", factors = c("individual", "batch")) + labs(title = "Linear shift normalization (expt cpm)")

Session information

sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] testit_0.4    ggplot2_1.0.1 edgeR_3.10.2  limma_3.24.9  knitr_1.10.5 

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.0      digest_0.6.8     MASS_7.3-40      grid_3.2.0      
 [5] plyr_1.8.3       gtable_0.1.2     formatR_1.2      magrittr_1.5    
 [9] scales_0.2.4     evaluate_0.7     stringi_0.4-1    reshape2_1.4.1  
[13] rmarkdown_0.6.1  labeling_0.3     proto_0.3-10     tools_3.2.0     
[17] stringr_1.0.0    munsell_0.4.2    yaml_2.1.13      colorspace_1.2-6
[21] htmltools_0.2.6