Last updated: 2015-08-11

Code version: 54b5090efa940ffc9920fdee95e8364552464e2b

Input

Input annotation.

anno <- read.table("../data/annotation.txt", header = TRUE,
                   stringsAsFactors = FALSE)
head(anno)
  individual batch well     sample_id
1      19098     1  A01 NA19098.1.A01
2      19098     1  A02 NA19098.1.A02
3      19098     1  A03 NA19098.1.A03
4      19098     1  A04 NA19098.1.A04
5      19098     1  A05 NA19098.1.A05
6      19098     1  A06 NA19098.1.A06

Input ERCC concentration information.

ercc <- read.table("../data/ercc-info.txt", header = TRUE, sep = "\t",
                   stringsAsFactors = FALSE)
colnames(ercc) <- c("num", "id", "subgroup", "conc_mix1", "conc_mix2",
                    "expected_fc", "log2_mix1_mix2")
head(ercc)
  num         id subgroup conc_mix1  conc_mix2 expected_fc log2_mix1_mix2
1   1 ERCC-00130        A 30000.000 7500.00000           4              2
2   2 ERCC-00004        A  7500.000 1875.00000           4              2
3   3 ERCC-00136        A  1875.000  468.75000           4              2
4   4 ERCC-00108        A   937.500  234.37500           4              2
5   5 ERCC-00116        A   468.750  117.18750           4              2
6   6 ERCC-00092        A   234.375   58.59375           4              2
stopifnot(nrow(ercc) == 92)

Input molecule counts.

molecules <- read.table("../data/molecules.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input list of quality single cells.

quality_single_cells <- scan("../data/quality-single-cells.txt",
                             what = "character")

Prepare single cell molecule data

Keep only the single cells that passed the QC filters. This also removes the bulk samples.

molecules_single <- molecules[, colnames(molecules) %in% quality_single_cells]
anno_single <- anno[anno$sample_id %in% quality_single_cells, ]
stopifnot(ncol(molecules_single) == nrow(anno_single),
          colnames(molecules_single) == anno_single$sample_id)

Also remove batch 2 of individual 19098.

molecules_single <- molecules_single[, !(anno_single$individual == 19098 & anno_single$batch == 2)]
anno_single <- anno_single[!(anno_single$individual == 19098 & anno_single$batch == 2), ]
stopifnot(ncol(molecules_single) == nrow(anno_single))

Remove genes with zero read counts in the single cells.

expressed_single <- rowSums(molecules_single) > 0
molecules_single <- molecules_single[expressed_single, ]
dim(molecules_single)
[1] 17478   563

How many genes have greater than or equal to 1,024 molecules in at least one of the cells?

overexpressed_genes <- rownames(molecules_single)[apply(molecules_single, 1,
                                                        function(x) any(x >= 1024))]

15 have greater than or equal to 1,024 molecules. Remove them.

molecules_single <- molecules_single[!(rownames(molecules_single) %in% overexpressed_genes), ]

Calculate the molecule counts of each ERCC in single cell

Accorindg to Pollen et al., there are total 28,000 ERCC molecule per cell when the ERCC stock is used in 1:20,000 dilution. In our study, I used 1:50,000 dilution. Therefore, we have total 28000/2.5= 11200 ERCC molecule per cell.

#### sort ercc file by gene ID
ercc <- ercc[order(ercc$id), ]

#### calculate the number of each ERCC based on the genes detected
ratio <- 11200/ sum(ercc[,"conc_mix1"])
ratio
[1] 0.1081968
ercc$counts <- ercc[,"conc_mix1"] * ratio
head(ercc)
   num         id subgroup    conc_mix1    conc_mix2 expected_fc
70  70 ERCC-00002        D 1.500000e+04 3.000000e+04        0.50
72  72 ERCC-00003        D 9.375000e+02 1.875000e+03        0.50
2    2 ERCC-00004        A 7.500000e+03 1.875000e+03        4.00
26  26 ERCC-00009        B 9.375000e+02 9.375000e+02        1.00
67  67 ERCC-00012        C 1.144409e-01 1.716614e-01        0.67
86  86 ERCC-00013        D 9.155273e-01 1.831055e+00        0.50
   log2_mix1_mix2       counts
70          -1.00 1.622953e+03
72          -1.00 1.014345e+02
2            2.00 8.114764e+02
26           0.00 1.014345e+02
67          -0.58 1.238215e-02
86          -1.00 9.905717e-02
#### try 28000 to see if integers were generated 
ratio_t <- 28000/ sum(ercc[,"conc_mix1"])
ercc$counts_t <- ercc[,"conc_mix1"] * ratio_t
head(ercc)
   num         id subgroup    conc_mix1    conc_mix2 expected_fc
70  70 ERCC-00002        D 1.500000e+04 3.000000e+04        0.50
72  72 ERCC-00003        D 9.375000e+02 1.875000e+03        0.50
2    2 ERCC-00004        A 7.500000e+03 1.875000e+03        4.00
26  26 ERCC-00009        B 9.375000e+02 9.375000e+02        1.00
67  67 ERCC-00012        C 1.144409e-01 1.716614e-01        0.67
86  86 ERCC-00013        D 9.155273e-01 1.831055e+00        0.50
   log2_mix1_mix2       counts     counts_t
70          -1.00 1.622953e+03 4.057382e+03
72          -1.00 1.014345e+02 2.535864e+02
2            2.00 8.114764e+02 2.028691e+03
26           0.00 1.014345e+02 2.535864e+02
67          -0.58 1.238215e-02 3.095537e-02
86          -1.00 9.905717e-02 2.476429e-01

Look at the correlation using the function that John created

correlate_ercc <- function(observed, expected, description = "") {
  # Plots the relationship between the observed ERCC data and the expected ERCC
  # concentrations.

  # Args:
  #  observed: vector of summary statistic of observed ERCC counts
  #  expected: vector of ERCC concentrations
  #  description: optional string to add to title
  plot(expected, observed)
  ercc_fit <- lm(observed ~ expected)
  abline(ercc_fit, col = "red")
  title(sprintf("%s\nY ~ %.2fX + %.2f ; R-squared: %.2f", description,
                ercc_fit$coefficients[2], ercc_fit$coefficients[1],
                summary(ercc_fit)$r.squared))
}

#### number of ERCC detected in single cells
ercc_rows_molecules_single <- grep("ERCC", rownames(molecules_single))
length(ercc_rows_molecules_single)
[1] 81
#### keep only the detected ERCC in the ercc list
ercc_molecules_single <- ercc[ercc$id %in% rownames(molecules_single), ]
stopifnot(rownames(molecules_single[ercc_rows_molecules_single, ]) ==
          ercc_molecules_single$id)

#### what's the correlation of counts?
correlate_ercc(rowMeans(molecules_single[ercc_rows_molecules_single, ]), ercc_molecules_single$counts,
               description = "single cell molecules")

The collision and the standardization

Correct for the collision probability.

First correct and look for the correlation. See Grun et al. 2014 for details.

molecules_single_collision <- -1024 * log(1 - molecules_single / 1024)

correlate_ercc(rowMeans(molecules_single_collision[ercc_rows_molecules_single, ]), ercc_molecules_single$counts,
               description = "single cell molecules (after correction)")

Standardize molecule numbers.

molecules_single_cpm <- cpm(molecules_single_collision, log = TRUE)

correlate_ercc(rowMeans(molecules_single_cpm[ercc_rows_molecules_single, ]), log2(ercc_molecules_single$counts),
               description = "single cell log2 molecules per million")

Session information

sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ggplot2_1.0.1 edgeR_3.10.2  limma_3.24.9  knitr_1.10.5 

loaded via a namespace (and not attached):
 [1] Rcpp_0.11.6      digest_0.6.8     MASS_7.3-40      grid_3.2.0      
 [5] plyr_1.8.2       gtable_0.1.2     formatR_1.2      magrittr_1.5    
 [9] scales_0.2.4     evaluate_0.7     stringi_0.4-1    reshape2_1.4.1  
[13] rmarkdown_0.6.1  proto_0.3-10     tools_3.2.0      stringr_1.0.0   
[17] munsell_0.4.2    yaml_2.1.13      colorspace_1.2-6 htmltools_0.2.6