Last updated: 2016-01-15

Code version: ab0cf1f917a4c115e9ee44403eb62e837e1795ae

Objective

Update the batch-corrected counts using data that were processed well-specific technical bias in molecule counts using ERCC molecule counts under a Poisson-based framework. See [link1] for details.

link1

Setup

source("functions.R")
require("limma")
Loading required package: limma
require("edgeR")
Loading required package: edgeR
require(ggplot2)
Loading required package: ggplot2
require(dplyr)
Loading required package: dplyr

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
require(data.table)
Loading required package: data.table

Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, last
theme_set(theme_bw(base_size = 12))

Poisson normalization

Import data: ERCC counts after filtering out low quality single cells are used to compute well-specific bias in ENSG molecule counts.

# Annotation for single cells included in downstream analysis
anno <- read.table("../data/annotation.txt",header=T,stringsAsFactors=F)
quality_single_cells <- scan("../data/quality-single-cells.txt",
                             what = "character")
anno_filter <- anno %>% filter(sample_id %in% quality_single_cells)

# Import expected ERCC counts
spike <- read.table("../data/expected-ercc-molecules.txt", header = TRUE,
                    sep = "\t", stringsAsFactors = FALSE)

# Import molecule counts of all genes (ERCC + ENSG) after filtering out low 
# quality single cells
molecules_filter <- read.table("../data/molecules-filter.txt", header = TRUE,
                               stringsAsFactors = FALSE)

# ERCC observed count
spike_input <- spike$ercc_molecules_well[spike$id %in% rownames(molecules_filter)]
names(spike_input) <- spike$id[spike$id %in% rownames(molecules_filter)]
spike_input <- spike_input[order(names(spike_input))]
tech <- grepl("ERCC", rownames(molecules_filter))
molecules_filter <- as.matrix(molecules_filter)
batches <- unique(anno_filter$batch)

# CPM corrected ENSG counts
molecules_cpm <- fread( "../data/molecules-cpm.txt", header = TRUE,
                         stringsAsFactors = FALSE)
setDF(molecules_cpm)
rownames(molecules_cpm) <- molecules_cpm$V1
molecules_cpm$V1 <- NULL 
molecules_cpm <- as.matrix(molecules_cpm)

Apply poisson regression to compute expected-to-observed count transformation paramters using ERCC count infomration.

pois_glm_params <- lapply(1:3,function(g) numeric(ncol(molecules_cpm)))
names(pois_glm_params) = c("intercept","nu","theta")
for (i in 1:ncol(molecules_cpm)) {
  fit <- glm(molecules_filter[names(spike_input), i] ~ log(spike_input), family="poisson")
  pois_glm_params$intercept[i] <- fit$coefficients[1]
  pois_glm_params$nu[i] <- fit$coefficients[2]
}
pois_glm_expression <- sweep( sweep( molecules_cpm, 2, pois_glm_params$intercept, "-"), 2, pois_glm_params$nu, "/" )

PCA

molecules_pois_transform <- run_pca(pois_glm_expression)
molecules_pois_transform_plot <- plot_pca(molecules_pois_transform$PCs, 
                                          explained = molecules_pois_transform$explained,
                                          metadata = anno_filter, 
                                          color = "individual",
                                          shape = "replicate") +
                                    labs(title = "After poisson normalization")

Remove unwanted variation

Load the Humanzee package

if (!require(Humanzee, quietly = TRUE)) {
  library(devtools)
  install_github("jhsiao999/Humanzee")
  library(Humanzee)
}

Create design matrix and compute a consensus correlation coefficient using limma’s duplicateCorrelation function.

block <- anno_filter$batch
design <- model.matrix(~ 1 + individual, data = anno_filter)

Compute correlation between replicates.

dup_corrs_file <- "../data/dup-corrs-poisson.rda"
if (file.exists(dup_corrs_file)) {
  load(dup_corrs_file)
} else{
  dup_corrs <- duplicateCorrelation(pois_glm_expression,
                                    design = design, block = block)
  save(dup_corrs, file = dup_corrs_file)
}
str(dup_corrs)
List of 3
 $ consensus.correlation: num 0.0469
 $ cor                  : num 0.0469
 $ atanh.correlations   : num [1:10564] 0.003212 0.136288 0.115906 0.000111 0.012663 ...

Fit a mixed model with the 8 batches being the random effect.

if (file.exists("../data/limma-crossed-poisson.rda")) {
  load("../data/limma-crossed-poisson.rda")
} else {
  gls_fit <- Humanzee::ruv_mixed_model(pois_glm_expression,
                      ndups = 1,
                      design = design, block = block,
                      correlation = dup_corrs$cons)
  save(gls_fit, file = "../data/limma-crossed-poisson.rda")
}

Compute expression levels after removing variation due to random effects.

molecules_final <- t( design %*% t(gls_fit$coef) ) + gls_fit$resid
colnames(molecules_final) <- anno_filter$sample_id
rownames(molecules_final) <- rownames(molecules_cpm)

Export final data

data_file <- "../data/molecules-final-pois.txt"
if (!file.exists(data_file)) {
  write.table(molecules_final, 
              data_file, quote = FALSE, sep = "\t",
              row.names = TRUE)
}
pca_final <- run_pca(molecules_final)
pca_final_plot <- plot_pca(pca_final$PCs, explained = pca_final$explained,
         metadata = anno_filter, color = "individual",
         shape = "replicate") +
  labs(title = "Batch-corrected")

PCA plots

theme_set(theme_bw(base_size = 8))
cowplot::plot_grid(
    molecules_pois_transform_plot,
    pca_final_plot,
  ncol = 2,
  labels = LETTERS[1:2])

Session information

sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] Humanzee_0.1.0   testit_0.4       data.table_1.9.4 dplyr_0.4.2     
[5] ggplot2_1.0.1    edgeR_3.10.2     limma_3.24.9     knitr_1.10.5    

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.0      magrittr_1.5     MASS_7.3-40      cowplot_0.3.1   
 [5] munsell_0.4.2    colorspace_1.2-6 R6_2.1.1         stringr_1.0.0   
 [9] httr_0.6.1       plyr_1.8.3       tools_3.2.0      parallel_3.2.0  
[13] grid_3.2.0       gtable_0.1.2     DBI_0.3.1        htmltools_0.2.6 
[17] lazyeval_0.1.10  assertthat_0.1   yaml_2.1.13      digest_0.6.8    
[21] reshape2_1.4.1   formatR_1.2      bitops_1.0-6     RCurl_1.95-4.6  
[25] evaluate_0.7     rmarkdown_0.6.1  labeling_0.3     stringi_0.4-1   
[29] scales_0.2.4     chron_2.3-45     proto_0.3-10