Ordering effect of the capture sites, all genes CPM normalized

Last updated: 2015-10-07

Code version: d0a60fe49ff16f1cb9cc8d75b3a4b657eeea95b6

Goal

We investigated potential ordering effect of wells on each place on gene expression across the 9 plates (batches). Leng et al. discussed ordering effect in single-cell RNA-seq experiments using Fluidigm C1 and implemented an algorithm that detects ordering effect of wells on gene expression in OEFinder.

OEFinder defaults DESeq normalization. Here we attempt to use CPM normalized data for OEFinder.

Even after CPM normalization, we still found that the genes identified as over-expressed by OEFinder to distribute across all levels of per gene expression (average across cells). 273 genes were identified as overexpressed after the data was CPM normalized.

Setup

source("functions.R")
library(edgeR)

Warning: package 'edgeR' was built under R version 3.2.2

Loading required package: limma

library(ggplot2)
theme_set(theme_bw(base_size = 16))

Prepare single cell data before filtering

Input annotation

anno <- read.table("../data/annotation.txt", header = TRUE,
                   stringsAsFactors = FALSE)
head(anno)

  individual batch well     sample_id
1      19098     1  A01 NA19098.1.A01
2      19098     1  A02 NA19098.1.A02
3      19098     1  A03 NA19098.1.A03
4      19098     1  A04 NA19098.1.A04
5      19098     1  A05 NA19098.1.A05
6      19098     1  A06 NA19098.1.A06

Input read counts.

reads <- read.table("../data/reads.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input molecule counts

molecules <- read.table("../data/molecules.txt", header = TRUE, stringsAsFactors = FALSE)

Remove bulk samples

single_samples <- anno$well != "bulk"
anno_single <- anno[ which(single_samples), ]
molecules_single <- molecules[ , which(single_samples)]
reads_single <- reads[ , which(single_samples)]
stopifnot(ncol(molecules_single) == nrow(anno_single),
          colnames(molecules_single) == anno_single$sample_id)

Remove ERCC genes.

ii_nonERCC <- grep("ERCC", rownames(molecules_single), invert = TRUE)

molecules_single_ENSG <- molecules_single[ii_nonERCC, ]

Remove genes with no counts and also overly expressed genes.

## remove gene with 0 counts
expressed_single_ENSG <- rowSums(molecules_single_ENSG) > 0
molecules_single_ENSG <- molecules_single_ENSG[expressed_single_ENSG, ]
dim(molecules_single_ENSG)

[1] 17687   864

## remove gene with molecule count larger than 1024 (15 if them)
overexpressed_genes <- rownames(molecules_single_ENSG)[apply(molecules_single_ENSG, 1,
                                                        function(x) any(x >= 1024))]
molecules_single_ENSG <- molecules_single_ENSG[!(rownames(molecules_single_ENSG) %in% overexpressed_genes), ]

## collision probability and cpm molecule counts
molecules_single_collision <- -1024 * log(1 - molecules_single_ENSG / 1024)
molecules_single_cpm <- cpm(molecules_single_collision, log = TRUE)

Output single cell CPM to txt files.

if (!file.exists("../data/molecules-single-cpm.txt")) {
  write.table(molecules_single_cpm, 
              file = "../data/molecules-single-cpm.txt",
              quote = FALSE,
              col.names = TRUE, row.names = TRUE)
}

Prepare capture site identification file. A txt file with one column of capture site ID (A, B, C, …, H).

require(stringr)

Loading required package: stringr

capture_site <- str_extract(anno_single$well, "[aA-zZ]+")
table(capture_site)

capture_site
  A   B   C   D   E   F   G   H 
108 108 108 108 108 108 108 108

Save capture_site to a txt file.

if (!file.exists("../data/capture-site.txt")) {
write.table(data.frame(site = capture_site), 
            file = "../data/capture-site.txt",
            quote = FALSE,
            col.names = FALSE, row.names = FALSE)
}

OEFinder

Upload molecules-single-cpm.txt and capture-site.txt to OEFinder Shiny GUI interface.

Output to singleCellSeq/data/OEFinder.

Run OEFinder

# Packages required to start OEFinder
library(shiny)
library(gdata)
library(shinyFiles)
library(EBSeq)

runGitHub("OEFinder", "lengning")

Load OEFinder outputted genes.

OE_cpm <- read.csv("../data/OEFinder/cpm-OEgenes.csv",
                     stringsAsFactors = FALSE,
                     quote = "\"", sep = ",", header = TRUE)
colnames(OE_cpm) <- c("genes", "pvalue")
head(OE_cpm)

            genes pvalue
1 ENSG00000175756      0
2 ENSG00000162585      0
3 ENSG00000116670      0
4 ENSG00000117318      0
5 ENSG00000214026      0
6 ENSG00000167996      0

str(OE_cpm)

'data.frame':   273 obs. of  2 variables:
 $ genes : chr  "ENSG00000175756" "ENSG00000162585" "ENSG00000116670" "ENSG00000117318" ...
 $ pvalue: num  0 0 0 0 0 0 0 0 0 0 ...

OE genes and mean expression

cutoffs <- seq(1001, nrow(molecules_single_cpm), by = 1000)
cutoffs <- c(cutoffs, nrow(molecules_single_cpm))
top_genes_count <- lapply(1:length(cutoffs), function(cut) {
                        per_cutoff <- cutoffs[cut]
                        cell_across_order <- order(rowSums(molecules_single_cpm), decreasing = TRUE)
                        top_genes <- rownames(molecules_single_cpm)[cell_across_order < per_cutoff]
                        sum(OE_cpm$genes %in% top_genes)
                        })
top_genes_count <- do.call(c, top_genes_count)

ggplot(data.frame(top_count = top_genes_count,
                  cutoffs = cutoffs), 
       aes(x = as.factor(cutoffs), y = top_count)) + geom_point() +
       labs(x = "Top X genes as in per gene mean expression", 
            y = "Number of OEFinder OE genes")

OE genes identified by OEFinder were not limited to the top 1000 genes. On the contrary, we found OE genes at all levels of gene expression (averaged acrosss cells).

OE genes and variability of expression

cutoffs <- seq(1001, nrow(molecules_single_cpm), by = 1000)
cutoffs <- c(cutoffs, nrow(molecules_single_cpm))
top_genes_cv <- 
  lapply(1:length(cutoffs), function(cut) {
        per_cutoff <- cutoffs[cut]
        per_gene_cv <- apply(molecules_single_cpm, 1, sd)/apply(molecules_single_cpm, 1, mean)
        cell_across_order <- order(per_gene_cv, decreasing = TRUE)
        top_genes <- rownames(molecules_single_cpm)[cell_across_order < per_cutoff]
        sum(OE_cpm$genes %in% top_genes)
        })
top_genes_cv <- do.call(c, top_genes_cv)

ggplot(data.frame(top_cv = top_genes_cv,
                  cutoffs = cutoffs), 
       aes(x = as.factor(cutoffs), y = top_cv)) + geom_point() +
       labs(x = "Top X genes as in per gene CV", 
            y = "Number of OEFinder OE genes")

OE genes by individual

Extract OE gene information.

is_OE <- which(rownames(molecules_single_cpm) %in% OE_cpm$genes)

OE_cpm <- molecules_single_cpm[ is_OE , ]

# OE_cpm <- molecules_single_cpm[ is_OE , anno_single$individual == "19098"]
# capture_site_ind <- capture_site[anno_single$individual == "19098"]
# batch_ind <- anno_single$batch[anno_single$individual == "19098"]

Compute CV of OE genes.

Top 5 CV

OE_cv <- apply(OE_cpm, 1, sd)/apply(OE_cpm, 1, mean)
is_top5_cv <- order(OE_cv, decreasing = TRUE)[1:5]

df_plot <- data.frame(cpm = c(t( OE_cpm[is_top5_cv, ]  )),
                  site = as.numeric(as.factor(capture_site)),
                  batch = as.factor(anno_single$batch),
                  individual = as.factor(anno_single$individual),
                  gene = rep(rownames(OE_cpm[is_top5_cv, ]), each = ncol(OE_cpm) ) )

#df_plot <- df_plot[df_plot$batch == 1, ]
ggplot(df_plot,
       aes(x = site, y = cpm)) + 
  geom_point() +
  facet_grid(gene ~ individual) +
  stat_smooth(method = loess, formula = y ~ x ) + 
  ggtitle("Top 5 CV among the OE genes")

Bottom 5 CV

OE_cv <- apply(OE_cpm, 1, sd)/apply(OE_cpm, 1, mean)
is_bottom5_cv <- order(OE_cv)[1:5]

df_plot <- data.frame(cpm = c(t( OE_cpm[is_bottom5_cv, ]  )),
                  site = as.numeric(as.factor(capture_site)),
                  batch = as.factor(anno_single$batch),
                  individual = as.factor(anno_single$individual),
                  gene = rep(rownames(OE_cpm[is_bottom5_cv, ]), each = ncol(OE_cpm) ) )

#df_plot <- df_plot[df_plot$batch == 1, ]
ggplot(df_plot,
       aes(x = site, y = cpm)) + 
  geom_point() +
  facet_grid(gene ~ individual) +
  stat_smooth(method = loess, formula = y ~ x ) + 
  ggtitle("Bottom 5 CV among the OE genes")

Coefficient of variation by individual.

OE_cv <- apply(OE_cpm, 1, sd)/apply(OE_cpm, 1, mean)
is_bottom5_cv <- order(OE_cv)[1:5]

df_plot <- data.frame(cpm = c(t( OE_cpm[is_bottom5_cv, ]  )),
                  site = as.numeric(as.factor(capture_site)),
                  batch = as.factor(anno_single$batch),
                  individual = as.factor(anno_single$individual),
                  gene = rep(rownames(OE_cpm[is_bottom5_cv, ]), each = ncol(OE_cpm) ) )

#df_plot <- df_plot[df_plot$batch == 1, ]
ggplot(df_plot,
       aes(x = site, y = cpm)) + 
  geom_point() +
  facet_grid(gene ~ individual) +
  stat_smooth(method = loess, formula = y ~ x ) + 
  ggtitle("Bottom 5 CV among the OE genes")

Gene annotation

library(mygene)

gene_query <- queryMany(OE_cpm$genes, scopes="ensembl.gene", 
                 fields=c("name", "summary"), species="human")

kable(as.data.frame(gene_query)[ , 1:4])

Session information

sessionInfo()

R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.4 (Yosemite)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] stringr_1.0.0 ggplot2_1.0.1 edgeR_3.10.3  limma_3.24.15 knitr_1.11   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.1      digest_0.6.8     MASS_7.3-44      grid_3.2.1      
 [5] plyr_1.8.3       gtable_0.1.2     formatR_1.2.1    magrittr_1.5    
 [9] scales_0.3.0     evaluate_0.8     stringi_0.5-5    reshape2_1.4.1  
[13] rmarkdown_0.8    labeling_0.3     proto_0.3-10     tools_3.2.1     
[17] munsell_0.4.2    yaml_2.1.13      colorspace_1.2-6 htmltools_0.2.6