Last updated: 2015-10-07
Code version: d0a60fe49ff16f1cb9cc8d75b3a4b657eeea95b6
We investigated potential ordering effect of wells on each place on gene expression across the 9 plates (batches). Leng et al. discussed ordering effect in single-cell RNA-seq experiments using Fluidigm C1 and implemented an algorithm that detects ordering effect of wells on gene expression in OEFinder.
OEFinder defaults DESeq normalization. Here we attempt to use CPM normalized data for OEFinder.
Even after CPM normalization, we still found that the genes identified as over-expressed by OEFinder to distribute across all levels of per gene expression (average across cells). 273 genes were identified as overexpressed after the data was CPM normalized.
source("functions.R")
library(edgeR)
Warning: package 'edgeR' was built under R version 3.2.2
Loading required package: limma
library(ggplot2)
theme_set(theme_bw(base_size = 16))
Input annotation
anno <- read.table("../data/annotation.txt", header = TRUE,
stringsAsFactors = FALSE)
head(anno)
individual batch well sample_id
1 19098 1 A01 NA19098.1.A01
2 19098 1 A02 NA19098.1.A02
3 19098 1 A03 NA19098.1.A03
4 19098 1 A04 NA19098.1.A04
5 19098 1 A05 NA19098.1.A05
6 19098 1 A06 NA19098.1.A06
Input read counts.
reads <- read.table("../data/reads.txt", header = TRUE,
stringsAsFactors = FALSE)
Input molecule counts
molecules <- read.table("../data/molecules.txt", header = TRUE, stringsAsFactors = FALSE)
Remove bulk samples
single_samples <- anno$well != "bulk"
anno_single <- anno[ which(single_samples), ]
molecules_single <- molecules[ , which(single_samples)]
reads_single <- reads[ , which(single_samples)]
stopifnot(ncol(molecules_single) == nrow(anno_single),
colnames(molecules_single) == anno_single$sample_id)
Remove ERCC genes.
ii_nonERCC <- grep("ERCC", rownames(molecules_single), invert = TRUE)
molecules_single_ENSG <- molecules_single[ii_nonERCC, ]
Remove genes with no counts and also overly expressed genes.
## remove gene with 0 counts
expressed_single_ENSG <- rowSums(molecules_single_ENSG) > 0
molecules_single_ENSG <- molecules_single_ENSG[expressed_single_ENSG, ]
dim(molecules_single_ENSG)
[1] 17687 864
## remove gene with molecule count larger than 1024 (15 if them)
overexpressed_genes <- rownames(molecules_single_ENSG)[apply(molecules_single_ENSG, 1,
function(x) any(x >= 1024))]
molecules_single_ENSG <- molecules_single_ENSG[!(rownames(molecules_single_ENSG) %in% overexpressed_genes), ]
## collision probability and cpm molecule counts
molecules_single_collision <- -1024 * log(1 - molecules_single_ENSG / 1024)
molecules_single_cpm <- cpm(molecules_single_collision, log = TRUE)
Output single cell CPM to txt files.
if (!file.exists("../data/molecules-single-cpm.txt")) {
write.table(molecules_single_cpm,
file = "../data/molecules-single-cpm.txt",
quote = FALSE,
col.names = TRUE, row.names = TRUE)
}
Prepare capture site identification file. A txt file with one column of capture site ID (A, B, C, …, H).
require(stringr)
Loading required package: stringr
capture_site <- str_extract(anno_single$well, "[aA-zZ]+")
table(capture_site)
capture_site
A B C D E F G H
108 108 108 108 108 108 108 108
Save capture_site to a txt file.
if (!file.exists("../data/capture-site.txt")) {
write.table(data.frame(site = capture_site),
file = "../data/capture-site.txt",
quote = FALSE,
col.names = FALSE, row.names = FALSE)
}
Upload molecules-single-cpm.txt and capture-site.txt to OEFinder Shiny GUI interface.
Output to singleCellSeq/data/OEFinder.
# Packages required to start OEFinder
library(shiny)
library(gdata)
library(shinyFiles)
library(EBSeq)
runGitHub("OEFinder", "lengning")
Load OEFinder outputted genes.
OE_cpm <- read.csv("../data/OEFinder/cpm-OEgenes.csv",
stringsAsFactors = FALSE,
quote = "\"", sep = ",", header = TRUE)
colnames(OE_cpm) <- c("genes", "pvalue")
head(OE_cpm)
genes pvalue
1 ENSG00000175756 0
2 ENSG00000162585 0
3 ENSG00000116670 0
4 ENSG00000117318 0
5 ENSG00000214026 0
6 ENSG00000167996 0
str(OE_cpm)
'data.frame': 273 obs. of 2 variables:
$ genes : chr "ENSG00000175756" "ENSG00000162585" "ENSG00000116670" "ENSG00000117318" ...
$ pvalue: num 0 0 0 0 0 0 0 0 0 0 ...
cutoffs <- seq(1001, nrow(molecules_single_cpm), by = 1000)
cutoffs <- c(cutoffs, nrow(molecules_single_cpm))
top_genes_count <- lapply(1:length(cutoffs), function(cut) {
per_cutoff <- cutoffs[cut]
cell_across_order <- order(rowSums(molecules_single_cpm), decreasing = TRUE)
top_genes <- rownames(molecules_single_cpm)[cell_across_order < per_cutoff]
sum(OE_cpm$genes %in% top_genes)
})
top_genes_count <- do.call(c, top_genes_count)
ggplot(data.frame(top_count = top_genes_count,
cutoffs = cutoffs),
aes(x = as.factor(cutoffs), y = top_count)) + geom_point() +
labs(x = "Top X genes as in per gene mean expression",
y = "Number of OEFinder OE genes")
OE genes identified by OEFinder were not limited to the top 1000 genes. On the contrary, we found OE genes at all levels of gene expression (averaged acrosss cells).
cutoffs <- seq(1001, nrow(molecules_single_cpm), by = 1000)
cutoffs <- c(cutoffs, nrow(molecules_single_cpm))
top_genes_cv <-
lapply(1:length(cutoffs), function(cut) {
per_cutoff <- cutoffs[cut]
per_gene_cv <- apply(molecules_single_cpm, 1, sd)/apply(molecules_single_cpm, 1, mean)
cell_across_order <- order(per_gene_cv, decreasing = TRUE)
top_genes <- rownames(molecules_single_cpm)[cell_across_order < per_cutoff]
sum(OE_cpm$genes %in% top_genes)
})
top_genes_cv <- do.call(c, top_genes_cv)
ggplot(data.frame(top_cv = top_genes_cv,
cutoffs = cutoffs),
aes(x = as.factor(cutoffs), y = top_cv)) + geom_point() +
labs(x = "Top X genes as in per gene CV",
y = "Number of OEFinder OE genes")
Extract OE gene information.
is_OE <- which(rownames(molecules_single_cpm) %in% OE_cpm$genes)
OE_cpm <- molecules_single_cpm[ is_OE , ]
# OE_cpm <- molecules_single_cpm[ is_OE , anno_single$individual == "19098"]
# capture_site_ind <- capture_site[anno_single$individual == "19098"]
# batch_ind <- anno_single$batch[anno_single$individual == "19098"]
Compute CV of OE genes.
OE_cv <- apply(OE_cpm, 1, sd)/apply(OE_cpm, 1, mean)
is_top5_cv <- order(OE_cv, decreasing = TRUE)[1:5]
df_plot <- data.frame(cpm = c(t( OE_cpm[is_top5_cv, ] )),
site = as.numeric(as.factor(capture_site)),
batch = as.factor(anno_single$batch),
individual = as.factor(anno_single$individual),
gene = rep(rownames(OE_cpm[is_top5_cv, ]), each = ncol(OE_cpm) ) )
#df_plot <- df_plot[df_plot$batch == 1, ]
ggplot(df_plot,
aes(x = site, y = cpm)) +
geom_point() +
facet_grid(gene ~ individual) +
stat_smooth(method = loess, formula = y ~ x ) +
ggtitle("Top 5 CV among the OE genes")
OE_cv <- apply(OE_cpm, 1, sd)/apply(OE_cpm, 1, mean)
is_bottom5_cv <- order(OE_cv)[1:5]
df_plot <- data.frame(cpm = c(t( OE_cpm[is_bottom5_cv, ] )),
site = as.numeric(as.factor(capture_site)),
batch = as.factor(anno_single$batch),
individual = as.factor(anno_single$individual),
gene = rep(rownames(OE_cpm[is_bottom5_cv, ]), each = ncol(OE_cpm) ) )
#df_plot <- df_plot[df_plot$batch == 1, ]
ggplot(df_plot,
aes(x = site, y = cpm)) +
geom_point() +
facet_grid(gene ~ individual) +
stat_smooth(method = loess, formula = y ~ x ) +
ggtitle("Bottom 5 CV among the OE genes")
Coefficient of variation by individual.
OE_cv <- apply(OE_cpm, 1, sd)/apply(OE_cpm, 1, mean)
is_bottom5_cv <- order(OE_cv)[1:5]
df_plot <- data.frame(cpm = c(t( OE_cpm[is_bottom5_cv, ] )),
site = as.numeric(as.factor(capture_site)),
batch = as.factor(anno_single$batch),
individual = as.factor(anno_single$individual),
gene = rep(rownames(OE_cpm[is_bottom5_cv, ]), each = ncol(OE_cpm) ) )
#df_plot <- df_plot[df_plot$batch == 1, ]
ggplot(df_plot,
aes(x = site, y = cpm)) +
geom_point() +
facet_grid(gene ~ individual) +
stat_smooth(method = loess, formula = y ~ x ) +
ggtitle("Bottom 5 CV among the OE genes")
library(mygene)
gene_query <- queryMany(OE_cpm$genes, scopes="ensembl.gene",
fields=c("name", "summary"), species="human")
kable(as.data.frame(gene_query)[ , 1:4])
sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.4 (Yosemite)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] stringr_1.0.0 ggplot2_1.0.1 edgeR_3.10.3 limma_3.24.15 knitr_1.11
loaded via a namespace (and not attached):
[1] Rcpp_0.12.1 digest_0.6.8 MASS_7.3-44 grid_3.2.1
[5] plyr_1.8.3 gtable_0.1.2 formatR_1.2.1 magrittr_1.5
[9] scales_0.3.0 evaluate_0.8 stringi_0.5-5 reshape2_1.4.1
[13] rmarkdown_0.8 labeling_0.3 proto_0.3-10 tools_3.2.1
[17] munsell_0.4.2 yaml_2.1.13 colorspace_1.2-6 htmltools_0.2.6