HEAD
Last updated: 2015-10-07
Code version: d0a60fe49ff16f1cb9cc8d75b3a4b657eeea95b6
Last updated: 2015-09-28
Code version: 3f442a23f405d3564d9ab3197ec337df22fe9383
We investigated potential ordering effect of wells on each place on gene expression for each plate. Leng et al. discussed ordering effect in single-cell RNA-seq experiments using Fluidigm C1 and implemented an algorithm that detects ordering effect of wells on gene expression in OEFinder.
First, we applied OEFinder to each batch (within each individual) on the CPM-normalized data. The OEFinder returned a warning saying “essentially perfect fit: summary may be unreliable.”
We then applied OEFinder to one individual at a time, and the same error message returned…
source("functions.R")
library(edgeR)Warning: package 'edgeR' was built under R version 3.2.2Loading required package: limmalibrary(ggplot2)
theme_set(theme_bw(base_size = 16))Input annotation
anno <- read.table("../data/annotation.txt", header = TRUE,
                   stringsAsFactors = FALSE)
head(anno)  individual batch well     sample_id
1      19098     1  A01 NA19098.1.A01
2      19098     1  A02 NA19098.1.A02
3      19098     1  A03 NA19098.1.A03
4      19098     1  A04 NA19098.1.A04
5      19098     1  A05 NA19098.1.A05
6      19098     1  A06 NA19098.1.A06Input read counts.
reads <- read.table("../data/reads.txt", header = TRUE,
                    stringsAsFactors = FALSE)Input molecule counts
molecules <- read.table("../data/molecules.txt", header = TRUE, stringsAsFactors = FALSE)Remove bulk samples
single_samples <- anno$well != "bulk"
anno_single <- anno[ which(single_samples), ]
molecules_single <- molecules[ , which(single_samples)]
reads_single <- reads[ , which(single_samples)]
stopifnot(ncol(molecules_single) == nrow(anno_single),
          colnames(molecules_single) == anno_single$sample_id)Remove ERCC genes.
ii_nonERCC <- grep("ERCC", rownames(molecules_single), invert = TRUE)
molecules_single_ENSG <- molecules_single[ii_nonERCC, ]Remove genes with no counts and also overly expressed genes.
## remove gene with 0 counts
expressed_single_ENSG <- rowSums(molecules_single_ENSG) > 0
molecules_single_ENSG <- molecules_single[expressed_single_ENSG, ]
dim(molecules_single_ENSG)[1] 17769   864## remove gene with molecule count larger than 1024 (15 if them)
overexpressed_genes <- rownames(molecules_single_ENSG)[apply(molecules_single_ENSG, 1,
                                                        function(x) any(x >= 1024))]
molecules_single_ENSG <- molecules_single_ENSG[!(rownames(molecules_single_ENSG) %in% overexpressed_genes), ]
## collision probability and cpm molecule counts
molecules_single_collision <- -1024 * log(1 - molecules_single_ENSG / 1024)
molecules_single_cpm <- cpm(molecules_single_collision, log = TRUE)Output single cell CPM to txt files.
anno_single$unique_batch <- with(anno_single, paste(individual, batch, sep = "_"))
for (per_batch in 1:length(unique(anno_single$unique_batch))) {
  ii_batch <- anno_single$unique_batch == unique(anno_single$unique_batch)[per_batch]
  write.table(molecules_single_cpm[which(ii_batch), ], 
              file = paste("../data/molecules-single-cpm", "-", 
                           unique(anno_single$unique_batch)[per_batch], ".txt", sep = ""),
              quote = FALSE,
              col.names = TRUE, row.names = TRUE)
}Prepare capture site identification file. A txt file with one column of capture site ID (A, B, C, …, H).
require(stringr)
capture_site <- str_extract(anno_single$well, "[aA-zZ]+")
table(capture_site)
# Save capture_site to a txt file.
for (per_batch in 1:length(unique(anno_single$unique_batch))) {
  ii_batch <- anno_single$unique_batch == unique(anno_single$unique_batch)[per_batch]
  write.table(data.frame(site = capture_site[ii_batch]), 
              file = paste("../data/capture-site-",
                           unique(anno_single$unique_batch)[per_batch], ".txt", sep = ""),
              quote = FALSE,
              col.names = FALSE, row.names = FALSE)
}Output single cell CPM to txt files.
for (per_person in 1:length(unique(anno_single$individual))) {
  ii_person <- anno_single$individual == unique(anno_single$individual)[per_person]
  write.table(molecules_single_cpm[which(ii_person), ], 
              file = paste("../data/molecules-single-cpm", "-", 
                           unique(anno_single$individual)[per_person], ".txt", sep = ""),
              quote = FALSE,
              col.names = TRUE, row.names = TRUE)
}Prepare capture site identification file. A txt file with one column of capture site ID (A, B, C, …, H).
require(stringr)
capture_site <- str_extract(anno_single$well, "[aA-zZ]+")
table(capture_site)
# Save capture_site to a txt file.
for (per_person in 1:length(unique(anno_single$individual))) {
  ii_person <- anno_single$individual == unique(anno_single$individual)[per_person]
  write.table(data.frame(site = capture_site[ii_person]), 
              file = paste("../data/capture-site-",
                           unique(anno_single$individual)[per_person], ".txt", sep = ""),
              quote = FALSE,
              col.names = FALSE, row.names = FALSE)
}Upload molecules-single-cpm-(batch).txt and capture-site-(batch).txt to OEFinder Shiny GUI interface.
Output to singleCellSeq/data/OEFinder.
# Packages required to start OEFinder
library(shiny)
library(gdata)
library(shinyFiles)
library(EBSeq)
runGitHub("OEFinder", "lengning")sessionInfo()R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.4 (Yosemite)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
=======
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
>>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b
attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     
other attached packages:
<<<<<<< HEAD
[1] ggplot2_1.0.1 edgeR_3.10.3  limma_3.24.15 knitr_1.11   
loaded via a namespace (and not attached):
 [1] Rcpp_0.12.1      digest_0.6.8     MASS_7.3-44      grid_3.2.1      
 [5] plyr_1.8.3       gtable_0.1.2     formatR_1.2.1    magrittr_1.5    
 [9] scales_0.3.0     evaluate_0.8     stringi_0.5-5    reshape2_1.4.1  
[13] rmarkdown_0.8    proto_0.3-10     tools_3.2.1      stringr_1.0.0   
[17] munsell_0.4.2    yaml_2.1.13      colorspace_1.2-6 htmltools_0.2.6