<<<<<<< HEAD Potential batch effect in the ordering effect of the capture sites ======= Potential batch effect in the ordering effect of the capture sites >>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b <<<<<<< HEAD
<<<<<<< HEAD

Last updated: 2015-10-07

Code version: d0a60fe49ff16f1cb9cc8d75b3a4b657eeea95b6

Objective

=======

Last updated: 2015-09-28

Code version: 3f442a23f405d3564d9ab3197ec337df22fe9383

Goal

>>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b

We investigated potential ordering effect of wells on each place on gene expression for each plate. Leng et al. discussed ordering effect in single-cell RNA-seq experiments using Fluidigm C1 and implemented an algorithm that detects ordering effect of wells on gene expression in OEFinder.

First, we applied OEFinder to each batch (within each individual) on the CPM-normalized data. The OEFinder returned a warning saying “essentially perfect fit: summary may be unreliable.”

We then applied OEFinder to one individual at a time, and the same error message returned…

Setup

source("functions.R")
library(edgeR)
<<<<<<< HEAD
Warning: package 'edgeR' was built under R version 3.2.2
======= >>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b
Loading required package: limma
library(ggplot2)
theme_set(theme_bw(base_size = 16))

Prepare single cell data before filtering

Input annotation

anno <- read.table("../data/annotation.txt", header = TRUE,
                   stringsAsFactors = FALSE)
head(anno)
  individual batch well     sample_id
1      19098     1  A01 NA19098.1.A01
2      19098     1  A02 NA19098.1.A02
3      19098     1  A03 NA19098.1.A03
4      19098     1  A04 NA19098.1.A04
5      19098     1  A05 NA19098.1.A05
6      19098     1  A06 NA19098.1.A06

Input read counts.

reads <- read.table("../data/reads.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input molecule counts

molecules <- read.table("../data/molecules.txt", header = TRUE, stringsAsFactors = FALSE)

Remove bulk samples

single_samples <- anno$well != "bulk"
anno_single <- anno[ which(single_samples), ]
molecules_single <- molecules[ , which(single_samples)]
reads_single <- reads[ , which(single_samples)]
stopifnot(ncol(molecules_single) == nrow(anno_single),
          colnames(molecules_single) == anno_single$sample_id)

Remove ERCC genes.

ii_nonERCC <- grep("ERCC", rownames(molecules_single), invert = TRUE)

molecules_single_ENSG <- molecules_single[ii_nonERCC, ]

Remove genes with no counts and also overly expressed genes.

## remove gene with 0 counts
expressed_single_ENSG <- rowSums(molecules_single_ENSG) > 0
molecules_single_ENSG <- molecules_single[expressed_single_ENSG, ]
dim(molecules_single_ENSG)
[1] 17769   864
## remove gene with molecule count larger than 1024 (15 if them)
overexpressed_genes <- rownames(molecules_single_ENSG)[apply(molecules_single_ENSG, 1,
                                                        function(x) any(x >= 1024))]
molecules_single_ENSG <- molecules_single_ENSG[!(rownames(molecules_single_ENSG) %in% overexpressed_genes), ]

## collision probability and cpm molecule counts
molecules_single_collision <- -1024 * log(1 - molecules_single_ENSG / 1024)
molecules_single_cpm <- cpm(molecules_single_collision, log = TRUE)

Prepare OEFinder input

Per batch, per individual

Output single cell CPM to txt files.

anno_single$unique_batch <- with(anno_single, paste(individual, batch, sep = "_"))
for (per_batch in 1:length(unique(anno_single$unique_batch))) {
  ii_batch <- anno_single$unique_batch == unique(anno_single$unique_batch)[per_batch]
  write.table(molecules_single_cpm[which(ii_batch), ], 
              file = paste("../data/molecules-single-cpm", "-", 
                           unique(anno_single$unique_batch)[per_batch], ".txt", sep = ""),
              quote = FALSE,
              col.names = TRUE, row.names = TRUE)
}

Prepare capture site identification file. A txt file with one column of capture site ID (A, B, C, …, H).

require(stringr)
capture_site <- str_extract(anno_single$well, "[aA-zZ]+")
table(capture_site)

# Save capture_site to a txt file.
for (per_batch in 1:length(unique(anno_single$unique_batch))) {
  ii_batch <- anno_single$unique_batch == unique(anno_single$unique_batch)[per_batch]
  write.table(data.frame(site = capture_site[ii_batch]), 
              file = paste("../data/capture-site-",
                           unique(anno_single$unique_batch)[per_batch], ".txt", sep = ""),
              quote = FALSE,
              col.names = FALSE, row.names = FALSE)
}

Per individual

Output single cell CPM to txt files.

for (per_person in 1:length(unique(anno_single$individual))) {
  ii_person <- anno_single$individual == unique(anno_single$individual)[per_person]
  write.table(molecules_single_cpm[which(ii_person), ], 
              file = paste("../data/molecules-single-cpm", "-", 
                           unique(anno_single$individual)[per_person], ".txt", sep = ""),
              quote = FALSE,
              col.names = TRUE, row.names = TRUE)
}

Prepare capture site identification file. A txt file with one column of capture site ID (A, B, C, …, H).

require(stringr)
capture_site <- str_extract(anno_single$well, "[aA-zZ]+")
table(capture_site)

# Save capture_site to a txt file.
for (per_person in 1:length(unique(anno_single$individual))) {
  ii_person <- anno_single$individual == unique(anno_single$individual)[per_person]
  write.table(data.frame(site = capture_site[ii_person]), 
              file = paste("../data/capture-site-",
                           unique(anno_single$individual)[per_person], ".txt", sep = ""),
              quote = FALSE,
              col.names = FALSE, row.names = FALSE)
}

OEFinder

Upload molecules-single-cpm-(batch).txt and capture-site-(batch).txt to OEFinder Shiny GUI interface.

Output to singleCellSeq/data/OEFinder.

  • Run OEFinder
# Packages required to start OEFinder
library(shiny)
library(gdata)
library(shinyFiles)
library(EBSeq)

runGitHub("OEFinder", "lengning")

Session information

sessionInfo()
<<<<<<< HEAD
R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.4 (Yosemite)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
=======
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
>>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
<<<<<<< HEAD
[1] ggplot2_1.0.1 edgeR_3.10.3  limma_3.24.15 knitr_1.11   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.1      digest_0.6.8     MASS_7.3-44      grid_3.2.1      
 [5] plyr_1.8.3       gtable_0.1.2     formatR_1.2.1    magrittr_1.5    
 [9] scales_0.3.0     evaluate_0.8     stringi_0.5-5    reshape2_1.4.1  
[13] rmarkdown_0.8    proto_0.3-10     tools_3.2.1      stringr_1.0.0   
[17] munsell_0.4.2    yaml_2.1.13      colorspace_1.2-6 htmltools_0.2.6 
======= [1] ggplot2_1.0.1 edgeR_3.10.2 limma_3.24.9 knitr_1.10.5 loaded via a namespace (and not attached): [1] Rcpp_0.12.0 magrittr_1.5 MASS_7.3-40 munsell_0.4.2 [5] colorspace_1.2-6 stringr_1.0.0 httr_0.6.1 plyr_1.8.3 [9] tools_3.2.0 grid_3.2.0 gtable_0.1.2 htmltools_0.2.6 [13] yaml_2.1.13 digest_0.6.8 reshape2_1.4.1 formatR_1.2 [17] bitops_1.0-6 RCurl_1.95-4.6 evaluate_0.7 rmarkdown_0.6.1 [21] stringi_0.4-1 scales_0.2.4 proto_0.3-10
>>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b
<<<<<<< HEAD ======= >>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b