<<<<<<< HEAD ======= >>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b <<<<<<< HEAD
<<<<<<< HEAD ======= >>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b <<<<<<< HEAD

Last updated: 2015-09-23

Code version: 89227f632516ff3ca6a451e84fde3fedee9c7b8a

=======

Last updated: 2015-09-28

Code version: 3f442a23f405d3564d9ab3197ec337df22fe9383

>>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b

Comparing the conversion of reads to molecules for each cell. Used three different metrics:

Input

library("dplyr")
library("ggplot2")
theme_set(theme_bw(base_size = 16))
library("edgeR")
source("functions.R")
library("tidyr")

Input annotation.

anno <- read.table("../data/annotation.txt", header = TRUE,
                   stringsAsFactors = FALSE)
head(anno)
  individual batch well     sample_id
1      19098     1  A01 NA19098.1.A01
2      19098     1  A02 NA19098.1.A02
3      19098     1  A03 NA19098.1.A03
4      19098     1  A04 NA19098.1.A04
5      19098     1  A05 NA19098.1.A05
6      19098     1  A06 NA19098.1.A06

Input read counts.

reads <- read.table("../data/reads.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input molecule counts.

molecules <- read.table("../data/molecules.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input list of quality single cells.

quality_single_cells <- scan("../data/quality-single-cells.txt",
                             what = "character")

Keep only the single cells that passed the QC filters.

reads <- reads[, colnames(reads) %in% quality_single_cells]
molecules <- molecules[, colnames(molecules) %in% quality_single_cells]
anno <- anno[anno$sample_id %in% quality_single_cells, ]
stopifnot(dim(reads) == dim(molecules),
          nrow(anno) == ncol(reads))

Distribution of fold change to mean

Look at the distribution of fold change to mean. As being reported by others, the lowly expressed genes show divergent read and molecule counts

## calculate mean
reads_mean     <- apply(reads, 1, mean)
molecules_mean <- apply(molecules, 1, mean)
distribution <- data.frame(reads_mean, molecules_mean)

## calculate fold change to mean
distribution$fold_change_read     <- log2(reads_mean/mean(reads_mean))
distribution$fold_change_molecule <- log2(molecules_mean/mean(molecules_mean))

## select ERCC
distribution$ERCC <- grepl("ERCC", rownames(distribution))

## color palette
cbPalette <- c("#999999", "#0000FF", "#990033", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#009E73")

ggplot(distribution, aes(x = fold_change_molecule, y = fold_change_read, col = ERCC)) + geom_point(size = 3, alpha = 0.5) + scale_colour_manual(values=cbPalette) + stat_function(fun= function(x) {x}, col= "#56B4E9") + labs(x = "log2 fold change to mean (molecule)", y =  "log2 fold change to mean (reads)")
<<<<<<< HEAD

=======

>>>>>>> 62bc5a2ab71c7d07af0b00504bd53484166fd98b

Transformation

In addition to comparing the raw counts of reads and molecules, we compare the log2 counts and the log counts per million.

For the log counts, I add a pseudocount of 1.

reads_log <- log2(reads + 1)
molecules_log <- log2(molecules + 1)

standardized by cmp and log-transform

reads_cpm <- cpm(reads, log = FALSE)
molecules_cpm <- cpm(molecules, log = FALSE)

Calculate cpm for the reads data using TMM-normalization.

norm_factors_reads <- calcNormFactors(reads, method = "TMM")
reads_tmm <- cpm(reads, lib.size = colSums(reads) * norm_factors_reads,
                 log = TRUE)

And for the molecules.

norm_factors_mol <- calcNormFactors(molecules, method = "TMM")
molecules_tmm <- cpm(molecules, lib.size = colSums(molecules) * norm_factors_mol,
                     log = TRUE)

conversion in each single cell

Counts

<<<<<<< HEAD

Compare the counts. (1) all genes. (2) only ERCC. (3) without ERCC, only endogenous genes

All genes

## linear regression per cell and make a table with intercept, slope, and r-squared
regression_table <- as.data.frame(do.call(rbind,lapply(names(reads),function(x){
  fit.temp <- lm(molecules[,x]~reads[,x])
    c(x,fit.temp$coefficients,summary(fit.temp)$adj.r.squared)
})))
names(regression_table) <- c("sample_id","Intercept","slope","r2")
regression_table$Intercept <- as.numeric(as.character(regression_table$Intercept))
regression_table$slope <- as.numeric(as.character(regression_table$slope))
regression_table$r2 <- as.numeric(as.character(regression_table$r2))
plot(regression_table$r2)

anno_regression <- merge(anno,regression_table,by="sample_id")

ggplot(anno_regression,aes(x=Intercept,y=slope,col=as.factor(individual),shape=as.factor(batch))) + geom_point() + labs(x = "intercept", y = "slope", title = "read-molecule conversion (counts)")