Last updated: 2016-01-15
Code version: ab0cf1f917a4c115e9ee44403eb62e837e1795ae
library("dplyr")
library("ggplot2")
library("cowplot")
theme_set(theme_bw(base_size = 14))
Run for each single cell for each sequencing depth. The analysis is performed by detect-genes.R.
cd $ssd/lcl/full-lane
mkdir -p genes-detected
mkdir -p ~/log/detect-genes.R
CELLS=1
SEED=1
for WELL in A9E1 B2E2 B4H1 D2H2
do
for NUM in 200000 1000000 10000000 20000000 30000000 40000000 50000000
do
for GENE in ENSG ERCC
do
# Molecules
CMD="detect-genes.R $CELLS $SEED molecule-counts-$NUM.txt --min_count=1 --min_cells=1 --wells=$WELL --gene=$GENE"
DEST="genes-detected/molecule-$WELL-$CELLS-$SEED-$NUM-$GENE.txt"
echo "$CMD > $DEST" | qsub -l h_vmem=2g -cwd -V -N detect-molecule-$WELL-$CELLS-$SEED-$NUM-$GENE -j y -o ~/log/detect-genes.R -l 'hostname=!bigmem01'
sleep .01s
# Reads
CMD="detect-genes.R $CELLS $SEED read-counts-$NUM.txt --min_count=10 --min_cells=1 --wells=$WELL --gene=$GENE"
DEST="genes-detected/read-$WELL-$CELLS-$SEED-$NUM-$GENE.txt"
echo "$CMD > $DEST" | qsub -l h_vmem=2g -cwd -V -N detect-read-$WELL-$CELLS-$SEED-$NUM-$GENE -j y -o ~/log/detect-genes.R -l 'hostname=!bigmem01'
sleep .01s
done
done
done
Convert to one file using Python. Run from $ssd/lcl/full-lane
.
import os
import glob
files = glob.glob("genes-detected/*txt")
out = open("genes-detected.txt", "w")
out.write("type\twell\tdepth\tfeatures\tnum_cells\tseed\tgenes\tmean_counts\n")
for fname in files:
fname_parts = os.path.basename(fname).rstrip(".txt").split("-")
type = fname_parts[0]
well = fname_parts[1]
depth = fname_parts[4]
features = fname_parts[5]
f = open(fname, "r")
out.write(type + "\t" + well + "\t" + depth + "\t" + features + "\t" + f.read())
f.close()
out.close()
genes_data <- read.table("/mnt/gluster/data/internal_supp/singleCellSeq/lcl/full-lane/genes-detected.txt",
header = TRUE, sep = "\t", stringsAsFactors = FALSE)
genes_data$depth <- as.factor(genes_data$depth)
levels(genes_data$depth) <- c("0.2", "1", "10", "20", "30", "40", "50")
For the analysis of read counts, a gene was detected if it had greater than 10 reads. For the analysis of molecule counts, a gene was detected if it had greater than 1 molecule.
genes <- ggplot(genes_data[genes_data$features == "ENSG", ],
aes(x = as.factor(depth), y = genes)) +
geom_boxplot() +
geom_point(aes(color = as.factor(depth), shape = well)) +
facet_wrap(~type) +
labs(x = "Sequencing depth (millions)",
y = "Number of genes detected",
color = "Depth",
title = "Subsample: Number of genes detected")
genes
ercc <- genes %+% genes_data[genes_data$features == "ERCC", ]
ercc
Explore the effect of subsampling sequencing depth on the total count. Only includes counts of genes which had the minumum count (10 reads; 1 molecule).
p <- ggplot(genes_data[genes_data$features == "ENSG" &
genes_data$type == "read", ],
aes(x = as.factor(depth), y = mean_counts)) +
geom_boxplot() +
geom_point(aes(color = as.factor(depth), shape = well)) +
labs(x = "Sequencing depth (millions)",
y = "Total count",
color = "Depth",
title = "Subsample: Endogenous genes, reads")
reads_gene <- p
reads_gene
molecule_gene <- p %+% genes_data[genes_data$features == "ENSG" &
genes_data$type == "molecule", ] +
labs(y = "Total count",
title = "Subsample: Endogenous genes, molecules")
molecule_gene
read_ercc <- p %+% genes_data[genes_data$features == "ERCC" &
genes_data$type == "read", ] +
labs(y = "Total count",
title = "Subsample: ERCC, reads")
read_ercc
molecule_ercc <- p %+% genes_data[genes_data$features == "ERCC" &
genes_data$type == "molecule", ] +
labs(y = "Total count",
title = "Subsample: ERCC, molecules")
molecule_ercc
The cDNA concentrations are listed as followed:
theme_set(theme_bw(base_size = 12))
plot_grid(genes + theme(legend.position = c(.4,.35)),
ercc + labs(title = "Subsample: Number of ERCC detected") + theme(legend.position = "none"),
labels = LETTERS[1:2])
plot_grid(reads_gene + labs(title = "Genes-reads") + theme(legend.position = "none"),
molecule_gene + labs(title = "Genes-molecules") + theme(legend.position = "none"),
read_ercc + labs(title = "ERCC-reads") + theme(legend.position = "none"),
molecule_ercc + labs(title = "ERCC-molecules") + theme(legend.position = "none"),
nrow = 1,
labels = LETTERS[3:6])
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] cowplot_0.3.1 ggplot2_1.0.1 dplyr_0.4.2 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.12.0 magrittr_1.5 MASS_7.3-40 munsell_0.4.2
[5] colorspace_1.2-6 R6_2.1.1 stringr_1.0.0 httr_0.6.1
[9] plyr_1.8.3 tools_3.2.0 parallel_3.2.0 grid_3.2.0
[13] gtable_0.1.2 DBI_0.3.1 htmltools_0.2.6 yaml_2.1.13
[17] assertthat_0.1 digest_0.6.8 reshape2_1.4.1 formatR_1.2
[21] bitops_1.0-6 RCurl_1.95-4.6 evaluate_0.7 rmarkdown_0.6.1
[25] labeling_0.3 stringi_0.4-1 scales_0.2.4 proto_0.3-10