Last updated: 2016-04-24
Code version: 74f30e4e4b6c6fea3667336da91b01fb5603564f
During the processing pipeline, the number of reads in a file are counted and saved in a separate text files. The script gather-total-counts.py compiles all these counts and extracts the relevant variables from the filename.
library("dplyr")
library("ggplot2")
theme_set(theme_bw(base_size = 16))
theme_update(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank())
counts <- read.table("../data/total-counts.txt", header = TRUE, sep = "\t",
stringsAsFactors = TRUE)
head(counts)
stage sequences combined individual replicate well index lane
1 raw reads NA NA19098 r1 A01 ATTAGACG L002
2 raw reads NA NA19098 r1 A01 ATTAGACG L003
3 raw reads NA NA19098 r1 A01 ATTAGACG L006
4 raw reads NA NA19098 r1 A02 ACGAGTCT L002
5 raw reads NA NA19098 r1 A02 ACGAGTCT L003
6 raw reads NA NA19098 r1 A02 ACGAGTCT L006
flow_cell counts
1 C6WYKACXX 1986825
2 C6WURACXX 1785837
3 C723YACXX 1739622
4 C6WYKACXX 2019671
5 C6WURACXX 1809239
6 C723YACXX 1762255
str(counts)
'data.frame': 20232 obs. of 10 variables:
$ stage : Factor w/ 5 levels "mapped to exons",..: 4 4 4 4 4 4 4 4 4 4 ...
$ sequences : Factor w/ 2 levels "molecules","reads": 2 2 2 2 2 2 2 2 2 2 ...
$ combined : logi NA NA NA NA NA NA ...
$ individual: Factor w/ 3 levels "NA19098","NA19101",..: 1 1 1 1 1 1 1 1 1 1 ...
$ replicate : Factor w/ 3 levels "r1","r2","r3": 1 1 1 1 1 1 1 1 1 1 ...
$ well : Factor w/ 97 levels "A01","A02","A03",..: 1 1 1 2 2 2 3 3 3 4 ...
$ index : Factor w/ 96 levels "AAAGGTCC","AACCCATG",..: 23 23 23 13 13 13 85 85 85 68 ...
$ lane : Factor w/ 8 levels "L001","L002",..: 2 3 6 2 3 6 2 3 6 2 ...
$ flow_cell : Factor w/ 4 levels "C6WURACXX","C6WYKACXX",..: 2 1 3 2 1 3 2 1 3 2 ...
$ counts : int 1986825 1785837 1739622 2019671 1809239 1762255 1191181 1056347 989324 1601993 ...
# Order the processing steps
counts$stage <- factor(counts$stage,
levels = c("raw", "valid UMI", "quality trimmed",
"mapped to genome", "mapped to exons"))
# Make new variable to separate bulk and single cell samples
counts$type <- ifelse(counts$well == "bulk", "bulk", "single")
counts$type <- factor(counts$type, levels = c("bulk", "single"))
# Scale to thousands and millions of counts
counts$counts_thousands <- counts$counts / 10^3
counts$counts_mil <- counts$counts / 10^6
summary(counts)
stage sequences combined individual
raw :2664 molecules: 6912 Mode :logical NA19098:6744
valid UMI :2664 reads :13320 FALSE:7848 NA19101:6744
quality trimmed :2664 TRUE :1728 NA19239:6744
mapped to genome:6120 NA's :10656
mapped to exons :6120
replicate well index lane
r1:6744 bulk : 360 AAGTCTTC: 209 L002 :2688
r2:6744 A01 : 207 ACACAACT: 209 L003 :2688
r3:6744 A02 : 207 ACAGCGAA: 209 L004 :2688
A03 : 207 AGTCCACC: 209 L006 :2688
A04 : 207 CACGGTGT: 209 L001 :2106
A05 : 207 (Other) :17459 (Other):5646
(Other):18837 NA's : 1728 NA's :1728
flow_cell counts type counts_thousands
C6WURACXX:4794 Min. : 2495 bulk : 360 Min. : 2.495
C6WYKACXX:4794 1st Qu.: 84396 single:19872 1st Qu.: 84.396
C723YACXX:4794 Median : 822508 Median : 822.509
C72JMACXX:4122 Mean : 1002821 Mean : 1002.821
NA's :1728 3rd Qu.: 1578909 3rd Qu.: 1578.909
Max. :15126607 Max. :15126.607
counts_mil
Min. : 0.002495
1st Qu.: 0.084396
Median : 0.822508
Mean : 1.002821
3rd Qu.: 1.578909
Max. :15.126607
counts_per_lane <- counts %>% filter(!combined | is.na(combined),
sequences == "reads")
stopifnot(table(counts_per_lane$stage) == 2664)
Plot the number of reads per lane at each processing stage faceted by bulk versus single cell sequencing.
ggplot(counts_per_lane,
aes(x = stage, y = counts_mil)) +
geom_boxplot() +
facet_wrap(~type) +
scale_y_continuous(breaks = seq(0, 16, 2)) +
labs(x = "Processing stage", y = "Number of reads (millions)",
title = "Total number of reads per lane at each stage") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
Plot the number of raw reads per lane for each replicate of each individual faceted by bulk versus single cell sequencing.
ggplot(counts_per_lane[counts$stage == "raw", ],
aes(x = individual, y = counts_mil, fill = replicate)) +
geom_boxplot() +
facet_wrap(~type) +
scale_y_continuous(breaks = seq(0, 16, 2)) +
labs(x = "Individual", y = "Number of raw reads (millions)",
title = "Total number of raw reads per lane for each replicate") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
Plot the number of raw reads per lane for each lane of each flow cell faceted by bulk versus single cell sequencing.
ggplot(counts_per_lane[counts$stage == "raw", ],
aes(x = flow_cell, y = counts_mil, fill = lane)) +
geom_boxplot() +
facet_wrap(~type) +
scale_y_continuous(breaks = seq(0, 16, 2)) +
labs(x = "Flow cell", y = "Number of raw reads (millions)",
title = "Total number of raw reads per lane for each lane") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
Plot the number of raw reads per lane for each index faceted by bulk versus single cell sequencing.
ggplot(counts_per_lane[counts$stage == "raw", ],
aes(x = index, y = counts_mil)) +
geom_boxplot() +
facet_wrap(~type) +
scale_y_continuous(breaks = seq(0, 16, 2)) +
labs(x = "Index", y = "Number of raw reads (millions)",
title = "Total number of raw reads per lane for each index") +
theme(axis.text.x = element_blank())
counts_per_sample <- counts_per_lane %>%
group_by(stage, individual, replicate, well, type) %>%
summarize(counts_mil = sum(counts) / 10^6)
stopifnot(table(counts_per_sample$stage) == 864 + 9)
Plot the number of reads per sample at each processing stage faceted by bulk versus single cell sequencing.
ggplot(counts_per_sample,
aes(x = stage, y = counts_mil)) +
geom_boxplot() +
facet_wrap(~type) +
scale_y_continuous(breaks = seq(0, 100, 10)) +
labs(x = "Processing stage", y = "Number of reads (millions)",
title = "Total number of reads per sample at each stage") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
Plot the number of raw reads per sample for each replicate of each individual faceted by bulk versus single cell sequencing.
ggplot(counts_per_sample[counts$stage == "raw", ],
aes(x = individual, y = counts_mil, fill = replicate)) +
geom_boxplot() +
facet_wrap(~type) +
scale_y_continuous(breaks = seq(0, 100, 10)) +
labs(x = "Individual", y = "Number of raw reads (millions)",
title = "Total number of raw reads per sample for each replicate") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
molecules_per_lane <- counts %>% filter(!combined | is.na(combined),
sequences == "molecules",
stage %in% c("mapped to genome",
"mapped to exons"))
molecules_per_lane <- droplevels(molecules_per_lane)
stopifnot(table(molecules_per_lane$stage) == 96 * 3 * 3 * 3)
Plot the number of molecules per lane at each processing stage.
ggplot(molecules_per_lane,
aes(x = stage, y = counts_thousands)) +
geom_boxplot() +
scale_y_continuous(breaks = seq(0, 300, 50)) +
labs(x = "Processing stage", y = "Number of molecules (thousands)",
title = "Total number of molecules per lane at each stage") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
Plot the number of mapped molecules per lane for each replicate of each individual.
ggplot(molecules_per_lane[molecules_per_lane$stage == "mapped to genome", ],
aes(x = individual, y = counts_thousands, fill = replicate)) +
geom_boxplot() +
scale_y_continuous(breaks = seq(0, 300, 50)) +
labs(x = "Individual", y = "Number of mapped molecules (thousands)",
title = "Total number of mapped molecules per lane for each replicate") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
Plot the number of mapped molecules per lane for each lane of each flow cell faceted by bulk versus single cell sequencing.
ggplot(molecules_per_lane[molecules_per_lane$stage == "mapped to genome", ],
aes(x = flow_cell, y = counts_thousands, fill = lane)) +
geom_boxplot() +
scale_y_continuous(breaks = seq(0, 300, 50)) +
labs(x = "Flow cell", y = "Number of mapped molecules (thousands)",
title = "Total number of mapped molecules per lane for each lane") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
Plot the number of mapped molecules per lane for each index faceted by bulk versus single cell sequencing.
ggplot(molecules_per_lane[molecules_per_lane$stage == "mapped to genome", ],
aes(x = index, y = counts_thousands)) +
geom_boxplot() +
scale_y_continuous(breaks = seq(0, 500, 50)) +
labs(x = "Index", y = "Number of mapped molecules (thousands)",
title = "Total number of mapped molecules per lane for each index") +
theme(axis.text.x = element_blank())
molecules_per_sample <- counts %>% filter(combined == TRUE,
stage %in% c("mapped to genome",
"mapped to exons"))
molecules_per_sample <- droplevels(molecules_per_sample)
stopifnot(molecules_per_sample$sequences == "molecules",
table(molecules_per_sample$stage) == 864)
Plot the number of molecules per sample at each processing stage.
ggplot(molecules_per_sample,
aes(x = stage, y = counts_thousands)) +
geom_boxplot() +
scale_y_continuous(breaks = seq(0, 400, 50)) +
labs(x = "Processing stage", y = "Number of molecules (thousands)",
title = "Total number of molecules per sample at each stage") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
Plot the number of mapped molecules per sample for each replicate of each individual.
ggplot(molecules_per_sample[molecules_per_sample$stage == "mapped to genome", ],
aes(x = individual, y = counts_thousands, fill = replicate)) +
geom_boxplot() +
scale_y_continuous(breaks = seq(0, 400, 50)) +
labs(x = "Individual", y = "Number of mapped molecules (thousands)",
title = "Total number of mapped molecules per sample for each replicate") +
theme(axis.text.x = element_text(angle = 45, hjust = 0.9, vjust = 0.75))
These summary statistics include low quality cells that are removed before downstream analyses.
depth_stats <- counts %>%
filter(stage == "raw", well != "bulk") %>%
group_by(individual, replicate, well) %>%
summarize(counts_per_sample = sum(counts)) %>%
ungroup() %>%
summarize(mean = mean(counts_per_sample), sd = sd(counts_per_sample),
min = min(counts_per_sample), max = max(counts_per_sample))
We obtained an average of 6.28 +/- 2.11 million raw sequencing reads per sample (range 0.361-11.2 million reads).
exons_reads_stats <- counts %>%
filter(stage == "mapped to exons", well != "bulk", sequences == "reads") %>%
group_by(individual, replicate, well) %>%
summarize(counts_per_sample = sum(counts)) %>%
ungroup() %>%
summarize(mean = mean(counts_per_sample), sd = sd(counts_per_sample),
min = min(counts_per_sample), max = max(counts_per_sample))
We obtained an average of 2.1 +/- 0.879 million sequencing reads per sample that mapped to protein-coding exons (range 0.0552-4.33 million reads).
exons_molecules_stats <- counts %>%
filter(stage == "mapped to exons", well != "bulk", sequences == "molecules",
combined) %>%
group_by(individual, replicate, well) %>%
summarize(counts_per_sample = sum(counts)) %>%
ungroup() %>%
summarize(mean = mean(counts_per_sample), sd = sd(counts_per_sample),
min = min(counts_per_sample), max = max(counts_per_sample))
We obtained an average of 68.1 +/- 28.2 thousand molecules per sample that mapped to protein-coding exons (range 6.03-235 thousand molecules).
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggplot2_1.0.1 dplyr_0.4.2 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.12.0 magrittr_1.5 MASS_7.3-40 munsell_0.4.2
[5] colorspace_1.2-6 R6_2.1.1 stringr_1.0.0 httr_0.6.1
[9] plyr_1.8.3 tools_3.2.0 parallel_3.2.0 grid_3.2.0
[13] gtable_0.1.2 DBI_0.3.1 htmltools_0.2.6 lazyeval_0.1.10
[17] yaml_2.1.13 assertthat_0.1 digest_0.6.8 reshape2_1.4.1
[21] formatR_1.2 bitops_1.0-6 RCurl_1.95-4.6 evaluate_0.7
[25] rmarkdown_0.6.1 stringi_0.4-1 scales_0.2.4 proto_0.3-10