Last updated: 2018-07-01
Code version: 5eab055
This analysis explores the total counts of reads and molecules mapped to each source (human, fly, worm, and ERCC).
library("cowplot")
library("dplyr")
library("ggplot2")
library("tidyr")
theme_set(theme_cowplot())
source("../code/functions.R")
library("Biobase")
Import data.
eset <- readRDS("../data/eset.rds")
anno <- pData(eset)
Remove samples with bad cell number or TRA-1-60.
eset_qc <- eset[, anno$cell_number == 1 & anno$tra1.60]
anno_qc <- pData(eset_qc)
Extract the columns relevant for this analysis.
totals <- anno_qc %>% select(experiment:well, raw:mol_hs)
totals_long <- totals %>%
gather(key = "type", value = "count", raw:mol_hs) %>%
filter(type %in% c("raw", "umi", "mapped", "molecules")) %>%
mutate(type = factor(type, levels = c("raw", "umi", "mapped", "molecules"),
labels = c("Raw reads", "Reads with UMI",
"Reads mapped\nto genome",
"Molecules mapped\nto genome")))
head(totals_long)
experiment well type count
1 02192018 A01 Raw reads 4755105
2 02192018 A02 Raw reads 1955271
3 02192018 A03 Raw reads 4876011
4 02192018 A04 Raw reads 6770341
5 02192018 A05 Raw reads 8782029
6 02192018 A06 Raw reads 7043643
ggplot(totals_long, aes(x = type, y = count)) +
geom_boxplot() +
labs(x = "Classification", y = "Number of sequences",
title = "Processing of raw reads")
Category | mean | sd | % of raw (mean) |
---|---|---|---|
Raw reads | 5.023019510^{6} | 2.253564210^{6} | 1.00 |
Reads with UMI | 3.309334110^{6} | 1.518695810^{6} | 0.65 |
Reads mapped to genome | 2.33680210^{6} | 1.119883110^{6} | 0.46 |
Molecules mapped to genome | 1.183215110^{5} | 5.3484410^{4} | 0.03 |
mapped <- totals_long %>%
filter(type == "Reads mapped\nto genome")
ggplot(mapped, aes(x = experiment, y = count)) +
geom_boxplot() +
labs(x = "C1 chip", y = "Number of mapped reads",
title = "Mapped reads per C1 chip") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
mapped_sum <- mapped %>%
group_by(experiment) %>%
summarize(total = sum(count))
ggplot(mapped_sum, aes(x = experiment, y = total)) +
geom_bar(stat= "identity") +
labs(x = "C1 chip", y = "Total number of mapped reads",
title = "Total mapped reads per C1 chip") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
Sorted by total number of mapped reads (in millions).
mapped_sum %>%
arrange(total) %>%
mutate(total = total / 10^6) %>%
knitr::kable()
experiment | total |
---|---|
12042017 | 159.2927 |
10022017 | 172.9970 |
12112017 | 174.3949 |
10062017 | 174.9899 |
11272017 | 177.3012 |
11152017 | 180.4016 |
11202017 | 180.6296 |
02242018 | 181.0613 |
09272017 | 181.2822 |
02282018 | 181.2910 |
10172017 | 181.3652 |
12132017 | 182.2084 |
12062017 | 182.9707 |
03012018 | 183.6841 |
12082017 | 184.4322 |
03172017 | 184.7017 |
11162017 | 185.1417 |
03162017 | 185.2525 |
09262017 | 185.5391 |
02262018 | 185.6744 |
02202018 | 186.4867 |
11302017 | 186.8379 |
11142017 | 187.3253 |
11172017 | 189.3749 |
02212018 | 190.5448 |
11292017 | 191.7120 |
11022017 | 191.8474 |
08222017 | 192.2871 |
08162017 | 192.5859 |
11102017 | 193.6744 |
04202017 | 194.0535 |
12122017 | 195.1131 |
10092017 | 195.7672 |
09252017 | 195.9887 |
11032017 | 196.2268 |
08112017 | 197.4141 |
11212017 | 197.6497 |
10302017 | 198.0889 |
10052017 | 200.7256 |
03072018 | 201.6887 |
11282017 | 202.5115 |
04072017 | 202.9047 |
11092017 | 204.0547 |
08182017 | 204.4532 |
04142017 | 204.9052 |
08102017 | 205.5988 |
12142017 | 205.7184 |
10042017 | 207.0857 |
10102017 | 208.4722 |
08152017 | 208.7694 |
12052017 | 209.0949 |
03312017 | 210.1042 |
08282017 | 210.1087 |
02222018 | 211.0495 |
11082017 | 211.7339 |
04132017 | 212.3782 |
08142017 | 215.3691 |
04052017 | 215.4493 |
08212017 | 217.3511 |
03302017 | 219.8773 |
08302017 | 219.9493 |
02192018 | 226.0620 |
03052018 | 227.0483 |
11072017 | 230.4075 |
02272018 | 233.8705 |
10112017 | 235.2866 |
11132017 | 238.0722 |
08242017 | 239.1506 |
12072017 | 239.5364 |
10162017 | 241.1949 |
08312017 | 241.8290 |
03062018 | 241.9799 |
11062017 | 244.1415 |
03232017 | 245.7061 |
10132017 | 248.5827 |
12012017 | 257.0711 |
08232017 | 258.9153 |
08292017 | 260.7327 |
10122017 | 269.9484 |
convert_all <- ggplot(totals, aes(x = mapped, y = molecules)) +
geom_point() +
labs(x = "Number of reads mapped to any genome",
y = "Number of molecules mapped to any genome",
title = "Conversion of reads to molecules")
convert_hs <- ggplot(totals, aes(x = reads_hs, y = mol_hs)) +
geom_point() +
labs(x = "Number of reads mapped to human genome",
y = "Number of molecules mapped to human genome",
title = "Conversion of reads to molecules for human")
convert_dm <- ggplot(totals, aes(x = reads_dm, y = mol_dm)) +
geom_point() +
labs(x = "Number of reads mapped to fly genome",
y = "Number of molecules mapped to fly genome",
title = "Conversion of reads to molecules for fly")
convert_ce <- ggplot(totals, aes(x = reads_ce, y = mol_ce)) +
geom_point() +
labs(x = "Number of reads mapped to worm genome",
y = "Number of molecules mapped to worm genome",
title = "Conversion of reads to molecules for worm")
plot_grid(convert_all, convert_hs, convert_dm, convert_ce,
labels = letters[1:4])
p_reads_dm <- ggplot(anno_qc, aes(x = as.factor(fly), y = reads_dm / mapped * 100)) +
geom_boxplot() +
labs(x = "Fly spike-in concentration",
y = "Percentage of mapped reads that mapped to fly genome")
p_mol_dm <- ggplot(anno_qc, aes(x = as.factor(fly), y = mol_dm / molecules * 100)) +
geom_boxplot() +
labs(x = "Fly spike-in concentration",
y = "Percentage of mapped molecules that mapped to fly genome")
plot_grid(p_reads_dm, p_mol_dm, labels = letters[1:2])
p_reads_dm_counts <- ggplot(anno_qc, aes(x = as.factor(fly), y = reads_dm)) +
geom_boxplot() +
labs(x = "Fly spike-in concentration",
y = "Number of reads that mapped to fly genome")
p_mol_dm_counts <- ggplot(anno_qc, aes(x = as.factor(fly), y = mol_dm)) +
geom_boxplot() +
labs(x = "Fly spike-in concentration",
y = "Number of molecules that mapped to fly genome")
plot_grid(p_reads_dm_counts, p_mol_dm_counts, labels = letters[1:2])
totals_percent <- anno_qc %>%
mutate(sample = paste(experiment, well, sep = "-"),
sample = factor(sample, levels = sample[order(reads_dm / raw)])) %>%
arrange(reads_dm / raw) %>%
group_by(experiment, sample, fly, worm, ERCC) %>%
summarize(p_reads_ce = reads_ce / raw * 100,
p_reads_dm = reads_dm / raw * 100,
p_reads_ercc = reads_ercc / raw * 100,
p_reads_hs = reads_hs / raw * 100,
p_unmapped = unmapped / raw * 100,
p_no_umi = (raw - unmapped - mapped) / raw * 100) %>%
gather(key = "type", value = "percent", p_reads_ce:p_no_umi) %>%
ungroup() %>%
mutate(type = factor(type, levels = c("p_no_umi", "p_unmapped", "p_reads_hs",
"p_reads_ercc", "p_reads_ce", "p_reads_dm"),
labels = c("No UMI", "Unmapped", "H. sapiens", "ERCC",
"C. elegans", "D. melanogaster")),
fly = factor(fly, levels = c(5000, 50000), labels = c("5 pg fly spike-on",
"50 pg fly spike-on")))
ggplot(totals_percent, aes(x = sample, y = percent, fill = type)) +
geom_col() +
facet_wrap(~fly) +
labs(x = "Sample", y = "Percent", title = "Read classification per sample") +
theme(legend.title = element_blank())
p_reads_ce <- ggplot(anno_qc, aes(x = as.factor(worm), y = reads_ce / mapped * 100)) +
geom_boxplot() +
labs(x = "Worm spike-in concentration",
y = "Percentage of mapped reads that mapped to worm genome")
p_mol_ce <- ggplot(anno_qc, aes(x = as.factor(worm), y = mol_ce / molecules * 100)) +
geom_boxplot() +
labs(x = "Worm spike-in concentration",
y = "Percentage of mapped molecules that mapped to worm genome")
plot_grid(p_reads_ce, p_mol_ce, labels = letters[1:2])
p_reads_ce_counts <- ggplot(anno_qc, aes(x = as.factor(worm), y = reads_ce)) +
geom_boxplot() +
labs(x = "Worm spike-in concentration",
y = "Number of mapped reads that mapped to worm genome")
p_mol_ce_counts <- ggplot(anno_qc, aes(x = as.factor(worm), y = mol_ce)) +
geom_boxplot() +
labs(x = "Worm spike-in concentration",
y = "Number of mapped molecules that mapped to worm genome")
plot_grid(p_reads_ce_counts, p_mol_ce_counts, labels = letters[1:2])
p_reads_ercc <- ggplot(anno_qc, aes(x = as.factor(ERCC), y = reads_ercc / mapped * 100)) +
geom_boxplot() +
labs(x = "ERCC spike-in concentration",
y = "Percentage of mapped reads that mapped to ERCCs")
p_mol_ercc <- ggplot(anno_qc, aes(x = as.factor(ERCC), y = mol_ercc / molecules * 100)) +
geom_boxplot() +
labs(x = "ERCC spike-in concentration",
y = "Percentage of mapped molecules that mapped to ERCCs")
plot_grid(p_reads_ercc, p_mol_ercc, labels = letters[1:2])
p_reads_ercc_counts <- ggplot(anno_qc, aes(x = as.factor(ERCC), y = reads_ercc)) +
geom_boxplot() +
labs(x = "ERCC spike-in concentration",
y = "Number of reads that mapped to ERCCs")
p_mol_ercc_counts <- ggplot(anno_qc, aes(x = as.factor(ERCC), y = mol_ercc)) +
geom_boxplot() +
labs(x = "ERCC spike-in concentration",
y = "Number of molecules that mapped to ERCCs")
plot_grid(p_reads_ercc_counts, p_mol_ercc_counts, labels = letters[1:2])
sessionInfo()
R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Scientific Linux 7.4 (Nitrogen)
Matrix products: default
BLAS: /project2/gilad/jdblischak/miniconda3/envs/scqtl/lib/R/lib/libRblas.so
LAPACK: /project2/gilad/jdblischak/miniconda3/envs/scqtl/lib/R/lib/libRlapack.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel methods stats graphics grDevices utils datasets
[8] base
other attached packages:
[1] bindrcpp_0.2 Biobase_2.38.0 BiocGenerics_0.24.0
[4] tidyr_0.7.1 dplyr_0.7.4 cowplot_0.9.1
[7] ggplot2_2.2.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.13 compiler_3.4.1 git2r_0.19.0 plyr_1.8.4
[5] highr_0.6 bindr_0.1 tools_3.4.1 digest_0.6.12
[9] evaluate_0.10.1 tibble_1.3.3 gtable_0.2.0 pkgconfig_2.0.1
[13] rlang_0.1.2 yaml_2.1.14 stringr_1.2.0 knitr_1.20
[17] rprojroot_1.2 grid_3.4.1 tidyselect_0.2.3 glue_1.1.1
[21] R6_2.2.0 rmarkdown_1.8 purrr_0.2.2 magrittr_1.5
[25] backports_1.0.5 scales_0.5.0 htmltools_0.3.6 assertthat_0.1
[29] colorspace_1.3-2 labeling_0.3 stringi_1.1.2 lazyeval_0.2.0
[33] munsell_0.4.3
This R Markdown site was created with workflowr