Analysis of totals counts

Last updated: 2018-07-01

Code version: 5eab055

This analysis explores the total counts of reads and molecules mapped to each source (human, fly, worm, and ERCC).

Setup

library("cowplot")
library("dplyr")
library("ggplot2")
library("tidyr")
theme_set(theme_cowplot())
source("../code/functions.R")
library("Biobase")

Import data.

eset <- readRDS("../data/eset.rds")
anno <- pData(eset)

Remove samples with bad cell number or TRA-1-60.

eset_qc <- eset[, anno$cell_number == 1 & anno$tra1.60]
anno_qc <- pData(eset_qc)

Extract the columns relevant for this analysis.

totals <- anno_qc %>% select(experiment:well, raw:mol_hs)

What percentage of reads are mapped to the genome?

totals_long <- totals %>%
  gather(key = "type", value = "count", raw:mol_hs) %>%
  filter(type %in% c("raw", "umi", "mapped", "molecules")) %>%
  mutate(type = factor(type, levels = c("raw", "umi", "mapped", "molecules"),
                       labels = c("Raw reads", "Reads with UMI",
                                  "Reads mapped\nto genome",
                                  "Molecules mapped\nto genome")))
head(totals_long)

  experiment well      type   count
1   02192018  A01 Raw reads 4755105
2   02192018  A02 Raw reads 1955271
3   02192018  A03 Raw reads 4876011
4   02192018  A04 Raw reads 6770341
5   02192018  A05 Raw reads 8782029
6   02192018  A06 Raw reads 7043643

ggplot(totals_long, aes(x = type, y = count)) +
  geom_boxplot() +
  labs(x = "Classification", y = "Number of sequences",
       title = "Processing of raw reads")

Category	mean	sd	% of raw (mean)
Raw reads	5.023019510^{6}	2.253564210^{6}	1.00
Reads with UMI	3.309334110^{6}	1.518695810^{6}	0.65
Reads mapped to genome	2.33680210^{6}	1.119883110^{6}	0.46
Molecules mapped to genome	1.183215110^{5}	5.3484410^{4}	0.03

How does the number of mapped reads vary by C1 chip?

mapped <- totals_long %>%
  filter(type == "Reads mapped\nto genome")
ggplot(mapped, aes(x = experiment, y = count)) +
  geom_boxplot() +
  labs(x = "C1 chip", y = "Number of mapped reads",
       title = "Mapped reads per C1 chip") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

mapped_sum <- mapped %>%
  group_by(experiment) %>%
  summarize(total = sum(count))
ggplot(mapped_sum, aes(x = experiment, y = total)) +
  geom_bar(stat= "identity") +
  labs(x = "C1 chip", y = "Total number of mapped reads",
       title = "Total mapped reads per C1 chip") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

Sorted by total number of mapped reads (in millions).

mapped_sum %>%
  arrange(total) %>%
  mutate(total = total / 10^6) %>%
  knitr::kable()

experiment	total
12042017	159.2927
10022017	172.9970
12112017	174.3949
10062017	174.9899
11272017	177.3012
11152017	180.4016
11202017	180.6296
02242018	181.0613
09272017	181.2822
02282018	181.2910
10172017	181.3652
12132017	182.2084
12062017	182.9707
03012018	183.6841
12082017	184.4322
03172017	184.7017
11162017	185.1417
03162017	185.2525
09262017	185.5391
02262018	185.6744
02202018	186.4867
11302017	186.8379
11142017	187.3253
11172017	189.3749
02212018	190.5448
11292017	191.7120
11022017	191.8474
08222017	192.2871
08162017	192.5859
11102017	193.6744
04202017	194.0535
12122017	195.1131
10092017	195.7672
09252017	195.9887
11032017	196.2268
08112017	197.4141
11212017	197.6497
10302017	198.0889
10052017	200.7256
03072018	201.6887
11282017	202.5115
04072017	202.9047
11092017	204.0547
08182017	204.4532
04142017	204.9052
08102017	205.5988
12142017	205.7184
10042017	207.0857
10102017	208.4722
08152017	208.7694
12052017	209.0949
03312017	210.1042
08282017	210.1087
02222018	211.0495
11082017	211.7339
04132017	212.3782
08142017	215.3691
04052017	215.4493
08212017	217.3511
03302017	219.8773
08302017	219.9493
02192018	226.0620
03052018	227.0483
11072017	230.4075
02272018	233.8705
10112017	235.2866
11132017	238.0722
08242017	239.1506
12072017	239.5364
10162017	241.1949
08312017	241.8290
03062018	241.9799
11062017	244.1415
03232017	245.7061
10132017	248.5827
12012017	257.0711
08232017	258.9153
08292017	260.7327
10122017	269.9484

What is the conversion of reads to molecules?

convert_all <- ggplot(totals, aes(x = mapped, y = molecules)) +
  geom_point() +
  labs(x = "Number of reads mapped to any genome",
       y = "Number of molecules mapped to any genome",
       title = "Conversion of reads to molecules")
convert_hs <- ggplot(totals, aes(x = reads_hs, y = mol_hs)) +
  geom_point() +
  labs(x = "Number of reads mapped to human genome",
       y = "Number of molecules mapped to human genome",
       title = "Conversion of reads to molecules for human")
convert_dm <- ggplot(totals, aes(x = reads_dm, y = mol_dm)) +
  geom_point() +
  labs(x = "Number of reads mapped to fly genome",
       y = "Number of molecules mapped to fly genome",
       title = "Conversion of reads to molecules for fly")
convert_ce <- ggplot(totals, aes(x = reads_ce, y = mol_ce)) +
  geom_point() +
  labs(x = "Number of reads mapped to worm genome",
       y = "Number of molecules mapped to worm genome",
       title = "Conversion of reads to molecules for worm")

plot_grid(convert_all, convert_hs, convert_dm, convert_ce,
          labels = letters[1:4])

Drosophila spike-in

p_reads_dm <- ggplot(anno_qc, aes(x = as.factor(fly), y = reads_dm / mapped * 100)) +
  geom_boxplot() +
  labs(x = "Fly spike-in concentration",
       y = "Percentage of mapped reads that mapped to fly genome")
p_mol_dm <- ggplot(anno_qc, aes(x = as.factor(fly), y = mol_dm / molecules * 100)) +
  geom_boxplot() +
  labs(x = "Fly spike-in concentration",
       y = "Percentage of mapped molecules that mapped to fly genome")
plot_grid(p_reads_dm, p_mol_dm, labels = letters[1:2])

p_reads_dm_counts <- ggplot(anno_qc, aes(x = as.factor(fly), y = reads_dm)) +
  geom_boxplot() +
  labs(x = "Fly spike-in concentration",
       y = "Number of reads that mapped to fly genome")
p_mol_dm_counts <- ggplot(anno_qc, aes(x = as.factor(fly), y = mol_dm)) +
  geom_boxplot() +
  labs(x = "Fly spike-in concentration",
       y = "Number of molecules that mapped to fly genome")
plot_grid(p_reads_dm_counts, p_mol_dm_counts, labels = letters[1:2])

totals_percent <- anno_qc %>%
  mutate(sample = paste(experiment, well, sep = "-"),
         sample = factor(sample, levels = sample[order(reads_dm / raw)])) %>%
  arrange(reads_dm / raw) %>%
  group_by(experiment, sample, fly, worm, ERCC) %>%
  summarize(p_reads_ce = reads_ce / raw * 100,
            p_reads_dm = reads_dm / raw * 100,
            p_reads_ercc = reads_ercc / raw * 100,
            p_reads_hs = reads_hs / raw * 100,
            p_unmapped = unmapped / raw * 100,
            p_no_umi = (raw - unmapped - mapped) / raw * 100) %>%
  gather(key = "type", value = "percent", p_reads_ce:p_no_umi) %>%
  ungroup() %>%
  mutate(type = factor(type, levels = c("p_no_umi", "p_unmapped", "p_reads_hs",
                                        "p_reads_ercc", "p_reads_ce", "p_reads_dm"),
                       labels = c("No UMI", "Unmapped", "H. sapiens", "ERCC",
                                  "C. elegans", "D. melanogaster")),
         fly = factor(fly, levels = c(5000, 50000), labels = c("5 pg fly spike-on",
                                                               "50 pg fly spike-on")))

ggplot(totals_percent, aes(x = sample, y = percent, fill = type)) +
  geom_col() +
  facet_wrap(~fly) +
  labs(x = "Sample", y = "Percent", title = "Read classification per sample") +
  theme(legend.title = element_blank())

C. elegans spike-in

p_reads_ce <- ggplot(anno_qc, aes(x = as.factor(worm), y = reads_ce / mapped * 100)) +
  geom_boxplot() +
  labs(x = "Worm spike-in concentration",
       y = "Percentage of mapped reads that mapped to worm genome")
p_mol_ce <- ggplot(anno_qc, aes(x = as.factor(worm), y = mol_ce / molecules * 100)) +
  geom_boxplot() +
  labs(x = "Worm spike-in concentration",
       y = "Percentage of mapped molecules that mapped to worm genome")
plot_grid(p_reads_ce, p_mol_ce, labels = letters[1:2])

p_reads_ce_counts <- ggplot(anno_qc, aes(x = as.factor(worm), y = reads_ce)) +
  geom_boxplot() +
  labs(x = "Worm spike-in concentration",
       y = "Number of mapped reads that mapped to worm genome")
p_mol_ce_counts <- ggplot(anno_qc, aes(x = as.factor(worm), y = mol_ce)) +
  geom_boxplot() +
  labs(x = "Worm spike-in concentration",
       y = "Number of mapped molecules that mapped to worm genome")
plot_grid(p_reads_ce_counts, p_mol_ce_counts, labels = letters[1:2])

ERCC spike-in

p_reads_ercc <- ggplot(anno_qc, aes(x = as.factor(ERCC), y = reads_ercc / mapped * 100)) +
  geom_boxplot() +
  labs(x = "ERCC spike-in concentration",
       y = "Percentage of mapped reads that mapped to ERCCs")
p_mol_ercc <- ggplot(anno_qc, aes(x = as.factor(ERCC), y = mol_ercc / molecules * 100)) +
  geom_boxplot() +
  labs(x = "ERCC spike-in concentration",
       y = "Percentage of mapped molecules that mapped to ERCCs")
plot_grid(p_reads_ercc, p_mol_ercc, labels = letters[1:2])

p_reads_ercc_counts <- ggplot(anno_qc, aes(x = as.factor(ERCC), y = reads_ercc)) +
  geom_boxplot() +
  labs(x = "ERCC spike-in concentration",
       y = "Number of reads that mapped to ERCCs")
p_mol_ercc_counts <- ggplot(anno_qc, aes(x = as.factor(ERCC), y = mol_ercc)) +
  geom_boxplot() +
  labs(x = "ERCC spike-in concentration",
       y = "Number of molecules that mapped to ERCCs")
plot_grid(p_reads_ercc_counts, p_mol_ercc_counts, labels = letters[1:2])

Session information

sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Scientific Linux 7.4 (Nitrogen)

Matrix products: default
BLAS: /project2/gilad/jdblischak/miniconda3/envs/scqtl/lib/R/lib/libRblas.so
LAPACK: /project2/gilad/jdblischak/miniconda3/envs/scqtl/lib/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  methods   stats     graphics  grDevices utils     datasets 
[8] base     

other attached packages:
[1] bindrcpp_0.2        Biobase_2.38.0      BiocGenerics_0.24.0
[4] tidyr_0.7.1         dplyr_0.7.4         cowplot_0.9.1      
[7] ggplot2_2.2.1      

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.13     compiler_3.4.1   git2r_0.19.0     plyr_1.8.4      
 [5] highr_0.6        bindr_0.1        tools_3.4.1      digest_0.6.12   
 [9] evaluate_0.10.1  tibble_1.3.3     gtable_0.2.0     pkgconfig_2.0.1 
[13] rlang_0.1.2      yaml_2.1.14      stringr_1.2.0    knitr_1.20      
[17] rprojroot_1.2    grid_3.4.1       tidyselect_0.2.3 glue_1.1.1      
[21] R6_2.2.0         rmarkdown_1.8    purrr_0.2.2      magrittr_1.5    
[25] backports_1.0.5  scales_0.5.0     htmltools_0.3.6  assertthat_0.1  
[29] colorspace_1.3-2 labeling_0.3     stringi_1.1.2    lazyeval_0.2.0  
[33] munsell_0.4.3

This R Markdown site was created with workflowr