Subsampling replicates for supplementary figure

Input
Figures
Session information

Last updated: 2016-11-08

Code version: bd286a36f14d3b332285cdc7e62258b1f616bb14

The reviewers asked to see the results of the subsampling for each batch. The data was created with subsample-pipeline-rep.py. The analysis below was modified from the original analysis for Figure 2.

Input

library("dplyr")
library("tidyr")
library("ggplot2")
library("cowplot")
theme_set(theme_bw(base_size = 12))
theme_update(panel.grid.minor.x = element_blank(),
             panel.grid.minor.y = element_blank(),
             panel.grid.major.x = element_blank(),
             panel.grid.major.y = element_blank(),
             legend.key = element_blank())

d <- read.table("../data/subsampling-results-rep.txt",
                header = TRUE, sep = "\t", stringsAsFactors = FALSE)
str(d)

'data.frame':   12660 obs. of  25 variables:
 $ type            : chr  "reads" "reads" "molecules" "molecules" ...
 $ depth           : int  50000 50000 50000 50000 50000 50000 50000 50000 50000 50000 ...
 $ gene_subset     : chr  "all" "all" "all" "all" ...
 $ seed            : int  1 1 1 1 2 2 2 2 3 3 ...
 $ subsampled_cells: int  5 5 5 5 5 5 5 5 5 5 ...
 $ individual      : chr  "NA19098" "NA19098" "NA19098" "NA19098" ...
 $ replicate       : chr  "r1" "r3" "r1" "r3" ...
 $ lower_q         : num  0 0 0 0 0 0 0 0 0 0 ...
 $ upper_q         : num  1 1 1 1 1 1 1 1 1 1 ...
 $ available_ensg  : int  12192 12192 12192 12192 12192 12192 12192 12192 12192 12192 ...
 $ used_ensg       : int  12192 12192 12192 12192 12192 12192 12192 12192 12192 12192 ...
 $ available_ercc  : int  43 43 43 43 43 43 43 43 43 43 ...
 $ used_ercc       : int  43 43 43 43 43 43 43 43 43 43 ...
 $ potential_cells : int  85 57 85 57 85 57 85 57 85 57 ...
 $ available_cells : int  85 57 85 57 85 57 85 57 85 57 ...
 $ pearson_ensg    : num  0.815 0.778 0.825 0.794 0.79 ...
 $ pearson_ercc    : num  0.911 0.934 0.918 0.935 0.916 ...
 $ spearman_ensg   : num  0.826 0.782 0.836 0.799 0.796 ...
 $ spearman_ercc   : num  0.913 0.902 0.912 0.908 0.907 ...
 $ detected_ensg   : int  8470 8322 8470 8322 8200 8409 8200 8409 8366 8283 ...
 $ detected_ercc   : int  25 27 25 27 29 28 29 28 25 24 ...
 $ mean_counts_ensg: num  24282 23583 15906 13533 22905 ...
 $ mean_counts_ercc: num  284 401 191 226 454 ...
 $ var_pearson     : num  0.823 0.807 0.847 0.843 0.812 ...
 $ var_spearman    : num  0.791 0.772 0.812 0.799 0.771 ...

d_grouped <- d %>%
  group_by(type, depth, gene_subset, subsampled_cells,
           individual, replicate, potential_cells, available_cells,
           lower_q, upper_q, available_ensg, used_ensg,
           available_ercc, used_ercc) %>%
  summarize(mean_detected = mean(detected_ensg),
            sem_detected = sd(detected_ensg) / sqrt(length(detected_ensg)),
            mean_bulk = mean(pearson_ensg),
            sem_bulk = sd(pearson_ensg) / sqrt(length(pearson_ensg)),
            mean_var = mean(var_pearson),
            sem_var = sd(var_pearson) / sqrt(length(var_pearson)))

d_filter <- d_grouped %>% filter(type == "molecules",
                                 gene_subset %in% c("lower")) %>%
  mutate(batch = paste(individual, replicate, sep = "-"))
d_filter$depth2 <- factor(d_filter$depth,
                          labels = format(unique(d_filter$depth), big.mark = ",",
                                          scientifc = FALSE, trim = TRUE))

Figures

plot_base <- ggplot(d_filter,
                 aes(x = subsampled_cells,
                     color = individual, shape = replicate)) +
  geom_point(alpha = 0.5) +
  geom_line(alpha = 0.5) +
  facet_wrap(~depth2, ncol = 1) +
  scale_x_continuous(breaks = unique(d_filter$subsampled_cells)) +
  scale_color_discrete(name = "Individual") +
  scale_shape(name = "Replicate") +
  theme(legend.position = "none", axis.text.x = element_text(size = rel(0.75)))

plot_bulk <- plot_base %+% aes(y = mean_bulk) +
  theme(legend.position = c(0.75, 0.875),
        legend.key.size = grid::unit(0.25, "in")) +
  guides(shape = FALSE) +
  labs(x = "Number of subsampled cells",
       y = "Pearson's r",
       title = "Correlation with bulk")
plot_bulk

plot_detected <- plot_base %+% aes(y = mean_detected) +
  geom_errorbar(aes(ymin = mean_detected - sem_detected,
                    ymax = mean_detected + sem_detected),
                width = 1, alpha = 0.5) +
  theme(legend.position = c(0.75, 0.875),
        legend.key.size = grid::unit(0.25, "in")) +
  guides(color = FALSE) +
  labs(x = "Number of subsampled cells",
       y = "Number of genes detected",
       title = "Genes detected")
plot_detected

plot_var <- plot_base %+% aes(y = mean_var) +
  geom_errorbar(aes(ymin = mean_var - sem_var,
                    ymax = mean_var + sem_var),
                width = 1, alpha = 0.5) +
  labs(x = "Number of subsampled cells",
       y = "Pearson's r",
       title = "Cell-to-cell variance")
plot_var

plot_final <- plot_grid(plot_bulk, plot_detected, plot_var,
                        ncol = 3, labels = letters[1:3], label_size = 12)
plot_final

tiff("../paper/figure/fig-subsample-rep.tiff",
     width = 6 * 1.5, height = 8 * 1.5,
     units = "in", res = 300, compression = "none")
plot_final
dev.off()

png 
  2

Session information

sessionInfo()

R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] cowplot_0.3.1 ggplot2_1.0.1 tidyr_0.2.0   dplyr_0.4.2   knitr_1.10.5 

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.4      magrittr_1.5     MASS_7.3-40      munsell_0.4.3   
 [5] colorspace_1.2-6 R6_2.1.1         stringr_1.0.0    httr_0.6.1      
 [9] plyr_1.8.3       tools_3.2.0      parallel_3.2.0   grid_3.2.0      
[13] gtable_0.1.2     DBI_0.3.1        htmltools_0.2.6  lazyeval_0.1.10 
[17] yaml_2.1.13      assertthat_0.1   digest_0.6.8     reshape2_1.4.1  
[21] formatR_1.2      bitops_1.0-6     RCurl_1.95-4.6   evaluate_0.7    
[25] rmarkdown_0.6.1  labeling_0.3     stringi_1.0-1    scales_0.4.0    
[29] proto_0.3-10

Subsampling replicates for supplementary figure

2016-10-27

Input

Figures

Session information