Last updated: 2018-01-29
Code version: f6b7f76
Batch 1 only
library("cowplot")
library("edgeR")
library("ggplot2")
library("variancePartition")
theme_set(theme_cowplot())
source("../code/functions.R")
library("Biobase") # has to be loaded last to use `combine`
Import data.
eset <- readRDS("../data/eset.rds")
eset
ExpressionSet (storageMode: lockedEnvironment)
assayData: 54792 features, 4992 samples
element names: exprs
protocolData: none
phenoData
sampleNames: 03162017-A01 03162017-A02 ... 12142017-H12 (4992
total)
varLabels: experiment well ... valid_id (40 total)
varMetadata: labelDescription
featureData
featureNames: ENSG00000000003 ENSG00000000005 ... WBGene00235374
(54792 total)
fvarLabels: chr start ... source (6 total)
fvarMetadata: labelDescription
experimentData: use 'experimentData(object)'
Annotation:
Limit this analysis to batch 1 since it deals with the spike-ins.
eset <- eset[, eset$batch == "b1"]
dim(eset)
Features Samples
54792 960
Remove samples with bad cell number, lack of TRA-1-60 signal, or an invalid individual assignment.
eset_quality <- eset[, eset$cell_number == 1 & eset$tra1.60 & eset$valid_id]
dim(eset_quality)
Features Samples
54792 814
Separate by source.
eset_ce <- eset_quality[fData(eset_quality)$source == "C. elegans", ]
head(featureNames(eset_ce))
[1] "WBGene00000001" "WBGene00000002" "WBGene00000003" "WBGene00000004"
[5] "WBGene00000005" "WBGene00000006"
eset_dm <- eset_quality[fData(eset_quality)$source == "D. melanogaster", ]
head(featureNames(eset_dm))
[1] "FBgn0000008" "FBgn0000014" "FBgn0000015" "FBgn0000017" "FBgn0000018"
[6] "FBgn0000022"
eset_ercc <- eset_quality[fData(eset_quality)$source == "ERCC",
eset_quality$ERCC != "Not added"]
head(featureNames(eset_ercc))
[1] "ERCC-00002" "ERCC-00003" "ERCC-00004" "ERCC-00009" "ERCC-00012"
[6] "ERCC-00013"
eset_hs <- eset_quality[fData(eset_quality)$source == "H. sapiens", ]
head(featureNames(eset_hs))
[1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457"
[5] "ENSG00000000460" "ENSG00000000938"
Define a function for partitioning the variance.
# https://bioconductor.org/packages/3.4/bioc/vignettes/variancePartition/inst/doc/variancePartition.R
calc_partition <- function(x, info, cores = 1) {
stopifnot(c("experiment", "chip_id") %in% colnames(info))
library("doParallel")
cl <- makeCluster(cores)
registerDoParallel(cl)
form <- ~ (1|experiment) + (1|chip_id)
varPart <- fitExtractVarPartModel(x, form, info)
vp <- sortCols(varPart)
return(vp)
}
Remove zeros.
eset_ercc_clean <- eset_ercc[rowSums(exprs(eset_ercc)) != 0, ]
dim(eset_ercc_clean)
Features Samples
87 294
Only keep genes which are observed in at least 50% of the samples.
eset_ercc_clean <- eset_ercc_clean[apply(exprs(eset_ercc_clean), 1, present), ]
dim(eset_ercc_clean)
Features Samples
31 294
mol_ercc_cpm <- cpm(exprs(eset_ercc_clean), log = TRUE)
part_ercc <- calc_partition(mol_ercc_cpm, pData(eset_ercc_clean))
Loading required package: iterators
Projected run time: ~ 0.6 min
plotPercentBars(part_ercc)
plotVarPart(part_ercc, main = sprintf("ERCC (n = %d)", nrow(part_ercc)))
Remove zeros.
eset_dm_clean <- eset_dm[rowSums(exprs(eset_dm)) != 0, ]
dim(eset_dm_clean)
Features Samples
11509 814
Only keep genes which are observed in at least 50% of the samples.
eset_dm_clean <- eset_dm_clean[apply(exprs(eset_dm_clean), 1, present), ]
dim(eset_dm_clean)
Features Samples
328 814
Convert to log2 counts per million.
mol_dm_cpm <- cpm(exprs(eset_dm_clean), log = TRUE)
part_dm <- calc_partition(mol_dm_cpm, pData(eset_dm_clean))
Projected run time: ~ 0.7 min
plotPercentBars(part_dm)
plotVarPart(part_dm, main = sprintf("Fly (n = %d)", nrow(part_dm)))
Select only samples that received 5 pg.
eset_dm_5pg <- eset_dm[, eset_dm$fly == 5000]
dim(eset_dm_5pg)
Features Samples
13832 325
Remove zeros.
eset_dm_5pg_clean <- eset_dm_5pg[rowSums(exprs(eset_dm_5pg)) != 0, ]
dim(eset_dm_5pg_clean)
Features Samples
9786 325
Only keep genes which are observed in at least 50% of the samples.
eset_dm_5pg_clean <- eset_dm_5pg_clean[apply(exprs(eset_dm_5pg_clean), 1, present), ]
dim(eset_dm_5pg_clean)
Features Samples
157 325
Convert to log2 counts per million.
mol_dm_5pg_cpm <- cpm(exprs(eset_dm_5pg_clean), log = TRUE)
part_dm_5pg <- calc_partition(mol_dm_5pg_cpm, pData(eset_dm_5pg_clean))
Projected run time: ~ 0.2 min
plotPercentBars(part_dm_5pg)
Warning: closing unused connection 7 (<-localhost:11466)
Warning: closing unused connection 6 (<-localhost:11466)
plotVarPart(part_dm_5pg, main = sprintf("Fly (5 pg; n = %d)", nrow(part_dm_5pg)))
Select only samples that received 50 pg.
eset_dm_50pg <- eset_dm[, eset_dm$fly == 50000]
dim(eset_dm_50pg)
Features Samples
13832 489
Remove zeros.
eset_dm_50pg_clean <- eset_dm_50pg[rowSums(exprs(eset_dm_50pg)) != 0, ]
dim(eset_dm_50pg_clean)
Features Samples
11038 489
Only keep genes which are observed in at least 50% of the samples.
eset_dm_50pg_clean <- eset_dm_50pg_clean[apply(exprs(eset_dm_50pg_clean), 1, present), ]
dim(eset_dm_50pg_clean)
Features Samples
525 489
Convert to log2 counts per million.
mol_dm_50pg_cpm <- cpm(exprs(eset_dm_50pg_clean), log = TRUE)
part_dm_50pg <- calc_partition(mol_dm_50pg_cpm, pData(eset_dm_50pg_clean))
Projected run time: ~ 0.7 min
plotPercentBars(part_dm_50pg)
plotVarPart(part_dm_50pg, main = sprintf("Fly (50 pg; n = %d)", nrow(part_dm_50pg)))
Remove zeros.
eset_hs_clean <- eset_hs[rowSums(exprs(eset_hs)) != 0, ]
dim(eset_hs_clean)
Features Samples
18868 814
Only keep genes which are observed in at least 50% of the samples.
eset_hs_clean <- eset_hs_clean[apply(exprs(eset_hs_clean), 1, present), ]
dim(eset_hs_clean)
Features Samples
6681 814
This takes too long on the full data set. Subsample the genes to get the idea.
n_genes <- nrow(eset_hs_clean)
eset_hs_clean_sub <- eset_hs_clean[sample(1:n_genes, size = floor(n_genes * 0.05)), ]
dim(eset_hs_clean_sub)
Features Samples
334 814
Convert to log2 counts per million.
mol_hs_cpm <- cpm(exprs(eset_hs_clean_sub), log = TRUE)
part_hs <- calc_partition(mol_hs_cpm, pData(eset_hs_clean_sub))
Projected run time: ~ 0.6 min
plotPercentBars(part_hs)
plotVarPart(part_hs, main = sprintf("Human (n = %d)", nrow(part_hs)))
x_labels <- scale_x_discrete(labels = c("C1 chip", "Individual", "Residuals"))
x_title <- labs(x = "Model coefficient")
plot_grid(plotVarPart(part_ercc, main = sprintf("ERCC (n = %d)", nrow(part_ercc))) +
x_labels + x_title,
plotVarPart(part_hs, main = sprintf("Human (n = %d)", nrow(part_hs))) +
x_labels + x_title,
labels = LETTERS[1:2])
Warning in "factor" %in% attrib[["class", exact = TRUE]]: closing unused
connection 6 (<-localhost:11466)
sessionInfo()
R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Scientific Linux 7.2 (Nitrogen)
Matrix products: default
BLAS: /project2/gilad/jdblischak/miniconda3/envs/scqtl/lib/R/lib/libRblas.so
LAPACK: /project2/gilad/jdblischak/miniconda3/envs/scqtl/lib/R/lib/libRlapack.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] parallel methods stats graphics grDevices utils datasets
[8] base
other attached packages:
[1] doParallel_1.0.10 iterators_1.0.8
[3] variancePartition_1.4.2 Biobase_2.38.0
[5] BiocGenerics_0.24.0 foreach_1.4.3
[7] edgeR_3.20.1 limma_3.34.1
[9] cowplot_0.9.1 ggplot2_2.2.1
loaded via a namespace (and not attached):
[1] Rcpp_0.12.13 nloptr_1.0.4 compiler_3.4.1
[4] git2r_0.19.0 plyr_1.8.4 bitops_1.0-6
[7] tools_3.4.1 lme4_1.1-13 digest_0.6.12
[10] nlme_3.1-131 evaluate_0.10.1 tibble_1.3.3
[13] gtable_0.2.0 lattice_0.20-34 rlang_0.1.2
[16] Matrix_1.2-7.1 yaml_2.1.14 stringr_1.2.0
[19] knitr_1.16 caTools_1.17.1 gtools_3.5.0
[22] locfit_1.5-9.1 rprojroot_1.2 grid_3.4.1
[25] rmarkdown_1.6 minqa_1.2.4 gdata_2.17.0
[28] reshape2_1.4.2 magrittr_1.5 MASS_7.3-45
[31] splines_3.4.1 backports_1.0.5 scales_0.5.0
[34] colorRamps_2.3 gplots_3.0.1 codetools_0.2-15
[37] htmltools_0.3.6 pbkrtest_0.4-7 colorspace_1.3-2
[40] labeling_0.3 KernSmooth_2.23-15 stringi_1.1.2
[43] lazyeval_0.2.0 munsell_0.4.3
This R Markdown site was created with workflowr