Last updated: 2015-12-11
Code version: 7a25b2aef5c8329d4b5394cce595bbfc2b5b0c3e
We rank each individual’s adjusted CV (Deviation-from-the-Median; DM) and compare genes’ ranks in adjusted CV across individuals.
Method: Compute CV for each individual and adjust the CV to remove its dependency on mean gene expression. We will follow [Kolodziejczyk et al., 2015] to compute the adjusted coefficient of variation (referred to as Deviation-from-the-Median in [Kolodziejczyk et al., 2015] and also in our related documentations).
library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
theme_set(theme_bw(base_size = 12))
source("functions.R")
Input annotation of only QC-filtered single cells. Remove NA19098.r2
anno_qc <- read.table("../data/annotation-filter.txt", header = TRUE,
stringsAsFactors = FALSE)
is_include <- anno_qc$batch != "NA19098.r2"
anno_qc_filter <- anno_qc[which(is_include), ]
Import endogeneous gene molecule counts that are QC-filtered, CPM-normalized, ERCC-normalized, and also processed to remove unwanted variation from batch effet. ERCC genes are removed from this file.
molecules_ENSG <- read.table("../data/molecules-final.txt", header = TRUE, stringsAsFactors = FALSE)
molecules_ENSG <- molecules_ENSG[ , is_include]
Input moleclule counts before log2 CPM transformation. This file is used to compute percent zero-count cells per sample.
molecules_sparse <- read.table("../data/molecules-filter.txt", header = TRUE, stringsAsFactors = FALSE)
molecules_sparse <- molecules_sparse[grep("ENSG", rownames(molecules_sparse)), ]
stopifnot( all.equal(rownames(molecules_ENSG), rownames(molecules_sparse)) )
We compute squared CV across cells for each individual and then for each individual CV profile, account for mean dependency by computing distance with respect to the data-wide coefficient variation on the log10 scale.
source("../code/cv-functions.r")
ENSG_cv <- compute_cv(log2counts = molecules_ENSG,
grouping_vector = anno_qc_filter$individual)
ENSG_cv_adj <- normalize_cv(group_cv = ENSG_cv,
log2counts = molecules_ENSG,
anno = anno_qc_filter)
*Top/bottom 50
extreme_genes_50 <- lapply(ENSG_cv_adj, function(xx) {
low_mean <- rank(xx$mean) < 50
high_mean <- rank(xx$mean) > (length(xx$mean) - 50)
low_cv <- rank(xx$log10cv2_adj) < 50
high_cv <- rank(xx$log10cv2_adj) > (length(xx$log10cv2_adj) - 50)
res <- data.frame(high_mean = high_mean,
low_mean = low_mean,
high_cv = high_cv,
low_cv = low_cv)
rownames(res) <- rownames(ENSG_cv_adj[[1]])
res
})
*Top/bottom 200
extreme_genes_200 <- lapply(ENSG_cv_adj, function(xx) {
low_mean <- rank(xx$mean) < 200
high_mean <- rank(xx$mean) > (length(xx$mean) - 200)
low_cv <- rank(xx$log10cv2_adj) < 200
high_cv <- rank(xx$log10cv2_adj) > (length(xx$log10cv2_adj) - 200)
res <- data.frame(high_mean = high_mean,
low_mean = low_mean,
high_cv = high_cv,
low_cv = low_cv)
rownames(res) <- rownames(ENSG_cv_adj[[1]])
res
})
*Output gene symbols.
gene_info <- read.table("../data/gene-info.txt", header = TRUE, sep = "\t",
stringsAsFactors = FALSE, quote = "")
str(gene_info)
'data.frame': 10483 obs. of 5 variables:
$ ensembl_gene_id : chr "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457" ...
$ chromosome_name : chr "X" "X" "20" "1" ...
$ external_gene_name: chr "TSPAN6" "TNMD" "DPM1" "SCYL3" ...
$ transcript_count : int 3 2 7 5 10 4 12 2 13 8 ...
$ description : chr "tetraspanin 6 [Source:HGNC Symbol;Acc:11858]" "tenomodulin [Source:HGNC Symbol;Acc:17757]" "dolichyl-phosphate mannosyltransferase polypeptide 1, catalytic subunit [Source:HGNC Symbol;Acc:3005]" "SCY1-like 3 (S. cerevisiae) [Source:HGNC Symbol;Acc:19285]" ...
Gene symbols ordered by CV
adj_cv_order <- with(gene_info, data.frame(
NA19098 = external_gene_name[
order(ENSG_cv_adj$NA19098$log10cv2_adj, decreasing = TRUE)],
NA19101 = external_gene_name[
order(ENSG_cv_adj$NA19101$log10cv2_adj, decreasing = TRUE)],
NA19239 = external_gene_name[
order(ENSG_cv_adj$NA19239$log10cv2_adj, decreasing = TRUE)],
stringsAsFactors = FALSE) )
head(adj_cv_order)
NA19098 NA19101 NA19239
1 HSPA4 GNPNAT1 VTI1A
2 TTC26 ADCY1 GNPNAT1
3 GNPNAT1 ADNP2 ADCY1
4 ADCY1 TTC26 ZNF350
5 KIAA1033 CBY1 ADNP2
6 ADNP2 TAF1D PITPNM1
if(!file.exists("../data/gene-order-high-to-low-cv.txt")) {
write.table(adj_cv_order, file = "../data/gene-order-high-to-low-cv.txt",
sep = "\t",
col.names = TRUE, row.names = FALSE, quote = FALSE)
}
require(Humanzee)
if (file.exists("rda/cv-adjusted-profile/go-cv-NA19098.rda")) {
load("rda/cv-adjusted-profile/go-cv-NA19098.rda")
} else {
go_cv_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[1]]$high_cv],
pval_cutoff = 1, ontology=c("BP") )
go_cv_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[1]]$low_cv],
pval_cutoff = 1, ontology=c("BP") )
go_mean_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[1]]$high_mean],
pval_cutoff = 1, ontology=c("BP") )
go_mean_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[1]]$low_mean],
pval_cutoff = 1, ontology=c("BP") )
save(ENSG_cv_adj, extreme_genes_50,
go_cv_high, go_cv_low, go_mean_high, go_mean_low,
file = "rda/cv-adjusted-profile/go-cv-NA19098.rda")
}
if (file.exists("rda/cv-adjusted-profile/go-cv-NA19101.rda")) {
load("rda/cv-adjusted-profile/go-cv-NA19101.rda")
} else {
go_cv_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[2]]$high_cv],
pval_cutoff = 1, ontology=c("BP") )
go_cv_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[2]]$low_cv],
pval_cutoff = 1, ontology=c("BP") )
go_mean_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[2]]$high_mean],
pval_cutoff = 1, ontology=c("BP") )
go_mean_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[2]]$low_mean],
pval_cutoff = 1, ontology=c("BP") )
save(ENSG_cv_adj, extreme_genes_50,
go_cv_high, go_cv_low, go_mean_high, go_mean_low,
file = "rda/cv-adjusted-profile/go-cv-NA19101.rda")
}
if (file.exists("rda/cv-adjusted-profile/go-cv-NA19239.rda")) {
load("rda/cv-adjusted-profile/go-cv-NA19239.rda")
} else {
go_cv_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[3]]$high_cv],
pval_cutoff = 1, ontology=c("BP") )
go_cv_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[3]]$low_cv],
pval_cutoff = 1, ontology=c("BP") )
go_mean_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[3]]$high_mean],
pval_cutoff = 1, ontology=c("BP") )
go_mean_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[3]]$low_mean],
pval_cutoff = 1, ontology=c("BP") )
save(ENSG_cv_adj, extreme_genes_50,
go_cv_low, go_cv_high,
go_mean_high, go_mean_low, file = "rda/cv-adjusted-profile/go-cv-NA19239.rda")
}
load(file = "rda/cv-adjusted-profile/go-cv-NA19098.rda")
goterms_cv_high <- summary(go_cv_high$GO$BP, pvalue = .05)
goterms_cv_high <- data.frame(ID = goterms_cv_high[[1]],
Pvalue = goterms_cv_high[[2]],
Terms = goterms_cv_high[[7]])
goterms_cv_high <- goterms_cv_high[order(goterms_cv_high$Pvalue), ]
head(goterms_cv_high)
goterms_cv_low <- summary(go_cv_low$GO$BP, pvalue = .05)
goterms_cv_low <- data.frame(ID = goterms_cv_low[[1]],
Pvalue = goterms_cv_low[[2]],
Terms = goterms_cv_low[[7]])
goterms_cv_low <- goterms_cv_high[order(goterms_cv_low$Pvalue), ]
head(goterms_cv_low)
load(file = "rda/cv-adjusted-profile/go-cv-NA19101.rda")
goterms_cv_high <- summary(go_cv_high$GO$BP, pvalue = .05)
goterms_cv_high <- data.frame(ID = goterms_cv_high[[1]],
Pvalue = goterms_cv_high[[2]],
Terms = goterms_cv_high[[7]])
goterms_cv_high <- goterms_cv_high[order(goterms_cv_high$Pvalue), ]
head(goterms_cv_high)
goterms_cv_low <- summary(go_cv_low$GO$BP, pvalue = .05)
goterms_cv_low <- data.frame(ID = goterms_cv_low[[1]],
Pvalue = goterms_cv_low[[2]],
Terms = goterms_cv_low[[7]])
goterms_cv_low <- goterms_cv_high[order(goterms_cv_low$Pvalue), ]
head(goterms_cv_low)
load(file = "rda/cv-adjusted-profile/go-cv-NA19239.rda")
goterms_cv_high <- summary(go_cv_high$GO$BP, pvalue = .05)
goterms_cv_high <- data.frame(ID = goterms_cv_high[[1]],
Pvalue = goterms_cv_high[[2]],
Terms = goterms_cv_high[[7]])
goterms_cv_high <- goterms_cv_high[order(goterms_cv_high$Pvalue), ]
head(goterms_cv_high)
goterms_cv_low <- summary(go_cv_low$GO$BP, pvalue = .05)
goterms_cv_low <- data.frame(ID = goterms_cv_low[[1]],
Pvalue = goterms_cv_low[[2]],
Terms = goterms_cv_low[[7]])
goterms_cv_low <- goterms_cv_high[order(goterms_cv_low$Pvalue), ]
head(goterms_cv_low)
library("biomaRt")
ensembl <- useMart(host = "grch37.ensembl.org",
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl")
info <- getBM(attributes = c("ensembl_gene_id", "chromosome_name",
"external_gene_name", "transcript_count",
"description"),
filters = "ensembl_gene_id",
values = rownames(molecules_ENSG[sig_cv, ]),
mart = ensembl)
write.table(extreme_genes_50$NA19098$high_cv, quote = FALSE)
sessionInfo()
R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.5 (Yosemite)
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] zoo_1.7-12 ggplot2_1.0.1 edgeR_3.10.5 limma_3.24.15
[5] dplyr_0.4.3 data.table_1.9.6 knitr_1.11
loaded via a namespace (and not attached):
[1] Rcpp_0.12.2 magrittr_1.5 MASS_7.3-45 munsell_0.4.2
[5] lattice_0.20-33 colorspace_1.2-6 R6_2.1.1 stringr_1.0.0
[9] plyr_1.8.3 tools_3.2.1 parallel_3.2.1 gtable_0.1.2
[13] DBI_0.3.1 htmltools_0.2.6 yaml_2.1.13 assertthat_0.1
[17] digest_0.6.8 reshape2_1.4.1 formatR_1.2.1 evaluate_0.8
[21] rmarkdown_0.8.1 stringi_1.0-1 scales_0.3.0 chron_2.3-47
[25] proto_0.3-10