Profile individual adjusted CVs

Objective
Set up
Prepare data
Compute normalized CV
Extreme genes
Annotation
- GOstats
- Extract GO terms
Export files
Session information

Last updated: 2015-12-11

Code version: 7a25b2aef5c8329d4b5394cce595bbfc2b5b0c3e

Objective

We rank each individual’s adjusted CV (Deviation-from-the-Median; DM) and compare genes’ ranks in adjusted CV across individuals.

Method: Compute CV for each individual and adjust the CV to remove its dependency on mean gene expression. We will follow [Kolodziejczyk et al., 2015] to compute the adjusted coefficient of variation (referred to as Deviation-from-the-Median in [Kolodziejczyk et al., 2015] and also in our related documentations).

Kolodziejczyk et al., 2015

Set up

library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
theme_set(theme_bw(base_size = 12))
source("functions.R")

Prepare data

Input annotation of only QC-filtered single cells. Remove NA19098.r2

anno_qc <- read.table("../data/annotation-filter.txt", header = TRUE,
                   stringsAsFactors = FALSE)
is_include <- anno_qc$batch != "NA19098.r2"
anno_qc_filter <- anno_qc[which(is_include), ]

Import endogeneous gene molecule counts that are QC-filtered, CPM-normalized, ERCC-normalized, and also processed to remove unwanted variation from batch effet. ERCC genes are removed from this file.

molecules_ENSG <- read.table("../data/molecules-final.txt", header = TRUE, stringsAsFactors = FALSE)
molecules_ENSG <- molecules_ENSG[ , is_include]

Input moleclule counts before log2 CPM transformation. This file is used to compute percent zero-count cells per sample.

molecules_sparse <- read.table("../data/molecules-filter.txt", header = TRUE, stringsAsFactors = FALSE)

molecules_sparse <- molecules_sparse[grep("ENSG", rownames(molecules_sparse)), ]
stopifnot( all.equal(rownames(molecules_ENSG), rownames(molecules_sparse)) )

Compute normalized CV

We compute squared CV across cells for each individual and then for each individual CV profile, account for mean dependency by computing distance with respect to the data-wide coefficient variation on the log10 scale.

source("../code/cv-functions.r")

ENSG_cv <- compute_cv(log2counts = molecules_ENSG,
                      grouping_vector = anno_qc_filter$individual)

ENSG_cv_adj <- normalize_cv(group_cv = ENSG_cv, 
                            log2counts = molecules_ENSG, 
                            anno = anno_qc_filter)

Extreme genes

*Top/bottom 50

extreme_genes_50 <- lapply(ENSG_cv_adj, function(xx) {
  low_mean <- rank(xx$mean) < 50
  high_mean <- rank(xx$mean) > (length(xx$mean) - 50)
  low_cv <-  rank(xx$log10cv2_adj) < 50
  high_cv <- rank(xx$log10cv2_adj) > (length(xx$log10cv2_adj) - 50)
  res <- data.frame(high_mean = high_mean,
                    low_mean = low_mean,
                    high_cv = high_cv,
                    low_cv = low_cv)
  rownames(res) <- rownames(ENSG_cv_adj[[1]])
  res
})

*Top/bottom 200

extreme_genes_200 <- lapply(ENSG_cv_adj, function(xx) {
  low_mean <- rank(xx$mean) < 200
  high_mean <- rank(xx$mean) > (length(xx$mean) - 200)
  low_cv <-  rank(xx$log10cv2_adj) < 200
  high_cv <- rank(xx$log10cv2_adj) > (length(xx$log10cv2_adj) - 200)
  res <- data.frame(high_mean = high_mean,
                    low_mean = low_mean,
                    high_cv = high_cv,
                    low_cv = low_cv)
  rownames(res) <- rownames(ENSG_cv_adj[[1]])
  res
})

*Output gene symbols.

gene_info <- read.table("../data/gene-info.txt", header = TRUE, sep = "\t", 
                        stringsAsFactors = FALSE, quote = "")
str(gene_info)

'data.frame':   10483 obs. of  5 variables:
 $ ensembl_gene_id   : chr  "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457" ...
 $ chromosome_name   : chr  "X" "X" "20" "1" ...
 $ external_gene_name: chr  "TSPAN6" "TNMD" "DPM1" "SCYL3" ...
 $ transcript_count  : int  3 2 7 5 10 4 12 2 13 8 ...
 $ description       : chr  "tetraspanin 6 [Source:HGNC Symbol;Acc:11858]" "tenomodulin [Source:HGNC Symbol;Acc:17757]" "dolichyl-phosphate mannosyltransferase polypeptide 1, catalytic subunit [Source:HGNC Symbol;Acc:3005]" "SCY1-like 3 (S. cerevisiae) [Source:HGNC Symbol;Acc:19285]" ...

Gene symbols ordered by CV

adj_cv_order <- with(gene_info, data.frame(
                          NA19098 = external_gene_name[
                            order(ENSG_cv_adj$NA19098$log10cv2_adj, decreasing = TRUE)],
                          NA19101 = external_gene_name[
                            order(ENSG_cv_adj$NA19101$log10cv2_adj, decreasing = TRUE)],
                          NA19239 = external_gene_name[
                            order(ENSG_cv_adj$NA19239$log10cv2_adj, decreasing = TRUE)], 
                          stringsAsFactors = FALSE) )

head(adj_cv_order)

   NA19098 NA19101 NA19239
1    HSPA4 GNPNAT1   VTI1A
2    TTC26   ADCY1 GNPNAT1
3  GNPNAT1   ADNP2   ADCY1
4    ADCY1   TTC26  ZNF350
5 KIAA1033    CBY1   ADNP2
6    ADNP2   TAF1D PITPNM1

if(!file.exists("../data/gene-order-high-to-low-cv.txt")) {
write.table(adj_cv_order, file = "../data/gene-order-high-to-low-cv.txt",
            sep = "\t",
            col.names = TRUE, row.names = FALSE, quote = FALSE)
}

Annotation

GOstats

require(Humanzee)
if (file.exists("rda/cv-adjusted-profile/go-cv-NA19098.rda")) {
  load("rda/cv-adjusted-profile/go-cv-NA19098.rda")  
} else {
go_cv_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[1]]$high_cv],
                      pval_cutoff = 1, ontology=c("BP") )
go_cv_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[1]]$low_cv],
                      pval_cutoff = 1, ontology=c("BP") )

go_mean_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[1]]$high_mean],
                      pval_cutoff = 1, ontology=c("BP") )
go_mean_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[1]]$low_mean],
                      pval_cutoff = 1, ontology=c("BP") )

save(ENSG_cv_adj, extreme_genes_50,
     go_cv_high, go_cv_low, go_mean_high, go_mean_low,
     file = "rda/cv-adjusted-profile/go-cv-NA19098.rda")
}

if (file.exists("rda/cv-adjusted-profile/go-cv-NA19101.rda")) {
  load("rda/cv-adjusted-profile/go-cv-NA19101.rda")  
} else {
go_cv_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[2]]$high_cv],
                      pval_cutoff = 1, ontology=c("BP") )
go_cv_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[2]]$low_cv],
                      pval_cutoff = 1, ontology=c("BP") )

go_mean_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[2]]$high_mean],
                      pval_cutoff = 1, ontology=c("BP") )
go_mean_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[2]]$low_mean],
                      pval_cutoff = 1, ontology=c("BP") )
save(ENSG_cv_adj, extreme_genes_50,
     go_cv_high, go_cv_low, go_mean_high, go_mean_low,
     file = "rda/cv-adjusted-profile/go-cv-NA19101.rda")
}


if (file.exists("rda/cv-adjusted-profile/go-cv-NA19239.rda")) {
  load("rda/cv-adjusted-profile/go-cv-NA19239.rda")  
} else {
go_cv_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[3]]$high_cv],
                      pval_cutoff = 1, ontology=c("BP") )
go_cv_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[3]]$low_cv],
                      pval_cutoff = 1, ontology=c("BP") )

go_mean_high <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[3]]$high_mean],
                      pval_cutoff = 1, ontology=c("BP") )
go_mean_low <- GOtest(my_ensembl_gene_universe = rownames(molecules_ENSG),
                      my_ensembl_gene_test = rownames(molecules_ENSG)[extreme_genes_50[[3]]$low_mean],
                      pval_cutoff = 1, ontology=c("BP") )
save(ENSG_cv_adj, extreme_genes_50,
     go_cv_low, go_cv_high,
     go_mean_high, go_mean_low, file = "rda/cv-adjusted-profile/go-cv-NA19239.rda")
}

Extract GO terms

NA19098

load(file = "rda/cv-adjusted-profile/go-cv-NA19098.rda")
goterms_cv_high <- summary(go_cv_high$GO$BP, pvalue = .05)
goterms_cv_high <- data.frame(ID = goterms_cv_high[[1]],
                             Pvalue = goterms_cv_high[[2]],
                             Terms = goterms_cv_high[[7]])
goterms_cv_high <- goterms_cv_high[order(goterms_cv_high$Pvalue), ]
head(goterms_cv_high)

goterms_cv_low <- summary(go_cv_low$GO$BP, pvalue = .05)
goterms_cv_low <- data.frame(ID = goterms_cv_low[[1]],
                             Pvalue = goterms_cv_low[[2]],
                             Terms = goterms_cv_low[[7]])
goterms_cv_low <- goterms_cv_high[order(goterms_cv_low$Pvalue), ]
head(goterms_cv_low)

NA19101

load(file = "rda/cv-adjusted-profile/go-cv-NA19101.rda")
goterms_cv_high <- summary(go_cv_high$GO$BP, pvalue = .05)
goterms_cv_high <- data.frame(ID = goterms_cv_high[[1]],
                             Pvalue = goterms_cv_high[[2]],
                             Terms = goterms_cv_high[[7]])
goterms_cv_high <- goterms_cv_high[order(goterms_cv_high$Pvalue), ]
head(goterms_cv_high)

goterms_cv_low <- summary(go_cv_low$GO$BP, pvalue = .05)
goterms_cv_low <- data.frame(ID = goterms_cv_low[[1]],
                             Pvalue = goterms_cv_low[[2]],
                             Terms = goterms_cv_low[[7]])
goterms_cv_low <- goterms_cv_high[order(goterms_cv_low$Pvalue), ]
head(goterms_cv_low)

NA19239

load(file = "rda/cv-adjusted-profile/go-cv-NA19239.rda")
goterms_cv_high <- summary(go_cv_high$GO$BP, pvalue = .05)
goterms_cv_high <- data.frame(ID = goterms_cv_high[[1]],
                             Pvalue = goterms_cv_high[[2]],
                             Terms = goterms_cv_high[[7]])
goterms_cv_high <- goterms_cv_high[order(goterms_cv_high$Pvalue), ]
head(goterms_cv_high)

goterms_cv_low <- summary(go_cv_low$GO$BP, pvalue = .05)
goterms_cv_low <- data.frame(ID = goterms_cv_low[[1]],
                             Pvalue = goterms_cv_low[[2]],
                             Terms = goterms_cv_low[[7]])
goterms_cv_low <- goterms_cv_high[order(goterms_cv_low$Pvalue), ]
head(goterms_cv_low)

Export files

library("biomaRt")
ensembl <- useMart(host = "grch37.ensembl.org",
                   biomart = "ENSEMBL_MART_ENSEMBL",
                   dataset = "hsapiens_gene_ensembl")

info <- getBM(attributes = c("ensembl_gene_id", "chromosome_name",
                                                   "external_gene_name", "transcript_count",
                                                   "description"),
                                    filters = "ensembl_gene_id",
                                    values = rownames(molecules_ENSG[sig_cv, ]),
                                    mart = ensembl)

write.table(extreme_genes_50$NA19098$high_cv, quote = FALSE)

Session information

sessionInfo()

R version 3.2.1 (2015-06-18)
Platform: x86_64-apple-darwin13.4.0 (64-bit)
Running under: OS X 10.10.5 (Yosemite)

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] zoo_1.7-12       ggplot2_1.0.1    edgeR_3.10.5     limma_3.24.15   
[5] dplyr_0.4.3      data.table_1.9.6 knitr_1.11      

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.2      magrittr_1.5     MASS_7.3-45      munsell_0.4.2   
 [5] lattice_0.20-33  colorspace_1.2-6 R6_2.1.1         stringr_1.0.0   
 [9] plyr_1.8.3       tools_3.2.1      parallel_3.2.1   gtable_0.1.2    
[13] DBI_0.3.1        htmltools_0.2.6  yaml_2.1.13      assertthat_0.1  
[17] digest_0.6.8     reshape2_1.4.1   formatR_1.2.1    evaluate_0.8    
[21] rmarkdown_0.8.1  stringi_1.0-1    scales_0.3.0     chron_2.3-47    
[25] proto_0.3-10