Last updated: 2015-10-22
Code version: 044491cd5df4302bbdcf97ec1313fa184fc31516
library("knitr")
opts_chunk$set(fig.align = "center", fig.width = 10)
Load packages.
library("topGO")
##
## groupGOTerms: GOBPTerm, GOMFTerm, GOCCTerm environments built.
library("biomaRt")
library("edgeR")
library("tidyr")
library("ggplot2")
theme_set(theme_bw(base_size = 12))
Import the filtered counts matrix.
counts <- read.table("../../data/mouse/counts-matrix-filtered.txt",
header = TRUE)
dim(counts)
## [1] 13738 6
head(counts)
## DMSO.REP1 DMSO.REP2 DMSO.REP3 RA.REP1 RA.REP2 RA.REP3
## ENSMUSG00000033845 3312 2890 2922 2981 772 1627
## ENSMUSG00000025903 1837 1789 1459 1348 409 881
## ENSMUSG00000033813 706 675 607 580 132 376
## ENSMUSG00000002459 8 17 15 43 13 48
## ENSMUSG00000033793 1612 1467 1390 1566 417 1081
## ENSMUSG00000025907 1473 1437 1253 1665 556 1167
Calculate log cpm.
normalized_lib_sizes <- calcNormFactors(counts)
log_cpm <- cpm(counts, log = TRUE,
lib.size = colSums(counts) * normalized_lib_sizes)
Import the DE results.
de_results <- read.table("../../data/mouse/de-results.txt",
header = TRUE)
head(de_results)
## logFC logCPM PValue FDR de
## ENSMUSG00000033845 -0.13449462 7.535947 3.402753e-01 4.713826e-01 FALSE
## ENSMUSG00000025903 -0.27935785 6.625184 3.944787e-02 8.981354e-02 FALSE
## ENSMUSG00000033813 -0.28073406 5.271903 1.329561e-02 3.736042e-02 TRUE
## ENSMUSG00000002459 1.98438205 1.091352 8.360234e-09 1.242997e-07 TRUE
## ENSMUSG00000033793 0.07886065 6.612570 4.574341e-01 5.857344e-01 FALSE
## ENSMUSG00000025907 0.39736543 6.682437 4.430506e-03 1.520137e-02 TRUE
Use topGO for GO analysis. It accounts for the nested graph structure of GO terms to prune the number of GO categories tested (Alexa et al. 2006). Essentially, it decreases the redundancy of the results.
First create the gene universe. This is all the genes tested for differential expression assigned a 1 for differentially expressed and 0 if not.
gene_universe <- as.numeric(de_results$de)
gene_universe <- factor(gene_universe)
names(gene_universe) <- rownames(de_results)
head(gene_universe)
## ENSMUSG00000033845 ENSMUSG00000025903 ENSMUSG00000033813
## 0 0 1
## ENSMUSG00000002459 ENSMUSG00000033793 ENSMUSG00000025907
## 1 0 1
## Levels: 0 1
Create the topGO data object. Only consider “Biological Process” categories and use the Mouse Ensembl database for annotation.
go_data <- new("topGOdata",
ontology = "BP",
allGenes = gene_universe,
nodeSize = 5,
annotationFun = annFUN.org,
mapping = "org.Mm.eg",
ID = "ensembl")
##
## Building most specific GOs .....
## Loading required package: org.Mm.eg.db
## ( 9170 GO terms found. )
##
## Build GO DAG topology .......... ( 12663 GO terms and 30627 relations. )
##
## Annotating nodes ............... ( 13068 genes annotated to the GO terms. )
Use the weight01 algorithm and score the tests with Fisher’s exact test.
go_test <- runTest(go_data, algorithm = "weight01", statistic = "fisher")
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 6650 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 18: 1 nodes to be scored (0 eliminated genes)
##
## Level 17: 10 nodes to be scored (0 eliminated genes)
##
## Level 16: 27 nodes to be scored (10 eliminated genes)
##
## Level 15: 64 nodes to be scored (64 eliminated genes)
##
## Level 14: 149 nodes to be scored (212 eliminated genes)
##
## Level 13: 261 nodes to be scored (576 eliminated genes)
##
## Level 12: 404 nodes to be scored (1360 eliminated genes)
##
## Level 11: 627 nodes to be scored (3038 eliminated genes)
##
## Level 10: 798 nodes to be scored (4129 eliminated genes)
##
## Level 9: 968 nodes to be scored (5796 eliminated genes)
##
## Level 8: 951 nodes to be scored (7080 eliminated genes)
##
## Level 7: 904 nodes to be scored (8196 eliminated genes)
##
## Level 6: 714 nodes to be scored (9062 eliminated genes)
##
## Level 5: 464 nodes to be scored (9673 eliminated genes)
##
## Level 4: 224 nodes to be scored (10209 eliminated genes)
##
## Level 3: 61 nodes to be scored (10404 eliminated genes)
##
## Level 2: 22 nodes to be scored (10601 eliminated genes)
##
## Level 1: 1 nodes to be scored (10667 eliminated genes)
Keep the results with a Fisher’s exact test p-value < 0.01.
go_table <- GenTable(go_data, weightFisher = go_test,
orderBy = "weightFisher", ranksOf = "weightFisher",
topNodes = sum(score(go_test) < .01))
head(go_table)
## GO.ID Term Annotated
## 1 GO:0051290 protein heterotetramerization 35
## 2 GO:0030325 adrenal gland development 14
## 3 GO:0030501 positive regulation of bone mineralizati... 23
## 4 GO:0010951 negative regulation of endopeptidase act... 110
## 5 GO:0051897 positive regulation of protein kinase B ... 55
## 6 GO:0007156 homophilic cell adhesion via plasma memb... 47
## Significant Expected weightFisher
## 1 26 13.31 1.4e-05
## 2 13 5.32 3.1e-05
## 3 18 8.74 9.9e-05
## 4 57 41.82 0.00016
## 5 34 20.91 0.00029
## 6 30 17.87 0.00029
There are 101 significant results.
Reassuringly, there are many categories expected for a stem cell differentiating into neuronal precursors in response to treatment with retinoic acid.
go_table[grep("retinoic", go_table$Term), ]
## GO.ID Term Annotated Significant
## 32 GO:0042573 retinoic acid metabolic process 9 8
## Expected weightFisher
## 32 3.42 0.00259
go_table[grep("stem", go_table$Term), ]
## GO.ID Term Annotated Significant
## 46 GO:0001501 skeletal system development 323 163
## 61 GO:0048485 sympathetic nervous system development 13 12
## 65 GO:0035019 somatic stem cell maintenance 39 23
## Expected weightFisher
## 46 122.79 0.00328
## 61 4.94 0.00609
## 65 14.83 0.00630
go_table[grep("differentiation", go_table$Term), ]
## GO.ID Term Annotated Significant Expected
## 52 GO:0030154 cell differentiation 2438 1111 926.84
## weightFisher
## 52 0.00415
go_table[grep("neur", go_table$Term), ]
## GO.ID Term Annotated
## 22 GO:0001764 neuron migration 103
## 55 GO:0043525 positive regulation of neuron apoptotic ... 49
## 75 GO:0043524 negative regulation of neuron apoptotic ... 111
## 84 GO:0021785 branchiomotor neuron axon guidance 5
## 89 GO:0045213 neurotransmitter receptor metabolic proc... 5
## Significant Expected weightFisher
## 22 53 39.16 0.00138
## 55 28 18.63 0.00501
## 75 55 42.20 0.00770
## 84 5 1.90 0.00793
## 89 5 1.90 0.00793
go_table[grep("nerv", go_table$Term), ]
## GO.ID Term Annotated Significant
## 17 GO:0021612 facial nerve structural organization 7 7
## 61 GO:0048485 sympathetic nervous system development 13 12
## Expected weightFisher
## 17 2.66 0.00114
## 61 4.94 0.00609
Investigate the RA genes.
go_id_ra <- go_table[grep("retinoic", go_table$Term), "GO.ID"]
go_genes_ra <- genesInTerm(go_data, go_id_ra)[[1]]
ensembl <- useMart(host = "sep2015.archive.ensembl.org",
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "mmusculus_gene_ensembl")
gene_info_ra <- getBM(attributes = c("ensembl_gene_id", "chromosome_name",
"external_gene_name", "description"),
filter = "ensembl_gene_id",
values = go_genes_ra,
mart = ensembl)
stopifnot(go_genes_ra == gene_info_ra$ensembl_gene_id)
gene_info_ra
## ensembl_gene_id chromosome_name external_gene_name
## 1 ENSMUSG00000004885 3 Crabp2
## 2 ENSMUSG00000013584 9 Aldh1a2
## 3 ENSMUSG00000015134 7 Aldh1a3
## 4 ENSMUSG00000021214 13 Akr1c18
## 5 ENSMUSG00000024987 19 Cyp26a1
## 6 ENSMUSG00000025921 1 Rdh10
## 7 ENSMUSG00000046402 9 Rbp1
## 8 ENSMUSG00000062432 19 Cyp26c1
## 9 ENSMUSG00000074207 3 Adh1
## description
## 1 cellular retinoic acid binding protein II [Source:MGI Symbol;Acc:MGI:88491]
## 2 aldehyde dehydrogenase family 1, subfamily A2 [Source:MGI Symbol;Acc:MGI:107928]
## 3 aldehyde dehydrogenase family 1, subfamily A3 [Source:MGI Symbol;Acc:MGI:1861722]
## 4 aldo-keto reductase family 1, member C18 [Source:MGI Symbol;Acc:MGI:2145420]
## 5 cytochrome P450, family 26, subfamily a, polypeptide 1 [Source:MGI Symbol;Acc:MGI:1096359]
## 6 retinol dehydrogenase 10 (all-trans) [Source:MGI Symbol;Acc:MGI:1924238]
## 7 retinol binding protein 1, cellular [Source:MGI Symbol;Acc:MGI:97876]
## 8 cytochrome P450, family 26, subfamily c, polypeptide 1 [Source:MGI Symbol;Acc:MGI:2679699]
## 9 alcohol dehydrogenase 1 (class I) [Source:MGI Symbol;Acc:MGI:87921]
Visualize the RA genes.
log_cpm_ra <- log_cpm[go_genes_ra, ]
log_cpm_ra <- data.frame(gene = rownames(log_cpm_ra), log_cpm_ra,
stringsAsFactors = FALSE)
log_cpm_ra_long <- gather(log_cpm_ra, key = "sample", value = "log_cpm", -gene)
log_cpm_ra_long <- separate(log_cpm_ra_long, col = sample,
into = c("treatment", "replicate"), sep = "\\.")
log_cpm_ra_long$gene <- factor(log_cpm_ra_long$gene,
levels = go_genes_ra[order(de_results[go_genes_ra, "logFC"])],
labels = gene_info_ra$external_gene_name[order(de_results[go_genes_ra, "logFC"])])
head(log_cpm_ra_long)
## gene treatment replicate log_cpm
## 1 Crabp2 DMSO REP1 0.9054316
## 2 Aldh1a2 DMSO REP1 2.8943198
## 3 Aldh1a3 DMSO REP1 1.7813047
## 4 Akr1c18 DMSO REP1 0.9920260
## 5 Cyp26a1 DMSO REP1 -3.6971706
## 6 Rdh10 DMSO REP1 3.0129666
ggplot(log_cpm_ra_long, aes(x = gene, y = log_cpm, fill = treatment)) +
geom_boxplot() +
labs(x = "Gene", y = "log2 cpm",
title = sprintf("Expression of retinoic acid genes (%s)", go_id_ra))
sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-unknown-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] org.Mm.eg.db_3.1.2 ggplot2_1.0.1 tidyr_0.2.0
## [4] edgeR_3.10.2 limma_3.24.9 biomaRt_2.24.0
## [7] topGO_2.20.0 SparseM_1.6 GO.db_3.1.2
## [10] RSQLite_1.0.0 DBI_0.3.1 AnnotationDbi_1.30.1
## [13] GenomeInfoDb_1.4.0 IRanges_2.2.4 S4Vectors_0.6.0
## [16] Biobase_2.28.0 graph_1.46.0 BiocGenerics_0.14.0
## [19] knitr_1.10.5
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.0 formatR_1.2 plyr_1.8.3 bitops_1.0-6
## [5] tools_3.2.0 digest_0.6.8 evaluate_0.7 gtable_0.1.2
## [9] lattice_0.20-31 yaml_2.1.13 proto_0.3-10 dplyr_0.4.2
## [13] httr_0.6.1 stringr_1.0.0 grid_3.2.0 R6_2.1.1
## [17] XML_3.98-1.2 rmarkdown_0.6.1 reshape2_1.4.1 magrittr_1.5
## [21] MASS_7.3-40 scales_0.2.4 htmltools_0.2.6 assertthat_0.1
## [25] colorspace_1.2-6 labeling_0.3 stringi_0.4-1 lazyeval_0.1.10
## [29] munsell_0.4.2 RCurl_1.95-4.6