Last updated: 2016-07-08
Code version: 641fc1222ff5c5bc6b4766ad24ea2f5420334427
Get gene symbols for the final data set and save them in a text file for annotation references.
library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
theme_set(theme_bw(base_size = 12))
source("functions.R")
Input annotation of QC-filtered data.
anno_filter <- read.table("../data/annotation-filter.txt",
header = TRUE,
stringsAsFactors = FALSE)
Import endogeneous genes in the final file.
molecules_final <- read.table("../data/molecules-final.txt",
header = TRUE,
stringsAsFactors = FALSE)
if(file.exists("../data/gene-info.txt")) {
gene_info <- read.table(file = "../data/gene-info.txt", sep = "\t",
header = TRUE, stringsAsFactors = FALSE, quote = "")
head(gene_info)
} else {
library("biomaRt")
ensembl <- useMart(host = "grch37.ensembl.org",
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl")
gene_info <- getBM(attributes = c("ensembl_gene_id",
"chromosome_name",
"external_gene_name",
"transcript_count",
"description"),
filters = "ensembl_gene_id",
values = rownames(molecules_final),
mart = ensembl)
write.table(gene_info, row.names = FALSE,
file = "../data/gene-info.txt", quote = FALSE, sep = "\t")
write.table(gene_info$external_gene_name, row.names = FALSE, col.names = F,
file = "../data/gene-info-symbol-only.txt", quote = FALSE, sep = "\t")
write.table(gene_info$ensembl_gene_id, row.names = FALSE, col.names = F,
file = "../data/gene-info-ensg-only.txt", quote = FALSE, sep = "\t")
}
ensembl_gene_id chromosome_name external_gene_name transcript_count
1 ENSG00000000003 X TSPAN6 3
2 ENSG00000000005 X TNMD 2
3 ENSG00000000419 20 DPM1 7
4 ENSG00000000457 1 SCYL3 5
5 ENSG00000000460 1 C1orf112 10
6 ENSG00000001036 6 FUCA2 4
description
1 tetraspanin 6 [Source:HGNC Symbol;Acc:11858]
2 tenomodulin [Source:HGNC Symbol;Acc:17757]
3 dolichyl-phosphate mannosyltransferase polypeptide 1, catalytic subunit [Source:HGNC Symbol;Acc:3005]
4 SCY1-like 3 (S. cerevisiae) [Source:HGNC Symbol;Acc:19285]
5 chromosome 1 open reading frame 112 [Source:HGNC Symbol;Acc:25565]
6 fucosidase, alpha-L- 2, plasma [Source:HGNC Symbol;Acc:4008]
t1 <- Sys.time()
library("biomaRt")
ensembl <- useMart(host = "grch37.ensembl.org",
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl")
gene_info <- getBM(attributes = c("ensembl_gene_id",
"chromosome_name",
"external_gene_name",
"transcript_count",
"description"),
filters = "ensembl_gene_id",
values = rownames(molecules_final),
mart = ensembl)
t2 <- Sys.time()
Takes 21.0294459 to processs 13058.
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] grid stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] biomaRt_2.24.0 ggplot2_1.0.1 edgeR_3.10.2 limma_3.24.9
[5] dplyr_0.4.2 data.table_1.9.4 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.12.4 GenomeInfoDb_1.4.0 formatR_1.2
[4] plyr_1.8.3 bitops_1.0-6 tools_3.2.0
[7] digest_0.6.8 RSQLite_1.0.0 evaluate_0.7
[10] gtable_0.1.2 DBI_0.3.1 yaml_2.1.13
[13] parallel_3.2.0 proto_0.3-10 httr_0.6.1
[16] stringr_1.0.0 IRanges_2.2.4 S4Vectors_0.6.0
[19] stats4_3.2.0 Biobase_2.28.0 R6_2.1.1
[22] AnnotationDbi_1.30.1 XML_3.98-1.2 rmarkdown_0.6.1
[25] reshape2_1.4.1 magrittr_1.5 scales_0.4.0
[28] htmltools_0.2.6 MASS_7.3-40 BiocGenerics_0.14.0
[31] assertthat_0.1 colorspace_1.2-6 stringi_1.0-1
[34] RCurl_1.95-4.6 munsell_0.4.3 chron_2.3-45