Last updated: 2016-07-08

Code version: 641fc1222ff5c5bc6b4766ad24ea2f5420334427

Objective

Get gene symbols for the final data set and save them in a text file for annotation references.

Set up

library("data.table")
library("dplyr")
library("limma")
library("edgeR")
library("ggplot2")
library("grid")
theme_set(theme_bw(base_size = 12))
source("functions.R")

Input annotation of QC-filtered data.

anno_filter <- read.table("../data/annotation-filter.txt", 
                          header = TRUE,
                          stringsAsFactors = FALSE)

Import endogeneous genes in the final file.

molecules_final <- read.table("../data/molecules-final.txt", 
                             header = TRUE, 
                             stringsAsFactors = FALSE)

Generate Gene IDs

if(file.exists("../data/gene-info.txt")) {
   gene_info <- read.table(file = "../data/gene-info.txt", sep = "\t",
                          header = TRUE, stringsAsFactors = FALSE, quote = "")
   head(gene_info)
} else {
library("biomaRt")
  ensembl <- useMart(host = "grch37.ensembl.org",
                     biomart = "ENSEMBL_MART_ENSEMBL",
                     dataset = "hsapiens_gene_ensembl")

  gene_info <- getBM(attributes = c("ensembl_gene_id", 
                               "chromosome_name",
                               "external_gene_name", 
                               "transcript_count",
                               "description"),
                      filters = "ensembl_gene_id",
                      values = rownames(molecules_final),
                      mart = ensembl)
  write.table(gene_info, row.names = FALSE,
              file = "../data/gene-info.txt", quote = FALSE, sep = "\t") 

  write.table(gene_info$external_gene_name, row.names = FALSE, col.names = F,
            file = "../data/gene-info-symbol-only.txt", quote = FALSE, sep = "\t") 
  
  write.table(gene_info$ensembl_gene_id, row.names = FALSE, col.names = F,
          file = "../data/gene-info-ensg-only.txt", quote = FALSE, sep = "\t") 
}
  ensembl_gene_id chromosome_name external_gene_name transcript_count
1 ENSG00000000003               X             TSPAN6                3
2 ENSG00000000005               X               TNMD                2
3 ENSG00000000419              20               DPM1                7
4 ENSG00000000457               1              SCYL3                5
5 ENSG00000000460               1           C1orf112               10
6 ENSG00000001036               6              FUCA2                4
                                                                                            description
1                                                          tetraspanin 6 [Source:HGNC Symbol;Acc:11858]
2                                                            tenomodulin [Source:HGNC Symbol;Acc:17757]
3 dolichyl-phosphate mannosyltransferase polypeptide 1, catalytic subunit [Source:HGNC Symbol;Acc:3005]
4                                            SCY1-like 3 (S. cerevisiae) [Source:HGNC Symbol;Acc:19285]
5                                    chromosome 1 open reading frame 112 [Source:HGNC Symbol;Acc:25565]
6                                          fucosidase, alpha-L- 2, plasma [Source:HGNC Symbol;Acc:4008]

Runtime

t1 <- Sys.time()
library("biomaRt")
  ensembl <- useMart(host = "grch37.ensembl.org",
                     biomart = "ENSEMBL_MART_ENSEMBL",
                     dataset = "hsapiens_gene_ensembl")

  gene_info <- getBM(attributes = c("ensembl_gene_id", 
                               "chromosome_name",
                               "external_gene_name", 
                               "transcript_count",
                               "description"),
                      filters = "ensembl_gene_id",
                      values = rownames(molecules_final),
                      mart = ensembl)
t2 <- Sys.time()

Takes 21.0294459 to processs 13058.

Session information

sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] biomaRt_2.24.0   ggplot2_1.0.1    edgeR_3.10.2     limma_3.24.9    
[5] dplyr_0.4.2      data.table_1.9.4 knitr_1.10.5    

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.4          GenomeInfoDb_1.4.0   formatR_1.2         
 [4] plyr_1.8.3           bitops_1.0-6         tools_3.2.0         
 [7] digest_0.6.8         RSQLite_1.0.0        evaluate_0.7        
[10] gtable_0.1.2         DBI_0.3.1            yaml_2.1.13         
[13] parallel_3.2.0       proto_0.3-10         httr_0.6.1          
[16] stringr_1.0.0        IRanges_2.2.4        S4Vectors_0.6.0     
[19] stats4_3.2.0         Biobase_2.28.0       R6_2.1.1            
[22] AnnotationDbi_1.30.1 XML_3.98-1.2         rmarkdown_0.6.1     
[25] reshape2_1.4.1       magrittr_1.5         scales_0.4.0        
[28] htmltools_0.2.6      MASS_7.3-40          BiocGenerics_0.14.0 
[31] assertthat_0.1       colorspace_1.2-6     stringi_1.0-1       
[34] RCurl_1.95-4.6       munsell_0.4.3        chron_2.3-45