Filter genes

Last updated: 2018-07-01

Code version: e4eb9e9

Setup

library("cowplot")
library("dplyr")
library("edgeR")
library("ggplot2")
library("knitr")
theme_set(theme_cowplot())
source("../code/functions.R")
library("Biobase") # has to be loaded last to use `combine`

Import data.

eset <- readRDS("../data/eset.rds")
dim(eset)

Features  Samples 
   54616     7584

Filter

Low QC sample

We performed quality control to identify low quality single cells. Only keep high-quality single cells.

quality <- read.table("../data/quality-single-cells.txt", stringsAsFactors = FALSE)
colnames(quality) <- c("sample", "quality")
eset <- eset[, quality$quality]
dim(eset)

Features  Samples 
   54616     5597

Over-expressed gene

We would like to remove genes which have greater than 4096 (44444*4) molecules in any given single cell. These are above our theoretical maximum number of 6-bp UMIs (it can happen when a highly expressed gene as multiple start sites). We did not find any genes with molecules greater than 4096. We found r length(overexpressed_genes_1024) mitochondrial genes which have molecules larger than 1024

## look at human genes
eset_hs <- eset[fData(eset)$source == "H. sapiens", ]
dim(eset_hs)

Features  Samples 
   20151     5597

head(featureNames(eset_hs))

[1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" "ENSG00000000457"
[5] "ENSG00000000460" "ENSG00000000938"

## identify gene with molecule > 4096
overexpressed_rows <- apply(exprs(eset_hs), 1, function(x) any(x >= 4096))
overexpressed_genes <- rownames(exprs(eset_hs))[overexpressed_rows]
overexpressed_genes

character(0)

## identify gene with molecule > 1024
overexpressed_rows_1024 <- apply(exprs(eset_hs), 1, function(x) any(x >= 1024))
overexpressed_genes_1024 <- rownames(exprs(eset_hs))[overexpressed_rows]
overexpressed_genes_1024

character(0)

Lowly expressed

We identify the lower cutoff using the mean log2 molecule counts per million (cpm) in the high quality single cells.

## convert to log2 cpm
mol_hs_cpm <- cpm(exprs(eset_hs), log = TRUE)
mol_hs_cpm_means <- rowMeans(mol_hs_cpm)
summary(mol_hs_cpm_means)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  2.306   2.354   2.974   3.746   4.657  12.940

hist(mol_hs_cpm_means, xlab = "Mean log2 molecule cpm in single cell samples",
     ylab = "Number of genes", main = "Identifying expression cutoff")

## set a cutoff
lower_exp_cutoff <- 3
abline(v = lower_exp_cutoff, col = "red")

genes_pass_filter <- rownames(mol_hs_cpm)[mol_hs_cpm_means > lower_exp_cutoff]

9957 genes have a mean log2 molecule cpm greater than 3

mol_hs_cpm <- data.frame(mol_hs_cpm)
mol_hs_cpm$mean <- apply(mol_hs_cpm, 1, mean)
mol_hs_cpm$filter <- mol_hs_cpm$mean > lower_exp_cutoff
write.table(data.frame(row.names(mol_hs_cpm), mol_hs_cpm[,"filter"]),
            file = "../data/genes-pass-filter.txt", quote = FALSE,
            sep = "\t", row.names = FALSE, col.names = FALSE)

Session information

sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Scientific Linux 7.4 (Nitrogen)

Matrix products: default
BLAS: /project2/gilad/jdblischak/miniconda3/envs/scqtl/lib/R/lib/libRblas.so
LAPACK: /project2/gilad/jdblischak/miniconda3/envs/scqtl/lib/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  methods   stats     graphics  grDevices utils     datasets 
[8] base     

other attached packages:
[1] Biobase_2.38.0      BiocGenerics_0.24.0 knitr_1.20         
[4] edgeR_3.20.1        limma_3.34.1        dplyr_0.7.4        
[7] cowplot_0.9.1       ggplot2_2.2.1      

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.13     bindr_0.1        magrittr_1.5     munsell_0.4.3   
 [5] lattice_0.20-34  colorspace_1.3-2 R6_2.2.0         rlang_0.1.2     
 [9] stringr_1.2.0    plyr_1.8.4       tools_3.4.1      grid_3.4.1      
[13] gtable_0.2.0     git2r_0.19.0     htmltools_0.3.6  assertthat_0.1  
[17] yaml_2.1.14      lazyeval_0.2.0   rprojroot_1.2    digest_0.6.12   
[21] tibble_1.3.3     bindrcpp_0.2     glue_1.1.1       evaluate_0.10.1 
[25] rmarkdown_1.8    stringi_1.1.2    compiler_3.4.1   scales_0.5.0    
[29] backports_1.0.5  locfit_1.5-9.1   pkgconfig_2.0.1

This R Markdown site was created with workflowr