npreg: trendfilter on normalized data

Data and packages
Evaluate data after standardizing expression
Fit trendfilter to the data after quantile normalization
Make a permuted distribution
Genes with the small PVE
Output summary statistics
Session information

Last updated: 2018-05-08

Code version: 85120ce

Data and packages

Packages

library(circular)
library(conicfit)
library(Biobase)
library(dplyr)
library(matrixStats)

library(NPCirc)
library(smashr)
library(genlasso)

Load data

df <- readRDS(file="../data/eset-final.rds")
pdata <- pData(df)
fdata <- fData(df)

# select endogeneous genes
counts <- exprs(df)[grep("ENSG", rownames(df)), ]

log2cpm.all <- t(log2(1+(10^6)*(t(counts)/pdata$molecules)))

macosko <- readRDS("../data/cellcycle-genes-previous-studies/rds/macosko-2015.rds")

log2cpm.all <- log2cpm.all[,order(pdata$theta)]
pdata <- pdata[order(pdata$theta),]

source("../code/utility.R")
source("../code/npreg/npreg.methods.R")

–

Evaluate data after standardizing expression

Map log2cpm expression to standard normal distribution. The transformation is non-linear. Sort N expression values from the largest to the smalles. General N standard normal random variable. For the non-zero expression value, find the correspnoding standard normal random variable that has the same quantile and subsitute the value with the corresponding normal random variable value. We then find the stanadard normal random variable values correspond to non-zero expression values and randomly assign these non-zero expression values to a standard normal random variable value.

For genes with relatively low fraction of deteted cells, this method allows to move the zero expression values closer to the non-zero expression value.
For genes with high fraction of undetected cells, this method creates a

log2cpm.quant <- do.call(rbind, mclapply(1:nrow(log2cpm.all), function(g) {
  yy <- log2cpm.all[g,]
  is.zero <- which(yy == 0)
  qq.map <- qqnorm(yy)
  yy.qq <- qq.map$x
  yy.qq[is.zero] <- sample(qq.map$x[is.zero])
  return(yy.qq)
}, mc.cores=8) )
rownames(log2cpm.quant) <- rownames(log2cpm.all)

saveRDS(log2cpm.quant, file = "../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.rds")

check genes with low/high fraction of undetected cells.

log2cpm.quant <- readRDS("../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.rds")

ii.high <- order(rowMeans(log2cpm.all > 0), decreasing = F)[1:5]
par(mfcol=c(3,5))
for (i in 1:5) {
  plot(log2cpm.all[ii.high[i],])
  plot(log2cpm.quant[ii.high[i],])
  plot(log2cpm.all[ii.high[i],], log2cpm.quant[ii.high[i],])
}

ii.low <- order(rowMeans(log2cpm.all > 0), decreasing = T)[1:5]
par(mfcol=c(3,5))
for (i in 1:5) {
  plot(log2cpm.all[ii.low[i],])
  plot(log2cpm.quant[ii.low[i],])
  plot(log2cpm.all[ii.low[i],], log2cpm.quant[ii.low[i],])
}

Check genes that we previously found to have cyclical patterns. This doesn’t distor genes previously found to have cyclical patterns.

cyclegenes <- readRDS("../output/npreg-methods.Rmd/cyclegenes.rds")

par(mfcol=c(3,5))
for (gene in colnames(cyclegenes)[2:6]) {
  ind <- rownames(log2cpm.all) == gene
  plot(log2cpm.all[ind,])
  plot(log2cpm.quant[ind,])
  plot(log2cpm.all[ind,], log2cpm.quant[ind,])
}

Fit trendfilter to the data after quantile normalization

sbatch code/npreg-trendfilter-quantile.Rmd trendfilter.1.sh

sbatch code/npreg-trendfilter-quantile.Rmd trendfilter.2.sh

sbatch code/npreg-trendfilter-quantile.Rmd trendfilter.3.sh

sbatch code/npreg-trendfilter-quantile.Rmd trendfilter.4.sh

sbatch code/npreg-trendfilter-quantile.Rmd trendfilter.5.sh

Split the data to five chunks by genes and submit the chunks to independent batch jobs.

log2cpm.quant <- readRDS("../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.rds")

saveRDS(log2cpm.quant[1:2200,], 
        "../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.1.rds")
saveRDS(log2cpm.quant[2201:4400,], 
        "../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.2.rds")
saveRDS(log2cpm.quant[4401:6600,], 
        "../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.3.rds")
saveRDS(log2cpm.quant[6601:8800,], 
        "../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.4.rds")
saveRDS(log2cpm.quant[8801:11040,], 
        "../output/npreg-trendfilter-quantile.Rmd/log2cpm.quant.5.rds")

fit.quant.1 <- readRDS("../output/npreg-trendfilter-quantile.Rmd/fit.trend.quant.1.rds")
fit.quant.2 <- readRDS("../output/npreg-trendfilter-quantile.Rmd/fit.trend.quant.2.rds")
fit.quant.3 <- readRDS("../output/npreg-trendfilter-quantile.Rmd/fit.trend.quant.3.rds")
fit.quant.4 <- readRDS("../output/npreg-trendfilter-quantile.Rmd/fit.trend.quant.4.rds")
fit.quant.5 <- readRDS("../output/npreg-trendfilter-quantile.Rmd/fit.trend.quant.5.rds")

fit.quant <- c(fit.quant.1,fit.quant.2,fit.quant.3,fit.quant.4,fit.quant.5)
saveRDS(fit.quant, 
        "../output/npreg-trendfilter-quantile.Rmd/fit.quant.rds")

consider PVE

fit.quant <- readRDS("../output/npreg-trendfilter-quantile.Rmd/fit.quant.rds")

pve <- sapply(fit.quant, "[[", "trend.pve")
summary(pve)

      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-0.0043839  0.0000798  0.0002166  0.0008961  0.0004973  0.3266288

select genes with the highest PVE

pve.genes <- names(pve)[order(c(pve), decreasing = T)[1:10]]

plot top 10

par(mfrow=c(2,5))
for (g in 1:length(pve.genes)) {
  ii.g <- which(names(fit.quant)==pve.genes[g])
  plot(log2cpm.quant[rownames(log2cpm.quant)==pve.genes[g]])
  points(fit.quant[[ii.g]]$trend.yy, pch=16, col = "blue", cex=.7)
}

quickily check top 100 enrichment for cell cycle genes.

enrich.order <- function(cutoffs, metrics, cyclegenes, allgenes) {
  #  out <- order(mad.ratio$smash.mad.ratio)
  # cutoffs <- c(100, 200, 300)
  cycle.rich <- sapply(cutoffs, function(x) {
    which_top <- order(metrics, decreasing = T)[1:x]
    sig.cycle <- sum(allgenes[which_top] %in% cyclegenes)/x
    non.cycle <- sum(allgenes[-which_top] %in% cyclegenes)/(length(allgenes)-x)
    cbind(as.numeric(sum(allgenes[which_top] %in% cyclegenes)), 
          sig.cycle/non.cycle)
  })
  colnames(cycle.rich) <- cutoffs
  rownames(cycle.rich) <- c("nsig.genes.cycle", "fold.sig.vs.nonsig.cycle")
  cycle.rich
}

enrich.order(cutoffs = c(100, 200, 300), 
             metrics = pve, cyclegenes = macosko$ensembl,
             allgenes = rownames(log2cpm.quant))

                              100       200   300
nsig.genes.cycle         54.00000 73.000000 86.00
fold.sig.vs.nonsig.cycle 12.78701  8.931377  7.16

Make a permuted distribution

Consider two genes, one with large fraction of undetected cells and one with small fraction of undeteted cells. See if the null distribution is similar.

sbatch code/npreg-trendfilter-quantile.Rmd trendfilter.perm.lowmisss.sh

sbatch code/npreg-trendfilter-quantile.Rmd trendfilter.perm.highmisss.sh

perm.lowmiss <- readRDS("../output/npreg-trendfilter-quantile.Rmd/fit.trend.perm.lowmiss.rds")

perm.highmiss <- readRDS("../output/npreg-trendfilter-quantile.Rmd/fit.trend.perm.highmiss.rds")

pve.perm.lowmiss <- sapply(perm.lowmiss, "[[", "trend.pve")
pve.perm.highmiss <- sapply(perm.highmiss, "[[", "trend.pve")

summary(pve.perm.lowmiss)

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
3.700e-08 4.998e-05 1.388e-04 2.821e-04 3.382e-04 1.617e-02

summary(pve.perm.highmiss)

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
1.340e-07 5.089e-05 1.389e-04 2.664e-04 3.403e-04 4.214e-03

Turns out the p-value based on permuted data is not the same for genes with low and high fraction of undetected cells.

par(mfrow=c(1,2))
hist(pve.perm.lowmiss, nclass=30,
     main = "Fraction undetected < 10%", xlab = "p-value")
hist(pve.perm.highmiss, nclass=30,
     main = "Fraction undetected > 80%", xlab = "p-value")

Compute p-value based on two different distributions. High consistency between the two.

B <- length(pve.perm.lowmiss)
pval.perm.low <- sapply(fit.quant, function(x) (1+sum(pve.perm.lowmiss > x$trend.pve))/(1+B))
pval.perm.high <- sapply(fit.quant, function(x) (1+sum(pve.perm.highmiss > x$trend.pve))/(1+B))

summary(pval.perm.low)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000999 0.163836 0.365634 0.415336 0.648352 1.000000

summary(pval.perm.high)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.000999 0.150849 0.360639 0.411520 0.630370 1.000000

plot(pval.perm.low, pval.perm.high)

sum(pval.perm.high < .001)

[1] 278

sum(pval.perm.low < .001)

[1] 101

use permutated distribution based data with low missing value. more conservative

which.sig <- pval.perm.low < .001

enrich.sigval <- function(cutoffs, metrics, cyclegenes, allgenes) {
  #  out <- order(mad.ratio$smash.mad.ratio)
  # cutoffs <- c(100, 200, 300)
  cycle.rich <- sapply(cutoffs, function(x) {
    #which_top <- order(metrics, decreasing = T)[1:x]
    sig.cycle <- sum(allgenes[metrics < x] %in% cyclegenes)/sum(metrics < x)
    non.cycle <- sum(allgenes[metrics > x] %in% cyclegenes)/sum(metrics > x)
    cbind(sum(metrics < x), as.numeric(sum(allgenes[metrics < x] %in% cyclegenes)), 
          sig.cycle/non.cycle)
  })
  colnames(cycle.rich) <- cutoffs
  rownames(cycle.rich) <- c("nsig.genes", "nsig.genes.cycle", "fold.sig.vs.nonsig.cycle")
  cycle.rich
}


enrich.sigval(cutoffs = c(.001, .005, .01), metrics=pval.perm.low,
              cyclegenes = macosko$ensembl,
              allgenes = rownames(log2cpm.quant))

                             0.001      0.005       0.01
nsig.genes               101.00000 476.000000 553.000000
nsig.genes.cycle          54.00000  99.000000  99.000000
fold.sig.vs.nonsig.cycle  12.65925   5.268908   4.502205

Genes with the small PVE

out.stats <- data.frame(pve=sapply(fit.quant, "[[", "trend.pve"),
                        pval.perm=pval.perm.low,
                        row.names = rownames(log2cpm.quant))

out.stats.ordered <- out.stats[order(out.stats$pve, decreasing = T),]
out.stats.ordered.sig <- out.stats.ordered[out.stats.ordered$pval.perm < .001,]
  

library(biomaRt)
ensembl <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
symbols <- getBM(attributes = c("hgnc_symbol",'ensembl_gene_id'), 
      filters = c('ensembl_gene_id'),
      values = rownames(out.stats.ordered.sig), 
      mart = ensembl)
out.stats.ordered.sig$symbols <- symbols$hgnc_symbol[match(rownames(out.stats.ordered.sig),
                                                           symbols$ensembl_gene_id)]
out.stats.ordered.sig$symbols[is.na(out.stats.ordered.sig$symbols)] <- rownames(out.stats.ordered.sig)[is.na(out.stats.ordered.sig$symbols)]

sig.genes <- rownames(out.stats.ordered.sig)
par(mfrow=c(4,5)) 
for (g in 1:20) {
  ii.g <- which(rownames(log2cpm.quant) == sig.genes[g])
  plot(log2cpm.quant[ii.g,],
       main=out.stats.ordered.sig$symbols[g])
  points(fit.quant[[ii.g]]$trend.yy, col = "blue", pch=16, cex=.7)
}

Output summary statistics

saveRDS(out.stats.ordered.sig,
        "../output/npreg-trendfilter-quantile.Rmd/out.stats.ordered.sig.rds")

saveRDS(out.stats,
        "../output/npreg-trendfilter-quantile.Rmd/quant.stats.rds")


write.table(rownames(out.stats.ordered.sig),
            quote = F, col.names = F, row.names = F,
            file = "../output/npreg-trendfilter-quantile.Rmd/siggens.txt")

Session information

sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-redhat-linux-gnu (64-bit)
Running under: Scientific Linux 7.2 (Nitrogen)

Matrix products: default
BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] biomaRt_2.34.2      genlasso_1.3        igraph_1.2.1       
 [4] smashr_1.1-0        caTools_1.17.1      data.table_1.10.4-3
 [7] Matrix_1.2-10       wavethresh_4.6.8    MASS_7.3-47        
[10] ashr_2.2-7          Rcpp_0.12.16        NPCirc_2.0.1       
[13] matrixStats_0.53.1  dplyr_0.7.4         Biobase_2.38.0     
[16] BiocGenerics_0.24.0 conicfit_1.0.4      geigen_2.1         
[19] pracma_2.1.4        circular_0.4-93    

loaded via a namespace (and not attached):
 [1] httr_1.3.1           bit64_0.9-7          jsonlite_1.5        
 [4] foreach_1.4.4        shiny_1.0.5          assertthat_0.2.0    
 [7] stats4_3.4.1         blob_1.1.0           yaml_2.1.18         
[10] progress_1.1.2       slam_0.1-42          pillar_1.2.1        
[13] RSQLite_2.0          backports_1.1.2      lattice_0.20-35     
[16] glue_1.2.0           digest_0.6.15        htmltools_0.3.6     
[19] httpuv_1.3.6.2       XML_3.98-1.10        pkgconfig_2.0.1     
[22] misc3d_0.8-4         xtable_1.8-2         mvtnorm_1.0-7       
[25] git2r_0.21.0         tibble_1.4.2         IRanges_2.12.0      
[28] movMF_0.2-2          magrittr_1.5         mime_0.5            
[31] memoise_1.1.0        evaluate_0.10.1      doParallel_1.0.11   
[34] truncnorm_1.0-8      tools_3.4.1          prettyunits_1.0.2   
[37] stringr_1.3.0        S4Vectors_0.16.0     plotrix_3.7         
[40] AnnotationDbi_1.40.0 bindrcpp_0.2         compiler_3.4.1      
[43] rlang_0.2.0          grid_3.4.1           RCurl_1.95-4.10     
[46] iterators_1.0.9      htmlwidgets_1.0      crosstalk_1.0.0     
[49] bitops_1.0-6         rmarkdown_1.9        boot_1.3-19         
[52] codetools_0.2-15     curl_3.1             DBI_0.8             
[55] R6_2.2.2             knitr_1.20           bit_1.1-12          
[58] bindr_0.1.1          rprojroot_1.3-2      shape_1.4.4         
[61] stringi_1.1.7        pscl_1.5.2           SQUAREM_2017.10-1

This R Markdown site was created with workflowr