Last updated: 2018-02-07

Code version: ecbb5f4


Overview/Results

Goal: What are the expression profiles of the sorted cells in Leng et al. 2015?

Method: Compute phase-specific score Macoscko et al. 2015 using the genes identified as cell cycle genes and also assigned to different cell cycle phases in Macosko et al. 2015. Hence the genes and the samples evaluated are selected from two independent datasets.

Results:

* 120 genes were identified as variable along cell cycle phase. 

* G1 cells scored the highest in G1 and also high in M phase. S cells scored the highest in S, while G2 cells scored the highest in G2. Importantly, there appears to be a pattern in average expression levels within each cell state (fucci-assigned). G1 cells peaks at M/G1 and G1/S, S cells peaks at S, while G2 cells peaks at G2. 

* Compared to our data ([here](https://jdblischak.github.io/fucci-seq/images-classify-fucci.html)): Across individuals, cells classified to S and G2 phases scored the highest in the corresponding phase score. However, cells classifed as G1 phase score high in S phase for some individuals. 

What's clear is that phase-specific scores along are not adequate in predicting cell cycle phase. 

Data and packages

Packages

library(Biobase)
library(ggplot2)
library(cowplot)
library(data.table)
library(tidyr)
library(ccRemover)

Load Leng et al. 2015 data from GEO - already filtered for low-quality samples, and count data was median-normalized.

HumanLengESC <- readRDS(file = "../data/rnaseq-previous-studies/HumanLengESC.rds")

counts <- exprs(HumanLengESC)
libsize <- colSums(counts)
cpm <- t(t(counts)*(10^6)/libsize)
log2cpm <- log2(cpm+1)
pdata <- pData(HumanLengESC)
table(pData(HumanLengESC)$cell_state)

 G1  G2  H1   S 
 91  76 213  80 

select fucci-expression cells

counts <- counts[,pdata$cell_state != "H1"]
cpm <- cpm[,pdata$cell_state != "H1"]
log2cpm <- log2cpm[,pdata$cell_state != "H1"]
pdata <- pdata[pdata$cell_state != "H1", ]
pdata$cell_state <- droplevels(pdata$cell_state)

filter genes

genes_to_include <- which(rowMeans(cpm)>1)
log2cpm <- log2cpm[genes_to_include,]

subset to include genes that are annotated as cell cycle genes (according to ccRemover)

ccremover <- readRDS("../data/cellcycle-genes-previous-studies/rds/macosko-2015.rds")
which_ccremover <- gene_indexer(rownames(log2cpm), species="human", name_type="symbol")

log2cpm_ccremover <- log2cpm[which_ccremover, ]
cpm_ccremover <- cpm[which_ccremover, ]
counts_ccremover <- counts[which_ccremover, ]

subset to include genes in Macosko data

macosko <- readRDS("../data/cellcycle-genes-previous-studies/rds/macosko-2015.rds")
which_macosko <- which(rownames(log2cpm) %in% macosko$hgnc)

log2cpm_macosko <- log2cpm[which_macosko, ]
cpm_macosko <- cpm[which_macosko, ]
counts_macosko <- counts[which_macosko, ]
ccgenes_macosko <- macosko[which(macosko$hgnc %in% rownames(log2cpm_macosko)), ]

compute phase-specific scores

cc_scores <- lapply(1:uniqueN(ccgenes_macosko$phase), function(i) {
  ph <- unique(ccgenes_macosko$phase)[i]
  df_sub <- log2cpm_macosko[rownames(log2cpm_macosko) %in% ccgenes_macosko$hgnc[ccgenes_macosko$phase == ph],]
  mn <- colMeans(df_sub)
  cc <- cor(t(rbind(mn, df_sub)))
  cc_mean <- cc[-1,1]
  genes_cc <- names(cc_mean)[which(cc_mean > .3)]
  scores_raw <- colMeans(df_sub[rownames(df_sub) %in% genes_cc,])
  scores_z <- scale(scores_raw)
  return(list(scores_z=scores_z, ngenes = length(genes_cc)))
})
names(cc_scores) <- unique(ccgenes_macosko$phase)

ngenes <- sapply(cc_scores, function(x) x[[2]])
print(ngenes)
   S G1/S M/G1    M   G2 
  10    8   27   43   32 
scores <- do.call(cbind, lapply(cc_scores, function(x) x[[1]]))
colnames(scores) <- unique(ccgenes_macosko$phase)

scores_z <- t(apply(scores, 1, scale))
colnames(scores_z) <- unique(ccgenes_macosko$phase)
scores_z <- as.data.frame(scores_z)

scores_z_long <- gather(scores_z, key=phase, value=scores)
scores_z_long$sample_id <- rep(rownames(scores_z), ncol(scores_z))

scores_z_long$cell_state <- pdata$cell_state[match(scores_z_long$sample_id, rownames(pdata))]

Analysis


Session information

R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Scientific Linux 7.4 (Nitrogen)

Matrix products: default
BLAS: /home/joycehsiao/miniconda3/envs/fucci-seq/lib/R/lib/libRblas.so
LAPACK: /home/joycehsiao/miniconda3/envs/fucci-seq/lib/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] ccRemover_1.0.4     tidyr_0.7.1         data.table_1.10.4  
[4] cowplot_0.9.1       ggplot2_2.2.1       Biobase_2.38.0     
[7] BiocGenerics_0.24.0

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.14     knitr_1.17       magrittr_1.5     munsell_0.4.3   
 [5] colorspace_1.3-2 rlang_0.1.4.9000 stringr_1.2.0    plyr_1.8.4      
 [9] tools_3.4.1      grid_3.4.1       gtable_0.2.0     git2r_0.19.0    
[13] htmltools_0.3.6  yaml_2.1.16      lazyeval_0.2.0   rprojroot_1.2   
[17] digest_0.6.12    tibble_1.3.3     purrr_0.2.2      glue_1.1.1      
[21] evaluate_0.10.1  rmarkdown_1.8    labeling_0.3     stringi_1.1.2   
[25] compiler_3.4.1   scales_0.5.0     backports_1.0.5 

This R Markdown site was created with workflowr