Last updated: 2016-04-28

Code version: 75e23ae09f8128081e5241c204064c50b6625372

Objective

Examine the cell-cycle gene set that went into the cell-cycle score calculation.

Setup

source("functions.R")
require("limma")
Loading required package: limma
require("edgeR")
Loading required package: edgeR
require(ggplot2)
Loading required package: ggplot2
require(dplyr)
Loading required package: dplyr

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
require(data.table)
Loading required package: data.table

Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, last
library("cowplot")

Attaching package: 'cowplot'

The following object is masked from 'package:ggplot2':

    ggsave

Import data

## Annotation of the filtered data
anno_filter <- read.table("../data/annotation-filter.txt",
                         header = TRUE,
                         stringsAsFactors = FALSE)

## Molecule counts of the filtered data
molecules_filter <- read.table("../data/molecules-filter.txt",
                               header = TRUE,
                               stringsAsFactors = FALSE)

## Read counts of filtered data
reads_filter <- read.table("../data/reads-filter.txt",
                               header = TRUE,
                               stringsAsFactors = FALSE)

## Molecule counts after all corrections
molecules_final <- read.table("../data/molecules-final.txt",
                              header = TRUE,
                              stringsAsFactors = FALSE)

Cell-cycle assignemnt

Import cell-cycle information

cell_cycle_genes <- read.table("../data/cellcyclegenes.txt",
                               header = TRUE, sep = "\t",
                               stringsAsFactors = FALSE)
colnames(cell_cycle_genes) <- c("G1.S","S","G2","G2.M","M.G1")

Cell-phase assignment helper

cell_phase_assign <- function(cell_cycle_genes, molecules_final) {

  cell_phase_score <- sapply(cell_cycle_genes, function(xx){
    ## create table of each phase
    molecules_phase <- molecules_final[rownames(molecules_final) %in% unlist(xx) ,]
  
    ## add average count of all genes in the phase
    combined_matrix <- rbind(molecules_phase, average = apply(molecules_phase,2,mean) )
    
    ## use transpose to compute cor matrix
    cor_matrix <- cor(t(combined_matrix))
    
    ## take the numbers
    cor_vector <- cor_matrix[,dim(cor_matrix)[1]]
    
    ## restrict to correlation >= 0.3 
    molecules_phase_restricted <- molecules_phase[rownames(molecules_phase) %in% names(cor_vector[cor_vector >= 0.3]),]
    
    ## output the phase specific scores (mean of normalized expression levels in the phase)
    apply(molecules_phase_restricted, 2, mean)
  })

  ## Two-step normalization (by gene and by cell)
  ## by row (gene)
  row_mean <- apply(cell_phase_score, 1, mean)
  row_sd   <- apply(cell_phase_score, 1, sd)
  score_row_normed <- do.call(rbind, 
      lapply(1:dim(cell_phase_score)[1], function(i) {
          (cell_phase_score[i,] - row_mean[i])/row_sd[i]
      })  )
  
  ## by column
  col_mean <- apply(score_row_normed, 2, mean)
  col_sd   <- apply(score_row_normed, 2, sd)
  score_final_normed <- do.call(cbind, 
      lapply(1:dim(score_row_normed)[2], function(i) {
          (score_row_normed[, i] - col_mean[i])/col_sd[i]
      })
  )
  return(score_final_normed)
}

Assign phase

cell_phase_before <- apply(
      cell_phase_assign(cell_cycle_genes, molecules_filter), 
      1, function(x) colnames(cell_cycle_genes)[which.max(x)])

cell_phase_after <- apply(
      cell_phase_assign(cell_cycle_genes, molecules_final), 
      1, function(x) colnames(cell_cycle_genes)[which.max(x)])

phase_order <- c("G1.S","S","G2","G2.M","M.G1")
cell_phase_before <- factor(cell_phase_before, 
                            levels = phase_order)
cell_phase_after <- factor(cell_phase_after,
                           levels = phase_order)

Average total number of molecules of the cells assigned to each phase

cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")

data_to_plot <- anno_filter
data_to_plot$cell_phase_before <- cell_phase_before
data_to_plot$cell_phase_after <- cell_phase_after

## Calculate the total molecule counts using molecule filter data
data_to_plot$total_molecules_before <- apply(molecules_filter, 2, sum)
data_to_plot$total_molecules_after <- apply(molecules_filter, 2, sum)

## Calculate the total read counts using read filter data
data_to_plot$total_reads_before <- apply(reads_filter, 2, sum)
data_to_plot$total_reads_after <- apply(reads_filter, 2, sum)

data_to_plot$individual <- factor(data_to_plot$individual)
data_to_plot$replicate <- factor(data_to_plot$replicate)

data_before <- summarise(group_by(data_to_plot, cell_phase_before, individual, replicate),
                         mean_total_molecules = mean(total_molecules_before),
                         mean_total_reads = mean(total_reads_before),
                         cell_number = length(cell_phase_before),
                         sem_total_molecules = sd(total_molecules_before)/sqrt(cell_number),
                         sem_total_reads = sd(total_reads_before)/sqrt(cell_number))
data_after <- summarise(group_by(data_to_plot, cell_phase_after, individual, replicate),
                        mean_total_molecules = mean(total_molecules_after), 
                        mean_total_reads = mean(total_reads_after),
                        cell_number = length(cell_phase_after),
                        sem_total_molecules = sd(total_molecules_after)/sqrt(cell_number),
                        sem_total_reads = sd(total_reads_after)/sqrt(cell_number))
stopifnot(sum(data_before$cell_number) == sum(data_after$cell_number))

Before correction

cycle_plot_before <- ggplot(data_before, aes(x = as.factor(cell_phase_before), 
                                             y = mean_total_molecules,
                                             color = replicate,
                                             group = replicate) ) + 
                        geom_point() + geom_line() +
                        geom_errorbar(aes(ymin = mean_total_molecules - sem_total_molecules,
                                          ymax = mean_total_molecules + sem_total_molecules), 
                                          width = 0, col = "grey") +
                        scale_x_discrete(limits=phase_order, labels=c("G1/S","S","G2","G2/M","M/G1")) + 
                        theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + 
                        xlab("Cell cycle phase") + ylab("Total molecule counts (mean +/- sem)") +
                        ggtitle("Before correction")

cycle_plot_before_reads <- ggplot(data_before, aes(x = as.factor(cell_phase_before), 
                                             y = mean_total_reads,
                                             color = replicate,
                                             group = replicate) ) + 
                        geom_point() + geom_line() +
                        geom_errorbar(aes(ymin = mean_total_reads - sem_total_reads,
                                          ymax = mean_total_reads + sem_total_reads), 
                                          width = 0, col = "grey") +
                        scale_x_discrete(limits=phase_order, labels=c("G1/S","S","G2","G2/M","M/G1")) + 
                        theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + 
                        xlab("Cell cycle phase") + ylab("Total read counts (mean +/- sem)") +
                        ggtitle("Before correction")

cell_number_before <- ggplot(data_before, aes(x = as.factor(cell_phase_before), 
                                             y = cell_number,
                                             color = replicate,
                                             group = replicate) ) + 
                        geom_point() + geom_line() +
                        scale_x_discrete(limits=phase_order, labels=c("G1/S","S","G2","G2/M","M/G1")) + 
                        theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + 
                        xlab("Cell cycle phase") + ylab("Cell number") +
                        ggtitle("Before correction")

After correction

cycle_plot_after <- ggplot(data_after, aes(x = as.factor(cell_phase_after), 
                                             y = mean_total_molecules,
                                             color = replicate,
                                             group = replicate) ) + 
                        geom_point() + geom_line() +
                        geom_errorbar(aes(ymin = mean_total_molecules - sem_total_molecules,
                                          ymax = mean_total_molecules + sem_total_molecules), 
                                          width = 0, col = "grey") +
                        scale_x_discrete(limits=phase_order, labels=c("G1/S","S","G2","G2/M","M/G1")) + 
                        theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + 
                        xlab("Cell cycle phase") + ylab("Total molecule counts (mean +/- sem)") +
                        ggtitle("Post correction")

cycle_plot_after_reads <- ggplot(data_after, aes(x = as.factor(cell_phase_after), 
                                             y = mean_total_reads,
                                             color = replicate,
                                             group = replicate) ) + 
                        geom_point() + geom_line() +
                        geom_errorbar(aes(ymin = mean_total_reads - sem_total_reads,
                                          ymax = mean_total_reads + sem_total_reads), 
                                          width = 0, col = "grey") +
                        scale_x_discrete(limits=phase_order, labels=c("G1/S","S","G2","G2/M","M/G1")) + 
                        theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + 
                        xlab("Cell cycle phase") + ylab("Total read counts (mean +/- sem)") +
                        ggtitle("Post correction")

cell_number_after <- ggplot(data_after, aes(x = as.factor(cell_phase_after), 
                                             y = cell_number,
                                             color = replicate,
                                             group = replicate) ) + 
                        geom_point() + geom_line() +
                        scale_x_discrete(limits=phase_order, labels=c("G1/S","S","G2","G2/M","M/G1")) + 
                        theme(axis.text.x = element_text(angle = 90, hjust = 0.9, vjust = 0.5)) + 
                        xlab("Cell cycle phase") + ylab("Cell number") +
                        ggtitle("Post correction")
theme_set(theme_bw(base_size = 12))
theme_update(panel.grid.minor.x = element_blank(),
             panel.grid.minor.y = element_blank(),
             panel.grid.major.x = element_blank(),
             panel.grid.major.y = element_blank())
plot_grid(cycle_plot_before + facet_grid(~ individual) + theme(legend.position = c(.25,.3)) + labs (col = ""),
          cycle_plot_after + facet_grid(~ individual) + theme(legend.position = "none"),
          cycle_plot_before_reads + facet_grid(~ individual) + theme(legend.position = "none"),
          cycle_plot_after_reads + facet_grid(~ individual) + theme(legend.position = "none"),
          cell_number_before + facet_grid(~ individual) + theme(legend.position = "none"),
          cell_number_after + facet_grid(~ individual) + theme(legend.position = "none"),
          nrow = 3,
          labels = LETTERS[1:6])

Session information

sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] cowplot_0.3.1    data.table_1.9.4 dplyr_0.4.2      ggplot2_1.0.1   
[5] edgeR_3.10.2     limma_3.24.9     knitr_1.10.5    

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.0      magrittr_1.5     MASS_7.3-40      munsell_0.4.2   
 [5] colorspace_1.2-6 R6_2.1.1         stringr_1.0.0    httr_0.6.1      
 [9] plyr_1.8.3       tools_3.2.0      parallel_3.2.0   grid_3.2.0      
[13] gtable_0.1.2     DBI_0.3.1        htmltools_0.2.6  lazyeval_0.1.10 
[17] assertthat_0.1   yaml_2.1.13      digest_0.6.8     reshape2_1.4.1  
[21] formatR_1.2      bitops_1.0-6     RCurl_1.95-4.6   evaluate_0.7    
[25] rmarkdown_0.6.1  labeling_0.3     stringi_0.4-1    scales_0.2.4    
[29] chron_2.3-45     proto_0.3-10