Last updated: 2015-06-30

Code version: bbd04becabb24228433a9d432c2f4d9dd5976bf1

Input

library("dplyr")
library("ggplot2")
theme_set(theme_bw(base_size = 16))
library("edgeR")
library("gplots")

Input annotation.

anno <- read.table("../data/annotation.txt", header = TRUE,
                   stringsAsFactors = FALSE)
head(anno)
  individual batch well     sample_id
1      19098     1  A01 NA19098.1.A01
2      19098     1  A02 NA19098.1.A02
3      19098     1  A03 NA19098.1.A03
4      19098     1  A04 NA19098.1.A04
5      19098     1  A05 NA19098.1.A05
6      19098     1  A06 NA19098.1.A06

Input read counts.

reads <- read.table("../data/reads.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input molecule counts.

molecules <- read.table("../data/molecules.txt", header = TRUE,
                    stringsAsFactors = FALSE)

Input list of quality single cells.

quality_single_cells <- scan("../data/quality-single-cells.txt",
                             what = "character")

Keep only the single cells that passed the QC filters and the bulk samples.

molecules <- molecules[, grepl("bulk", colnames(molecules)) |
                         colnames(molecules) %in% quality_single_cells]
anno <- anno[anno$well == "bulk" | anno$sample_id %in% quality_single_cells, ]
stopifnot(ncol(molecules) == nrow(anno),
          colnames(molecules) == anno$sample_id)

reads <- reads[, grepl("bulk", colnames(reads)) |
                         colnames(reads) %in% quality_single_cells]
stopifnot(ncol(reads) == nrow(anno),
          colnames(reads) == anno$sample_id)

Remove genes with zero read counts in the single cells or bulk samples.

expressed <- rowSums(molecules[, anno$well == "bulk"]) > 0 &
             rowSums(molecules[, anno$well != "bulk"]) > 0
molecules <- molecules[expressed, ]
dim(molecules)
[1] 17208   641
expressed <- rowSums(reads[, anno$well == "bulk"]) > 0 &
             rowSums(reads[, anno$well != "bulk"]) > 0
reads <- reads[expressed, ]
dim(reads)
[1] 17218   641

Split the bulk and single samples.

molecules_bulk <- molecules[, anno$well == "bulk"]
molecules_single <- molecules[, anno$well != "bulk"]
reads_bulk <- reads[, anno$well == "bulk"]
reads_single <- reads[, anno$well != "bulk"]

Remove genes with max molecule numer larger than 1024

molecules_single <- molecules_single[apply(molecules_single,1,max) < 1024,]

variance between batches (C1 prep) by molecules

See if batches from the same individual cluster together

# create table using the molecule counts
group_list <- c("19","19098","19101","19239","19098.1","19098.2","19098.3","19101.1","19101.2","19101.3","19239.1","19239.2","19239.3")

### create a function to generate tables of mean,
create_info_table <- function(statistics="mean"){
  # first correct
  molecules.crt <- -1024*log(1-molecules_single/1024)
  # loop all possible combinations
  big_table <- do.call(cbind,lapply(group_list,function(x){
      # subset data
      data_ref <- molecules.crt[grep(x,names(molecules.crt))]
      if(statistics=="mean"){
        ans <- apply(data_ref,1,function(xx) mean(xx,na.rm=TRUE))  
      }
      if(statistics=="var"){
        ans <- apply(data_ref,1,function(xx) var(xx,na.rm=TRUE))  
      }
      if(statistics=="CV"){
        ans <- apply(data_ref,1,function(xx)  sd(xx,na.rm=TRUE)) / apply(data_ref,1,function(xx)  mean(xx,na.rm=TRUE))   
      }
    ans
    })
  )
    big_table <- data.frame(big_table)
    names(big_table) <- paste(statistics,group_list,sep="_")
    big_table$gene_name <- rownames(molecules_single)
    big_table
}

# create big table for mean, var and cv
table_mean <- create_info_table(statistics="mean")
table_var <- create_info_table(statistics="var")
table_cv <- create_info_table(statistics="CV")

# replace 0 as NA
table_mean[table_mean==0] <- NA
table_var[table_mean==0] <- NA
table_cv[table_mean==0] <- NA

# calculate the correlation
cor(table_mean[,2:13],use="pairwise.complete.obs")
             mean_19098 mean_19101 mean_19239 mean_19098.1 mean_19098.2
mean_19098    1.0000000  0.9852974  0.9789826    0.9890853    0.9659667
mean_19101    0.9852974  1.0000000  0.9879295    0.9856922    0.9254189
mean_19239    0.9789826  0.9879295  1.0000000    0.9809677    0.9166417
mean_19098.1  0.9890853  0.9856922  0.9809677    1.0000000    0.9205102
mean_19098.2  0.9659667  0.9254189  0.9166417    0.9205102    1.0000000
mean_19098.3  0.9897683  0.9898391  0.9844379    0.9886098    0.9265197
mean_19101.1  0.9813595  0.9958857  0.9822822    0.9863789    0.9160428
mean_19101.2  0.9750097  0.9907745  0.9791731    0.9665583    0.9267378
mean_19101.3  0.9824474  0.9963364  0.9858818    0.9852461    0.9191901
mean_19239.1  0.9751169  0.9836778  0.9961752    0.9762399    0.9141762
mean_19239.2  0.9819749  0.9906238  0.9966978    0.9837043    0.9190592
mean_19239.3  0.9695992  0.9791607  0.9958848    0.9726513    0.9065916
             mean_19098.3 mean_19101.1 mean_19101.2 mean_19101.3
mean_19098      0.9897683    0.9813595    0.9750097    0.9824474
mean_19101      0.9898391    0.9958857    0.9907745    0.9963364
mean_19239      0.9844379    0.9822822    0.9791731    0.9858818
mean_19098.1    0.9886098    0.9863789    0.9665583    0.9852461
mean_19098.2    0.9265197    0.9160428    0.9267378    0.9191901
mean_19098.3    1.0000000    0.9858418    0.9789881    0.9876076
mean_19101.1    0.9858418    1.0000000    0.9771253    0.9903586
mean_19101.2    0.9789881    0.9771253    1.0000000    0.9820900
mean_19101.3    0.9876076    0.9903586    0.9820900    1.0000000
mean_19239.1    0.9802797    0.9777765    0.9777544    0.9792898
mean_19239.2    0.9882042    0.9868162    0.9795495    0.9883365
mean_19239.3    0.9747475    0.9722657    0.9698328    0.9794403
             mean_19239.1 mean_19239.2 mean_19239.3
mean_19098      0.9751169    0.9819749    0.9695992
mean_19101      0.9836778    0.9906238    0.9791607
mean_19239      0.9961752    0.9966978    0.9958848
mean_19098.1    0.9762399    0.9837043    0.9726513
mean_19098.2    0.9141762    0.9190592    0.9065916
mean_19098.3    0.9802797    0.9882042    0.9747475
mean_19101.1    0.9777765    0.9868162    0.9722657
mean_19101.2    0.9777544    0.9795495    0.9698328
mean_19101.3    0.9792898    0.9883365    0.9794403
mean_19239.1    1.0000000    0.9913432    0.9865966
mean_19239.2    0.9913432    1.0000000    0.9884642
mean_19239.3    0.9865966    0.9884642    1.0000000
cor(table_var[,2:13],use="pairwise.complete.obs")
            var_19098 var_19101 var_19239 var_19098.1 var_19098.2
var_19098   1.0000000 0.4457268 0.4937791   0.4976387   0.9618056
var_19101   0.4457268 1.0000000 0.8782272   0.8620967   0.5626504
var_19239   0.4937791 0.8782272 1.0000000   0.8879291   0.6283549
var_19098.1 0.4976387 0.8620967 0.8879291   1.0000000   0.5491121
var_19098.2 0.9618056 0.5626504 0.6283549   0.5491121   1.0000000
var_19098.3 0.5033991 0.8042806 0.9520294   0.9051561   0.6058795
var_19101.1 0.4593402 0.8919549 0.8764965   0.8894759   0.5661045
var_19101.2 0.4646436 0.8613854 0.9552923   0.8151788   0.6148337
var_19101.3 0.3819232 0.9552264 0.7674670   0.7448572   0.4888928
var_19239.1 0.4792729 0.7715627 0.9604927   0.8367994   0.6036163
var_19239.2 0.4940503 0.9313876 0.9769478   0.9248669   0.6164329
var_19239.3 0.4684518 0.8392077 0.9805642   0.8127816   0.6212588
            var_19098.3 var_19101.1 var_19101.2 var_19101.3 var_19239.1
var_19098     0.5033991   0.4593402   0.4646436   0.3819232   0.4792729
var_19101     0.8042806   0.8919549   0.8613854   0.9552264   0.7715627
var_19239     0.9520294   0.8764965   0.9552923   0.7674670   0.9604927
var_19098.1   0.9051561   0.8894759   0.8151788   0.7448572   0.8367994
var_19098.2   0.6058795   0.5661045   0.6148337   0.4888928   0.6036163
var_19098.3   1.0000000   0.7998802   0.9139139   0.7084104   0.9612885
var_19101.1   0.7998802   1.0000000   0.8005423   0.7231277   0.7723457
var_19101.2   0.9139139   0.8005423   1.0000000   0.7814277   0.9411226
var_19101.3   0.7084104   0.7231277   0.7814277   1.0000000   0.6693945
var_19239.1   0.9612885   0.7723457   0.9411226   0.6693945   1.0000000
var_19239.2   0.9263464   0.9209996   0.9271934   0.8234919   0.9068348
var_19239.3   0.9208955   0.8273502   0.9459670   0.7373678   0.9351080
            var_19239.2 var_19239.3
var_19098     0.4940503   0.4684518
var_19101     0.9313876   0.8392077
var_19239     0.9769478   0.9805642
var_19098.1   0.9248669   0.8127816
var_19098.2   0.6164329   0.6212588
var_19098.3   0.9263464   0.9208955
var_19101.1   0.9209996   0.8273502
var_19101.2   0.9271934   0.9459670
var_19101.3   0.8234919   0.7373678
var_19239.1   0.9068348   0.9351080
var_19239.2   1.0000000   0.9414995
var_19239.3   0.9414995   1.0000000
cor(table_cv[,2:13],use="pairwise.complete.obs")
            CV_19098  CV_19101  CV_19239 CV_19098.1 CV_19098.2 CV_19098.3
CV_19098   1.0000000 0.8525950 0.8203035  0.9196700  0.9016564  0.8797393
CV_19101   0.8525950 1.0000000 0.8380438  0.8306933  0.8270444  0.8008779
CV_19239   0.8203035 0.8380438 1.0000000  0.8016100  0.8027402  0.7844651
CV_19098.1 0.9196700 0.8306933 0.8016100  1.0000000  0.8247774  0.8050655
CV_19098.2 0.9016564 0.8270444 0.8027402  0.8247774  1.0000000  0.8071659
CV_19098.3 0.8797393 0.8008779 0.7844651  0.8050655  0.8071659  1.0000000
CV_19101.1 0.8149290 0.9196720 0.8004435  0.8230621  0.8245842  0.7883248
CV_19101.2 0.8148629 0.9139575 0.8105231  0.8100857  0.8171736  0.8066706
CV_19101.3 0.8194759 0.8972533 0.8210178  0.8327272  0.8342135  0.8183561
CV_19239.1 0.7723207 0.7940978 0.8878040  0.7877285  0.7909112  0.7842425
CV_19239.2 0.7932015 0.8078961 0.9014403  0.7997803  0.8038477  0.7877058
CV_19239.3 0.7850551 0.8094105 0.9155776  0.7864117  0.7918676  0.7829476
           CV_19101.1 CV_19101.2 CV_19101.3 CV_19239.1 CV_19239.2
CV_19098    0.8149290  0.8148629  0.8194759  0.7723207  0.7932015
CV_19101    0.9196720  0.9139575  0.8972533  0.7940978  0.8078961
CV_19239    0.8004435  0.8105231  0.8210178  0.8878040  0.9014403
CV_19098.1  0.8230621  0.8100857  0.8327272  0.7877285  0.7997803
CV_19098.2  0.8245842  0.8171736  0.8342135  0.7909112  0.8038477
CV_19098.3  0.7883248  0.8066706  0.8183561  0.7842425  0.7877058
CV_19101.1  1.0000000  0.8328967  0.8503940  0.7923881  0.8108956
CV_19101.2  0.8328967  1.0000000  0.8499119  0.8018922  0.8119385
CV_19101.3  0.8503940  0.8499119  1.0000000  0.8103150  0.8269014
CV_19239.1  0.7923881  0.8018922  0.8103150  1.0000000  0.8155535
CV_19239.2  0.8108956  0.8119385  0.8269014  0.8155535  1.0000000
CV_19239.3  0.7989245  0.8110615  0.8252111  0.8107193  0.8278178
           CV_19239.3
CV_19098    0.7850551
CV_19101    0.8094105
CV_19239    0.9155776
CV_19098.1  0.7864117
CV_19098.2  0.7918676
CV_19098.3  0.7829476
CV_19101.1  0.7989245
CV_19101.2  0.8110615
CV_19101.3  0.8252111
CV_19239.1  0.8107193
CV_19239.2  0.8278178
CV_19239.3  1.0000000
heatmap.2(cor(table_mean[,2:13],use="pairwise.complete.obs"), trace="none",cexRow=1,cexCol=1,margins=c(8,8))

heatmap.2(cor(table_var[,2:13],use="pairwise.complete.obs"), trace="none",cexRow=1,cexCol=1,margins=c(8,8))

heatmap.2(cor(table_cv[,2:13],use="pairwise.complete.obs"), trace="none",cexRow=1,cexCol=1,margins=c(8,8))

Look at the correlation of CV from batches

# create a table for boxplot of CV correlation 
corr_CV <- cor(table_cv[,2:13],use="pairwise.complete.obs")

corr_boxplot <- data.frame(
correlation=c(corr_CV[4,5:6],corr_CV[5,6],corr_CV[7,8:9],corr_CV[8,9],corr_CV[7,8:9],corr_CV[11,12],corr_CV[4:6,7:12],corr_CV[7:9,10:12]), cor_source=c(rep("within",9),rep("between",27)), individual=c(rep("19098",3),rep("19101",3),rep("19239",3),rep("19098.19101",9),rep("19098.19239",9),rep("19101.19239",9)))

# t test
t_test <- t.test(corr_boxplot[1:6,1],corr_boxplot[7:33,1],alternative = "greater")
t_test

    Welch Two Sample t-test

data:  corr_boxplot[1:6, 1] and corr_boxplot[7:33, 1]
t = 2.2134, df = 7.0767, p-value = 0.03104
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 0.002854199         Inf
sample estimates:
mean of x mean of y 
0.8283686 0.8087437 
ggplot(corr_boxplot, aes(cor_source,correlation)) + geom_boxplot() + geom_jitter(aes(colour = individual, size = 2, width = 0.5)) 

# create a table for boxplot of CV correlation without 19098.2
corr_boxplot_no <- data.frame(
correlation=c(corr_CV[4,6],corr_CV[7,8:9],corr_CV[8,9],corr_CV[7,8:9],corr_CV[11,12],corr_CV[4,7:12],corr_CV[6,7:12],corr_CV[7:9,10:12]), cor_source=c(rep("within",7),rep("between",21)), individual=c(rep("19098",1),rep("19101",3),rep("19239",3),rep("19098.19101",3),rep("19098.19239",3),rep("19098.19101",3),rep("19098.19239",3), rep("19101.19239",9)))

# t test
t_test <- t.test(corr_boxplot_no[1:6,1],corr_boxplot_no[7:33,1],alternative = "greater")
t_test

    Welch Two Sample t-test

data:  corr_boxplot_no[1:6, 1] and corr_boxplot_no[7:33, 1]
t = 3.9055, df = 7.2966, p-value = 0.002699
alternative hypothesis: true difference in means is greater than 0
95 percent confidence interval:
 0.0161709       Inf
sample estimates:
mean of x mean of y 
0.8369265 0.8056995 
ggplot(corr_boxplot_no, aes(cor_source,correlation)) + geom_boxplot() + geom_jitter(aes(colour = individual, size = 2, width = 0.5))

variance of gene expression between individuls

Calculate the variance betwwen individuls from the bulk samples

# normalization
reads_bulk_cpm <- cpm(reads_bulk)

# create a new dataset
reads_var <- data.frame(reads_bulk_cpm)
sum(reads_var!=reads_bulk_cpm)
[1] 0
# add mean of each individauls
reads_var$mean19098 <- apply(reads_var[,grep("NA19098",names(reads_var))], 1, mean)
reads_var$mean19101 <- apply(reads_var[,grep("NA19101",names(reads_var))], 1, mean)
reads_var$mean19239 <- apply(reads_var[,grep("NA19239",names(reads_var))], 1, mean)

# add variance of bulk means
reads_var$bulk_variance <- apply(reads_var[,c("mean19098","mean19101","mean19239")],1,var)

Calculate the variance between individuals using the means from single cells

# normalization
reads_single_cpm <- data.frame(cpm(reads_single))

# remove the ERCC of 19098.2 
  ## identify 19098.2
      sample_name <- names(reads_single_cpm)
#     targeremove()mn <- sample_name[grep("19098.2",sample_name)]
  ## find out ERCC rows
#        g <- rownames(reads_single_cpm)
#        target.row <- g[grep("ERCC",g)]
    ## replace the molecules numbers with NA 
#        reads_single_cpm[target.row,target.column] <- NA
        
# means of single cells within individuals
reads_var$mean.single19098 <- apply(reads_single_cpm[,grep("NA19098",names(reads_single_cpm))], 1, mean, na.rm = TRUE)
reads_var$mean.single19101 <- apply(reads_single_cpm[,grep("NA19101",names(reads_single_cpm))], 1, mean, na.rm = TRUE)
reads_var$mean.single19239 <- apply(reads_single_cpm[,grep("NA19239",names(reads_single_cpm))], 1, mean, na.rm = TRUE)


## variance of means from single cells
reads_var$mean_single_variance <- apply(reads_var[,c("mean.single19098","mean.single19101","mean.single19239")],1,var)

# sellect ERCC
reads_var$ERCC <- grepl("ERCC",rownames(reads_var))

# plot with color-blind-friendly palettes
cbPalette <- c("#999999", "#0000FF", "#990033", "#F0E442", "#0072B2", "#D55E00", "#CC79A7", "#009E73")

ggplot(reads_var, aes(x = bulk_variance, y = mean_single_variance, col = ERCC)) + geom_point(size = 2, alpha = 0.5) + scale_colour_manual(values=cbPalette) + scale_x_log10() + scale_y_log10() + stat_function(fun= function(x) {x}, col= "#56B4E9")

Calculate the variance between and within individuals using the single cell data

## variance within individual 
variance.single19098 <- apply(reads_single_cpm[,grep("NA19098",names(reads_single_cpm))], 1, var, na.rm = TRUE)
variance.single19101 <- apply(reads_single_cpm[,grep("NA19101",names(reads_single_cpm))], 1, var, na.rm = TRUE)
variance.single19239 <- apply(reads_single_cpm[,grep("NA19239",names(reads_single_cpm))], 1, var, na.rm = TRUE)

# number of cell
number.of.cell.all   <- sum(grepl("19",sample_name))
number.of.cell.19098 <- sum(grepl("19098",sample_name))
number.of.cell.19101 <- sum(grepl("19101",sample_name))
number.of.cell.19239 <- sum(grepl("19239",sample_name))

# total within individual variance 
reads_var$var_within_individual<-  
 (variance.single19098   *(number.of.cell.19098)  +      
  variance.single19101   *(number.of.cell.19101)  + 
  variance.single19239   *(number.of.cell.19239)  ) / 
 (number.of.cell.all)

ggplot(reads_var, aes(x = bulk_variance, y = var_within_individual, col = ERCC)) + geom_point(size = 2, alpha = 0.5) + scale_colour_manual(values=cbPalette) + scale_x_log10() + scale_y_log10() + stat_function(fun= function(x) {x}, col= "#56B4E9")

## variance between all single cells
var_all_single <- apply(reads_single_cpm, 1, var, na.rm = TRUE)
reads_var$var_all_single <- var_all_single

## keep non-missing across the table
reads_var <- reads_var[apply(reads_var,1,function(x)  sum(is.na(x)))==0,]

## variance between individauls
reads_var$var_between_individual<-( 
  var_all_single         *(number.of.cell.all-1)-
  variance.single19098   *(number.of.cell.19098-1)  -      
  variance.single19101   *(number.of.cell.19101-1)  - 
  variance.single19239   *(number.of.cell.19239-1)  ) / 
 (number.of.cell.all-1)

## the variaance contributed by between individual
reads_var$ratio_var_between_individual <- reads_var[,"var_between_individual"]/var_all_single

ggplot(reads_var, aes(x = var_all_single, y = ratio_var_between_individual, col = ERCC)) + geom_point(size = 2, alpha = 0.5) + scale_colour_manual(values=cbPalette) + scale_x_log10()

ggplot(reads_var, aes(x = var_all_single, y = var_between_individual, col = ERCC)) + geom_point(size = 2, alpha = 0.5) + scale_colour_manual(values=cbPalette) + scale_x_log10() + scale_y_log10() + stat_function(fun= function(x) {x}, col= "#56B4E9")

ggplot(reads_var, aes(x = bulk_variance, y = var_between_individual, col = ERCC)) + geom_point(size = 2, alpha = 0.5) + scale_colour_manual(values=cbPalette) + scale_x_log10() + scale_y_log10() + stat_function(fun= function(x) {x}, col= "#56B4E9")

AVONA F-statistics

Pull the p-value

### create a function for f.test by anova
### compare the to fits:
### 1. lm from all single cells
### 2. lm from each individaul
f.test <- function(data.in){
  tt <- names(data.in)
  individual.id <- rep("19098",length(tt))
  individual.id[grep("19101",tt)] <- "19101"
  individual.id[grep("19239",tt)] <- "19239"
  
  dd <- data.frame(reads=unlist(data.in),individual.id=individual.id)
  fit1 <- lm(reads~1,data=dd)
  fit2 <- lm(reads~1 + individual.id,data=dd)
  anova(fit1,fit2)[2,"Pr(>F)"]
}

# creat the f test table
f.test.table <- do.call(rbind,lapply(rownames(reads_single_cpm),function(x){
  data.frame(gene_name=x,p_of_f=f.test(reads_single_cpm[x,]))  
}))

# sellect ERCC
f.test.table$ERCC <- grepl("ERCC",f.test.table[,1])

# sort 
f.test.table.sort <- f.test.table[order(f.test.table[,2]),]
head(f.test.table.sort)
            gene_name        p_of_f  ERCC
14654 ENSG00000106153 1.095160e-139 FALSE
6757  ENSG00000256618 5.551611e-115 FALSE
5028  ENSG00000183648 6.035838e-104 FALSE
8833  ENSG00000022556 1.794451e-102 FALSE
14243 ENSG00000112306  8.897942e-96 FALSE
16181 ENSG00000176386  2.418442e-88 FALSE
plot(f.test.table.sort[,2], log = "y",col=as.numeric(f.test.table.sort$ERCC+1))

plot(f.test.table.sort[1:5000,2], log = "y",col=as.numeric(f.test.table.sort$ERCC[1:5000]+1))

plot(f.test.table.sort[1:5000,2],col=as.numeric(f.test.table.sort$ERCC[1:5000]+1),pch=20,cex=.3)

# plot the variance and show p 0f f
reads_var$p_of_f <- f.test.table$p_of_f

ggplot(reads_var, aes(x = bulk_variance, y = mean_single_variance, shape = as.factor(ERCC), col = p_of_f)) + geom_point(size = 2, alpha = 0.25) + scale_x_log10() + scale_y_log10() + scale_colour_gradient2(midpoint= 0.5, space="Lab") + stat_function(fun= function(x) {x}, col= "#56B4E9")

ggplot(reads_var, aes(x = bulk_variance, y = var_within_individual, shape = as.factor(ERCC), col = p_of_f)) + geom_point(size = 2, alpha = 0.5) + scale_x_log10() + scale_y_log10() + scale_colour_gradient2(midpoint= 0.5, space="Lab") + stat_function(fun= function(x) {x}, col= "#56B4E9")

ggplot(reads_var, aes(x = bulk_variance, y = var_between_individual, shape = as.factor(ERCC), col = p_of_f)) + geom_point(size = 2, alpha = 0.5) + scale_x_log10() + scale_y_log10() + scale_colour_gradient2(midpoint= 0.5, space="Lab") + stat_function(fun= function(x) {x}, col= "#56B4E9")

ggplot(reads_var, aes(x = var_all_single, y = var_between_individual, shape = as.factor(ERCC), col = p_of_f)) + geom_point(size = 2, alpha = 0.5) + scale_x_log10() + scale_y_log10() + scale_colour_gradient2(midpoint= 0.5, space="Lab") + stat_function(fun= function(x) {x}, col= "#56B4E9")

Pull the F value

# looking at the F value
f.test.F <- function(data.in){
  tt <- names(data.in)
  individual.id <- rep("19098",length(tt))
  individual.id[grep("19101",tt)] <- "19101"
  individual.id[grep("19239",tt)] <- "19239"
  
  dd <- data.frame(reads=unlist(data.in),individual.id=individual.id)
  fit1 <- lm(reads~1,data=dd)
  fit2 <- lm(reads~1 + individual.id,data=dd)
  anova(fit1,fit2)[2,"F"]
}
# creat the f test table of F value
f.test.F.table <- do.call(rbind,lapply(rownames(reads_single_cpm),function(x){
  data.frame(gene_name=x,F_value=f.test.F(reads_single_cpm[x,]))  
}))

# sellect ERCC
f.test.F.table$ERCC <- grepl("ERCC",f.test.F.table[,1])

# plot the variance and show F value
reads_var$F_value <- f.test.F.table$F_value

# calculate F value from p
# when p=0.01
qf(1-0.05,2,629)
[1] 3.010045
# create color index
reads_var$F_color <- "1 < F < 3"
reads_var$F_color[reads_var$F_value >=3] <- "F >= 3"
reads_var$F_color[reads_var$F_value <=1] <- "F <= 1"

# use the F value from p to scale the gradient
ggplot(reads_var, aes(x = bulk_variance, y = mean_single_variance, shape = as.factor(ERCC), col = F_color)) + scale_colour_manual(values=cbPalette) + geom_point(size = 2, alpha = 0.5) + scale_x_log10() + scale_y_log10() + stat_function(fun= function(x) {x}, col= "#56B4E9")

ggplot(reads_var, aes(x = bulk_variance, y = var_within_individual, shape = as.factor(ERCC), col = F_color)) + scale_colour_manual(values=cbPalette) + geom_point(size = 2, alpha = 0.5) + scale_x_log10() + scale_y_log10() + stat_function(fun= function(x) {x}, col= "#56B4E9")

ggplot(reads_var, aes(x = bulk_variance, y = var_between_individual, shape = as.factor(ERCC), col = F_color)) + scale_colour_manual(values=cbPalette) + geom_point(size = 2, alpha = 0.5) + scale_x_log10() + scale_y_log10() + stat_function(fun= function(x) {x}, col= "#56B4E9")

ggplot(reads_var, aes(x = var_all_single, y = var_within_individual, shape = as.factor(ERCC), col = F_color)) + scale_colour_manual(values=cbPalette) + geom_point(size = 2, alpha = 0.5) + scale_x_log10() + scale_y_log10() + stat_function(fun= function(x) {x}, col= "#56B4E9")

ggplot(reads_var, aes(x = var_all_single, y = var_between_individual, shape = as.factor(ERCC), col = F_color)) + scale_colour_manual(values=cbPalette) + geom_point(size = 2, alpha = 0.2) + scale_x_log10() + scale_y_log10() + stat_function(fun= function(x) {x}, col= "#56B4E9")

Session information

sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] gplots_2.17.0 edgeR_3.10.2  limma_3.24.9  ggplot2_1.0.1 dplyr_0.4.1  
[6] knitr_1.10.5 

loaded via a namespace (and not attached):
 [1] Rcpp_0.11.6        magrittr_1.5       MASS_7.3-40       
 [4] munsell_0.4.2      colorspace_1.2-6   stringr_1.0.0     
 [7] plyr_1.8.2         caTools_1.17.1     tools_3.2.0       
[10] parallel_3.2.0     grid_3.2.0         gtable_0.1.2      
[13] KernSmooth_2.23-14 DBI_0.3.1          gtools_3.5.0      
[16] htmltools_0.2.6    yaml_2.1.13        assertthat_0.1    
[19] digest_0.6.8       reshape2_1.4.1     formatR_1.2       
[22] bitops_1.0-6       evaluate_0.7       rmarkdown_0.6.1   
[25] labeling_0.3       gdata_2.16.1       stringi_0.4-1     
[28] scales_0.2.4       proto_0.3-10