Last updated: 2016-03-15

Code version: 1db9308a307e8e4185c5d233188ba21e5996e560

Here I use ngsplot to calculate coverage.

Conclusions:

library("tidyr")
library("ggplot2")
library("cowplot")
theme_set(theme_bw(base_size = 12))
theme_update(panel.grid.minor.x = element_blank(),
             panel.grid.minor.y = element_blank(),
             panel.grid.major.x = element_blank(),
             panel.grid.major.y = element_blank())

Functions

The following function aggregate results from the various ngsplot runs.

import_ngsplot <- function(results, id = 1:length(results)) {
  # Imports and combines results from multiple ngsplot analyses 
  #
  # results - name of ngsplot results (specified with -O flag)
  # id - description of analysis
  library("tidyr")
  stopifnot(length(results) > 0, length(results) == length(id))
  avgprof_list <- list()
  sem_list <- list()
  for (i in seq_along(results)) {
    zipfile <- paste0(results[i], ".zip")
    extract_zip(zipfile)
    # Import mean coverage
    avgprof_list[[i]] <- import_data(path = results[i], datatype = "avgprof",
                                     id = id[i])
    # Import standard error of mean coverage
    sem_list[[i]] <- import_data(path = results[i], datatype = "sem",
                                 id = id[i])
  }
  avgprof_df <- do.call(rbind, avgprof_list)
  sem_df <- do.call(rbind, sem_list)
  final <- merge(avgprof_df, sem_df)
  return(final)
}

extract_zip <- function(zipfile) {
  # Unzip the ngsplot results into the same directory
  stopifnot(length(zipfile) == 1, file.exists(zipfile))
  unzip(zipfile, exdir = dirname(zipfile))
  return(invisible())
}

import_data <- function(path, datatype, id) {
  # Import the data from a specific ngsplot file.
  #
  # path - path to the ngsplot results directory
  # datatype - either "avgprof" for the mean coverage or
  #            "sem" for the standard error of the mean coverage
  # id - description of analysis (length == 1)
  stopifnot(datatype == "avgprof" | datatype == "sem",
            length(id) == 1)
  fname <- paste0(path, "/", datatype, ".txt")
  df <- read.delim(fname)
  df$position <- paste0("p", 1:nrow(df))
  df$id <- id
  df_long <- gather_(df, key_col = "metainfo", value = datatype)
  df_long$metainfo <- as.character(df_long$metainfo)
  df_long$position <- sub("^p", "", df_long$position)
  df_long$position <- as.numeric(df_long$position)
  return(df_long)
}

Coverage

First I observe the coverage at the TSS, gene body, and TES for all filtered genes.

Unzip and import the raw coverage data.

cov <- import_ngsplot(results = c("../data/ngsplot-molecules-tss-both",
                                  "../data/ngsplot-molecules-genebody-both",
                                  "../data/ngsplot-molecules-tes-both",
                                  "../data/ngsplot-molecules-tss-same",
                                  "../data/ngsplot-molecules-genebody-same",
                                  "../data/ngsplot-molecules-tes-same",
                                  "../data/ngsplot-molecules-tss-opposite",
                                  "../data/ngsplot-molecules-genebody-opposite",
                                  "../data/ngsplot-molecules-tes-opposite"),
                      id = c("tss-both", "genebody-both", "tes-both",
                             "tss-same", "genebody-same", "tes-same",
                             "tss-opposite", "genebody-opposite", "tes-opposite"))
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
cov <- separate(cov, "id", into = c("feature", "strand"), sep = "-")
cov$id <- factor(cov$feature, levels = c("tss", "genebody", "tes"))

Plotting results.

p <- ggplot(NULL, aes(x = position, y = avgprof, color = metainfo)) +
  geom_line()+
  geom_ribbon(aes(ymin = avgprof - sem, ymax = avgprof + sem,
                  color = NULL, fill = metainfo), alpha = 0.25) +
  facet_wrap(~strand, ncol = 1) +
  theme(legend.position = "none") +
  labs(x = "Position", y = "Mean molecules per million") +
  ylim(0, 8)

TSS

plot_tss <- p %+% cov[cov$feature == "tss", ] +
  geom_vline(x = 50, col = "grey") +
  scale_x_continuous(breaks = c(0, 25, 50, 75, 100),
                     labels = c(-1000, -500, "TSS", 500, 1000)) +
  labs(title = "Transcription start site")

Gene body

plot_genebody <- p %+% cov[cov$feature == "genebody", ] +
  geom_vline(x = c(20, 80), color = "grey") +
  scale_x_continuous(breaks = c(0, 20, 40, 60, 80, 100),
                     labels = c(-1000, "TSS", "33%", "66%", "TES", 1000)) +
  labs(title = "Gene body")

TES

plot_tes <- p %+% cov[cov$feature == "tes", ] +
  geom_vline(x = 50, color = "grey") +
  scale_x_continuous(breaks = c(0, 25, 50, 75, 100),
                     labels = c(-1000, -500, "TES", 500, 1000)) +
  labs(title = "Transcription end site")

Plot

plot_grid(plot_tss, plot_genebody, plot_tes, ncol = 3, labels = LETTERS[1:3])

Coverage by expression level

Next I compare the coverage for NA19091 for genes split into expression quartiles.

cov_expr <- import_ngsplot(results = c("../data/ngsplot-genebody-expr-both",
                                  "../data/ngsplot-genebody-expr-same",
                                  "../data/ngsplot-genebody-expr-opposite"),
                      id = c("both", "same", "opposite"))
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
colnames(cov_expr)[colnames(cov_expr) == "id"] <- "strand"

Plot

plot_expr <- plot_genebody %+% cov_expr +
  scale_color_discrete(name = "Expression quartile") +
  scale_fill_discrete(name = "Expression quartile") +
  theme(legend.position = "bottom") +
  ylim(0, 25)
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
plot_expr

Notice the increased y-axis.

Coverage by gene length

Next I compare the coverage for NA19091 for genes split by gene length.

cov_len <- import_ngsplot(results = c("../data/ngsplot-genebody-len-both",
                                  "../data/ngsplot-genebody-len-same",
                                  "../data/ngsplot-genebody-len-opposite"),
                      id = c("both", "same", "opposite"))
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
Using position, id as id variables
colnames(cov_len)[colnames(cov_len) == "id"] <- "strand"

Plot

plot_len <- plot_genebody %+% cov_len +
  scale_color_discrete(name = "Length quartile") +
  scale_fill_discrete(name = "Length quartile") +
  theme(legend.position = "bottom") +
  ylim(0, 20)
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
plot_len

Session information

sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] cowplot_0.3.1   ggplot2_1.0.1   tidyr_0.2.0     knitr_1.10.5   
[5] rmarkdown_0.6.1

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.0      magrittr_1.5     MASS_7.3-40      munsell_0.4.2   
 [5] colorspace_1.2-6 stringr_1.0.0    httr_0.6.1       plyr_1.8.3      
 [9] tools_3.2.0      grid_3.2.0       gtable_0.1.2     htmltools_0.2.6 
[13] yaml_2.1.13      digest_0.6.8     reshape2_1.4.1   formatR_1.2     
[17] bitops_1.0-6     RCurl_1.95-4.6   evaluate_0.7     labeling_0.3    
[21] stringi_0.4-1    scales_0.2.4     proto_0.3-10