Last updated: 2015-06-11

Code version: 44338a0a5f347c0c7a565729f2a86161eb3229d3

For each sample, sum the number of reads or molecules for every gene across sequencing runs.

Creates the following files:

Input

library("data.table")
library("dplyr")
counts <- fread("/mnt/gluster/data/internal_supp/singleCellSeq/gene-counts.txt")

Read 0.0% of 12402 rows
Read 80.6% of 12402 rows
Read 12402 rows and 20427 (of 20427) columns from 0.524 GB file in 00:00:30

Sum counts per lane by sample

counts_by_sample <- counts %>%
  filter(!is.na(lane), sickle == "quality-trimmed") %>%
  select(individual, batch, well, rmdup, starts_with("ENSG"), starts_with("ERCC")) %>%
  group_by(individual, batch, well, rmdup) %>%
  summarise_each(funs(sum)) %>%
  arrange(individual, batch, well, rmdup) %>%
  ungroup
counts_by_sample %>% select(1:8) %>% slice(1:10)
Source: local data table [10 x 8]

   individual batch well     rmdup ENSG00000186092 ENSG00000237683
1       19098     1  A01 molecules               0               2
2       19098     1  A01     reads               0               2
3       19098     1  A02 molecules               0               0
4       19098     1  A02     reads               0               0
5       19098     1  A03 molecules               0               4
6       19098     1  A03     reads               0               6
7       19098     1  A04 molecules               0               6
8       19098     1  A04     reads               0              46
9       19098     1  A05 molecules               0               0
10      19098     1  A05     reads               0               0
Variables not shown: ENSG00000235249 (int), ENSG00000185097 (int)

Transpose

anno <- counts_by_sample %>%
  filter(rmdup == "molecules") %>%
  select(individual:well) %>%
  as.data.frame
anno$sample_id <- paste(paste0("NA", anno$individual), anno$batch, anno$well,
                        sep = ".")
head(anno)
  individual batch well     sample_id
1      19098     1  A01 NA19098.1.A01
2      19098     1  A02 NA19098.1.A02
3      19098     1  A03 NA19098.1.A03
4      19098     1  A04 NA19098.1.A04
5      19098     1  A05 NA19098.1.A05
6      19098     1  A06 NA19098.1.A06
molecules <- counts_by_sample %>%
  filter(rmdup == "molecules") %>%
  select(-(individual:rmdup)) %>%
  t
dim(molecules)
[1] 20419   873
colnames(molecules) <- anno$sample_id
molecules[1:10, 1:5]
                NA19098.1.A01 NA19098.1.A02 NA19098.1.A03 NA19098.1.A04
ENSG00000186092             0             0             0             0
ENSG00000237683             2             0             4             6
ENSG00000235249             0             0             0             0
ENSG00000185097             0             0             0             0
ENSG00000269831             0             0             0             0
ENSG00000269308             0             0             0             0
ENSG00000187634             0             0             0             0
ENSG00000268179             0             0             0             0
ENSG00000188976             5             6             0             3
ENSG00000187961             0             0             0             0
                NA19098.1.A05
ENSG00000186092             0
ENSG00000237683             0
ENSG00000235249             0
ENSG00000185097             0
ENSG00000269831             0
ENSG00000269308             0
ENSG00000187634             0
ENSG00000268179             0
ENSG00000188976            15
ENSG00000187961             0
reads <- counts_by_sample %>%
  filter(rmdup == "reads") %>%
  select(-(individual:rmdup)) %>%
  t
dim(reads)
[1] 20419   873
colnames(reads) <- anno$sample_id
reads[1:10, 1:5]
                NA19098.1.A01 NA19098.1.A02 NA19098.1.A03 NA19098.1.A04
ENSG00000186092             0             0             0             0
ENSG00000237683             2             0             6            46
ENSG00000235249             0             0             0             0
ENSG00000185097             0             0             0             0
ENSG00000269831             0             0             0             0
ENSG00000269308             0             0             0             0
ENSG00000187634             0             0             0             0
ENSG00000268179             0             0             0             0
ENSG00000188976            56            19             0             3
ENSG00000187961             0             0             0             0
                NA19098.1.A05
ENSG00000186092             0
ENSG00000237683             0
ENSG00000235249             0
ENSG00000185097             0
ENSG00000269831             0
ENSG00000269308             0
ENSG00000187634             0
ENSG00000268179             0
ENSG00000188976           165
ENSG00000187961             0

Output

Output annotation file.

write.table(anno, "../data/annotation.txt", quote = FALSE, sep = "\t",
            row.names = FALSE)

Output molecule counts.

write.table(molecules, "../data/molecules-per-lane.txt", quote = FALSE, sep = "\t",
            col.names = NA)

Output read counts.

write.table(reads, "../data/reads.txt", quote = FALSE, sep = "\t",
            col.names = NA)

Output molecules from combined counts

Reads with the same UMI and start position need to be removed using all the data for a given sample. Otherwise the counts will be inflated because the same read can be sequenced across multiple lanes. Post-mapping we combined all the reads per sample and removed duplicate UMIs (pipeline). The data from these combined samples have NA recorded for index, lane, and flow_cell.

counts_combined <- counts %>%
  filter(is.na(lane), sickle == "quality-trimmed") %>%
  select(individual, batch, well, rmdup, starts_with("ENSG"), starts_with("ERCC")) %>%
  arrange(individual, batch, well, rmdup)
stopifnot(counts_combined$individual[c(TRUE, FALSE)] == anno$individual,
          counts_combined$batch[c(TRUE, FALSE)] == anno$batch,
          counts_combined$well[c(TRUE, FALSE)] == anno$well)

Molecule counts from combined samples.

molecules_combined <- counts_combined %>%
  filter(rmdup == "molecules") %>%
  select(-(individual:rmdup)) %>%
  t
dim(molecules_combined)
[1] 20419   873
colnames(molecules_combined) <- anno$sample_id
molecules_combined[1:10, 1:5]
                NA19098.1.A01 NA19098.1.A02 NA19098.1.A03 NA19098.1.A04
ENSG00000186092             0             0             0             0
ENSG00000237683             1             0             3             3
ENSG00000235249             0             0             0             0
ENSG00000185097             0             0             0             0
ENSG00000269831             0             0             0             0
ENSG00000269308             0             0             0             0
ENSG00000187634             0             0             0             0
ENSG00000268179             0             0             0             0
ENSG00000188976             3             3             0             2
ENSG00000187961             0             0             0             0
                NA19098.1.A05
ENSG00000186092             0
ENSG00000237683             0
ENSG00000235249             0
ENSG00000185097             0
ENSG00000269831             0
ENSG00000269308             0
ENSG00000187634             0
ENSG00000268179             0
ENSG00000188976             7
ENSG00000187961             0
write.table(molecules_combined, "../data/molecules.txt", quote = FALSE, sep = "\t",
            col.names = NA)

Session information

sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_0.4.1      data.table_1.9.4 knitr_1.10.5    

loaded via a namespace (and not attached):
 [1] Rcpp_0.11.6     assertthat_0.1  digest_0.6.8    chron_2.3-45   
 [5] plyr_1.8.2      DBI_0.3.1       formatR_1.2     magrittr_1.5   
 [9] evaluate_0.7    stringi_0.4-1   lazyeval_0.1.10 reshape2_1.4.1 
[13] rmarkdown_0.6.1 tools_3.2.0     stringr_1.0.0   parallel_3.2.0 
[17] yaml_2.1.13     htmltools_0.2.6