Last updated: 2015-06-11
Code version: 44338a0a5f347c0c7a565729f2a86161eb3229d3
For each sample, sum the number of reads or molecules for every gene across sequencing runs.
Creates the following files:
library("data.table")
library("dplyr")
counts <- fread("/mnt/gluster/data/internal_supp/singleCellSeq/gene-counts.txt")
Read 0.0% of 12402 rows
Read 80.6% of 12402 rows
Read 12402 rows and 20427 (of 20427) columns from 0.524 GB file in 00:00:30
counts_by_sample <- counts %>%
filter(!is.na(lane), sickle == "quality-trimmed") %>%
select(individual, batch, well, rmdup, starts_with("ENSG"), starts_with("ERCC")) %>%
group_by(individual, batch, well, rmdup) %>%
summarise_each(funs(sum)) %>%
arrange(individual, batch, well, rmdup) %>%
ungroup
counts_by_sample %>% select(1:8) %>% slice(1:10)
Source: local data table [10 x 8]
individual batch well rmdup ENSG00000186092 ENSG00000237683
1 19098 1 A01 molecules 0 2
2 19098 1 A01 reads 0 2
3 19098 1 A02 molecules 0 0
4 19098 1 A02 reads 0 0
5 19098 1 A03 molecules 0 4
6 19098 1 A03 reads 0 6
7 19098 1 A04 molecules 0 6
8 19098 1 A04 reads 0 46
9 19098 1 A05 molecules 0 0
10 19098 1 A05 reads 0 0
Variables not shown: ENSG00000235249 (int), ENSG00000185097 (int)
anno <- counts_by_sample %>%
filter(rmdup == "molecules") %>%
select(individual:well) %>%
as.data.frame
anno$sample_id <- paste(paste0("NA", anno$individual), anno$batch, anno$well,
sep = ".")
head(anno)
individual batch well sample_id
1 19098 1 A01 NA19098.1.A01
2 19098 1 A02 NA19098.1.A02
3 19098 1 A03 NA19098.1.A03
4 19098 1 A04 NA19098.1.A04
5 19098 1 A05 NA19098.1.A05
6 19098 1 A06 NA19098.1.A06
molecules <- counts_by_sample %>%
filter(rmdup == "molecules") %>%
select(-(individual:rmdup)) %>%
t
dim(molecules)
[1] 20419 873
colnames(molecules) <- anno$sample_id
molecules[1:10, 1:5]
NA19098.1.A01 NA19098.1.A02 NA19098.1.A03 NA19098.1.A04
ENSG00000186092 0 0 0 0
ENSG00000237683 2 0 4 6
ENSG00000235249 0 0 0 0
ENSG00000185097 0 0 0 0
ENSG00000269831 0 0 0 0
ENSG00000269308 0 0 0 0
ENSG00000187634 0 0 0 0
ENSG00000268179 0 0 0 0
ENSG00000188976 5 6 0 3
ENSG00000187961 0 0 0 0
NA19098.1.A05
ENSG00000186092 0
ENSG00000237683 0
ENSG00000235249 0
ENSG00000185097 0
ENSG00000269831 0
ENSG00000269308 0
ENSG00000187634 0
ENSG00000268179 0
ENSG00000188976 15
ENSG00000187961 0
reads <- counts_by_sample %>%
filter(rmdup == "reads") %>%
select(-(individual:rmdup)) %>%
t
dim(reads)
[1] 20419 873
colnames(reads) <- anno$sample_id
reads[1:10, 1:5]
NA19098.1.A01 NA19098.1.A02 NA19098.1.A03 NA19098.1.A04
ENSG00000186092 0 0 0 0
ENSG00000237683 2 0 6 46
ENSG00000235249 0 0 0 0
ENSG00000185097 0 0 0 0
ENSG00000269831 0 0 0 0
ENSG00000269308 0 0 0 0
ENSG00000187634 0 0 0 0
ENSG00000268179 0 0 0 0
ENSG00000188976 56 19 0 3
ENSG00000187961 0 0 0 0
NA19098.1.A05
ENSG00000186092 0
ENSG00000237683 0
ENSG00000235249 0
ENSG00000185097 0
ENSG00000269831 0
ENSG00000269308 0
ENSG00000187634 0
ENSG00000268179 0
ENSG00000188976 165
ENSG00000187961 0
Output annotation file.
write.table(anno, "../data/annotation.txt", quote = FALSE, sep = "\t",
row.names = FALSE)
Output molecule counts.
write.table(molecules, "../data/molecules-per-lane.txt", quote = FALSE, sep = "\t",
col.names = NA)
Output read counts.
write.table(reads, "../data/reads.txt", quote = FALSE, sep = "\t",
col.names = NA)
Reads with the same UMI and start position need to be removed using all the data for a given sample. Otherwise the counts will be inflated because the same read can be sequenced across multiple lanes. Post-mapping we combined all the reads per sample and removed duplicate UMIs (pipeline). The data from these combined samples have NA
recorded for index, lane, and flow_cell.
counts_combined <- counts %>%
filter(is.na(lane), sickle == "quality-trimmed") %>%
select(individual, batch, well, rmdup, starts_with("ENSG"), starts_with("ERCC")) %>%
arrange(individual, batch, well, rmdup)
stopifnot(counts_combined$individual[c(TRUE, FALSE)] == anno$individual,
counts_combined$batch[c(TRUE, FALSE)] == anno$batch,
counts_combined$well[c(TRUE, FALSE)] == anno$well)
Molecule counts from combined samples.
molecules_combined <- counts_combined %>%
filter(rmdup == "molecules") %>%
select(-(individual:rmdup)) %>%
t
dim(molecules_combined)
[1] 20419 873
colnames(molecules_combined) <- anno$sample_id
molecules_combined[1:10, 1:5]
NA19098.1.A01 NA19098.1.A02 NA19098.1.A03 NA19098.1.A04
ENSG00000186092 0 0 0 0
ENSG00000237683 1 0 3 3
ENSG00000235249 0 0 0 0
ENSG00000185097 0 0 0 0
ENSG00000269831 0 0 0 0
ENSG00000269308 0 0 0 0
ENSG00000187634 0 0 0 0
ENSG00000268179 0 0 0 0
ENSG00000188976 3 3 0 2
ENSG00000187961 0 0 0 0
NA19098.1.A05
ENSG00000186092 0
ENSG00000237683 0
ENSG00000235249 0
ENSG00000185097 0
ENSG00000269831 0
ENSG00000269308 0
ENSG00000187634 0
ENSG00000268179 0
ENSG00000188976 7
ENSG00000187961 0
write.table(molecules_combined, "../data/molecules.txt", quote = FALSE, sep = "\t",
col.names = NA)
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] dplyr_0.4.1 data.table_1.9.4 knitr_1.10.5
loaded via a namespace (and not attached):
[1] Rcpp_0.11.6 assertthat_0.1 digest_0.6.8 chron_2.3-45
[5] plyr_1.8.2 DBI_0.3.1 formatR_1.2 magrittr_1.5
[9] evaluate_0.7 stringi_0.4-1 lazyeval_0.1.10 reshape2_1.4.1
[13] rmarkdown_0.6.1 tools_3.2.0 stringr_1.0.0 parallel_3.2.0
[17] yaml_2.1.13 htmltools_0.2.6