Last updated: 2016-06-29
Code version: c59e3a495e7257caa3180d1dfdaf315cdd79d715
Subsample reads post-mapping. Includes both mapped and unmapped reads.
cd /mnt/gluster/home/jdblischak/ssd
mkdir -p subsampled/bam-combined
mkdir -p ~/log/subsample-bam.py
for FILE in bam-combined/*bam
do
for NUM in 50000 250000 500000 1500000 4000000
do
echo "subsample-bam.py 12345 $NUM subsampled/bam-combined $FILE" | qsub -l h_vmem=3g -V -cwd -N sub -j y -o ~/log/subsample-bam.py -q blades.q
done
done
Confirm this worked. Expect 3456 jobs (3 individuals x 3 replicates x 96 wells * 4 subsamples). Many of the low quality single cells will fail due to low number of reads.
ls subsampled/bam-combined/*bam | wc -l
grep -w success ~/log/subsample-bam.py/* | wc -l
grep -w failure ~/log/subsample-bam.py/* | wc -l
# Failed because did not have enough reads to subsample
grep -w requested ~/log/subsample-bam.py/* | wc -l
# Failed because subsampled the wrong number of reads
grep -w Observed ~/log/subsample-bam.py/* | wc -l
Switch to directory subsampled
. Symlink exons.saf
.
cd subsampled
mkdir genome
ln -s /mnt/lustre/home/jdblischak/singleCellSeq/data/exons.saf genome/exons.saf
Remove duplicate UMIs.
submit-array.sh rmdup-umi.sh 2g bam-combined/*bam
ls bam-rmdup-umi/*bam | wc -l
grep -w success ~/log/rmdup-umi.sh/* | wc -l
grep -w failure ~/log/rmdup-umi.sh/* | wc -l
Count reads per gene.
submit-array.sh count-reads-per-gene.sh 2g bam-combined/*bam bam-rmdup-umi/*bam
ls counts/*genecounts.txt | wc -l
grep -w success ~/log/count-reads-per-gene.sh/* | wc -l
grep -w failure ~/log/count-reads-per-gene.sh/* | wc -l
Gather the gene counts.
mkdir -p counts-matrix
mkdir -p ~/log/gather-gene-counts-subsample.py
for NUM in 50000 250000 500000 1500000 4000000
do
echo "gather-gene-counts-subsample.py counts-matrix/$NUM- counts/*.$NUM.*genecounts.txt" | qsub -l h_vmem=2g -cwd -V -j y -o ~/log/gather-gene-counts-subsample.py -N gene-counts-$NUM -q blades.q
done
sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] knitr_1.10.5
loaded via a namespace (and not attached):
[1] httr_0.6.1 magrittr_1.5 formatR_1.2 htmltools_0.2.6
[5] tools_3.2.0 RCurl_1.95-4.6 yaml_2.1.13 rmarkdown_0.6.1
[9] stringi_1.0-1 digest_0.6.8 stringr_1.0.0 bitops_1.0-6
[13] evaluate_0.7