Descriptive statistics
Read the data
annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt') keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)
Apply the QC filters.
annotations = annotations[keep_samples.values.ravel()]
annotations.shape
(5597, 40)
Distributions of read counts
H. sapiens
plt.clf() plt.gcf().set_size_inches(12, 4) plt.hist(annotations['reads_hs'], bins=100) plt.xlabel('Read count') plt.ylabel('Number of cells') plt.title('Read count by cell')
Text(0.5,1,'Read count by cell')
plt.clf() plt.gcf().set_size_inches(12, 4) colors = {k: matplotlib.cm.get_cmap('Set1').colors[i] for i, k in enumerate(set(annotations['batch']))} for batch, g in annotations.groupby('batch'): plt.hist(g['reads_hs'], histtype='step', color=colors[batch], label=batch, bins=50) plt.legend() plt.xlabel('Read count') plt.ylabel('Density') plt.title('Read count by batch')
Text(0.5,1,'Read count by batch')
plt.clf() plt.gcf().set_size_inches(12, 4) colors = {k: matplotlib.cm.get_cmap('Set1').colors[i] for i, k in enumerate(set(annotations['batch']))} grid = np.linspace(annotations['reads_hs'].min(), annotations['reads_hs'].max(), 200) for batch, g in annotations.groupby('batch'): density = st.gaussian_kde(g['reads_hs']) plt.plot(grid, density.evaluate(grid), color=colors[batch], label=batch) plt.legend() plt.xlabel('Read count') plt.ylabel('Density') plt.title('Read count by batch')
Text(0.5,1,'Read count by batch')
plt.clf() plt.gcf().set_size_inches(12, 4) _ = plt.boxplot(annotations.groupby('chip_id').apply(lambda x: list(x['reads_hs'])), labels=sorted(set(annotations['chip_id']))) plt.xlabel('Individual') plt.ylabel('Read count') _ = plt.xticks(rotation=90)
plt.clf() fig, ax = plt.subplots(len(set(annotations['batch'])), 1) fig.set_size_inches(12, 10) for a, (batch, g) in zip(ax, annotations.groupby('batch')): bp = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_hs'])), positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))]) a.set_xticks(np.arange(1, len(set(annotations['chip_id'])))) a.set_xticklabels([]) a.set_ylabel('Batch {} read count'.format(batch)) ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90) fig.suptitle('Read count by individual/batch')
Text(0.5,0.98,'Read count by individual/batch')
ERCC
plt.clf() fig, ax = plt.subplots(3, 1) fig.set_size_inches(12, 12) for a, (k, g) in zip(ax, annotations.groupby('ERCC')): _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_ercc'])), positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))]) a.set_ylabel('Read count ({})'.format(k)) a.set_xticks(np.arange(1, len(set(annotations['chip_id'])))) a.set_xticklabels([]) ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90) plt.xlabel('Individual') plt.suptitle('ERCC read count by individual')
Text(0.5,0.98,'ERCC read count by individual')
D. melanogaster
plt.clf() fig, ax = plt.subplots(len(set(annotations['fly'])), 1) fig.set_size_inches(12, 8) for a, (k, g) in zip(ax, annotations.groupby('fly')): _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_dm'])), positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))]) a.set_ylabel('Read count ({})'.format(k)) a.set_xticks(np.arange(1, len(set(annotations['chip_id'])))) a.set_xticklabels([]) ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90) plt.xlabel('Individual') plt.suptitle('D. mel. read count by individual')
Text(0.5,0.98,'D. mel. read count by individual')
C. elegans
plt.clf() fig, ax = plt.subplots(3, 1) fig.set_size_inches(12, 12) for a, (k, g) in zip(ax, annotations.groupby('worm')): _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_ce'])), positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))]) a.set_ylabel('Read count ({})'.format(k)) a.set_xticks(np.arange(1, len(set(annotations['chip_id'])))) a.set_xticklabels([]) ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90) plt.xlabel('Individual') plt.suptitle('C. elegans read count by individual')
Text(0.5,0.98,'C. elegans read count by individual')
Distributions of molecules
H. sapiens
plt.clf() plt.hist(annotations['mol_hs'], bins=100) plt.xlabel('Number of molecules') plt.ylabel('Number of cells') plt.title('Number of molecules by cell') plt.close()
plt.clf() plt.gcf().set_size_inches(12, 4) _ = plt.boxplot(annotations.groupby('chip_id').apply(lambda x: list(x['mol_hs'])), labels=sorted(set(annotations['chip_id'])), sym='.') plt.xlabel('Individual') plt.ylabel('Molecule count') plt.xticks(rotation=90) plt.title('Molecule count by individual')
Text(0.5,1,'Molecule count by individual')
plt.clf() fig, ax = plt.subplots(len(set(annotations['batch'])), 1) fig.set_size_inches(12, 10) for a, (batch, g) in zip(ax, annotations.groupby('batch')): bp = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_hs'])), positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))]) a.set_xticks(np.arange(1, len(set(annotations['chip_id'])))) a.set_xticklabels([]) a.set_ylabel('Batch {} molecule count'.format(batch)) ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90) fig.suptitle('Molecule count by individual')
Text(0.5,0.98,'Molecule count by individual')
ERCC
plt.clf() fig, ax = plt.subplots(3, 1) fig.set_size_inches(12, 12) for a, (k, g) in zip(ax, annotations.groupby('ERCC')): _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_ercc'])), positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))]) a.set_ylabel('Molecule count ({})'.format(k)) a.set_xticks(np.arange(1, len(set(annotations['chip_id'])))) a.set_xticklabels([]) ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90) fig.suptitle('Molecule count by individual') plt.xlabel('Individual') plt.suptitle('ERCC molecule count by individual')
Text(0.5,0.98,'ERCC molecule count by individual')
D. melanogaster
plt.clf() fig, ax = plt.subplots(len(set(annotations['fly'])), 1) fig.set_size_inches(12, 8) for a, (k, g) in zip(ax, annotations.groupby('fly')): _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_dm'])), positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))]) a.set_ylabel('Molecule count ({})'.format(k)) a.set_xticks(np.arange(1, len(set(annotations['chip_id'])))) a.set_xticklabels([]) ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90) fig.suptitle('Molecule count by individual') plt.xlabel('Individual') plt.suptitle('D. mel. molecule count by individual')
Text(0.5,0.98,'D. mel. molecule count by individual')
C. elegans
plt.clf() fig, ax = plt.subplots(3, 1) fig.set_size_inches(12, 12) for a, (k, g) in zip(ax, annotations.groupby('worm')): _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_ce'])), positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))]) a.set_ylabel('Molecule count ({})'.format(k)) a.set_xticks(np.arange(1, len(set(annotations['chip_id'])))) a.set_xticklabels([]) ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90) fig.suptitle('Molecule count by individual') plt.xlabel('Individual') plt.suptitle('C. elegans molecule count by individual')
Text(0.5,0.98,'C. elegans molecule count by individual')
Distributions of cells
Our previous results on concordance between pools of single cells suggest that at least 50 cells are needed to consistently estimate mean expression.
plt.clf() plt.gcf().set_size_inches(12, 4) plt.bar(np.arange(len(set(annotations['chip_id']))), annotations.groupby('chip_id')['chip_id'].agg(len), color='k') plt.xlabel('Individual') plt.ylabel('Number of cells') plt.title('Number of cells by individual') _ = plt.xticks(np.arange(len(set(annotations['chip_id']))), sorted(set(annotations['chip_id'])), rotation=90)
plt.clf() fig, ax = plt.subplots(len(set(annotations['batch'])), 1) fig.set_size_inches(12, 10) for a, (batch, g) in zip(ax, annotations.groupby('batch')): bp = a.bar([sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))], g.groupby('chip_id').apply(lambda x: len(x))) a.set_xticks(np.arange(1, len(set(annotations['chip_id'])))) a.set_xticklabels([]) a.set_ylabel('Batch {} cell count'.format(batch)) ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90) fig.suptitle('Number of cells by individual')
Text(0.5,0.98,'Number of cells by individual')