Descriptive statistics

Read the data

annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)

Apply the QC filters.

annotations = annotations[keep_samples.values.ravel()]
annotations.shape

(5597, 40)

Distributions of read counts

H. sapiens

plt.clf()
plt.gcf().set_size_inches(12, 4)
plt.hist(annotations['reads_hs'], bins=100)
plt.xlabel('Read count')
plt.ylabel('Number of cells')
plt.title('Read count by cell')

Text(0.5,1,'Read count by cell')

plt.clf()
plt.gcf().set_size_inches(12, 4)
colors = {k: matplotlib.cm.get_cmap('Set1').colors[i] for i, k in enumerate(set(annotations['batch']))}
for batch, g in annotations.groupby('batch'):
  plt.hist(g['reads_hs'], histtype='step', color=colors[batch], label=batch, bins=50)
plt.legend()
plt.xlabel('Read count')
plt.ylabel('Density')
plt.title('Read count by batch')

Text(0.5,1,'Read count by batch')

plt.clf()
plt.gcf().set_size_inches(12, 4)
colors = {k: matplotlib.cm.get_cmap('Set1').colors[i] for i, k in enumerate(set(annotations['batch']))}
grid = np.linspace(annotations['reads_hs'].min(), annotations['reads_hs'].max(), 200)
for batch, g in annotations.groupby('batch'):
  density = st.gaussian_kde(g['reads_hs'])
  plt.plot(grid, density.evaluate(grid), color=colors[batch], label=batch)
plt.legend()
plt.xlabel('Read count')
plt.ylabel('Density')
plt.title('Read count by batch')

Text(0.5,1,'Read count by batch')

plt.clf()
plt.gcf().set_size_inches(12, 4)
_ = plt.boxplot(annotations.groupby('chip_id').apply(lambda x: list(x['reads_hs'])),
                labels=sorted(set(annotations['chip_id'])))
plt.xlabel('Individual')
plt.ylabel('Read count')
_ = plt.xticks(rotation=90)

plt.clf()
fig, ax = plt.subplots(len(set(annotations['batch'])), 1)
fig.set_size_inches(12, 10)
for a, (batch, g) in zip(ax, annotations.groupby('batch')):
  bp = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_hs'])),
                 positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
  a.set_ylabel('Batch {} read count'.format(batch))
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Read count by individual/batch')

Text(0.5,0.98,'Read count by individual/batch')

ERCC

plt.clf()
fig, ax = plt.subplots(3, 1)
fig.set_size_inches(12, 12)
for a, (k, g) in zip(ax, annotations.groupby('ERCC')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_ercc'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Read count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
plt.xlabel('Individual')
plt.suptitle('ERCC read count by individual')

Text(0.5,0.98,'ERCC read count by individual')

D. melanogaster

plt.clf()
fig, ax = plt.subplots(len(set(annotations['fly'])), 1)
fig.set_size_inches(12, 8)
for a, (k, g) in zip(ax, annotations.groupby('fly')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_dm'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Read count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
plt.xlabel('Individual')
plt.suptitle('D. mel. read count by individual')

Text(0.5,0.98,'D. mel. read count by individual')

C. elegans

plt.clf()
fig, ax = plt.subplots(3, 1)
fig.set_size_inches(12, 12)
for a, (k, g) in zip(ax, annotations.groupby('worm')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_ce'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Read count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
plt.xlabel('Individual')
plt.suptitle('C. elegans read count by individual')

Text(0.5,0.98,'C. elegans read count by individual')

Distributions of molecules

H. sapiens

plt.clf()
plt.hist(annotations['mol_hs'], bins=100)
plt.xlabel('Number of molecules')
plt.ylabel('Number of cells')
plt.title('Number of molecules by cell')
plt.close()

plt.clf()
plt.gcf().set_size_inches(12, 4)
_ = plt.boxplot(annotations.groupby('chip_id').apply(lambda x: list(x['mol_hs'])),
                labels=sorted(set(annotations['chip_id'])),
                sym='.')
plt.xlabel('Individual')
plt.ylabel('Molecule count')
plt.xticks(rotation=90)
plt.title('Molecule count by individual')

Text(0.5,1,'Molecule count by individual')

plt.clf()
fig, ax = plt.subplots(len(set(annotations['batch'])), 1)
fig.set_size_inches(12, 10)
for a, (batch, g) in zip(ax, annotations.groupby('batch')):
  bp = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_hs'])),
                 positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
  a.set_ylabel('Batch {} molecule count'.format(batch))
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Molecule count by individual')

Text(0.5,0.98,'Molecule count by individual')

ERCC

plt.clf()
fig, ax = plt.subplots(3, 1)
fig.set_size_inches(12, 12)
for a, (k, g) in zip(ax, annotations.groupby('ERCC')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_ercc'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Molecule count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Molecule count by individual')
plt.xlabel('Individual')
plt.suptitle('ERCC molecule count by individual')

Text(0.5,0.98,'ERCC molecule count by individual')

D. melanogaster

plt.clf()
fig, ax = plt.subplots(len(set(annotations['fly'])), 1)
fig.set_size_inches(12, 8)
for a, (k, g) in zip(ax, annotations.groupby('fly')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_dm'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Molecule count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Molecule count by individual')
plt.xlabel('Individual')
plt.suptitle('D. mel. molecule count by individual')

Text(0.5,0.98,'D. mel. molecule count by individual')

C. elegans

plt.clf()
fig, ax = plt.subplots(3, 1)
fig.set_size_inches(12, 12)
for a, (k, g) in zip(ax, annotations.groupby('worm')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_ce'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Molecule count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Molecule count by individual')
plt.xlabel('Individual')
plt.suptitle('C. elegans molecule count by individual')

Text(0.5,0.98,'C. elegans molecule count by individual')

Distributions of cells

Our previous results on concordance between pools of single cells suggest that at least 50 cells are needed to consistently estimate mean expression.

plt.clf()
plt.gcf().set_size_inches(12, 4)
plt.bar(np.arange(len(set(annotations['chip_id']))),
        annotations.groupby('chip_id')['chip_id'].agg(len), color='k')
plt.xlabel('Individual')
plt.ylabel('Number of cells')
plt.title('Number of cells by individual')
_ = plt.xticks(np.arange(len(set(annotations['chip_id']))), sorted(set(annotations['chip_id'])), rotation=90)

plt.clf()
fig, ax = plt.subplots(len(set(annotations['batch'])), 1)
fig.set_size_inches(12, 10)
for a, (batch, g) in zip(ax, annotations.groupby('batch')):
  bp = a.bar([sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))], 
             g.groupby('chip_id').apply(lambda x: len(x)))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
  a.set_ylabel('Batch {} cell count'.format(batch))
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Number of cells by individual')

Text(0.5,0.98,'Number of cells by individual')