Descriptive statistics

Read the data

annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)

Apply the QC filters.

annotations = annotations[keep_samples.values.ravel()]
annotations.shape
(5597, 40)

Distributions of read counts

H. sapiens

plt.clf()
plt.gcf().set_size_inches(12, 4)
plt.hist(annotations['reads_hs'], bins=100)
plt.xlabel('Read count')
plt.ylabel('Number of cells')
plt.title('Read count by cell')
Text(0.5,1,'Read count by cell')

Sorry, your browser does not support SVG.

plt.clf()
plt.gcf().set_size_inches(12, 4)
colors = {k: matplotlib.cm.get_cmap('Set1').colors[i] for i, k in enumerate(set(annotations['batch']))}
for batch, g in annotations.groupby('batch'):
  plt.hist(g['reads_hs'], histtype='step', color=colors[batch], label=batch, bins=50)
plt.legend()
plt.xlabel('Read count')
plt.ylabel('Density')
plt.title('Read count by batch')
Text(0.5,1,'Read count by batch')

Sorry, your browser does not support SVG.

plt.clf()
plt.gcf().set_size_inches(12, 4)
colors = {k: matplotlib.cm.get_cmap('Set1').colors[i] for i, k in enumerate(set(annotations['batch']))}
grid = np.linspace(annotations['reads_hs'].min(), annotations['reads_hs'].max(), 200)
for batch, g in annotations.groupby('batch'):
  density = st.gaussian_kde(g['reads_hs'])
  plt.plot(grid, density.evaluate(grid), color=colors[batch], label=batch)
plt.legend()
plt.xlabel('Read count')
plt.ylabel('Density')
plt.title('Read count by batch')
Text(0.5,1,'Read count by batch')

Sorry, your browser does not support SVG.

plt.clf()
plt.gcf().set_size_inches(12, 4)
_ = plt.boxplot(annotations.groupby('chip_id').apply(lambda x: list(x['reads_hs'])),
                labels=sorted(set(annotations['chip_id'])))
plt.xlabel('Individual')
plt.ylabel('Read count')
_ = plt.xticks(rotation=90)

Sorry, your browser does not support SVG.

plt.clf()
fig, ax = plt.subplots(len(set(annotations['batch'])), 1)
fig.set_size_inches(12, 10)
for a, (batch, g) in zip(ax, annotations.groupby('batch')):
  bp = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_hs'])),
                 positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
  a.set_ylabel('Batch {} read count'.format(batch))
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Read count by individual/batch')
Text(0.5,0.98,'Read count by individual/batch')

Sorry, your browser does not support SVG.

ERCC

plt.clf()
fig, ax = plt.subplots(3, 1)
fig.set_size_inches(12, 12)
for a, (k, g) in zip(ax, annotations.groupby('ERCC')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_ercc'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Read count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
plt.xlabel('Individual')
plt.suptitle('ERCC read count by individual')
Text(0.5,0.98,'ERCC read count by individual')

Sorry, your browser does not support SVG.

D. melanogaster

plt.clf()
fig, ax = plt.subplots(len(set(annotations['fly'])), 1)
fig.set_size_inches(12, 8)
for a, (k, g) in zip(ax, annotations.groupby('fly')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_dm'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Read count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
plt.xlabel('Individual')
plt.suptitle('D. mel. read count by individual')
Text(0.5,0.98,'D. mel. read count by individual')

Sorry, your browser does not support SVG.

C. elegans

plt.clf()
fig, ax = plt.subplots(3, 1)
fig.set_size_inches(12, 12)
for a, (k, g) in zip(ax, annotations.groupby('worm')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['reads_ce'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Read count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
plt.xlabel('Individual')
plt.suptitle('C. elegans read count by individual')
Text(0.5,0.98,'C. elegans read count by individual')

Sorry, your browser does not support SVG.

Distributions of molecules

H. sapiens

plt.clf()
plt.hist(annotations['mol_hs'], bins=100)
plt.xlabel('Number of molecules')
plt.ylabel('Number of cells')
plt.title('Number of molecules by cell')
plt.close()
plt.clf()
plt.gcf().set_size_inches(12, 4)
_ = plt.boxplot(annotations.groupby('chip_id').apply(lambda x: list(x['mol_hs'])),
                labels=sorted(set(annotations['chip_id'])),
                sym='.')
plt.xlabel('Individual')
plt.ylabel('Molecule count')
plt.xticks(rotation=90)
plt.title('Molecule count by individual')
Text(0.5,1,'Molecule count by individual')

Sorry, your browser does not support SVG.

plt.clf()
fig, ax = plt.subplots(len(set(annotations['batch'])), 1)
fig.set_size_inches(12, 10)
for a, (batch, g) in zip(ax, annotations.groupby('batch')):
  bp = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_hs'])),
                 positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
  a.set_ylabel('Batch {} molecule count'.format(batch))
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Molecule count by individual')
Text(0.5,0.98,'Molecule count by individual')

Sorry, your browser does not support SVG.

ERCC

plt.clf()
fig, ax = plt.subplots(3, 1)
fig.set_size_inches(12, 12)
for a, (k, g) in zip(ax, annotations.groupby('ERCC')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_ercc'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Molecule count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Molecule count by individual')
plt.xlabel('Individual')
plt.suptitle('ERCC molecule count by individual')
Text(0.5,0.98,'ERCC molecule count by individual')

Sorry, your browser does not support SVG.

D. melanogaster

plt.clf()
fig, ax = plt.subplots(len(set(annotations['fly'])), 1)
fig.set_size_inches(12, 8)
for a, (k, g) in zip(ax, annotations.groupby('fly')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_dm'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Molecule count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Molecule count by individual')
plt.xlabel('Individual')
plt.suptitle('D. mel. molecule count by individual')
Text(0.5,0.98,'D. mel. molecule count by individual')

Sorry, your browser does not support SVG.

C. elegans

plt.clf()
fig, ax = plt.subplots(3, 1)
fig.set_size_inches(12, 12)
for a, (k, g) in zip(ax, annotations.groupby('worm')):
  _ = a.boxplot(g.groupby('chip_id').apply(lambda x: list(x['mol_ce'])),
                positions=[sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))])
  a.set_ylabel('Molecule count ({})'.format(k))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Molecule count by individual')
plt.xlabel('Individual')
plt.suptitle('C. elegans molecule count by individual')
Text(0.5,0.98,'C. elegans molecule count by individual')

Sorry, your browser does not support SVG.

Distributions of cells

Our previous results on concordance between pools of single cells suggest that at least 50 cells are needed to consistently estimate mean expression.

plt.clf()
plt.gcf().set_size_inches(12, 4)
plt.bar(np.arange(len(set(annotations['chip_id']))),
        annotations.groupby('chip_id')['chip_id'].agg(len), color='k')
plt.xlabel('Individual')
plt.ylabel('Number of cells')
plt.title('Number of cells by individual')
_ = plt.xticks(np.arange(len(set(annotations['chip_id']))), sorted(set(annotations['chip_id'])), rotation=90)

Sorry, your browser does not support SVG.

plt.clf()
fig, ax = plt.subplots(len(set(annotations['batch'])), 1)
fig.set_size_inches(12, 10)
for a, (batch, g) in zip(ax, annotations.groupby('batch')):
  bp = a.bar([sorted(set(annotations['chip_id'])).index(k) for k in sorted(set(g['chip_id']))], 
             g.groupby('chip_id').apply(lambda x: len(x)))
  a.set_xticks(np.arange(1, len(set(annotations['chip_id']))))
  a.set_xticklabels([])
  a.set_ylabel('Batch {} cell count'.format(batch))
ax[-1].set_xticklabels(sorted(set(annotations['chip_id'])), rotation=90)
fig.suptitle('Number of cells by individual')
Text(0.5,0.98,'Number of cells by individual')

Sorry, your browser does not support SVG.

Author: Abhishek Sarkar

Created: 2018-08-24 Fri 11:14

Validate