UMI count distribution stratified by genotype

Setup

sbatch $RESOURCES --job-name=ipython3 --output=ipython3.out
#!/bin/bash
source activate scqtl
rm -f $HOME/.local/share/jupyter/runtime/kernel-aksarkar.json
ipython3 kernel --ip=$(hostname -i) -f kernel-aksarkar.json
Submitted batch job 39920166

%matplotlib inline

import io
import os
import matplotlib.pyplot as plt
import numpy as np
import requests
import pandas as pd

Read the data

counts = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz', index_col=0)
counts.shape
(20327, 2304)

annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
annotations['chip_id'].value_counts()
NA18507    233
  NA18502    228
  NA18508    203
  NA18520    164
  NA18862    132
  NA19128    119
  NA19190    114
  NA19257    105
  NA18852    105
  NA18501    100
  NA18856     96
  NA18499     93
  NA18853     89
  NA18870     87
  NA19210     87
  NA19098     75
  NA18505     65
  NA19092     54
  NA19159     45
  NA19203     27
  NA19119     20
  NA18498     20
  NA19131      3
  NA18516      3
  NA19200      3
  NA19129      3
  NA19122      2
  NA18913      2
  NA19138      2
  NA18907      2
  NA19214      2
  NA19144      2
  NA19153      2
  NA18874      1
  NA18924      1
  NA19107      1
  NA18917      1
  NA18867      1
  NA19102      1
  NA19099      1
  NA18486      1
  NA18861      1
  NA19171      1
  NA19176      1
  NA19239      1
  NA18517      1
  NA18523      1
  NA19189      1
  NA19209      1
  NA18511      1
  Name: chip_id, dtype: int64
genotypes = pd.read_table('/project/compbio/jointLCLs/genotype/hg19/YRI/genotypesYRI.gen.txt.gz',
                          header=0, sep='\t')
genotypes = genotypes.rename(columns={k: 'NA{}'.format(k)
                                      for k in genotypes.columns.values[9:]})
genotypes.shape
(6278013, 128)

Download ENSEMBL to Hugo gene symbol conversion table from Biomart.

query = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query  virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" >

        <Dataset name = "hsapiens_gene_ensembl" interface = "default" >
                <Attribute name = "ensembl_gene_id" />
                <Attribute name = "hgnc_symbol" />
        </Dataset>
</Query>"""

resp = requests.get('http://www.ensembl.org/biomart/martservice', params={'query': query})
if resp.status_code != 200:
  raise RuntimeError
ensembl = pd.read_table(io.StringIO(resp.text), header=None, index_col=0)
ensembl.columns = ['symbol']
ensembl.head()
                symbol
0                       
ENSG00000210049    MT-TF
ENSG00000211459  MT-RNR1
ENSG00000210077    MT-TV
ENSG00000210082  MT-RNR2
ENSG00000209082   MT-TL1
eqtls = pd.read_table('/project/compbio/iPSC_eQTL/permutations.all.RNAseq_run.fixed.txt.gz', header=None, index_col=0, sep=' ')
eqtls.columns = ['num_variants', 'alpha', 'beta', 'dummy', 'best_eqtl', 'dist', 'p', 'effect_size', 'perm', 'beta_perm']
eqtls.index = [symbol.split('.')[0] for symbol in eqtls.index.values]
eqtls[['rsid', 'chrom', 'pos']] = eqtls.apply(lambda x: pd.Series(str(x['best_eqtl']).split('.')), axis=1)
eqtls.merge(ensembl, left_index=True, right_index=True).merge(counts, left_index=True, right_index=True).sort_values(by='beta_perm')[['symbol', 'rsid', 'effect_size', 'beta_perm']].head()
                symbol        rsid  effect_size     beta_perm
ENSG00000214941   ZSWIM7  rs73276049     1.792670  1.567000e-17
ENSG00000145725  PPIP5K2     rs34822     1.155860  1.280260e-15
ENSG00000164978    NUDT2   rs7848476     0.998739  6.685150e-15
ENSG00000243317  C7orf73   rs6467603     1.371900  1.508780e-14
ENSG00000240344    PPIL3  rs13412214    -1.343370  1.566880e-14

Look at top mean-QTLs

Plot the distribution of UMI counts per cell stratified by genotype (no pooling by individual).

def stratify_umi(counts, genotypes, gene, rsid):
  counts = pd.DataFrame(counts.loc[gene].transpose())
  counts.index = [k.split('.')[0] for k in counts.index]
  genotypes = genotypes[genotypes['ID'] == rsid].iloc[:,9:].transpose().astype(np.int32)
  if len(counts) == 0 or len(genotypes) == 0:
    raise ValueError
  merged = counts.merge(genotypes, how='left', left_index=True, right_index=True)
  merged.columns = ['count', 'genotype']
  return merged

def plot_umi_dist(counts, genotypes, ensembl, gene, rsid, **kwargs):
  if 'kind' not in kwargs:
    kwargs['kind'] = 'hist'
  plt.clf()
  plt.gcf().set_size_inches(6, 4)
  merged = stratify_umi(counts, genotypes, gene, rsid)
  if kwargs.get('kind', '') == 'hist':
    kwargs['bins'] = np.arange(merged['count'].max()).astype(np.float32)
    kwargs['histtype'] = 'step'
  for k, g in merged.groupby('genotype'):
    ax = g['count'].plot(**kwargs)
    ax.set_xlim(left=0)
  plt.title('{}, {}'.format(ensembl.loc[gene, 'symbol'], rsid))
  plt.xlabel('UMI count')
  plt.legend(labels=np.arange(3))

ZWIM7

plot_umi_dist(counts, genotypes, ensembl, 'ENSG00000214941', 'rs73276049')

214941.png

At this variant, there are only two YRI individuals with genotype 2, and they are not among the individuals assayed.

genotypes[genotypes['ID'] == 'rs73276049'].iloc[0,9:].transpose().astype(np.int32).value_counts()
0    83
  1    34
  2     2
  Name: 2295561, dtype: int64

PPIP5K2

plot_umi_dist(counts, genotypes, ensembl, 'ENSG00000145725', 'rs34822')

145725.png

NUDT2

plot_umi_dist(counts, genotypes, ensembl, 'ENSG00000164978', 'rs7848476')

164978.png

C7orf73

plot_umi_dist(counts, genotypes, ensembl, 'ENSG00000243317', 'rs6467603')

243317.png

PPIL3

plot_umi_dist(counts, genotypes, ensembl, 'ENSG00000240344', 'rs13412214')

240344.png

Author: Abhishek Sarkar

Created: 2018-05-22 Tue 13:57

Validate