QTL mapping pipeline

Introduction

We take a modular approach to call QTLs:

Estimate a mean and a dispersion for each individual
Treat the mean/dispersion as continuous phenotypes and perform QTL mapping

Here, we solve (2).

We reproduce eQTLs called on the bulk RNA-Seq
We call mean/variance/CV/Fano QTLs using ZINB parameters
We call QTLs using counts and log CPM
We explain away variance QTLs
We test replication/overlap between different QTL calls

Implementation

def cpm(x):
  return pd.DataFrame(pandas2ri.ri2py(edger.cpm(numpy2ri(x.values), log=True)),
                      columns=x.columns,
                      index=x.index)

def qqnorm(x):
  """Wrap around R qqnorm"""
  return np.asarray(rpy2.robjects.r['qqnorm'](numpy2ri(x))[0])

def bh(x):
  """Wrap around p.adjust(..., method='fdr')"""
  return np.asarray(rpy2.robjects.r['p.adjust'](numpy2ri(x), method='fdr'))

gene_info = (pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-genes.txt.gz')
             .set_index('gene')
             .query('source == "H. sapiens"')
             .query('chr != "hsX"')
             .query('chr != "hsY"')
             .query('chr != "hsMT"'))

with sqlite3.connect('/project2/mstephens/aksarkar/projects/singlecell-qtl/browser/browser.db') as conn:
  gene_info.to_sql('gene_info', conn, index=True, if_exists='replace')

def qtltools_format(row, prefix='chr'):
  row['#Chr'] = '{}{}'.format(prefix, row['chr'][2:])
  row['gid'] = row.name
  row['pid'] = row.name
  # Important: qtltools expects TSS start/end
  if row['strand'] == '+':
    row['end'] = row['start']
  else:
    row['start'] = row['end']
  return row.loc[['#Chr', 'start', 'end', 'pid', 'gid', 'strand']]

def write_pheno_file(pheno, gene_info, output_file, holdout=True, **kwargs):
  if holdout:
    genes = gene_info.loc[gene_info.apply(lambda x: bool(int(x['chr'][2:]) % 2), axis=1)]
  else:
    genes = gene_info
  (genes
   .apply(qtltools_format, **kwargs, axis=1)
   .merge(pheno, left_index=True, right_index=True)
   .to_csv(output_file,
           sep='\t',
           header=True,
           index=False,
           index_label=False))

export input=$input
sbatch --partition=$partition --wait
#!/bin/bash
module load bedtools
bedtools sort -header -i $input | bgzip >$input.gz
tabix -f -p bed $input.gz

export pheno=$pheno
export geno=$geno
export op=$op
sbatch --partition=$partition -a 1-100 -J $pheno-qtl --wait
#!/bin/bash
source activate scqtl
qtltools cis --vcf $geno --bed $pheno.bed.gz $op --chunk $SLURM_ARRAY_TASK_ID 100 --out $pheno-qtl.$SLURM_ARRAY_TASK_ID.txt --seed 0

def _read_helper(pheno, columns):
  file_names = ['{}-qtl.{}.txt'.format(pheno, i) for i in range(1, 101)]
  res = (pd.concat([pd.read_table(f, header=None, sep=' ')
                     for f in file_names if os.path.exists(f) and
                     os.path.getsize(f) > 0])
          .rename(columns={i: x for i, x in enumerate(columns)})
          .dropna()
          .sort_values('p_beta'))
  res['p_adjust'] = bh(res['p_beta'])
  res['fdr_pass'] = res['p_adjust'] < 0.1
  return res


def read_fastqtl_output(pheno):
  columns = ['gene', 'num_snps', 'a', 'b', 'dummy', 'id',
             'distance', 'p', 'beta', 'p_empirical', 'p_beta']
  res = _read_helper(pheno, columns)
  # Drop the gene version number
  res['gene'] = res['gene'].apply(lambda x: x.split('.')[0])
  res['chr'] = res['id'].apply(lambda x: x.split('.')[1])
  res['pos'] = res['id'].apply(lambda x: x.split('.')[2])
  res['id'] = res['id'].apply(lambda x: x.split('.')[0])
  return res

def read_qtltools_output(pheno):
  columns = ['gene', 'chr', 'start', 'end', 'strand', 'num_vars',
             'distance', 'id', 'var_chr', 'var_start', 'var_end', 'df',
             'dummy', 'a', 'b', 'p_nominal', 'beta', 'p_empirical', 'p_beta']
  res = _read_helper(pheno, columns)
  res['chr'] = res['var_chr']
  res['pos'] = res['var_start']
  res['id'] = res['id'].apply(lambda x: x.split('.')[0])
  return res

def read_nominal_pass(f):
  isf = st.chi2(1).isf
  result = pd.read_table(f, sep=' ', header=None)
  result.columns = ['gene', 'chr', 'start', 'end', 'strand', 'n', 'distance', 'id', 'var_chr', 'var_start', 'var_end', 'p_nominal', 'beta', 'top']
  result['z'] = np.sign(result['beta']) * np.sqrt(isf(result['p_nominal']))
  return result

def plot_approx_permutation(df):
  plt.clf()
  plt.gcf().set_size_inches(6, 6)
  plt.scatter(df['p_empirical'], df['p_beta'], s=1, c='k')
  plt.plot([0, 1], [0, 1], c='r', ls='--')
  plt.xlabel('Empirical p-value')
  plt.ylabel('Approximate p-value')

def qqplot(qtls):
  N = qtls.shape[0]
  # 95% bootstrap CI
  ci = -np.log10(np.percentile(np.sort(np.random.uniform(size=(100, N)), axis=1), [5, 95], axis=0))

  grid = -np.log10(np.arange(1, 1 + N) / N)
  plt.clf()
  plt.gcf().set_size_inches(6, 6)
  plt.scatter(grid, -np.log10(qtls['p_beta']), s=1, c='k')
  plt.plot([0, np.log10(qtls.shape[0])], [0, np.log10(qtls.shape[0])], c='r', ls='--')
  plt.plot(grid, ci[0], c='r', ls=':')
  plt.plot(grid, ci[1], c='r', ls=':')
  plt.xlabel('Expected $-\log_{10}(p)$')
  _ = plt.ylabel('Observed $-\log_{10}(p)$')

def parse_vcf_dosage(record):
  geno = [float(g) for g in record[9:]]
  return pd.Series(geno)

def extract_qtl_gene_pair(qtl_gene_df, pheno_df, dosages):
  """Return aligned genotype and phenotype matrix for each QTL-gene pair in qtl_gene_df"""
  common_phenos, common_qtls = pheno_df.align(qtl_gene_df.set_index('gene'), join='inner', axis=0)
  # Important: assume individual IDs are prefixed by "NA". This isn't true in
  # the original VCF
  header = pd.read_table(dosages, skiprows=2, nrows=1, header=0).columns[9:]
  genotypes = tabix.open(dosages)
  X, Y = (common_qtls
          .apply(lambda x: parse_vcf_dosage(next(genotypes.query(x['chr'], int(x['var_start']) - 1, int(x['var_start'])))), axis=1)
          .rename(columns={i: ind for i, ind in enumerate(header)})
          .align(common_phenos, join='inner', axis=None))
  return X, Y

def replication_tests(X, Y, C=None):
  """Return a DataFrame containing replication p-values

  X - centered dosage matrix (num_genes, num_individuals)
  Y - phenotype matrix (num_genes, num_individuals)
  C - confounder matrix (num_confounders, num_individuals)

  """
  p, n = X.shape
  assert Y.shape == (p, n)
  if C is not None:
    assert C.shape[1] == n
    C = np.array(C).T
    C = C - C.mean(axis=0)
    # Construct the annihilator matrix I - X X^+
    M = np.eye(n) - C.dot(np.linalg.pinv(C))
  result = []
  _sf = st.chi2(1).sf
  for (_, x), (name, y) in zip(X.iterrows(), Y.iterrows()):
    if np.isclose(x.std(), 0):
      print('Skipping {}'.format(name))
      continue
    x = x.values.copy().reshape(-1, 1)
    x -= x.mean()
    y = y.values.copy().ravel()
    y -= y.mean()
    if C is not None:
      y = M.dot(y)
      y -= y.mean()
    beta, rss, *_ = np.linalg.lstsq(x, y, rcond=-1)
    sigma2 = rss / y.shape[0]
    se = sigma2 / x.T.dot(x).ravel()
    pval = _sf(np.square(beta / se))
    result.append({'gene': name, 'beta': beta[0], 'se': se[0], 'p': pval.ravel()[0]})
  return pd.DataFrame.from_dict(result)

def pairwise_replication(qtls, phenos, ticks, covars=None):
  if covars is not None:
    assert len(covars) == len(phenos)
  assert len(phenos) == len(ticks)
  repl_rate = np.ones((len(phenos), len(phenos)))
  for i, ki in enumerate(phenos):
    for j, kj in enumerate(phenos):
      if i == j:
        continue
      q, p = qtls[ki][0], qtls[kj][1]
      X, Y = extract_qtl_gene_pair(q[q['fdr_pass']], p,
                                   dosages='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/yri-120-dosages.vcf.gz')
      if X.empty:
        continue
      if covars is not None and covars[j] is not None:
        C = covars[j].align(X, axis='columns', join='inner')[0]
      else:
        C = None
      replication = q.merge(
        replication_tests(X, Y, C),
        on='gene',
        suffixes=['_1', '_2'])[['gene', 'id', 'beta_1', 'beta_2', 'p']]
      replication['fdr_pass'] = bh(replication['p']) < .1
      replication['replicated'] = replication.apply(lambda x: x['fdr_pass'] and x['beta_1'] * x['beta_2'] > 0, axis=1)
      repl_rate[i, j] = replication['replicated'].sum() / replication.shape[0]
  return pd.DataFrame(100 * repl_rate, columns=ticks, index=ticks)

def bootstrap_se(X, Y, C=None, num_bootstraps=100, seed=0):
  np.random.seed(seed)
  beta = {}
  for i in range(num_bootstraps):
    b = np.random.choice(X.shape[1], size=X.shape[1], replace=True)
    if C is not None:
      beta[i] = replication_tests(X.iloc[:,b], Y.iloc[:,b], C.iloc[:,b]).set_index('gene')['beta']
    else:
      beta[i] = replication_tests(X.iloc[:,b], Y.iloc[:,b]).set_index('gene')['beta']
  return pd.DataFrame.from_dict(beta).agg(np.std, axis=1)

def _fit_lm(x, y):
  n, p = x.shape
  assert y.shape == (n, 1)
  y -= y.mean()
  x -= x.mean(axis=0)
  beta = x.T.dot(y) / np.var(x, axis=0).values.reshape(-1, 1)
  return beta

def estimate_beta_se(genes, dosages, gene_info, covars=None, window=100000, n_bootstrap=100, seed=0):
  """Estimate beta via OLS and SE via bootstrap

  genes - dataframe (num_genes, num_individuals)
  dosages - VCF file name
  gene_info - dataframe (see read_gene_info)
  covars - dataframe (num_covars, num_individuals)

  """
  with gzip.open(dosages, 'rt') as f:
    for line in f:
      if line.startswith('#CHROM'):
        header = line.split()[9:]
        break
  dosages = tabix.open(dosages)
  if covars is not None:
    covars, genes = covars.align(genes, axis='columns', join='inner')
    _, n = covars.shape
    covars = covars.values.T
    M = np.eye(n) - covars.dot(np.linalg.pinv(covars))
  result = []
  for gene, Y in genes.iterrows():
    if gene in gene_info.index:
      record = gene_info.loc[gene]
      if record['strand'] == '+':
        X = dosages.query('chr{}'.format(record['chr'][2:]), record['start'] - window, record['start'] + window)
      else:
        X = dosages.query('chr{}'.format(record['chr'][2:]), record['end'] - window, record['end'] + window)
      X = list(X)
      meta = [row[2] for row in X]
      X = pd.DataFrame([parse_vcf_dosage(row) for row in X])
      X.index = meta
      X.columns = header
      X, Y = X.align(Y, axis='columns', join='inner')
      if covars is not None:
        Y = M.dot(Y - Y.mean())
      X = X.transform(lambda x: x - x.mean(), axis=1)

      beta = [_fit_lm(X.T, Y.reshape(-1, 1))]
      np.random.seed(seed)
      for _ in range(n_bootstrap):
        B = np.random.choice(n, size=n, replace=True)
        beta.append(_fit_lm(X.iloc[:,B].T, Y[B].reshape(-1, 1)))
      result.append(pd.DataFrame({'gene': gene, 'snp': meta, 'beta': beta[0].values.ravel(), 'se': np.std(np.ma.masked_invalid(np.hstack(beta[1:])), axis=1)}))
  return pd.concat(result)

Preliminaries

Test validity of approximate permutation test

qtltools tries to calibrate false discovery rates using the following procedure:

For each gene, permute the genotype data to estimate the null distribution of the p-values
Fit a beta distribution to the permuted p-values via ML
Compute the lower tail probability of the observed p-value, assuming it was generated from the fitted beta distribution
Apply FDR correction on the set of lower tail probabilities (across all genes)

Test whether the beta approximation is appropriate for our sample size by subsetting GEUVADIS. Take all genes on chromosome 1.

geuvadis = []
for chunk in pd.read_table('/project/compbio/geuvadis/analysis_results/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz', chunksize=100):
  geuvadis.append(chunk.query('Chr == "1"'))
geuvadis = pd.concat(geuvadis)
geuvadis = geuvadis.set_index(geuvadis['Gene_Symbol'].apply(lambda x: x.split('.')[0]))

First, replicate the result in Delaneau et al 2017 by using all 462 individuals from GEUVADIS.

pd.Series(geuvadis.columns).sort_values().to_csv('/scratch/midway2/aksarkar/singlecell/geuvadis/geuvadis-subset.txt', header=None, index=None)

Write out the phenotype file for qtltools. Important: GEUVADIS VCFs code chromosome without chr.

write_pheno_file(geuvadis, gene_info, '/scratch/midway2/aksarkar/singlecell/geuvadis/test.bed', prefix='')

Index the phenotype file. Important: # sorts before c, but after 1.

Submitted batch job 44542169

Perform SNP QC in plink.

sbatch --partition=broadwl --mem=2G --wait
#!/bin/bash
plink --memory 2000 --geno 0.01 --maf 0.05 --keep-fam /scratch/midway2/aksarkar/singlecell/geuvadis-subset.txt --vcf /project/compbio/geuvadis/genotypes/GEUVADIS.chr1.PH1PH2_465.IMPFRQFILT_BIALLELIC_PH.annotv2.genotypes.vcf.gz --recode vcf-iid --out geuvadis-chr1
bgzip -f geuvadis-chr1.vcf
tabix -f -p vcf geuvadis-chr1.vcf.gz

Submitted batch job 44541229

Run qtltools.

Submitted batch job 44685856

Read the results.

geuvadis_qtls = read_qtltools_output('geuvadis/test')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(geuvadis_qtls)

Plot the QQ plot

qqplot(geuvadis_qtls)

Repeat the analysis after subsetting to 54 individuals.

np.random.seed(0)
subset = np.random.choice([x for x in geuvadis.columns], size=54, replace=False)
pd.Series(subset).sort_values().to_csv('/scratch/midway2/aksarkar/singlecell/geuvadis-subset.txt', header=None, index=None)

write_pheno_file(geuvadis[subset], gene_info, '/scratch/midway2/aksarkar/singlecell/geuvadis/test.bed', prefix='')

Submitted batch job 44544326

Submitted batch job 44544018

Submitted batch job 44544375

geuvadis_54_qtls = read_qtltools_output('geuvadis/test')
plot_approx_permutation(geuvadis_54_qtls)

Reproduce bulk eQTL calls

The iPSC bulk eQTLs were called in Banovich et al 2018.

eQTLs in iPSCs and LCLs: We transformed expression levels to a standard normal
within each individual. We next accounted for unknown confounders by removing
principal components from the LCL (15 PCs) and iPSC (10 PCs) data. Genotypes
were obtained using impute2 as described previously (Li et al. 2016). We only
considered variants within 50 kb of genes. To identify association between
genotype and gene expression, we used FastQTL (Ongen et al. 2016). After the
initial regression, a variable number of permutations were performed to obtain
a gene-wise adjusted P-value (Ongen et al. 2016). To identify significant
eQTLs, we used Storey's q-value (Storey and Tibshirani 2003) on the adjusted
P-values. Genes with a q-value less than 0.1 are considered significant.

Important notes:

The text doesn't state how expression level was quantified (it was the ratio of mapped reads to total reads after correction by WASP).

WASP (de Geijin et al 2015) fits quartic polynomials \(f, g\) which predict the total read count per region \(T^*_{ij}\) from the observed read count \(x_{ij}\) and GC content \(w_j\) by maximizing the likelihood of the observed read counts:

\[ x_{ij} \sim \mathrm{Pois}(T^*_{ij}) \]

\[ T^*_{ij} = \exp\left(f\left(\sum_i x_{ij}\right)\right) g(w_j) \]

Using log CPM (under the assumption that we never compare genes to each other) yields 1279 eQTLs (89%).
fastqtl expects gene start/end, and only takes cis-SNPs around the start ignoring strand. The code uses GENCODE v19 exons to define the start/end.

qtltools expects TSS and strand, but doesn't use strand information in cis-eQTL mapping. Using the start coordinate of the provided expression matrix as TSS yields 1265 eQTLs (87%).
The methods section of Degner et al 2012 states data is standardized across individuals, and quantile normalized within individuals. The equation contradicts the text, but the code follows the text.
The code analyzes 100kb windows, contradicting the text.
Not every gene in the input appears in the output, and changing the number of chunks changes the number of genes lost.
QTL-gene pairs passed the Benjamini-Hochberg procedure, not Storey's procedure.

sbatch --partition=broadwl -a 1-25
#!/bin/bash
source activate scqtl
fastqtl -V YRI_SNPs_2_IPSC.txt.gen.gz -B fastqtl_qqnorm_RNAseq_run.fixed.txt.gz -C fasteqtl_PC_RNAseq_run.fixed.txt -O bulk-qtl.$SLURM_ARRAY_TASK_ID.txt --exclude-samples file_IPSC.excl --window 1e5 --permute 1000 10000 --chunk $SLURM_ARRAY_TASK_ID 25 --seed 1475098497

Submitted batch job 44546060

Subsample the individuals.

sbatch --partition=broadwl -a 1-25
#!/bin/bash
source activate scqtl
awk 'NR < 6' file_IPSC.used | cat - file_IPSC.excl >subsample.excl
fastqtl -V YRI_SNPs_2_IPSC.txt.gen.gz -B fastqtl_qqnorm_RNAseq_run.fixed.txt.gz -C fasteqtl_PC_RNAseq_run.fixed.txt -O bulk-qtl.$SLURM_ARRAY_TASK_ID.txt --exclude-samples subsample.excl --window 1e5 --permute 1000 10000 --chunk $SLURM_ARRAY_TASK_ID 25 --seed 1475098497

Submitted batch job 47297158

Read fastqtl output.

bulk_qtls = read_fastqtl_output('reproduce-yang/bulk')

Write out the summary stats with headers.

bulk_qtls.to_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/bulk.txt.gz', sep='\t', index=None, compression='gzip')

Compare qtltools to fastqtl. The input files need to be modified.

sbatch --partition=broadwl --wait
#!/bin/bash
zcat fastqtl_qqnorm_RNAseq_run.fixed.txt.gz | awk -vOFS='\t' 'NR == 1 {$4 = "pid" OFS "gid" OFS "strand"; for (i = 5; i <= NF; i++) {$i = "NA"$i} print} NR > 1 {$4 = $4 OFS $4 OFS "+"; $3 = $2; print}' >test.bed
awk 'NR == 1 {for (i = 2; i <= NF; i++) {$i = "NA"$i}} {print}' fasteqtl_PC_RNAseq_run.fixed.txt >bulk-covars.txt

Submitted batch job 44979838

Check whether the FDR is properly controlled by permuting.

np.random.seed(0)
permutation = bulk_expr.columns.values.copy()
np.random.shuffle(permutation[5:])
bulk_expr.columns = permutation

covars = (pd.read_table('/scratch/midway2/aksarkar/singlecell/reproduce-yang/covars.txt', sep=' ')
          .rename(columns={k: v for k, v in zip(bulk_expr.columns, permutation)}))
covars.to_csv('/scratch/midway2/aksarkar/singlecell/reproduce-yang/covars.txt', sep=' ', index=False)

Fix the TSS by rewriting the phenotype file.

bulk_expr = pd.read_table('/scratch/midway2/aksarkar/singlecell/reproduce-yang/test.bed', index_col=3)
bulk_expr.index = [x.split('.')[0] for x in bulk_expr.index]
write_pheno_file(bulk_expr.iloc[:,5:], gene_info, '/scratch/midway2/aksarkar/singlecell/reproduce-yang/test.bed', holdout=False)

Index the phenotype file.

Submitted batch job 44683584

Run qtltools.

Submitted batch job 44683586

Run qtltools on reprocessed dosages.

Submitted batch job 47427752

Run qtltools on subsample.

awk '{print "NA" $0}' subsample.excl >subsample-qtltools.excl

Submitted batch job 47428279

Read qtltools output

bulk_qtls = read_qtltools_output('reproduce-yang/test')

Check the beta approximation.

plot_approx_permutation(bulk_qtls)

Plot a QQ plot of adjusted p-values.

qqplot(bulk_qtls)

Take QTLs with \(\mathrm{FDR} < 0.1\).

bulk_qtls['fdr_pass'].sum()

Recall bulk eQTLs from log CPM

Read the counts matrix.

bulk_counts = (pd.read_table('/project2/gilad/singlecell-qtl/bulk/counts_RNAseq_iPSC.txt', sep=' ', index_col=0)
               .rename(columns=lambda x: 'NA{}'.format(x))
               .rename(index=lambda x: x.split('.')[0]))

Throw out individuals.

with open('/scratch/midway2/aksarkar/singlecell/reproduce-yang/file_IPSC.excl') as f:
  for line in f:
    k = 'NA{}'.format(line.strip())
    if k in bulk_counts:
      del bulk_counts[k]

Normalize the counts matrix by computing log CPM. Normalizing by length is unnecessary because we only ever compare counts for the same gene across individuals.

bulk_log_cpm = (cpm(bulk_counts)
                .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
                .apply(qqnorm, axis=0))

Compute expression PCs.

covars = pd.DataFrame(skd.PCA(n_components=10).fit(bulk_log_cpm).components_, columns=bulk_log_cpm.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/recall-bulk/log-cpm-covars.txt', sep='\t')

Check whether the false discovery rate is properly controlled by permuting the data.

Write the phenotype matrix in qtltools format. Use the annotation data (ENSEMBL 75) in this repository to be consistent with the single cell data. Important: this loses 1716 genes (are they pseudogenes?)

write_pheno_file(
  bulk_log_cpm,
  gene_info,
  holdout=False,
  output_file='/scratch/midway2/aksarkar/singlecell/recall-bulk/bulk-log-cpm.bed')

Index the phenotype file.

Submitted batch job 44681764

Ensure the dosage file follows the VCF standard. Add the prefix NA to sample IDs.

sbatch --partition=broadwl
#!/bin/bash
zcat YRI_SNPs_2_IPSC.txt.gen.gz | awk -vOFS='\t' 'BEGIN {print "##fileformat=VCFv4.2"; print "##FORMAT=<ID=DS,Number=1,Type=Float>"} NR == 1 {for (i = 10; i <= NF; i++) {$i = "NA"$i}} {print}' | bgzip >yri-dosages.vcf.gz
tabix yri-dosages.vcf.gz

Submitted batch job 44546926

Run qtltools

Submitted batch job 44681768

Read the output. Important: this loses 201 genes (is this a bug in qtltools)?

bulk_cpm_qtls = read_qtltools_output('recall-bulk/bulk-log-cpm')

Check the beta approximation.

plot_approx_permutation(bulk_cpm_qtls)

Plot a QQ plot of adjusted p-values.

qqplot(bulk_cpm_qtls)

Take QTLs with \(\mathrm{FDR} < 0.1\).

bulk_cpm_qtls['fdr_pass'].sum()

Recall bulk eQTLs from log TPM

We reprocessed the bulk RNA-Seq data using kallisto. Read the TPM matrix.

bulk_log_tpm = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/kallisto/bulk-ipsc-tpm.txt.gz', header=None, sep=' ')
bulk_log_tpm = np.log(bulk_log_tpm.pivot(columns=0, index=1, values=2) + 1)
bulk_log_tpm.index = [x.split('.')[0] for x in bulk_log_tpm.index]
# Important: need to throw out all zero rows because they blow up
# standardization
bulk_log_tpm = (bulk_log_tpm[bulk_log_tpm.apply(lambda x: x.sum() > 0, axis=1)]
                .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
                .apply(qqnorm, axis=0))

To quantify how much power we expect to lose going from 58 to 53 individuals (in our scRNA-Seq data), perform QTL mapping on a random subset of 53 individuals.

np.random.seed(0)
keep_inds = np.random.choice(bulk_log_tpm.columns, size=53, replace=False)
bulk_log_tpm = bulk_log_tpm.filter(items=keep_inds, axis='columns')

Get the TSS information. Use the annotation data (ENSEMBL 75) in this repository to be consistent with the single cell data.

Write the phenotype matrix in qtltools format. Important: this loses 1034 genes

write_pheno_file(
  bulk_log_tpm,
  gene_info,
  holdout=False,
  output_file='/scratch/midway2/aksarkar/singlecell/recall-bulk/bulk-log-tpm.bed')

Index the phenotype file.

Submitted batch job 47427500

Compute principal components.

covars = pd.DataFrame(skd.PCA(n_components=6).fit(bulk_log_tpm).components_, columns=bulk_log_tpm.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/recall-bulk/log-tpm-covars.txt', sep='\t')

Run qtltools

Submitted batch job 47427616

Read the output. Important: this loses 201 genes (is this a bug in qtltools)?

bulk_tpm_qtls = read_qtltools_output('recall-bulk/bulk-log-tpm')

Check the beta approximation to the permuted p-values.

plot_approx_permutation(bulk_tpm_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(bulk_tpm_qtls)

Take QTLs with \(\mathrm{FDR} < 0.1\).

bulk_tpm_qtls['fdr_pass'].sum()

Reprocess YRI dosages

We have two individuals which weren't used in Banovich et al 2018, so we don't have their genotypes. Reprocess the IMPUTE2 output to get the correct dosage matrix.

rsync -au -f '+ */' -f '+ *.impute2.gz' -f '+ YRI_samples.txt' -f '- *' /mnt/lustre/data/internal/genotypes/hg19/YRI/ aksarkar@midway2.rcc.uchicago.edu:/scratch/midway2/aksarkar/singlecell/scqtl-mapping/

import argparse
import glob
import gzip
import numpy as np
import pandas as pd
import sqlite3

def convert_impute2_vcf(file, chrom, outfile, min_maf=0.05, mask=None):
  for line in file:
    record = line.split()
    posterior = np.array([float(x) for x in record[5:]])
    dose = posterior.reshape(-1, 3).dot(np.arange(3))
    if mask is not None:
      dose = dose[mask]
    if min_maf <= dose.mean() / 2 <= 1 - min_maf:
      print(chrom, record[2], '{}.{}.{}'.format(record[1], chrom, record[2]),
            record[3], record[4], '.', '.', '.', 'DS',
            *['{:.3f}'.format(x) for x in dose], sep='\t', file=outfile)

parser = argparse.ArgumentParser()
parser.add_argument('-s', '--subset', help='Sample inclusion list', type=pd.read_table, default=None)
parser.add_argument('-o', '--output', help='Output file', default='dosages.vcf')
args = parser.parse_args()

samples = pd.read_table('YRI_samples.txt', header=None, sep=' ')
if args.subset is not None:
  mask = samples[0].isin(args.subset).values
  samples = samples.loc[mask.values.ravel(), 0]
else:
  mask = None
  samples = samples.loc[:,0]
with open(args.output, 'w') as f:
  print('##fileformat=VCFv4.2', file=f)
  print('##FORMAT=<ID=DS,Number=1,Type=Float>', file=f)
  print('#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', *samples, sep='\t', file=f)
  for c in range(1, 23):
    with gzip.open('chr{}.hg19.impute2.gz'.format(c), 'rt') as g:
      convert_impute2_vcf(g, 'chr{}'.format(c), outfile=f, mask=mask)

sbatch --partition=broadwl -n1 -c28 --exclusive --mem=8G
#!/bin/bash
source activate scqtl
python /project2/mstephens/aksarkar/projects/singlecell-qtl/code/reprocess-dosage.py -o yri-120-dosages.vcf
bgzip --threads=28 yri-120-dosages.vcf
tabix yri-120-dosages.vcf.gz

Submitted batch job 47228636

Analysis using counts

One obvious way to estimate means/variances to use as quantitative phenotypes is to use the sample means/variances of the counts.

Call eQTLs from pooled scRNA-Seq

Read the QC filters.

annotation = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)
keep_genes = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/genes-pass-filter.txt', index_col=0, header=None)
annotation = annotation.loc[keep_samples.values.ravel()]
keep_inds = annotation.groupby('chip_id').apply(lambda x: len(x) >= 50)

Read and pool the UMI data.

pooled_counts = pd.concat(
  [(chunk
    .filter(items=keep_genes[keep_genes.values].index, axis='index')
    # Important: this can't be done by filter because sample names are
    # different in the QC file
    .loc[:,keep_samples.values.ravel()]
    .groupby(annotation['chip_id'].values, axis=1)
    .agg(np.sum))
   for chunk in
   pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz',
                 chunksize=1000, index_col=0)])

Normalize the pooled counts.

pooled_counts = (pooled_counts.loc[:,keep_inds]
              .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
              .apply(qqnorm, axis=0))

Compute principal components and write out the covariate file.

covars = pd.DataFrame(skd.PCA(n_components=10).fit(pooled_counts).components_, columns=pooled_counts.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/pooled-covars.txt', sep='\t')

Write out the phenotype file.

write_pheno_file(pooled_counts, gene_info, holdout=False, output_file='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/pooled.bed')

Index the phenotype file.

Submitted batch job 55769711

Run qtltools

Read the output. Important: this loses 200 genes (is this a bug in qtltools)?

pooled_qtls = read_qtltools_output('scqtl-mapping/counts/pooled')

Check the beta approximation.

plot_approx_permutation(pooled_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(pooled_qtls)

Take QTLs with \(\mathrm{FDR} < 0.1\).

pooled_qtls['fdr_pass'].sum()

Call mean-QTLs

Read the count matrix.

umi = pd.concat(
  [(chunk
    .filter(items=keep_genes[keep_genes.values].index, axis='index')
    # Important: this can't be done by filter because sample names are
    # different in the QC file
    .loc[:,keep_samples.values.ravel()])
   for chunk in
   pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz',
                 chunksize=1000, index_col=0)])

Compute the sample mean per individual, then normalize.

sample_mean = (umi
               .groupby(annotation['chip_id'].values, axis=1)
               .agg(np.mean)
               .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
               .apply(qqnorm, axis=0))

Compute principal components and write out the covariate file.

covars = pd.DataFrame(skd.PCA(n_components=10).fit(sample_mean).components_, columns=sample_mean.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-mean-covars.txt', sep='\t')

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(
  sample_mean,
  gene_info,
  output_file='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-mean.bed',
  holdout=False)

Index the phenotype file.

Submitted batch job 55770738

Run qtltools.

Submitted batch job 55770741

Read the output.

sample_mean_qtls = read_qtltools_output('scqtl-mapping/counts/sample-mean')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(sample_mean_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(sample_mean_qtls)

Take QTLs at FDR 10%.

sample_mean_qtls['fdr_pass'].sum()

Call variance-QTLs

Read the UMI matrix.

Compute the sample variance per individual, then normalize.

sample_var = (umi
              .groupby(annotation['chip_id'].values, axis=1)
              .agg(np.var)
              .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
              .apply(qqnorm, axis=0))

Compute principal components and write out the covariate file.

covars = pd.DataFrame(skd.PCA(n_components=2).fit(sample_var).components_, columns=sample_var.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-var-covars.txt', sep='\t')

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(sample_var, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-var.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55770997

Run qtltools.

Submitted batch job 55771003

Read the output.

sample_var_qtls = read_qtltools_output('scqtl-mapping/counts/sample-var')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(sample_var_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(sample_var_qtls)

Take QTLs at FDR 10%.

sample_var_qtls['fdr_pass'].sum()

Call CV-QTLs

Compute the sample CV per individual per gene, then normalize.

sample_cv = (umi
             .groupby(annotation['chip_id'].values, axis=1)
             .agg(lambda x: np.std(x, axis=1) / (np.mean(x, axis=1) + 1e-8))
             .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
             .apply(qqnorm, axis=0))

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(sample_cv, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-cv.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55771641

Run qtltools.

Submitted batch job 55771671

Read the output.

sample_cv_qtls = read_qtltools_output('scqtl-mapping/counts/sample-cv')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(sample_cv_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(sample_cv_qtls)

Take QTLs at FDR 10%.

sample_cv_qtls['fdr_pass'].sum()

Call Fano-QTLs

Read the UMI matrix.

Compute the sample Fano factor per individual per gene, then normalize.

sample_fano = (umi
               .groupby(annotation['chip_id'].values, axis=1)
               .agg(lambda x: np.var(x, axis=1) / (1e-8 + np.mean(x, axis=1)))
               .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
               .apply(qqnorm, axis=0))

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(sample_fano, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample_fano.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55771840

Compute principal components and write out the covariate file.

covars = pd.DataFrame(skd.PCA(n_components=2).fit(sample_fano).components_, columns=sample_fano.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-fano-covars.txt', sep='\t')

Run qtltools.

Submitted batch job 55771844

Read the output.

sample_fano_qtls = read_qtltools_output('scqtl-mapping/counts/sample_fano')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(sample_fano_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(sample_fano_qtls)

Take QTLs at FDR 10%.

sample_fano_qtls['fdr_pass'].sum()

Analysis using log CPM

The most obvious way to estimate means/variances to use as quantitative phenotypes is to use the sample moments of per cell log CPM.

Care has to be taken in handling zero-observations. Simply ignoring them leads to loss of power due to the pseudocount introduced in computing log CPM. To see this, fix one individual, one gene, and let \(r\) denote the observed count. Then,

\[ \log\mathrm{CPM} \propto \ln(r + \epsilon) - \ln\mathrm{const} \]

Let \(\mu = \mathbb{E}[r\,]\), \(\sigma^2 = \mathbb{V}[r\,]\), expand to second-order, then take expectations over cells:

\[ \mathbb{E}[\,\ln (r + \epsilon)\,] \approx \ln(\mu + \epsilon) - \frac{\sigma^2}{(\mu + \epsilon)^2} \]

\[ \mathbb{V}[\,\ln (r + \epsilon)\,] \approx \frac{2 \sigma^2}{(\mu + \epsilon)^2} - \frac{\sigma^4}{(\mu + \epsilon)^4} \]

Call eQTLs from pooled scRNA-Seq

Read the QC filters.

annotation = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None)
keep_genes = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/genes-pass-filter.txt', index_col=0, header=None)
annotation = annotation.loc[keep_samples.values.ravel()]
keep_inds = annotation.groupby('chip_id').apply(lambda x: len(x) >= 50)

Read and pool the UMI data.

pooled_counts = pd.concat(
  [(chunk
    .filter(items=keep_genes[keep_genes.values].index, axis='index')
    # Important: this can't be done by filter because sample names are
    # different in the QC file
    .loc[:,keep_samples.values.ravel()]
    .groupby(annotation['chip_id'].values, axis=1)
    .agg(np.sum))
   for chunk in
   pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz',
                 chunksize=1000, index_col=0)])

Normalize the pooled counts.

pooled_cpm = (cpm(pooled_counts).loc[:,keep_inds]
              .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
              .apply(qqnorm, axis=0))

Compute principal components and write out the covariate file.

covars = pd.DataFrame(skd.PCA(n_components=10).fit(pooled_cpm).components_, columns=pooled_cpm.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/pooled-covars.txt', sep='\t')

Write out the phenotype file.

write_pheno_file(pooled_cpm, gene_info, holdout=False, output_file='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/pooled.bed')

Index the phenotype file.

Submitted batch job 55767633

Run qtltools

Read the output. Important: this loses 200 genes (is this a bug in qtltools)?

pooled_qtls = read_qtltools_output('scqtl-mapping/counts/pooled')

Check the beta approximation.

plot_approx_permutation(pooled_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(pooled_qtls)

Take QTLs with \(\mathrm{FDR} < 0.1\).

pooled_qtls['fdr_pass'].sum()

Call mean-QTLs

Throw out individuals with fewer than 50 cells.

Read the count matrix.

umi = pd.concat(
  [(chunk
    .filter(items=keep_genes[keep_genes.values].index, axis='index')
    # Important: this can't be done by filter because sample names are
    # different in the QC file
    .loc[:,keep_samples.values.ravel()])
   for chunk in
   pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz',
                 chunksize=1000, index_col=0)])

We previously noted that 50 cells was sufficient to reliably estimate the mean of single cell data. Therefore, throw out zeros from the analysis and require that at least 50 samples have non-zero count.

num_non_zero = umi.groupby(annotation['chip_id'].values, axis=1).agg(lambda x: (x > 0).values.sum(axis=1))

Derive a cutoff for the number of individuals with at least 50 non-zero observations per gene.

num_individuals_pass = (num_non_zero > 50).agg(np.sum, axis=1)

plt.clf()
n, b, _ = plt.hist(num_individuals_pass, color='k', bins=np.arange(0, 53, 2), histtype='step', cumulative=True, normed=True)
plt.axhline(y=n[-2], c='r', ls=':', lw=1)
plt.xlabel('Number of individuals with at least 50 non-zero observations')
_ = plt.ylabel('Cumulative fraction of genes')

Compute the sample mean log CPM per individual, then normalize.

sample_mean = (cpm(umi)
               .mask(umi == 0)
               .groupby(annotation['chip_id'].values, axis=1)
               .agg(np.mean)
               .loc[num_individuals_pass >= 50,keep_inds]
               .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
               .apply(qqnorm, axis=0))

Compute principal components and write out the covariate file.

covars = pd.DataFrame(skd.PCA(n_components=10).fit(sample_mean).components_, columns=sample_mean.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-mean-covars.txt', sep='\t')

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(
  sample_mean,
  gene_info,
  output_file='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-mean.bed',
  holdout=False)

Index the phenotype file.

Submitted batch job 55768305

Run qtltools.

Submitted batch job 46254485

Read the output.

sample_mean_qtls = read_qtltools_output('scqtl-mapping/counts/sample-mean')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(sample_mean_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(sample_mean_qtls)

Take QTLs at FDR 10%.

sample_mean_qtls['fdr_pass'].sum()

Call variance-QTLs

Throw out individuals with fewer than 50 cells.

Read the UMI matrix.

Compute the sample variance of log CPM per individual, then normalize.

sample_var = (cpm(umi)
              .mask(umi == 0)
              .groupby(annotation['chip_id'].values, axis=1)
              .agg(np.var)
              .loc[num_individuals_pass >= 50,keep_inds]
              .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
              .apply(qqnorm, axis=0))

Compute principal components and write out the covariate file.

covars = pd.DataFrame(skd.PCA(n_components=2).fit(sample_var).components_, columns=sample_var.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-var-covars.txt', sep='\t')

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(sample_var, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-var.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55769014

Run qtltools.

Submitted batch job 55769016

Read the output.

sample_var_qtls = read_qtltools_output('scqtl-mapping/counts/sample-var')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(sample_var_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(sample_var_qtls)

Take QTLs at FDR 10%.

sample_var_qtls['fdr_pass'].sum()

Call CV-QTLs

Throw out individuals with fewer than 50 cells.

Read the UMI matrix.

Compute the sample CV of log CPM per individual, then normalize.

sample_cv = (cpm(umi)
             .mask(umi == 0)
             .groupby(annotation['chip_id'].values, axis=1)
             .agg(lambda x: np.std(x, axis=1) / (np.mean(x, axis=1) + 1e-8))
             .loc[num_individuals_pass >= 50,keep_inds]
             .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
             .apply(qqnorm, axis=0))

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(sample_cv, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-cv.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55769280

Run qtltools.

Submitted batch job 55769282

Read the output.

sample_cv_qtls = read_qtltools_output('scqtl-mapping/counts/sample-cv')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(sample_cv_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(sample_cv_qtls)

Take QTLs at FDR 10%.

sample_cv_qtls['fdr_pass'].sum()

Call Fano-QTLs

Throw out individuals with fewer than 50 cells.

Read the UMI matrix.

Fisher's index of dispersion is defined as \(V[x] / E[x]\). The Fano factor is Fisher's index of dispersion over a fixed window (in our case, the total number of reads).

Compute the sample Fano factor of log CPM per individual, then normalize.

sample_fano = (cpm(umi)
               .mask(umi == 0)
               .groupby(annotation['chip_id'].values, axis=1)
               .agg(lambda x: np.var(x, axis=1) / (np.mean(x, axis=1)))
               .loc[num_individuals_pass >= 50,keep_inds]
               .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
               .apply(qqnorm, axis=0))

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(sample_fano, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample_fano.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55769523

Compute principal components and write out the covariate file.

covars = pd.DataFrame(skd.PCA(n_components=2).fit(sample_fano).components_, columns=sample_fano.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/counts/sample-fano-covars.txt', sep='\t')

Run qtltools.

Submitted batch job 55769526

Read the output.

sample_fano_qtls = read_qtltools_output('scqtl-mapping/counts/sample_fano')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(sample_fano_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(sample_fano_qtls)

Take QTLs at FDR 10%.

sample_fano_qtls['fdr_pass'].sum()

Analysis using ZINB

Call eQTLs

Estimate the moments of latent gene expression. We have:

\[ \mathbb{E}[\lambda_{ijk}] = (1 - \pi_{ik}) \mu_{ik} \]

\[ \mathbb{V}[\lambda_{ijk}] = (1 - \pi_{ik}) \phi_{ik}^{-1} + \pi_{ik} (1 - \pi_{ik}) \mu_{ik}^2 \]

log_mu = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-mu.txt.gz', index_col=0, sep=' ')
log_phi = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-phi.txt.gz', index_col=0, sep=' ')
logodds = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-logodds.txt.gz', index_col=0, sep=' ')
for x in (log_mu, log_phi, logodds):
  del x['NA18498']

# Important: log(sigmoid(x)) = -softplus(-x)
mean_by_ind = np.exp(log_mu - np.log1p(np.exp(logodds)))
variance_by_ind = np.exp(2 * log_mu + log_phi - np.log1p(np.exp(logodds))) + np.exp(-np.log1p(np.exp(logodds)) - np.log1p(np.exp(-logodds)) + 2 * log_mu)

Normalize the mean matrix analagous to the bulk data.

mean = (mean_by_ind
        .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
        .apply(qqnorm, axis=0))

Compute principal components of the mean matrix.

covars = pd.DataFrame(skd.PCA(n_components=10).fit(mean).components_, columns=mean.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/mean-covars.txt', sep='\t')

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(mean, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/mean.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55772051

Run qtltools.

Submitted batch job 55772076

Read the output.

mean_qtls = read_qtltools_output('scqtl-mapping/zinb/mean')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(mean_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(mean_qtls)

Take QTLs at FDR 10%.

mean_qtls['fdr_pass'].sum()

Fit multiplicative model

Estimate the moments of latent gene expression. We have:

\[ \mathbb{E}[\lambda_{ijk}] = (1 - \pi_{ik}) \mu_{ik} \]

\[ \mathbb{V}[\lambda_{ijk}] = (1 - \pi_{ik}) \phi_{ik}^{-1} + \pi_{ik} (1 - \pi_{ik}) \mu_{ik}^2 \]

log_mu = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-mu.txt.gz', index_col=0, sep=' ')
log_phi = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-phi.txt.gz', index_col=0, sep=' ')
logodds = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-logodds.txt.gz', index_col=0, sep=' ')
for x in (log_mu, log_phi, logodds):
  del x['NA18498']

# Important: log(sigmoid(x)) = -softplus(-x)
mean_by_ind = np.exp(log_mu - np.log1p(np.exp(logodds)))
variance_by_ind = np.exp(2 * log_mu + log_phi - np.log1p(np.exp(logodds))) + np.exp(-np.log1p(np.exp(logodds)) - np.log1p(np.exp(-logodds)) + 2 * log_mu)

Normalize the mean matrix analagous to the bulk data.

multmean = (np.log(mean_by_ind)
        .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
        .apply(qqnorm, axis=0))

Compute principal components of the mean matrix.

covars = pd.DataFrame(skd.PCA(n_components=10).fit(multmean).components_, columns=multmean.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/multmean-covars.txt', sep='\t')

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(multmean, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/multmean.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55772263

Run qtltools.

Submitted batch job 55772298

Read the output.

multmean_qtls = read_qtltools_output('scqtl-mapping/zinb/mean')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(multmean_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(multmean_qtls)

Take QTLs at FDR 10%.

multmean_qtls['fdr_pass'].sum()

Call φ-QTLs

Normalize the phenotype matrix.

log_phi = log_phi.transform(lambda x: (x - x.mean()) / x.std(), axis=1).apply(qqnorm, axis=1)

Write out the phenotype file.

write_pheno_file(log_phi, gene_info, holdout=False, output_file='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/log_phi.bed')

Index the phenotype file.

Submitted batch job 55772635

Run qtltools.

Submitted batch job 55772639

Read the output.

log_phi_qtls = read_qtltools_output('scqtl-mapping/zinb/log_phi')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(log_phi_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(log_phi_qtls)

Take QTLs at FDR 10%.

log_phi_qtls['fdr_pass'].sum()

Call variance-QTLs

Normalize the variance matrix.

variance = (variance_by_ind
            .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
            .apply(qqnorm, axis=0))

Compute principal components of the variance matrix.

covars = pd.DataFrame(skd.PCA(n_components=2).fit(variance).components_, columns=variance.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/variance-covars.txt', sep='\t')

Write out the phenotype file for qtltools.

write_pheno_file(variance, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/variance.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55772747

Run qtltools.

Submitted batch job 55772752

Read the output.

variance_qtls = read_qtltools_output('scqtl-mapping/zinb/variance')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(variance_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(variance_qtls)

Take QTLs at FDR 10%.

variance_qtls['fdr_pass'].sum()

Call CV-QTLs

Estimate the coefficient of variation.

cv = np.sqrt(variance_by_ind) / mean_by_ind

Normalize the CV matrix.

cv = cv.transform(lambda x: (x - x.mean()) / x.std(), axis=1).apply(qqnorm, axis=0)

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(cv, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/cv.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55772866

Run qtltools.

Submitted batch job 55772867

Read the output.

cv_qtls = read_qtltools_output('scqtl-mapping/zinb/cv')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(cv_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(cv_qtls)

Take QTLs at FDR 10%.

cv_qtls['fdr_pass'].sum()

Call Fano-QTLs

Estimate the coefficient of variation.

fano = variance_by_ind / mean_by_ind

Normalize the Fano matrix.

fano = fano.transform(lambda x: (x - x.mean()) / x.std(), axis=1).apply(qqnorm, axis=0)

Write out the phenotype file for qtltools. Hold out even chromosomes while optimizing the power to detect eQTLs.

write_pheno_file(fano, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/fano.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55772970

Run qtltools.

Submitted batch job 55772971

Read the output.

fano_qtls = read_qtltools_output('scqtl-mapping/zinb/fano')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(fano_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(fano_qtls)

Take QTLs at FDR 10%.

fano_qtls['fdr_pass'].sum()

Explain away variance QTLs

Call variance-QTLs

Regress out mean from variance.

M = mean_by_ind.transform(lambda x: (x - x.mean()) / x.std(), axis=1)
V = variance_by_ind.transform(lambda x: (x - x.mean()) / x.std(), axis=1)
beta = (M * V).sum(axis=1) / M.shape[1]

Normalize the variance matrix.

resid = ((V - M.mul(beta, axis='index'))
            .transform(lambda x: (x - x.mean()) / x.std(), axis=1)
            .apply(qqnorm, axis=0))

Compute principal components of the resid matrix.

covars = pd.DataFrame(skd.PCA(n_components=2).fit(resid).components_, columns=resid.columns)
covars.index.name = 'id'
covars.to_csv('/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/resid-covars.txt', sep='\t')

Write out the phenotype file for qtltools.

write_pheno_file(resid, gene_info, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/zinb/resid.bed', holdout=False)

Index the phenotype file.

Submitted batch job 55773126

Run qtltools.

Submitted batch job 55773130

Read the output.

resid_qtls = read_qtltools_output('scqtl-mapping/zinb/resid')

Check the beta approximation to the permutation p-values.

plot_approx_permutation(resid_qtls)

Plot a QQ plot of the adjusted p-values.

qqplot(resid_qtls)

Take QTLs at FDR 10%.

resid_qtls['fdr_pass'].sum()

Write out the QTLs

cp covars.txt bulk-covars.txt
cat test-qtl.*.txt | awk 'BEGIN {print "gene", "chr", "start", "end", "strand", "num_vars", "distance", "id", "var_chr", "var_start", "var_end", "df", "dummy", "a", "b", "p_nominal", "beta", "p_empirical", "p_beta"} {print}' | gzip >bulk.txt.gz
cp test.bed.gz /project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/bulk.bed.gz
cp bulk-covars.txt bulk.txt.gz /project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/

sbatch --partition=mstephens
#!/bin/bash
cat >.rsync-filter <<EOF
+ */
+ *.bed.gz
+ *.txt.gz
+ *covars*
- *
EOF
function z {
    pheno=$(basename $1 -qtl.1.txt)
    dir=$(dirname $(readlink -f $1))
    test $1 -nt $dir/$pheno.txt.gz && cat $dir/$pheno-qtl.*.txt | awk 'BEGIN {print "gene", "chr", "start", "end", "strand", "num_vars", "distance", "id", "var_chr", "var_start", "var_end", "df", "dummy", "a", "b", "p_nominal", "beta", "p_empirical", "p_beta"} {print}' | gzip >$dir/$pheno.txt.gz;
}
export -f z
find -name "*-qtl.1.txt" | parallel -j1 z
rsync -FFau . /project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/

Submitted batch job 55776894

QTL browser

Read the QTLs.

prefix = '/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/'
qtls = (pd.read_table('{}/multmean.txt.gz'.format(prefix), sep=' ')
        .sort_values('p_beta')
        .head(n=100))
norm = (pd.read_table('{}/multmean.bed.gz'.format(prefix))
        .set_index('pid')
        .filter(like='NA', axis='columns'))
X, Y = extract_qtl_gene_pair(qtls, norm, '/scratch/midway2/aksarkar/singlecell/scqtl-mapping/yri-120-dosages.vcf.gz')

Read the un-normalized phenotypes.

Read the estimated standard errors.

log_mean_sampling_var = pd.read_table('/scratch/midway2/aksarkar/singlecell/power/log_mean-se.txt.gz', index_col=0)
log_phi_sampling_var = pd.read_table('/scratch/midway2/aksarkar/singlecell/power/disp-se.txt.gz', index_col=0)

Annotate the genes with the estimated measurement error variances.

W = qtls.align(np.log(mean_by_ind), axis='index', join='inner')[1]
Z = qtls.align(log_mean_sampling_var, axis='index', join='inner')[1]
qtls['log_mean_error_var'] = Z.mean(axis=1)
qtls['log_mean_resid_var'] = W.var(axis=1) - Z.mean(axis=1)

W = qtls.align(log_phi, axis='index', join='inner')[1]
Z = qtls.align(log_phi_sampling_var, axis='index', join='inner')[1]
qtls['log_phi_error_var'] = Z.mean(axis=1)
qtls['log_phi_resid_var'] = W.var(axis=1) - Z.mean(axis=1)

Read the un-normalized effect sizes.

mean_stats = pd.read_table('/scratch/midway2/aksarkar/singlecell/power/analytic/mean.txt.gz')
disp_stats = pd.read_table('/scratch/midway2/aksarkar/singlecell/power/analytic/disp.txt.gz')

Read the fitted ash models.

with open('fold-change-ash-results.pkl', 'rb') as f:
  ash_results = pickle.load(f)

Annotate the genes with the posterior marginal effect size.

pandas2ri.activate()
mean_stats_subset = mean_stats.merge(qtls.reset_index(), left_on=['gene', 'Unnamed: 0'], right_on=['gene', 'id'], suffixes=['', '_norm'])
res = ashr.ash(mean_stats_subset['beta'], mean_stats_subset['se'], fixg=True, g=ash_results['mean'].rx2('fitted_g'))
mean_stats_subset['pm'] = np.array(ashr.get_pm(res))
mean_stats_subset['lfsr'] = np.array(ashr.get_lfsr(res))

disp_stats_subset = disp_stats.merge(qtls.reset_index(), left_on=['gene', 'Unnamed: 0'], right_on=['gene', 'id'], suffixes=['', '_norm'])
res = ashr.ash(disp_stats_subset['beta'], disp_stats_subset['se'], fixg=True, g=ash_results['disp'].rx2('fitted_g'))
disp_stats_subset['pm'] = np.array(ashr.get_pm(res))
disp_stats_subset['lfsr'] = np.array(ashr.get_lfsr(res))

Read the counts.

(mean_stats_subset
 .dropna()
 .merge(disp_stats_subset, on=['Unnamed: 0', 'gene', 'log_mean_resid_var', 'log_mean_error_var', 'log_phi_resid_var', 'log_phi_error_var'], suffixes=['_mean', '_disp'])
 .merge(gene_info, left_on='gene', right_index=True)
 .rename(columns={'Unnamed: 0': 'id'})
 [['gene', 'name', 'id', 'beta_mean', 'pm_mean', 'lfsr_mean', 'beta_disp', 'pm_disp', 'lfsr_disp',
   'log_mean_resid_var', 'log_mean_error_var', 'log_phi_resid_var', 'log_phi_error_var']]).head()

gene    name                          id  beta_mean   pm_mean  \
0  ENSG00000005075  POLR2J  rs113524320.chr7.102091990  -0.133238 -0.122851
1  ENSG00000008394   MGST1    rs9332891.chr12.16503789  -0.052297 -0.048002
2  ENSG00000008988   RPS20    rs73679777.chr8.56987059  -0.051499 -0.049712
3  ENSG00000029639   TFB1M   rs79805895.chr6.155635281  -0.128963 -0.108330
4  ENSG00000042753   AP2S1   rs10569642.chr19.47352660  -0.045408 -0.035699

lfsr_mean  beta_disp   pm_disp  lfsr_disp  log_mean_resid_var  \
0  6.306067e-14   0.156411  0.084523   0.133519            0.018223
1  1.324717e-05  -0.031652 -0.002645   0.888211            0.003425
2  4.717338e-13  -0.104588 -0.006367   0.839005            0.000912
3  1.837927e-05  -0.010367 -0.000462   0.905676            0.025056
4  9.047097e-03  -0.129085 -0.009645   0.800620            0.001014

log_mean_error_var  log_phi_resid_var  log_phi_error_var
0            0.011825           0.088176           0.038666
1            0.003648           0.098297           0.023262
2            0.003831           0.332455           0.021787
3            0.014941           0.264272           0.140184
4            0.006730           0.295754           0.037082

with sqlite3.connect('/project2/mstephens/aksarkar/projects/singlecell-qtl/browser/browser.db') as conn:
  (mean_stats_subset
   .dropna()
   .merge(disp_stats_subset, on=['Unnamed: 0', 'gene', 'log_mean_resid_var', 'log_mean_error_var', 'log_phi_resid_var', 'log_phi_error_var'], suffixes=['_mean', '_disp'])
   .merge(gene_info, left_on='gene', right_index=True)
   .rename(columns={'Unnamed: 0': 'id'})
   [['gene', 'name', 'id', 'beta_mean', 'pm_mean', 'lfsr_mean', 'beta_disp', 'pm_disp', 'lfsr_disp',
     'log_mean_resid_var', 'log_mean_error_var', 'log_phi_resid_var', 'log_phi_error_var']]
   .to_sql('qtls', conn, index=False, if_exists='replace'))
  conn.execute('create index ix_mean_qtls on qtls("gene");')

  (X.reset_index()
   .melt(id_vars='index', var_name='ind')
   .rename(columns={'index': 'gene'})
   .to_sql('mean_qtl_geno', conn, index=False, if_exists='replace'))
  conn.execute('create index ix_mean_qtl_geno on mean_qtl_geno("gene", "ind");')

  (Y.reset_index()
   .melt(id_vars='index', var_name='ind')
   .rename(columns={'index': 'gene', 'value': 'norm'})
   .merge(
     log_mu.reset_index().melt(id_vars='gene', var_name='ind').rename(columns={'value': 'log_mu'})
   )
   .merge(
     log_phi.reset_index().melt(id_vars='gene', var_name='ind').rename(columns={'value': 'log_phi'})
   )
   .merge(
     logodds.reset_index().melt(id_vars='gene', var_name='ind').rename(columns={'value': 'logodds'})     
   )
   .merge(
     np.log(mean_by_ind).reset_index().melt(id_vars='gene', var_name='ind').rename(columns={'value': 'log_mean'})
   )
   .merge(
     np.sqrt(log_mean_sampling_var).reset_index().melt(id_vars='gene', var_name='ind').rename(columns={'value': 'log_mean_se'})
   )
   .merge(
     np.sqrt(log_phi_sampling_var).reset_index().melt(id_vars='gene', var_name='ind').rename(columns={'value': 'log_phi_se'})     
   )
   .to_sql('params', conn, index=False, if_exists='replace'))
  conn.execute('create index ix_params on params("gene", "ind");')


  (umi.loc[qtls.index]
   .reset_index()
   .melt(id_vars='gene', var_name='sample')
   .to_sql('umi', conn, index=False, if_exists='replace'))
  conn.execute('create index ix_umi on umi("gene", "ind");')

  annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt')
  annotations['sample'] = annotations.apply(lambda x: '{chip_id}.{experiment:08d}.{well}'.format(**dict(x)), axis=1)
  annotations['size'] = annotations['mol_hs']
  (annotations.loc[keep_samples.values.ravel(), ['sample', 'chip_id', 'size']]
   .to_sql('annotation', con=conn, if_exists='replace'))
  conn.execute('create index ix_annotation on annotation(chip_id, sample);')

QTL overlap

Replication rates

Read the QTLs and normalized expression matrices.

prefix = '/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/'
qtls = {pheno: [pd.read_table('{}/{}.txt.gz'.format(prefix, pheno), sep=' '),
                pd.read_table('{}/{}.bed.gz'.format(prefix, pheno)).set_index('pid').filter(like='NA', axis='columns')]
        for pheno in ['bulk', 'zinb/mean', 'zinb/multmean', 'zinb/variance', 'zinb/cv', 'zinb/fano']}

Read covariates.

base = pathlib.Path('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/')
covars = [pd.read_table(base / '{}-covars.txt'.format(f), sep='\s+', engine='python', index_col=0)
          for f in ('bulk', 'zinb/mean', 'zinb/multmean', 'zinb/variance')]

The bulk QTL gene names need to be munged.

qtls['bulk'][0]['gene'] = qtls['bulk'][0]['gene'].apply(lambda x: x.split('.')[0])
qtls['bulk'][1].index = [x.split('.')[0] for x in qtls['bulk'][1].index]

Compute the gene-level FDR filter.

for k in qtls:
  qtls[k][0]['fdr_pass'] = bh(qtls[k][0]['p_beta']) < 0.1

Estimate replication rates for mean QTLs.

pd.options.display.float_format = '{:.3g}'.format
pairwise_replication(
  qtls,
  phenos=['bulk', 'zinb/mean', 'zinb/multmean'],
  ticks=['Bulk', 'Mean', 'log mean'],
  covars=covars[:3])

Bulk  Mean  log mean
Bulk       100    79      78.9
Mean      80.4   100       100
log mean    82   100       100

Estimate the rate at which variance QTLs replicate as mean QTLs (and vice versa).

pairwise_replication(qtls,
                     phenos=['zinb/mean', 'zinb/variance', 'zinb/cv', 'zinb/fano'],
                     ticks=['Mean', 'Variance', 'CV', 'Fano'],
                     covars=[covars[1], covars[3], None, None])

Mean  Variance   CV  Fano
Mean       100      85.1 42.6  66.4
Variance   100       100  100   100
CV         100       100  100   100
Fano       100       100  100   100

Relationship of \(p\)-values, effect sizes, and expression levels

Compute relative abundance per individual.

log_mu = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-mu.txt.gz', index_col=0, sep=' ')
logodds = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-logodds.txt.gz', index_col=0, sep=' ')

abundance = log_mu - np.log1p(np.exp(logodds))
abundance -= sp.logsumexp(abundance, axis=0)
abundance /= np.log(2)

Investigate whether variance QTL \(p\)-values are correlated with relative abundance.

variance_qtls = qtls['zinb/variance'][0]
thresh_pass = variance_qtls['p_beta'] < 1e-2
var_qtl_abundance, var_qtl_stats = abundance.align(variance_qtls[thresh_pass].set_index('gene'), axis='index', join='inner')
fdr_pass = var_qtl_stats['fdr_pass']

Count how many variance QTLs have \(p < 10^{-2}\)

thresh_pass.sum()

plt.clf()
plt.errorbar(x=var_qtl_abundance.mean(axis=1), y=-np.log10(var_qtl_stats['p_beta']), xerr=var_qtl_abundance.std(axis=1), fmt='none', label=None, lw=1, ecolor='.8', zorder=-1)
plt.scatter(x=var_qtl_abundance[fdr_pass].mean(axis=1), y=-np.log10(var_qtl_stats[fdr_pass]['p_beta']), c='r', s=4, label='FDR 10%')
plt.scatter(x=var_qtl_abundance[~fdr_pass].mean(axis=1), y=-np.log10(var_qtl_stats[~fdr_pass]['p_beta']), c='k', s=4, label='p < 0.01')
plt.legend(loc='upper left', frameon=False)
plt.xlabel('$\log_2(\mathrm{relative\ abundance})$')
_ = plt.ylabel('Variance QTL $-\log_{10}(p)$')

Make sure our effect sizes match qtltools.

variance = qtls['zinb/variance'][1]
Xv, Yv = extract_qtl_gene_pair(variance_qtls[thresh_pass], variance, dosages='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/yri-120-dosages.vcf.gz')
Cv = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/variance-covars.txt', index_col=0)
Cv = Cv.align(Xv, axis='columns', join='inner')[0]
my_var_qtl_stats = replication_tests(Xv, Yv, Cv)
my_var_qtl_stats.merge(variance_qtls, on='gene').apply(lambda x: abs(x['beta_x'] - x['beta_y']), axis=1).describe()

count       121
mean     0.0904
std       0.105
min     0.00246
25%      0.0255
50%      0.0601
75%       0.132
max       0.867
dtype: float64

Look at the gene with max difference in estimated effect size.

my_var_qtl_stats.iloc[my_var_qtl_stats.merge(variance_qtls, on='gene').apply(lambda x: abs(x['beta_x'] - x['beta_y']), axis=1).idxmax()]

beta               20.3
gene    ENSG00000079102
p                 0.157
se                 14.4
Name: 116, dtype: object

Estimate standard errors via the bootstrap.

var_qtl_stats['bootstrap_se'] = bootstrap_se(Xv, Yv, Cv)
var_qtl_stats['se'] = my_var_qtl_stats.set_index('gene')['se']

Investigate whether analytic SEs are reasonable:

var_qtl_stats['se'].describe()

count       121
mean       0.18
std         1.3
min     0.00832
25%      0.0311
50%      0.0447
75%       0.078
max        14.4
Name: se, dtype: float64

var_qtl_stats['bootstrap_se'].describe()

count      121
mean      1.22
std       9.12
min     0.0995
25%      0.178
50%      0.217
75%      0.285
max        100
Name: bootstrap_se, dtype: float64

Plot analytic SEs against bootstrap SEs.

plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.scatter(var_qtl_stats['se'], var_qtl_stats['bootstrap_se'], s=1, c='k')
plt.plot([0, 1], [0, 1], c='r', ls=':', lw=1)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('Analytic SE')
_ = plt.ylabel('Bootstrap SE')

Compute \(z\)-scores using the bootstrap SEs.

var_qtl_stats['z'] = var_qtl_stats['beta'] / var_qtl_stats['bootstrap_se']

Compute a \(z\)-score from the estimated effect size and the \(p\)-value.

var_qtl_stats['isf_z'] = np.sign(var_qtl_stats['beta']) * np.sqrt(st.chi2(1).isf(var_qtl_stats['p_nominal']))

Plot bootstrap \(z\)-scores against inverse-transformed \(z\)-scores.

plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.scatter(var_qtl_stats['z'], var_qtl_stats['isf_z'], s=3, c='k')
plt.axhline(y=0, c='k', lw=1)
plt.axvline(x=0, c='k', lw=1)
plt.plot([-15, 15], [-15, 15], c='r', ls=':', lw=1)
plt.xlim([-15, 15])
plt.ylim([-15, 15])
plt.xlabel('Bootstrap $z$-score')
_ = plt.ylabel('Inverse CDF $z$-score')

Investigate whether \(z\)-scores based on bootstrap SEs agree with permutation \(p\)-values.

plt.clf()
plt.scatter(-np.log10(var_qtl_stats[fdr_pass]['p_beta']), var_qtl_stats[fdr_pass]['z'], c='r', label='FDR 10%', s=4)
plt.scatter(-np.log10(var_qtl_stats[~fdr_pass]['p_beta']), var_qtl_stats[~fdr_pass]['z'], c='k', label='p < 0.01', s=4)
plt.xlabel('Variance QTL $-\log_{10}(p)$')
plt.ylabel('Variance QTL $z$-score')

Text(0,0.5,'Variance QTL $z$-score')

Investigate whether variance QTL \(z\)-scores are correlated with relative abundance.

var_qtl_abundance = var_qtl_abundance.align(var_qtl_stats, axis='index', join='inner')[0]

plt.clf()
plt.errorbar(x=var_qtl_abundance.mean(axis=1), y=var_qtl_stats['z'], xerr=var_qtl_abundance.std(axis=1), fmt='none', label=None, lw=1, ecolor='.8', zorder=1)
plt.scatter(x=var_qtl_abundance[fdr_pass].mean(axis=1), y=var_qtl_stats[fdr_pass]['z'], c='r', s=2, label='FDR 10%', zorder=2)
plt.scatter(x=var_qtl_abundance[~fdr_pass].mean(axis=1), y=var_qtl_stats[~fdr_pass]['z'], c='k', s=2, label='p < 0.01', zorder=2)
plt.legend(frameon=False)
plt.axhline(y=0, c='k')
plt.xlabel('$\log_2(\mathrm{relative\ abundance})$')
_ = plt.ylabel('Variance QTL $z$-score')

Investigate whether variance QTL \(z\)-scores are correlated with mean QTL \(z\)-scores.

Xm, Ym = extract_qtl_gene_pair(variance_qtls[thresh_pass], qtls['zinb/mean'][1], dosages='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/yri-120-dosages.vcf.gz')
Cm = covars[1]
Cm = Cm.align(Xm, axis='columns', join='inner')[0]
var_qtl_stats['mean_beta'] = replication_tests(Xm, Ym, Cm).set_index('gene')['beta']
var_qtl_stats['mean_z'] = var_qtl_stats['mean_beta'] / bootstrap_se(Xm, Ym, Cm)

Plot variance QTL \(z\)-scores against pooled QTL \(z\)-scores.

lim = [-12, 12]
plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.plot(lim, lim, c='r', ls=':', lw=1)
plt.scatter(var_qtl_stats[fdr_pass]['z'], var_qtl_stats[fdr_pass]['mean_z'], c='r', s=3, label='FDR 10%')
plt.scatter(var_qtl_stats[~fdr_pass]['z'], var_qtl_stats[~fdr_pass]['mean_z'], c='k', s=3, label='$p < 10^{-2}$')
plt.legend(frameon=False, markerscale=2, loc=(-.01, .75))
plt.axhline(y=0, c='k', lw=1)
plt.axvline(x=0, c='k', lw=1)
plt.xlim(lim)
plt.ylim(lim)
plt.xlabel('vQTL $z$-score')
_ = plt.ylabel('eQTL $z$-score')

Compute the same in the other direction.

mean_qtl_stats = qtls['zinb/mean'][0]
mean_qtl_stats = mean_qtl_stats[mean_qtl_stats['fdr_pass']].copy()
X, Y = extract_qtl_gene_pair(mean_qtl_stats, qtls['zinb/mean'][1], dosages='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/yri-120-dosages.vcf.gz')
z = mean_qtl_stats.set_index('gene')['beta'] / bootstrap_se(X, Y, Cm)

X, Y = extract_qtl_gene_pair(mean_qtl_stats, qtls['zinb/variance'][1], dosages='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/yri-120-dosages.vcf.gz')
vz = replication_tests(X, Y, Cv).set_index('gene')['beta'] / bootstrap_se(X, Y, Cv)

Xb, Yb = extract_qtl_gene_pair(mean_qtl_stats, qtls['bulk'][1], dosages='/scratch/midway2/aksarkar/singlecell/scqtl-mapping/yri-120-dosages.vcf.gz')
Cb = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/bulk-covars.txt', sep=r'\s+', engine='python')
Cb = Cb.align(Xb, axis='columns', join='inner')[0]
bulk_z = replication_tests(Xb, Yb, Cb).set_index('gene')['beta'] / bootstrap_se(Xb, Yb, Cb)

mean_qtl_stats = mean_qtl_stats.set_index('gene')
mean_qtl_stats['z'] = z
mean_qtl_stats['var_z'] = vz
mean_qtl_stats['bulk_z'] = bulk_z
mean_qtl_stats['var_fdr_pass'] = mean_qtl_stats.index.isin(fdr_pass[fdr_pass].index)

Plot mean vs. variance \(z\)-scores at eQTL SNP-gene pairs.

lim = [-12, 12]
plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.plot(lim, lim, c='r', ls=':', lw=1)
plt.scatter(mean_qtl_stats.loc[mean_qtl_stats['var_fdr_pass'], 'var_z'],
            mean_qtl_stats.loc[mean_qtl_stats['var_fdr_pass'], 'z'], c='r',
            s=3, label='vQTL')
plt.scatter(mean_qtl_stats.loc[~mean_qtl_stats['var_fdr_pass'], 'var_z'],
            mean_qtl_stats.loc[~mean_qtl_stats['var_fdr_pass'], 'z'], c='k',
            s=3, label='Not vQTL')
plt.legend(frameon=False, markerscale=2, loc=(-.01, .75))
plt.axhline(y=0, c='k', lw=1)
plt.axvline(x=0, c='k', lw=1)
plt.xlim(lim)
plt.ylim(lim)
plt.xlabel('vQTL $z$-score')
_ = plt.ylabel('eQTL $z$-score')

Plot single cell eQTL \(z\)-scores against bulk eQTL \(z\)-scores.

lim = [-12, 12]
plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.plot(lim, lim, c='r', ls=':', lw=1)
plt.scatter(mean_qtl_stats['z'], mean_qtl_stats['bulk_z'], c='k', s=3)
plt.axhline(y=0, c='k', lw=1)
plt.axvline(x=0, c='k', lw=1)
plt.xlim(lim)
plt.ylim(lim)
plt.xlabel('Single cell eQTL $z$-score')
_ = plt.ylabel('Bulk eQTL $z$-score')

Joint distribution of summary statistics

Read the nominal summary statistics.

stats = {x: pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/power/{}.txt.gz'.format(x), index_col=0)
         for x in ('bulk', 'mean', 'variance')}

Compute \(z\)-scores.

for k in stats:
  stats[k]['z'] = stats[k]['beta'] / stats[k]['se']

Munge the bulk gene names.

stats['bulk']['gene'] = [x.split('.')[0] for x in stats['bulk']['gene']]

Munge the index names.

for k in stats:
  stats[k].index.name = 'id'

Plot the joint distribution of bulk/mean \(z\)-scores.

J = stats['mean'].reset_index().merge(stats['bulk'].reset_index(), on=['id', 'gene'])[['z_x', 'z_y']]
plt.clf()
plt.gcf().set_size_inches(3, 4)
plt.hexbin(J['z_x'], J['z_y'], gridsize=30, extent=[-10, 10, -10, 10], bins='log', cmap=colorcet.cm['gray_r'])
plt.gca().set_aspect('equal')
plt.xlim(-10, 10)
plt.ylim(-10, 10)
plt.plot(plt.ylim(), plt.ylim(), c='r', ls=':', lw=1)
cb = plt.colorbar(orientation='horizontal')
cb.set_label('$\log_{10}$ number of SNP-gene pairs')
plt.xlabel('Single cell eQTL $z$-score')
plt.ylabel('Bulk eQTL $z$-score')
plt.tight_layout()

Plot single cell eQTL \(z\)-scores against bulk eQTL \(z\)-scores.

J = (stats['mean']
     .reset_index()
     .merge(qtls['mean'][0][qtls['mean'][0]['fdr_pass']], on=['id', 'gene'])
     [['id', 'gene', 'beta_x', 'se']]
     .merge(stats['bulk'].reset_index(), on=['id', 'gene']))
lim = [-12, 12]
plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.plot(lim, lim, c='r', ls=':', lw=1)
plt.scatter(J['beta_x'] / J['se_x'], J['beta'] / J['se_y'], c='k', s=3)
plt.axhline(y=0, c='k', lw=1)
plt.axvline(x=0, c='k', lw=1)
plt.xlim(lim)
plt.ylim(lim)
plt.xlabel('Single cell eQTL $z$-score')
_ = plt.ylabel('Bulk eQTL $z$-score')

Plot the joint distribution of mean/variance \(z\)-scores.

J = stats['mean'].reset_index().merge(stats['variance'].reset_index(), on=['id', 'gene'])[['z_x', 'z_y']]
plt.clf()
plt.gcf().set_size_inches(3, 4)
plt.hexbin(J['z_x'], J['z_y'], gridsize=30, extent=[-10, 10, -10, 10], bins='log', cmap=colorcet.cm['gray_r'])
plt.gca().set_aspect('equal')
plt.xlim(-12, 12)
plt.ylim(-12, 12)
plt.plot(plt.ylim(), plt.ylim(), c='r', ls='--', lw=1)
cb = plt.colorbar(orientation='horizontal')
cb.set_label('$\log_{10}$ number of SNP-gene pairs')
plt.xlabel('Single cell eQTL $z$-score')
plt.ylabel('vQTL $z$-score')
plt.tight_layout()

Plot the significant SC vQTL \(z\)-scores, and their corresponding \(z\)-score for the mean.

J = (stats['variance']
     .reset_index()
     .merge(qtls['variance'][0][qtls['variance'][0]['fdr_pass']], on=['id', 'gene'])
     [['id', 'gene', 'beta_x', 'se']]
     .merge(stats['mean'].reset_index(), on=['id', 'gene']))
lim = [-12, 12]
plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.plot(lim, lim, c='r', ls=':', lw=1)
plt.scatter(J['beta_x'] / J['se_x'], J['beta'] / J['se_y'], c='k', s=3)
plt.axhline(y=0, c='k', lw=1)
plt.axvline(x=0, c='k', lw=1)
plt.xlim(lim)
plt.ylim(lim)
plt.xlabel('vQTL $z$-score')
_ = plt.ylabel('eQTL $z$-score')

Plot the significant SC eQTL \(z\)-scores, and their corresponding \(z\)-score for the mean.

J = (stats['mean']
     .reset_index()
     .merge(qtls['mean'][0][qtls['mean'][0]['fdr_pass']], on=['id', 'gene'])
     [['id', 'gene', 'beta_x', 'se']]
     .merge(stats['variance'].reset_index(), on=['id', 'gene'])
     .merge(qtls['variance'][0], on=['id', 'gene'], suffixes=['_y', '_z'], how='left')
     .fillna(False))
lim = [-12, 12]
plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.plot(lim, lim, c='r', ls=':', lw=1)
colors = {True: 'r', False: 'k'}
labels = {True: 'vQTL', False: 'not vQTL'}
for k, g in J.groupby('fdr_pass'):
  plt.scatter(g['beta_x'] / g['se_x'], g['beta_y'] / g['se_y'], c=colors[k], s=3, label=labels[k])
plt.legend(frameon=False, markerscale=2, handletextpad=0, loc=(-.01, .75))
plt.axhline(y=0, c='k', lw=1)
plt.axvline(x=0, c='k', lw=1)
plt.xlim(lim)
plt.ylim(lim)
plt.ylabel('vQTL $z$-score')
_ = plt.xlabel('eQTL $z$-score')

Predicting mean QTLs from variance QTLs

As we change the threshold for calling mean (variance) QTLs, track the precision and recall of variance (mean) QTLs.

plt.clf()
plt.gcf().set_size_inches(3, 3)
Y, P = qtls['mean'][0]['fdr_pass'].align(qtls['variance'][0]['p_beta'].dropna(), join='inner')
p, r, _ = sklearn.metrics.precision_recall_curve(Y.astype(int), -np.log(P))
auprc_mean_var = sklearn.metrics.average_precision_score(Y.astype(int), -np.log(P))
plt.plot(r[::10], p[::10], lw=1, c='k', label='Variance')
Y, P = qtls['variance'][0]['fdr_pass'].align(qtls['mean'][0]['p_beta'].dropna(), join='inner')
p, r, _ = sklearn.metrics.precision_recall_curve(Y.astype(int), -np.log(P))
auprc_var_mean = sklearn.metrics.average_precision_score(Y.astype(int), -np.log(P))
plt.plot(r[::10], p[::10], lw=1, c='r', label='Mean')
plt.legend(frameon=False)
plt.xlabel('Recall of QTLs at FDR 10%')
_ = plt.ylabel('Precision of QTLs at FDR 10%')

Tabulate the AUPRC.

auprc_mean_var, auprc_var_mean

(0.5212419583724434, 0.6044457996921822)

Track sensitivity and specificity of mean QTLs.

plt.clf()
plt.gcf().set_size_inches(3, 3)
Y, P = qtls['mean'][0]['fdr_pass'].align(qtls['variance'][0]['p_beta'].dropna(), join='inner')
fpr, tpr, _ = sklearn.metrics.roc_curve(Y.astype(int), -np.log(P))
auroc_mean_var = sklearn.metrics.roc_auc_score(Y.astype(int), -np.log(P))
plt.plot(fpr[::10], tpr[::10], lw=1, c='k', label='Variance')
Y, P = qtls['variance'][0]['fdr_pass'].align(qtls['mean'][0]['p_beta'].dropna(), join='inner')
fpr, tpr, _ = sklearn.metrics.roc_curve(Y.astype(int), -np.log(P))
auroc_var_mean = sklearn.metrics.roc_auc_score(Y.astype(int), -np.log(P))
plt.plot(fpr[::10], tpr[::10], lw=1, c='r', label='Mean')
plt.legend(frameon=False)
plt.xlabel('False positive rate of QTLs at FDR 10%')
_ = plt.ylabel('True positive rate of QTLs at FDR 10%')

Tabulate the AUROC.

auroc_mean_var, auroc_var_mean

(0.8968614143396433, 0.9555166119581557)

Find the threshold at which 90% of eQTLs are recovered.

Y, P = qtls['variance'][0].set_index('gene')['fdr_pass'].align(qtls['mean'][0].set_index('gene')['p_beta'].dropna(), join='inner')
fpr, tpr, thresh = sklearn.metrics.roc_curve(Y.astype(int), -np.log(P))
np.exp(-thresh[np.where(tpr <= 0.9)].min())

0.05222859999999999

Find the vQTLs which are not predictive of eQTLs.

true_negatives = Y[np.logical_and(Y, -np.log(P) < thresh[np.where(tpr <= 0.9)].min())].to_frame().merge(gene_info, left_index=True, right_index=True)[['name']]
true_negatives

name
gene
ENSG00000122574      WIPF3
ENSG00000112367       FIG4
ENSG00000121481       RNF2
ENSG00000214194  LINC00998
ENSG00000133935    C14orf1
ENSG00000243725       TTC4
ENSG00000132849      INADL
ENSG00000171130   ATP6V0E2
ENSG00000129518       EAPP
ENSG00000182400   TRAPPC6B
ENSG00000182117      NOP10
ENSG00000068024      HDAC4
ENSG00000169714       CNBP
ENSG00000133030      MPRIP
ENSG00000108528   SLC25A11