Single cell/bulk RNA-Seq concordance
Introduction
Here, we investigate the correlation between single cell RNA-Seq and bulk RNA-Seq on samples from the same cell type in the same individuals. Our goal is to qualitatively assess our ability to call QTLs from scRNA-Seq.
Implementation
def plot_concordance(x, y, title, filename, xlabel=None, ylabel=None, lim=None, **kwargs): """Plot hexbin of concordance""" merged = x.merge(y, left_index=True, right_index=True) merged.columns = ['x', 'y'] if lim is None: lim = [merged.min().min(), merged.max().max()] plt.clf() if 'gridsize' not in kwargs: kwargs['gridsize'] = 40 hexes = plt.hexbin(merged['x'], merged['y'], cmap=colorcet.cm['blues'], extent=lim + lim, **kwargs) ax = plt.gca() if lim is None: ax.set_xlim([merged['x'].min(), merged['x'].max()]) ax.set_ylim([merged['y'].min(), merged['y'].max()]) else: ax.set_xlim(lim) ax.set_ylim(lim) ax.set_aspect('equal') cb = plt.colorbar() cb.set_label('Number of genes') plt.plot(lim, lim, color='red') plt.title(title) if xlabel is None: xlabel = 'scRNA-Seq $\log_2(\mathrm{CPM} + 1) $' if ylabel is None: ylabel = 'Bulk RNA-Seq $\log_2(\mathrm{TPM} + 1)$' plt.xlabel(xlabel) plt.ylabel(ylabel) plt.savefig(filename) def cpm(counts, size=None, log2=False): if size is None: size = counts.sum(axis=0) cpm = counts / size * 1e6 if log2: cpm = np.log(cpm + 1) / np.log(2) return cpm
def plot_concordance_by_individual(umi, annotations, bulk, output_dir): bulk, pooled_cpm = bulk.align( cpm(umi.groupby(by=annotations['chip_id'].values, axis=1).agg(np.sum), size=annotations.groupby('chip_id')['mol_hs'].agg(np.sum), log2=True), axis=1, join='inner') for k in bulk: plot_concordance( x=pooled_cpm[k].to_frame(), y=bulk[k].to_frame(), title=k, filename='{}/{}.svg'.format(output_dir, k))
def plot_concordance_by_num_cells(individual, umi, annotations, bulk_tpm, output_dir): bulk_tpm = bulk_tpm[individual].to_frame() umi = umi.loc[:,(annotations['chip_id'] == individual).values] annotations = annotations[annotations['chip_id'] == individual] for num_cells in [1, 10, 50, 100, 200]: sample = np.random.choice(annotations.shape[0], size=num_cells) pooled_cpm = cpm(umi.iloc[:,sample].sum(axis=1).to_frame(), size=annotations.iloc[sample]['mol_hs'].agg(np.sum), log2=True) plot_concordance( x=pooled_cpm, y=bulk_tpm, title='{}, {} cell{}'.format(individual, num_cells, 's' if num_cells > 1 else ''), filename='{}/{}-{}.svg'.format(output_dir, individual, num_cells), gridsize=20)
def plot_concordance_pooled_subsets(individual, umi, annotations, output_dir): umi = umi.loc[:,(annotations['chip_id'] == individual).values] for num_cells in [1, 10, 50, 100]: sample = umi.sample(n=2 * num_cells, axis=1) pool1 = cpm(sample.iloc[:,:num_cells].sum(axis=1).to_frame(), log2=True) pool2 = cpm(sample.iloc[:,num_cells:].sum(axis=1).to_frame(), log2=True) plot_concordance( x=pool1, y=pool2, title='{}, {} cell{}'.format(individual, num_cells, 's' if num_cells > 1 else ''), filename='{}/{}-{}.svg'.format(output_dir, individual, num_cells), ylabel='scRNA-Seq $\log_2(\mathrm{CPM} + 1)$', gridsize=15)
def plot_concordance_rho(bulk, sc, output_dir, **kwargs): if 'xlabel' not in kwargs: kwargs['xlabel'] = 'Single cell ln relative abundance' if 'ylabel' not in kwargs: kwargs['ylabel'] = 'Bulk ln relative abundance' bulk, sc = bulk.align(sc, axis=1, join='inner') for k in bulk: y = bulk[k].dropna().to_frame() x = sc[k].dropna().to_frame() plot_concordance( x=x, y=y, title=k, gridsize=20, **kwargs, filename='{}/{}.svg'.format(output_dir, k))
def mask(df): return ~np.isfinite(df)
Read the data
Read the QC files.
annotations = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt') keep_samples = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None) keep_genes = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/genes-pass-filter.txt', index_col=0, header=None) annotations = annotations.loc[keep_samples.values.ravel()]
Read the UMI matrix.
umi = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-counts.txt.gz', index_col=0) umi = umi.loc[:,keep_samples.values.ravel()]
The only quantity which is directly comparable between bulk and scRNA-Seq is
relative abundance (Pachter 2011). Therefore, we re-processed the iPSC bulk
RNA-Seq using kallisto
.
Important: we need to quantify relative abundance with respect to exactly the same set of genes as the single cell data.
bulk_tpm = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/kallisto/bulk-ipsc-tpm.txt.gz', header=None, sep=' ').pivot(columns=0, index=1, values=2) bulk_log_rho = np.log(bulk_tpm) - np.log(bulk_tpm.sum(axis=0))
Plot bulk vs. pooled single cells
Pool the single cells and estimate log CPM. We assume this is proportional to relative abundance (we assume UMIs really do directly count molecules).
sc_log_cpm = np.log(umi.groupby(annotations['chip_id'].values, axis=1).agg(np.sum)) - np.log(annotations.groupby('chip_id')['mol_hs'].agg(np.sum)) + 6 * np.log(10) sc_log_rho = sc_log_cpm - sp.logsumexp(sc_log_cpm, axis=0)
S, T = (bulk_log_rho.loc[keep_genes.values.ravel()] .mask(mask) .align(sc_log_rho.loc[keep_genes.values.ravel()] .mask(mask), join='inner'))
plot_concordance_rho(
S,
T,
'/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/pooled/')
Plot the individual with the most cells, and the fewest cells.
Look at the distribution of absolute differences.
S, T = bulk_log_rho.align(sc_log_rho, join='inner') diff = abs(S - T)
np.nanpercentile(diff.mask(~np.isfinite(diff)), [90, 95, 99, 99.5, 99.9])
array([1.96847244, 2.66536247, 4.26461767, 4.87070071, 6.14664319])
Look at pathway enrichment for genes only detected in bulk.
bulk_only = set([x for k in T for x in pd.Series(T[k].mask(np.isfinite(S[k])).dropna().index)])
Annotation Cluster 1 Enrichment Score: 6.163250777011418 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR018064:Metallothionein, vertebrate, metal binding site 6 5.9405940594059405 5.214809864836233E-10 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 11 18559 128.14039125431532 8.343695168111509E-8 8.343695168111509E-8 6.296819954343391E-7 UP_SEQ_FEATURE region of interest:Alpha 6 5.9405940594059405 1.0452172113872762E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 80 13 20063 115.74807692307692 2.4144515486934637E-7 2.4144515486934637E-7 1.3409402699338102E-6 UP_SEQ_FEATURE region of interest:Beta 6 5.9405940594059405 1.0452172113872762E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 80 13 20063 115.74807692307692 2.4144515486934637E-7 2.4144515486934637E-7 1.3409402699338102E-6 UP_SEQ_FEATURE metal ion-binding site:Divalent metal cation; cluster A 6 5.9405940594059405 1.6208966909643656E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 80 14 20063 107.48035714285714 3.744270538064143E-7 1.8721354444473093E-7 2.0794964750159295E-6 UP_SEQ_FEATURE metal ion-binding site:Divalent metal cation; cluster B 6 5.9405940594059405 1.6208966909643656E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 80 14 20063 107.48035714285714 3.744270538064143E-7 1.8721354444473093E-7 2.0794964750159295E-6 INTERPRO IPR003019:Metallothionein superfamily, eukaryotic 6 5.9405940594059405 2.2376038114295356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 14 18559 100.68173598553346 3.580165393035628E-7 1.7900828563899296E-7 2.7018795867306267E-6 INTERPRO IPR000006:Metallothionein, vertebrate 6 5.9405940594059405 2.2376038114295356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 14 18559 100.68173598553346 3.580165393035628E-7 1.7900828563899296E-7 2.7018795867306267E-6 INTERPRO IPR023587:Metallothionein domain, vertebrate 6 5.9405940594059405 2.2376038114295356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 14 18559 100.68173598553346 3.580165393035628E-7 1.7900828563899296E-7 2.7018795867306267E-6 INTERPRO IPR017854:Metallothionein domain 6 5.9405940594059405 2.2376038114295356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 79 14 18559 100.68173598553346 3.580165393035628E-7 1.7900828563899296E-7 2.7018795867306267E-6 UP_KEYWORDS Metal-thiolate cluster 6 5.9405940594059405 3.848645320957292E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 97 14 20581 90.93225331369662 5.272642669140737E-7 5.272642669140737E-7 4.523761099051171E-6 GOTERM_BP_DIRECT GO:0045926~negative regulation of growth 6 5.9405940594059405 1.6056085205036586E-8 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 75 19 16792 70.70315789473685 6.727477139589766E-6 6.727477139589766E-6 2.2546318878546856E-5 GOTERM_BP_DIRECT GO:0071294~cellular response to zinc ion 6 5.9405940594059405 1.6056085205036586E-8 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 75 19 16792 70.70315789473685 6.727477139589766E-6 6.727477139589766E-6 2.2546318878546856E-5 UP_KEYWORDS Cadmium 5 4.9504950495049505 5.5012039998665416E-8 ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417 97 9 20581 117.87514318442153 7.536621291603929E-6 3.7683177458447403E-6 6.466204057753444E-5 KEGG_PATHWAY hsa04978:Mineral absorption 6 5.9405940594059405 6.197865698893864E-7 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 27 46 6910 33.38164251207729 3.1608625294388126E-5 3.1608625294388126E-5 5.995777879075348E-4 GOTERM_BP_DIRECT GO:0071276~cellular response to cadmium ion 5 4.9504950495049505 7.917968157631074E-7 ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417 75 17 16792 65.85098039215686 3.317079698986758E-4 1.6586774100302293E-4 0.0011118534141041359 UP_KEYWORDS Copper 5 4.9504950495049505 2.4203238770349687E-4 ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417 97 65 20581 16.321173671689134 0.03261860473945333 0.008256340267242868 0.2841189120846299 GOTERM_MF_DIRECT GO:0046872~metal ion binding 13 12.871287128712872 0.12273080737129344 ENSG00000152977, ENSG00000125144, ENSG00000169715, ENSG00000153266, ENSG00000198105, ENSG00000125148, ENSG00000177932, ENSG00000215397, ENSG00000259332, ENSG00000205358, ENSG00000187193, ENSG00000121691, ENSG00000198417 69 2069 16881 1.5372055393279678 0.9999998290722704 0.7894858696046019 77.70573642627673 GOTERM_MF_DIRECT GO:0008270~zinc ion binding 8 7.920792079207921 0.18975529795362245 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417 69 1169 16881 1.6742663740841297 0.9999999999866547 0.8758984183433024 91.03468733803763 GOTERM_CC_DIRECT GO:0048471~perinuclear region of cytoplasm 6 5.9405940594059405 0.19285594142887852 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 91 621 18224 1.934915326219674 0.9999999919695161 0.9759592316379472 90.12891148150037 UP_KEYWORDS Zinc 13 12.871287128712872 0.41448076703181924 ENSG00000152977, ENSG00000125144, ENSG00000169715, ENSG00000153266, ENSG00000198105, ENSG00000125148, ENSG00000177932, ENSG00000215397, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417 97 2348 20581 1.1747352429793287 1.0 0.9829889947051305 99.81480020819157 UP_KEYWORDS Metal-binding 15 14.85148514851485 0.8237195669856142 ENSG00000125144, ENSG00000198105, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417, ENSG00000152977, ENSG00000169715, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000259332, ENSG00000121691, ENSG00000165188, ENSG00000247746 97 3640 20581 0.8743485895547751 1.0 0.9992576564560991 99.99999986203825 Annotation Cluster 2 Enrichment Score: 3.484819795455531 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR001152:Thymosin beta-4 4 3.9603960396039604 7.098533109747709E-7 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 79 5 18559 187.93924050632913 1.1357012050017268E-4 3.785814005408117E-5 8.571359595421768E-4 SMART SM00152:THY 4 3.9603960396039604 7.766657663469284E-7 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 45 5 10057 178.79111111111112 2.873623162158445E-5 2.873623162158445E-5 6.973781416896863E-4 PIR_SUPERFAMILY PIRSF001828:thymosin beta 4 3.9603960396039604 2.7080998708248916E-6 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 13 5 1692 104.12307692307694 2.708066868917225E-5 2.708066868917225E-5 0.0016261217555713081 GOTERM_BP_DIRECT GO:0042989~sequestering of actin monomers 4 3.9603960396039604 9.642416279102543E-6 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 75 10 16792 89.55733333333333 0.004032041304637524 0.0013458241984475316 0.013539249589888946 GOTERM_MF_DIRECT GO:0003785~actin monomer binding 4 3.9603960396039604 1.5210285357647118E-4 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 69 26 16881 37.63879598662207 0.017938766106036064 0.017938766106036064 0.1742005540888658 GOTERM_CC_DIRECT GO:0031941~filamentous actin 4 3.9603960396039604 4.7367184906870326E-4 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 91 31 18224 25.840482098546616 0.04038125481188837 0.04038125481188837 0.5107377212681841 GOTERM_BP_DIRECT GO:0007015~actin filament organization 4 3.9603960396039604 0.0039404588803159685 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 75 72 16792 12.43851851851852 0.8087767706624795 0.21048033854212067 5.393322740309592 UP_KEYWORDS Actin-binding 5 4.9504950495049505 0.0394339844994075 ENSG00000034510, ENSG00000147481, ENSG00000205542, ENSG00000154620, ENSG00000158164 97 274 20581 3.871811272480999 0.9959614191508189 0.5449776981781902 37.680777033965654 GOTERM_BP_DIRECT GO:0030036~actin cytoskeleton organization 3 2.9702970297029703 0.11220693889642849 ENSG00000034510, ENSG00000205542, ENSG00000158164 75 130 16792 5.166769230769231 1.0 0.9373656872217728 81.19896543130287 UP_KEYWORDS Cytoskeleton 5 4.9504950495049505 0.7843259865566969 ENSG00000034510, ENSG00000147481, ENSG00000205542, ENSG00000154620, ENSG00000158164 97 1138 20581 0.9322287246571123 1.0 0.9985945289079377 99.99999852310495 Annotation Cluster 3 Enrichment Score: 1.7811609652286242 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR SMART SM00389:HOX 8 7.920792079207921 9.417302516012567E-5 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 45 250 10057 7.151644444444445 0.003478501955001434 0.0017407661108270744 0.08452766486343188 INTERPRO IPR001356:Homeodomain 8 7.920792079207921 1.0044839918367324E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 79 256 18559 7.341376582278481 0.015944076888024683 0.0040100758644506795 0.12122279079956888 UP_SEQ_FEATURE DNA-binding region:Homeobox 7 6.9306930693069315 1.0745524652182817E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093 80 191 20063 9.191164921465969 0.02451792636356398 0.008240359123208085 0.13776994804843845 UP_KEYWORDS Homeobox 8 7.920792079207921 2.273035742769636E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 97 262 20581 6.478633823876604 0.030664146783657475 0.010327675965698613 0.26685006254048016 GOTERM_BP_DIRECT GO:0006366~transcription from RNA polymerase II promoter 10 9.900990099009901 4.090749986540058E-4 ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000054598 75 513 16792 4.364392462638077 0.15754707376630528 0.028168569886196426 0.5729025901226814 INTERPRO IPR009057:Homeodomain-like 8 7.920792079207921 5.261429021979469E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 79 336 18559 5.59342977697408 0.08075722475289715 0.01669998667646322 0.6334635185275994 GOTERM_MF_DIRECT GO:0001077~transcriptional activator activity, RNA polymerase II core promoter proximal region sequence-specific binding 6 5.9405940594059405 0.002608319419483337 ENSG00000152977, ENSG00000129514, ENSG00000109132, ENSG00000119547, ENSG00000205922, ENSG00000164093 69 236 16881 6.219970523212969 0.2671362793302654 0.1439254000557343 2.949180047116129 UP_SEQ_FEATURE compositionally biased region:Poly-Ala 7 6.9306930693069315 0.00518825052735661 ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000138083, ENSG00000054598, ENSG00000171450, ENSG00000163508 80 404 20063 4.3453279702970296 0.6992890245264455 0.25947910072369085 6.455682440417276 UP_KEYWORDS DNA-binding 19 18.81188118811881 0.0061751724814120695 ENSG00000129514, ENSG00000184302, ENSG00000164853, ENSG00000067048, ENSG00000198105, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000152977, ENSG00000109132, ENSG00000188375, ENSG00000153266, ENSG00000119547, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2050 20581 1.9665023887352275 0.5719954819022799 0.15610255752991076 7.022162963087153 GOTERM_BP_DIRECT GO:0007420~brain development 5 4.9504950495049505 0.00987285598589564 ENSG00000152977, ENSG00000053438, ENSG00000138083, ENSG00000054598, ENSG00000163508 75 190 16792 5.891929824561403 0.9843499743231441 0.36992744676798917 13.00554505004673 GOTERM_BP_DIRECT GO:0045944~positive regulation of transcription from RNA polymerase II promoter 11 10.891089108910892 0.01060386709098585 ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508 75 981 16792 2.5105266734624534 0.9885153598169162 0.36024787175300976 13.90312024253032 UP_SEQ_FEATURE compositionally biased region:Poly-Gly 5 4.9504950495049505 0.028203027199206778 ENSG00000109132, ENSG00000119547, ENSG00000138083, ENSG00000054598, ENSG00000163508 80 292 20063 4.294306506849315 0.9986511918732182 0.7333202527794147 30.72078907690975 GOTERM_MF_DIRECT GO:0000981~RNA polymerase II transcription factor activity, sequence-specific DNA binding 4 3.9603960396039604 0.03168704614824783 ENSG00000129514, ENSG00000184302, ENSG00000138083, ENSG00000054598 69 171 16881 5.722857869310959 0.9783292834625844 0.7212010778222342 30.862537573604364 UP_KEYWORDS Developmental protein 10 9.900990099009901 0.03297928048143198 ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000164853, ENSG00000053438, ENSG00000138083, ENSG00000164093, ENSG00000163508 97 949 20581 2.2357772152998816 0.9898911492926898 0.5350028435318056 32.57673990320743 GOTERM_MF_DIRECT GO:0003677~DNA binding 13 12.871287128712872 0.03398271598877738 ENSG00000184302, ENSG00000129514, ENSG00000164853, ENSG00000067048, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000163508 69 1674 16881 1.8999272765051167 0.9836618555842146 0.6424797192945351 32.718143656703056 INTERPRO IPR017970:Homeobox, conserved site 4 3.9603960396039604 0.04596070918167841 ENSG00000109132, ENSG00000164853, ENSG00000170561, ENSG00000164093 79 190 18559 4.94576948700866 0.9994622235937994 0.7148323276503672 43.341501655162176 GOTERM_MF_DIRECT GO:0000977~RNA polymerase II regulatory region sequence-specific DNA binding 4 3.9603960396039604 0.05156656275239971 ENSG00000109132, ENSG00000205922, ENSG00000054598, ENSG00000163508 69 208 16881 4.704849498327759 0.9981642397951731 0.7163627995115522 45.49275378977236 GOTERM_MF_DIRECT GO:0000978~RNA polymerase II core promoter proximal region sequence-specific DNA binding 5 4.9504950495049505 0.054849155170871446 ENSG00000152977, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000164093 69 355 16881 3.445805266380894 0.9987848382314719 0.673332685025507 47.616359931448414 GOTERM_MF_DIRECT GO:0003700~transcription factor activity, sequence-specific DNA binding 7 6.9306930693069315 0.18959031696019613 ENSG00000152977, ENSG00000129514, ENSG00000177932, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508 69 961 16881 1.782065782925395 0.9999999999863274 0.8971154894254747 91.01374122380184 UP_KEYWORDS Activator 6 5.9405940594059405 0.19590645201033402 ENSG00000152977, ENSG00000129514, ENSG00000109132, ENSG00000119547, ENSG00000205922, ENSG00000163508 97 661 20581 1.9259478765382037 0.9999999999998936 0.8995206058743108 92.29171287120641 GOTERM_BP_DIRECT GO:0045893~positive regulation of transcription, DNA-templated 4 3.9603960396039604 0.3971154565879258 ENSG00000152977, ENSG00000143869, ENSG00000054598, ENSG00000163508 75 515 16792 1.7389773462783173 1.0 0.9999587761988841 99.91796526374138 UP_KEYWORDS Transcription regulation 13 12.871287128712872 0.40500487412843467 ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2332 20581 1.1827951760357907 1.0 0.9847649688241881 99.77633739079752 UP_KEYWORDS Transcription 13 12.871287128712872 0.44414461678326184 ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2398 20581 1.1502411803650807 1.0 0.9820944180997002 99.89948309755069 UP_KEYWORDS Nucleus 25 24.752475247524753 0.5820328536133491 ENSG00000164853, ENSG00000253626, ENSG00000253506, ENSG00000159182, ENSG00000152977, ENSG00000109132, ENSG00000188375, ENSG00000147481, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000129514, ENSG00000184302, ENSG00000067048, ENSG00000198105, ENSG00000205922, ENSG00000170561, ENSG00000138083, ENSG00000250254, ENSG00000164093, ENSG00000182195, ENSG00000119547, ENSG00000153266, ENSG00000163508 97 5244 20581 1.0115143865940062 1.0 0.9956273086940396 99.99647757087135 GOTERM_BP_DIRECT GO:0030154~cell differentiation 3 2.9702970297029703 0.6080833739283924 ENSG00000152977, ENSG00000119547, ENSG00000205922 75 462 16792 1.453852813852814 1.0 0.9999995136278385 99.99980612003067 UP_KEYWORDS Disease mutation 9 8.91089108910891 0.9204249405761661 ENSG00000152977, ENSG00000184302, ENSG00000109132, ENSG00000124172, ENSG00000138083, ENSG00000168878, ENSG00000164093, ENSG00000054598, ENSG00000248099 97 2550 20581 0.7488538508186781 1.0 0.9999149050236531 99.99999999998799 Annotation Cluster 4 Enrichment Score: 1.165498308323919 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_BP_DIRECT GO:0006413~translational initiation 6 5.9405940594059405 3.446046051185818E-4 ENSG00000067048, ENSG00000198692, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000205609 75 137 16792 9.805547445255476 0.1344708403894822 0.03545959319361591 0.4828157254203713 GOTERM_CC_DIRECT GO:0022625~cytosolic large ribosomal subunit 3 2.9702970297029703 0.04452488666483783 ENSG00000198918, ENSG00000229117, ENSG00000163923 91 68 18224 8.835164835164836 0.9809854506297719 0.8621067464658695 38.874961529405915 UP_KEYWORDS Ribosomal protein 4 3.9603960396039604 0.055666091991101146 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 97 185 20581 4.58757314015046 0.9996089745535879 0.6250048700776605 48.99391263132687 GOTERM_MF_DIRECT GO:0003735~structural constituent of ribosome 4 3.9603960396039604 0.06034723008605642 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 69 222 16881 4.408147277712495 0.9993930954418766 0.6038222785369594 51.00471832879968 GOTERM_BP_DIRECT GO:0006614~SRP-dependent cotranslational protein targeting to membrane 3 2.9702970297029703 0.06460194328297483 ENSG00000129824, ENSG00000198918, ENSG00000229117 75 94 16792 7.145531914893617 0.9999999999992961 0.8838033244845918 60.850545022072836 GOTERM_MF_DIRECT GO:0003723~RNA binding 6 5.9405940594059405 0.06925428571255729 ENSG00000067048, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923, ENSG00000178997 69 547 16881 2.6835704633971864 0.9998046135816214 0.6128518819827384 56.07177294869783 GOTERM_BP_DIRECT GO:0019083~viral transcription 3 2.9702970297029703 0.0874042522465415 ENSG00000129824, ENSG00000198918, ENSG00000229117 75 112 16792 5.997142857142857 1.0 0.922295808229385 72.31651234196252 KEGG_PATHWAY hsa03010:Ribosome 3 2.9702970297029703 0.09197141519874578 ENSG00000129824, ENSG00000198918, ENSG00000229117 27 136 6910 5.645424836601307 0.9927041560195942 0.8060490790853818 60.67613480771449 GOTERM_BP_DIRECT GO:0000184~nuclear-transcribed mRNA catabolic process, nonsense-mediated decay 3 2.9702970297029703 0.09683692108179212 ENSG00000129824, ENSG00000198918, ENSG00000229117 75 119 16792 5.64436974789916 1.0 0.9305572806891955 76.07463661943554 GOTERM_BP_DIRECT GO:0006412~translation 4 3.9603960396039604 0.10082618317693195 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 75 253 16792 3.539815546772069 1.0 0.927158013883527 77.51659768922529 UP_KEYWORDS Ribonucleoprotein 4 3.9603960396039604 0.16041445057926768 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 97 296 20581 2.867233212594037 0.9999999999604725 0.8641452261285151 87.1930135274217 GOTERM_BP_DIRECT GO:0006364~rRNA processing 3 2.9702970297029703 0.24311129519341956 ENSG00000129824, ENSG00000198918, ENSG00000229117 75 214 16792 3.138691588785047 1.0 0.9978504982777531 97.9985714847756 GOTERM_CC_DIRECT GO:0005829~cytosol 7 6.9306930693069315 0.9995612834920512 ENSG00000226784, ENSG00000125148, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000154620, ENSG00000121691 91 3315 18224 0.42287968441814594 1.0 1.0 100.0 Annotation Cluster 5 Enrichment Score: 0.820211096675127 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_MF_DIRECT GO:0043565~sequence-specific DNA binding 6 5.9405940594059405 0.0575036128572627 ENSG00000129514, ENSG00000164853, ENSG00000170561, ENSG00000164093, ENSG00000054598, ENSG00000163508 69 518 16881 2.833808964243747 0.999130471945484 0.6346115628640061 49.27807833492119 GOTERM_MF_DIRECT GO:0003700~transcription factor activity, sequence-specific DNA binding 7 6.9306930693069315 0.18959031696019613 ENSG00000152977, ENSG00000129514, ENSG00000177932, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508 69 961 16881 1.782065782925395 0.9999999999863274 0.8971154894254747 91.01374122380184 GOTERM_MF_DIRECT GO:0008134~transcription factor binding 3 2.9702970297029703 0.31758166829967777 ENSG00000129514, ENSG00000164093, ENSG00000054598 69 284 16881 2.5843539497856707 1.0 0.9697360578283208 98.74714487738314 Annotation Cluster 6 Enrichment Score: 0.508309881872388 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR002110:Ankyrin repeat 3 2.9702970297029703 0.29082448887491896 ENSG00000222038, ENSG00000163046, ENSG00000196834 79 255 18559 2.763812360387193 1.0 0.9989646799915811 98.42286358849252 SMART SM00248:ANK 3 2.9702970297029703 0.2975318758729139 ENSG00000222038, ENSG00000163046, ENSG00000196834 45 249 10057 2.692637215528782 0.9999978856103895 0.9871650256415595 95.80398991675298 INTERPRO IPR020683:Ankyrin repeat-containing domain 3 2.9702970297029703 0.30637168905164147 ENSG00000222038, ENSG00000163046, ENSG00000196834 79 265 18559 2.6595175543348457 1.0 0.9985017380546117 98.79322761229355 UP_KEYWORDS ANK repeat 3 2.9702970297029703 0.3494196849000245 ENSG00000222038, ENSG00000163046, ENSG00000196834 97 264 20581 2.411082474226804 1.0 0.9748006629928797 99.36098921844489 Annotation Cluster 7 Enrichment Score: 0.354336784895891 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_SEQ_FEATURE zinc finger region:C2H2-type 4 5 4.9504950495049505 0.20136843576824753 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 80 588 20063 2.132546768707483 1.0 0.9998260841655876 94.41306218498366 UP_SEQ_FEATURE zinc finger region:C2H2-type 3 5 4.9504950495049505 0.24137654936132147 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 80 636 20063 1.9715998427672958 1.0 0.9998901331440954 97.1105095588215 INTERPRO IPR013087:Zinc finger C2H2-type/integrase DNA-binding domain 5 4.9504950495049505 0.3509812706610015 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 79 712 18559 1.649747546579434 1.0 0.999008904105432 99.45921000508513 UP_SEQ_FEATURE zinc finger region:C2H2-type 5 4 3.9603960396039604 0.3687510325334959 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932 80 550 20063 1.823909090909091 1.0 0.9999925558095984 99.72665248783586 INTERPRO IPR015880:Zinc finger, C2H2-like 5 4.9504950495049505 0.3990477278010214 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 79 762 18559 1.5414963952290774 1.0 0.999393086210869 99.78644025284262 UP_KEYWORDS Transcription regulation 13 12.871287128712872 0.40500487412843467 ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2332 20581 1.1827951760357907 1.0 0.9847649688241881 99.77633739079752 UP_KEYWORDS Zinc 13 12.871287128712872 0.41448076703181924 ENSG00000152977, ENSG00000125144, ENSG00000169715, ENSG00000153266, ENSG00000198105, ENSG00000125148, ENSG00000177932, ENSG00000215397, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417 97 2348 20581 1.1747352429793287 1.0 0.9829889947051305 99.81480020819157 SMART SM00355:ZnF_C2H2 5 4.9504950495049505 0.4300934117849453 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 45 762 10057 1.4664625255176436 0.9999999990779891 0.994489588687009 99.35831314673327 INTERPRO IPR007087:Zinc finger, C2H2 5 4.9504950495049505 0.4343150143180009 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 79 799 18559 1.4701129576527623 1.0 0.9994976627786998 99.89711105742633 UP_SEQ_FEATURE zinc finger region:C2H2-type 2 4 3.9603960396039604 0.4378369016202355 ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 80 615 20063 1.631138211382114 1.0 0.9999983334439706 99.93821123573167 UP_KEYWORDS Transcription 13 12.871287128712872 0.44414461678326184 ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000163508 97 2398 20581 1.1502411803650807 1.0 0.9820944180997002 99.89948309755069 UP_SEQ_FEATURE zinc finger region:C2H2-type 6 3 2.9702970297029703 0.590458495161822 ENSG00000153266, ENSG00000198105, ENSG00000177932 80 501 20063 1.5017215568862274 1.0 0.9999999655860128 99.99893817056615 UP_SEQ_FEATURE zinc finger region:C2H2-type 1 3 2.9702970297029703 0.6455180669053127 ENSG00000153266, ENSG00000177932, ENSG00000215397 80 554 20063 1.3580550541516245 1.0 0.999999962991393 99.99983342537078 UP_KEYWORDS Zinc-finger 7 6.9306930693069315 0.8479589820363391 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397, ENSG00000165188, ENSG00000247746 97 1781 20581 0.8339285817651383 1.0 0.9994944279023904 99.99999997575446 GOTERM_BP_DIRECT GO:0006351~transcription, DNA-templated 5 4.9504950495049505 0.9785388522558911 ENSG00000164853, ENSG00000198105, ENSG00000177932, ENSG00000215397, ENSG00000163508 75 1955 16792 0.572617220801364 1.0 1.0 100.0 Annotation Cluster 8 Enrichment Score: 0.06711873232279976 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_CC_DIRECT GO:0005615~extracellular space 9 8.91089108910891 0.3473712675240062 ENSG00000165246, ENSG00000222038, ENSG00000198918, ENSG00000096088, ENSG00000196834, ENSG00000270136, ENSG00000143869, ENSG00000121691, ENSG00000168878 91 1347 18224 1.3380650529871019 0.9999999999999999 0.995027606334708 99.00691145271703 UP_SEQ_FEATURE disulfide bond 7 6.9306930693069315 0.9799768322289583 ENSG00000165246, ENSG00000180974, ENSG00000096088, ENSG00000143869, ENSG00000164241, ENSG00000168878, ENSG00000248099 80 2917 20063 0.6018212204319506 1.0 1.0 100.0 UP_KEYWORDS Disulfide bond 8 7.920792079207921 0.9979253382713639 ENSG00000165246, ENSG00000180974, ENSG00000096088, ENSG00000270136, ENSG00000143869, ENSG00000164241, ENSG00000168878, ENSG00000248099 97 3434 20581 0.4942929708374112 1.0 0.999999993214293 100.0 UP_SEQ_FEATURE signal peptide 6 5.9405940594059405 0.9982749748270936 ENSG00000179542, ENSG00000165246, ENSG00000096088, ENSG00000143869, ENSG00000168878, ENSG00000248099 80 3346 20063 0.4497086072922893 1.0 1.0 100.0 UP_KEYWORDS Signal 9 8.91089108910891 0.9995774779340586 ENSG00000179542, ENSG00000214194, ENSG00000165246, ENSG00000096088, ENSG00000270136, ENSG00000143869, ENSG00000168878, ENSG00000243317, ENSG00000248099 97 4160 20581 0.459033009516257 1.0 0.9999999998538799 100.0 UP_SEQ_FEATURE glycosylation site:N-linked (GlcNAc...) 5 4.9504950495049505 0.9999874126486624 ENSG00000179542, ENSG00000165246, ENSG00000180974, ENSG00000143869, ENSG00000168878 80 4234 20063 0.2961590694378838 1.0 1.0 100.0 UP_KEYWORDS Glycoprotein 5 4.9504950495049505 0.9999998667567829 ENSG00000179542, ENSG00000165246, ENSG00000180974, ENSG00000143869, ENSG00000168878 97 4551 20581 0.23310839126780789 1.0 1.0 100.0 Annotation Cluster 9 Enrichment Score: 0.014705889178132312 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_MF_DIRECT GO:0005524~ATP binding 4 3.9603960396039604 0.9471714074328096 ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142 69 1495 16881 0.6545877562890795 1.0 0.9999999988516262 99.99999999999977 UP_KEYWORDS ATP-binding 4 3.9603960396039604 0.9617258940215971 ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142 97 1391 20581 0.6101373335210891 1.0 0.9999922196380046 100.0 UP_KEYWORDS Nucleotide-binding 4 3.9603960396039604 0.9917505356146884 ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142 97 1788 20581 0.4746650061117646 1.0 0.9999997700488767 100.0 Annotation Cluster 10 Enrichment Score: 0.01422812959020817 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_SEQ_FEATURE transmembrane region 16 15.841584158415841 0.9235699422433608 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000180974, ENSG00000165246, ENSG00000165188, ENSG00000166002, ENSG00000205670 80 5056 20063 0.7936313291139241 1.0 0.9999999999999999 99.99999999999953 UP_KEYWORDS Transmembrane helix 20 19.801980198019802 0.9664300899010495 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165246, ENSG00000270136, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670 97 5634 20581 0.7531958030953453 1.0 0.9999910591386459 100.0 UP_KEYWORDS Transmembrane 20 19.801980198019802 0.9677227406188967 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165246, ENSG00000270136, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670 97 5651 20581 0.7509299512721953 1.0 0.9999895880346041 100.0 UP_SEQ_FEATURE topological domain:Extracellular 7 6.9306930693069315 0.9713553106975322 ENSG00000171840, ENSG00000179542, ENSG00000165246, ENSG00000198133, ENSG00000180974, ENSG00000086159, ENSG00000180638 80 2787 20063 0.6298932543954072 1.0 1.0 100.0 UP_SEQ_FEATURE topological domain:Cytoplasmic 9 8.91089108910891 0.9727885778326631 ENSG00000171840, ENSG00000179542, ENSG00000165246, ENSG00000198133, ENSG00000180974, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142 80 3456 20063 0.6530924479166668 1.0 1.0 100.0 GOTERM_CC_DIRECT GO:0016021~integral component of membrane 18 17.82178217821782 0.9855047988667748 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000180638, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670 91 5163 18224 0.6981885052774071 1.0 0.9999999999999534 100.0 UP_KEYWORDS Membrane 26 25.742574257425744 0.9884542831964291 ENSG00000171840, ENSG00000198133, ENSG00000170091, ENSG00000253626, ENSG00000180638, ENSG00000197142, ENSG00000228474, ENSG00000179542, ENSG00000180389, ENSG00000124172, ENSG00000110934, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000176533, ENSG00000127540, ENSG00000139370, ENSG00000205639, ENSG00000086159, ENSG00000178449, ENSG00000214194, ENSG00000165246, ENSG00000180974, ENSG00000270136, ENSG00000171450, ENSG00000205670 97 7494 20581 0.7361297973086373 1.0 0.9999995215886112 100.0
Look at genes only detected in single cell.
set(x for k in S for x in pd.Series(S[k].mask(np.isfinite(T[k])).dropna().index))
{'ENSG00000000457', 'ENSG00000050438', 'ENSG00000053438', 'ENSG00000099617', 'ENSG00000102743', 'ENSG00000109618', 'ENSG00000111087', 'ENSG00000112357', 'ENSG00000120690', 'ENSG00000127080', 'ENSG00000128283', 'ENSG00000136213', 'ENSG00000137502', 'ENSG00000143869', 'ENSG00000149541', 'ENSG00000162188', 'ENSG00000165879', 'ENSG00000173401', 'ENSG00000181481', 'ENSG00000184302', 'ENSG00000185818', 'ENSG00000197818', 'ENSG00000213380'}
Look at pathway enrichments for genes detected only in single cell.
Annotation Cluster 1 Enrichment Score: 0.16470559032620852 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_BP_DIRECT GO:0045944~positive regulation of transcription from RNA polymerase II promoter 3 13.043478260869565 0.3277257959734531 ENSG00000111087, ENSG00000184302, ENSG00000120690 21 981 16792 2.4453181884374544 1.0 0.9999999999888005 98.99989403807203 UP_KEYWORDS DNA-binding 3 13.043478260869565 0.6331696404915614 ENSG00000111087, ENSG00000184302, ENSG00000120690 22 2050 20581 1.3690243902439023 1.0 0.9988514746581159 99.99771774457302 GOTERM_CC_DIRECT GO:0005654~nucleoplasm 3 13.043478260869565 0.852891320214584 ENSG00000111087, ENSG00000120690, ENSG00000127080 22 2784 18224 0.8926332288401253 1.0 0.9999966100403458 99.99999864269819 GOTERM_CC_DIRECT GO:0005634~nucleus 5 21.73913043478261 0.9101247789696545 ENSG00000111087, ENSG00000184302, ENSG00000109618, ENSG00000120690, ENSG00000127080 22 5415 18224 0.7648787039368757 1.0 0.9999990374355104 99.99999998711735 UP_KEYWORDS Nucleus 4 17.391304347826086 0.9320660841557977 ENSG00000111087, ENSG00000184302, ENSG00000120690, ENSG00000127080 22 5244 20581 0.7135774218154081 1.0 0.9999972758163257 99.99999999996426 Annotation Cluster 2 Enrichment Score: 0.13876944094888757 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_KEYWORDS Membrane 10 43.47826086956522 0.3432466422296966 ENSG00000137502, ENSG00000102743, ENSG00000050438, ENSG00000099617, ENSG00000128283, ENSG00000149541, ENSG00000162188, ENSG00000197818, ENSG00000185818, ENSG00000213380 22 7494 20581 1.2483319989324793 0.9999999999999983 0.9772688861660149 98.86760814991567 UP_SEQ_FEATURE transmembrane region 5 21.73913043478261 0.8140386056324314 ENSG00000102743, ENSG00000050438, ENSG00000149541, ENSG00000197818, ENSG00000185818 22 5056 20063 0.9018537830840045 1.0 0.9999999999999997 99.99999861987743 UP_KEYWORDS Transmembrane helix 5 21.73913043478261 0.8670386049310165 ENSG00000102743, ENSG00000050438, ENSG00000149541, ENSG00000197818, ENSG00000185818 22 5634 20581 0.8302271920482783 1.0 0.99999148614062 99.99999995415865 UP_KEYWORDS Transmembrane 5 21.73913043478261 0.8687936796993484 ENSG00000102743, ENSG00000050438, ENSG00000149541, ENSG00000197818, ENSG00000185818 22 5651 20581 0.8277296053795788 1.0 0.9999827433821622 99.99999996021165 GOTERM_CC_DIRECT GO:0016021~integral component of membrane 4 17.391304347826086 0.9614936232995246 ENSG00000102743, ENSG00000149541, ENSG00000197818, ENSG00000185818 22 5163 18224 0.6417692321236773 1.0 0.9999999410477701 99.99999999999572 Annotation Cluster 3 Enrichment Score: 0.10814451882692515 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_CC_DIRECT GO:0005576~extracellular region 3 13.043478260869565 0.5650202328331935 ENSG00000099617, ENSG00000143869, ENSG00000173401 22 1610 18224 1.5435347261434218 1.0 0.9995280289945352 99.96172821785414 UP_SEQ_FEATURE signal peptide 3 13.043478260869565 0.8873328320761085 ENSG00000099617, ENSG00000143869, ENSG00000173401 22 3346 20063 0.8176520132587077 1.0 0.9999999999999631 99.99999999371153 UP_KEYWORDS Signal 3 13.043478260869565 0.9449653686711603 ENSG00000099617, ENSG00000143869, ENSG00000173401 22 4160 20581 0.674639423076923 1.0 0.9999978479008454 99.99999999999622 Annotation Cluster 4 Enrichment Score: 0.06705635298464693 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_KEYWORDS Disulfide bond 4 17.391304347826086 0.7052904766215429 ENSG00000050438, ENSG00000099617, ENSG00000143869, ENSG00000149541 22 3434 20581 1.0896913220733837 1.0 0.9995057831808567 99.99977860872981 UP_SEQ_FEATURE disulfide bond 3 13.043478260869565 0.8313843498748016 ENSG00000099617, ENSG00000143869, ENSG00000149541 22 2917 20063 0.9379032006731698 1.0 0.9999999999999281 99.99999951871355 UP_SEQ_FEATURE glycosylation site:N-linked (GlcNAc...) 3 13.043478260869565 0.9544875159034373 ENSG00000099617, ENSG00000143869, ENSG00000149541 22 4234 20063 0.6461652424099282 1.0 1.0 99.99999999999963 UP_KEYWORDS Glycoprotein 3 13.043478260869565 0.963461735118086 ENSG00000099617, ENSG00000143869, ENSG00000149541 22 4551 20581 0.616677653263019 1.0 0.9999984894709725 99.99999999999996
Compute the Spearman correlation of relative abundance between all pairs of bulk samples.
R = pd.Series([st.mstats.spearmanr(S[i], S[j]).correlation for i, j in itertools.combinations(S.columns, 2)])
R.describe()
count 1225.000000 mean 0.975116 std 0.008525 min 0.941480 25% 0.969642 50% 0.976308 75% 0.981967 max 0.989239 dtype: float64
Compute the Spearman correlation between all pairs of scRNA-Seq estimates.
R = pd.Series([st.mstats.spearmanr(T[i], T[j]).correlation for i, j in itertools.combinations(T.columns, 2)]) R.describe()
count 1225.000000 mean 0.964085 std 0.012129 min 0.910210 25% 0.960425 50% 0.966380 75% 0.971532 max 0.985259 dtype: float64
Compute the Spearman correlation between bulk and single cell.
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()
count 50.000000 mean 0.770785 std 0.022602 min 0.690612 25% 0.761318 50% 0.776103 75% 0.786595 max 0.803776 dtype: float64
Compute the Spearman correlation for randomized pairs of bulk/single cell abundances.
np.random.seed(0) pd.Series([st.mstats.spearmanr(S[i], T[j]).correlation for i in np.random.choice(S.columns, 20, replace=True) for j in np.random.choice(T.columns, 20, replace=True)]).describe()
count 400.000000 mean 0.768326 std 0.019574 min 0.699244 25% 0.755550 50% 0.769128 75% 0.784345 max 0.804371 dtype: float64
Plot bulk vs. single cell relative abundance
Under our assumed model, the parameter \(\mu\) is proportional to relative abundance.
log_mu = pd.read_table('/scratch/midway2/aksarkar/singlecell/density-estimation/without-cell-cycle/zi2-log-mu.txt.gz', sep=' ', index_col=0) logodds = pd.read_table('/scratch/midway2/aksarkar/singlecell/density-estimation/without-cell-cycle/zi2-log-mu.txt.gz', sep=' ', index_col=0) # Important: log(sigmoid(x)) = -softplus(-x) log_mu -= np.log1p(np.exp(logodds)) log_mu -= log_mu.agg(sp.logsumexp, axis=0)
S, T = (bulk_log_rho.loc[keep_genes.values.ravel()] .mask(mask) .align(log_mu.mask(mask), join='inner'))
Plot the concordance.
plot_concordance_rho(
S,
T,
'/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/vs-sc-mean')
Look at the genes only detected in bulk.
with open('/scratch/midway2/aksarkar/singlecell/density-estimation/bulk-only.txt', 'w') as f: print(*set([x for k in T for x in pd.Series(T[k].mask(np.isfinite(S[k])).dropna().index)]), sep='\n', file=f)
Look at pathway enrichment for genes only detected in bulk.
curl "https://david.ncifcrf.gov/data/download/t2t_B5C5AE2A242A1525715859955.txt"
Annotation Cluster 1 Enrichment Score: 5.576667372224575 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR018064:Metallothionein, vertebrate, metal binding site 6 4.958677685950414 1.4246693775812425E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 96 11 18559 105.44886363636364 2.721118151916002E-7 2.721118151916002E-7 1.7722059264535517E-6 UP_SEQ_FEATURE region of interest:Beta 6 4.958677685950414 3.1296025620278356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 99 13 20063 93.53379953379954 8.668995394867096E-7 8.668995394867096E-7 4.131194308865105E-6 UP_SEQ_FEATURE region of interest:Alpha 6 4.958677685950414 3.1296025620278356E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 99 13 20063 93.53379953379954 8.668995394867096E-7 8.668995394867096E-7 4.131194308865105E-6 UP_SEQ_FEATURE metal ion-binding site:Divalent metal cation; cluster B 6 4.958677685950414 4.849470990265653E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 99 14 20063 86.85281385281385 1.3433025503051255E-6 6.716515007498813E-7 6.401485830309639E-6 UP_SEQ_FEATURE metal ion-binding site:Divalent metal cation; cluster A 6 4.958677685950414 4.849470990265653E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 99 14 20063 86.85281385281385 1.3433025503051255E-6 6.716515007498813E-7 6.401485830309639E-6 INTERPRO IPR003019:Metallothionein superfamily, eukaryotic 6 4.958677685950414 6.0990406966974536E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 96 14 18559 82.85267857142858 1.1649161019144927E-6 5.824582206548357E-7 7.586852035501579E-6 INTERPRO IPR000006:Metallothionein, vertebrate 6 4.958677685950414 6.0990406966974536E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 96 14 18559 82.85267857142858 1.1649161019144927E-6 5.824582206548357E-7 7.586852035501579E-6 INTERPRO IPR017854:Metallothionein domain 6 4.958677685950414 6.0990406966974536E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 96 14 18559 82.85267857142858 1.1649161019144927E-6 5.824582206548357E-7 7.586852035501579E-6 INTERPRO IPR023587:Metallothionein domain, vertebrate 6 4.958677685950414 6.0990406966974536E-9 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 96 14 18559 82.85267857142858 1.1649161019144927E-6 5.824582206548357E-7 7.586852035501579E-6 UP_KEYWORDS Metal-thiolate cluster 6 4.958677685950414 1.0025245596206678E-8 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 117 14 20581 75.3882783882784 1.5438866435291132E-6 1.5438866435291132E-6 1.202629306140679E-5 GOTERM_BP_DIRECT GO:0071294~cellular response to zinc ion 6 4.958677685950414 4.094169579380311E-8 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 90 19 16792 58.91929824561404 1.9406175909564283E-5 1.9406175909564283E-5 5.851492919095591E-5 GOTERM_BP_DIRECT GO:0045926~negative regulation of growth 6 4.958677685950414 4.094169579380311E-8 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 90 19 16792 58.91929824561404 1.9406175909564283E-5 1.9406175909564283E-5 5.851492919095591E-5 UP_KEYWORDS Cadmium 5 4.132231404958678 1.1811282003624497E-7 ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417 117 9 20581 97.72554605887939 1.8189209940788764E-5 9.09464632670165E-6 1.4168815375414923E-4 KEGG_PATHWAY hsa04978:Mineral absorption 6 4.958677685950414 1.561721135515403E-6 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 32 46 6910 28.16576086956522 8.901421223550532E-5 8.901421223550532E-5 0.001547999109541287 GOTERM_BP_DIRECT GO:0071276~cellular response to cadmium ion 5 4.132231404958678 1.664617662755867E-6 ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417 90 17 16792 54.87581699346404 7.887182269761261E-4 3.9443690372353846E-4 0.0023790888878050254 UP_KEYWORDS Copper 5 4.132231404958678 4.976376910881519E-4 ENSG00000125144, ENSG00000169715, ENSG00000205358, ENSG00000187193, ENSG00000198417 117 65 20581 13.531229454306377 0.07379091796476478 0.025228076303446367 0.5953360202090807 GOTERM_MF_DIRECT GO:0046872~metal ion binding 15 12.396694214876034 0.12418773298386136 ENSG00000125144, ENSG00000198105, ENSG00000125148, ENSG00000102349, ENSG00000205358, ENSG00000187193, ENSG00000198417, ENSG00000152977, ENSG00000169715, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000259332, ENSG00000132846, ENSG00000121691 83 2069 16881 1.4745206053794686 0.9999999852802904 0.8352634661442035 78.91549668783586 GOTERM_MF_DIRECT GO:0008270~zinc ion binding 9 7.43801652892562 0.20634061490317346 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000165188, ENSG00000247746, ENSG00000198417, ENSG00000048540 83 1169 16881 1.5658424974491636 0.9999999999999776 0.91087103471022 93.3655336196179 GOTERM_CC_DIRECT GO:0048471~perinuclear region of cytoplasm 6 4.958677685950414 0.30096978390296947 ENSG00000125144, ENSG00000169715, ENSG00000125148, ENSG00000205358, ENSG00000187193, ENSG00000198417 108 621 18224 1.630345321166577 0.9999999999999998 0.9942944345640995 98.13459523208337 UP_KEYWORDS Zinc 16 13.223140495867769 0.34358634892845447 ENSG00000125144, ENSG00000198105, ENSG00000125148, ENSG00000102349, ENSG00000205358, ENSG00000187193, ENSG00000198417, ENSG00000152977, ENSG00000169715, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000132846, ENSG00000165188, ENSG00000247746, ENSG00000048540 117 2348 20581 1.1986779073661529 1.0 0.9779278733358946 99.35899578165785 UP_KEYWORDS Zinc-finger 9 7.43801652892562 0.7961450798424026 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000102349, ENSG00000177932, ENSG00000215397, ENSG00000132846, ENSG00000165188, ENSG00000247746 117 1781 20581 0.8889128838595431 1.0 0.9997152160677613 99.99999948166463 UP_KEYWORDS Metal-binding 18 14.87603305785124 0.8364136070519472 ENSG00000125144, ENSG00000198105, ENSG00000125148, ENSG00000102349, ENSG00000205358, ENSG00000187193, ENSG00000198417, ENSG00000152977, ENSG00000169715, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000259332, ENSG00000132846, ENSG00000121691, ENSG00000165188, ENSG00000247746, ENSG00000048540 117 3640 20581 0.8698647506339814 1.0 0.9998355028846869 99.99999996300832 Annotation Cluster 2 Enrichment Score: 3.334962167202779 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR001152:Thymosin beta-4 4 3.3057851239669422 1.2897495522925282E-6 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 96 5 18559 154.65833333333333 2.463119835646177E-4 8.211073651009926E-5 0.0016043616741900912 SMART SM00152:THY 4 3.3057851239669422 1.6226621358714626E-6 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 57 5 10057 141.15087719298245 7.463973312682448E-5 7.463973312682448E-5 0.0015336986909142425 PIR_SUPERFAMILY PIRSF001828:thymosin beta 4 3.3057851239669422 3.5173915511616714E-6 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 14 5 1692 96.68571428571428 3.869062660721845E-5 3.869062660721845E-5 0.0021911300040700077 GOTERM_BP_DIRECT GO:0042989~sequestering of actin monomers 4 3.3057851239669422 1.681314954634123E-5 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 90 10 16792 74.63111111111111 0.007937827610423387 0.0026529745868454357 0.024027109196389507 GOTERM_MF_DIRECT GO:0003785~actin monomer binding 4 3.3057851239669422 2.6496648695155503E-4 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 83 26 16881 31.29008341056534 0.03539850083091345 0.03539850083091345 0.31060183729006985 GOTERM_CC_DIRECT GO:0031941~filamentous actin 4 3.3057851239669422 7.847934587976678E-4 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 108 31 18224 21.77299880525687 0.07623286602486246 0.07623286602486246 0.8692440247447752 GOTERM_BP_DIRECT GO:0007015~actin filament organization 4 3.3057851239669422 0.006594793403436778 ENSG00000034510, ENSG00000205542, ENSG00000154620, ENSG00000158164 90 72 16792 10.365432098765433 0.9565561097851812 0.32431996209823577 9.023293445276037 UP_KEYWORDS Actin-binding 6 4.958677685950414 0.019571276316372795 ENSG00000034510, ENSG00000147481, ENSG00000205542, ENSG00000154620, ENSG00000158164, ENSG00000197616 117 274 20581 3.85195583005802 0.9523494351355245 0.397886993238434 21.109161121502428 GOTERM_BP_DIRECT GO:0030036~actin cytoskeleton organization 3 2.479338842975207 0.15128048512411713 ENSG00000034510, ENSG00000205542, ENSG00000158164 90 130 16792 4.305641025641026 1.0 0.9795020108256144 90.4086641217756 UP_KEYWORDS Cytoskeleton 5 4.132231404958678 0.889662886640318 ENSG00000034510, ENSG00000147481, ENSG00000205542, ENSG00000154620, ENSG00000158164 117 1138 20581 0.7728733871088881 1.0 0.9998963302291057 99.99999999967154 Annotation Cluster 3 Enrichment Score: 1.6571445730798036 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR001356:Homeodomain 8 6.6115702479338845 3.445655722065392E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 96 256 18559 6.041341145833333 0.06370376779306242 0.016321183584631305 0.42777583502832117 UP_SEQ_FEATURE DNA-binding region:Homeobox 7 5.785123966942149 3.5054172576739275E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093 99 191 20063 7.427203976942197 0.09255024505416842 0.031853984164181726 0.4617402147000016 SMART SM00389:HOX 8 6.6115702479338845 4.42340030678099E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 57 250 10057 5.646035087719298 0.020146436053534278 0.010124470477997027 0.41731066017264284 UP_KEYWORDS Homeobox 8 6.6115702479338845 7.172911762649775E-4 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 117 262 20581 5.371175050564364 0.10461588777842423 0.02724752158556898 0.8570778580120453 GOTERM_BP_DIRECT GO:0006366~transcription from RNA polymerase II promoter 10 8.264462809917356 0.001572722098348745 ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000054598 90 513 16792 3.636993718865064 0.5257672759453207 0.11692297380011663 2.2244314781961316 INTERPRO IPR009057:Homeodomain-like 8 6.6115702479338845 0.0016952639861163858 ENSG00000184302, ENSG00000109132, ENSG00000164853, ENSG00000119547, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093 96 336 18559 4.602926587301587 0.2768005217052305 0.06275826172079468 2.088482148755444 GOTERM_BP_DIRECT GO:0045944~positive regulation of transcription from RNA polymerase II promoter 13 10.743801652892563 0.005493950593360164 ENSG00000184302, ENSG00000129514, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000153266, ENSG00000132846, ENSG00000106153, ENSG00000054598, ENSG00000163508 90 981 16792 2.4724883905312036 0.9265618980685568 0.3113654984676302 7.571745627308124 GOTERM_MF_DIRECT GO:0001077~transcriptional activator activity, RNA polymerase II core promoter proximal region sequence-specific binding 6 4.958677685950414 0.0058345245616869485 ENSG00000152977, ENSG00000129514, ENSG00000109132, ENSG00000119547, ENSG00000205922, ENSG00000164093 83 236 16881 5.170818868695119 0.5487885570500103 0.32827725738219216 6.638584356776677 UP_KEYWORDS DNA-binding 21 17.355371900826448 0.010410726649525567 ENSG00000269404, ENSG00000129514, ENSG00000184302, ENSG00000164853, ENSG00000067048, ENSG00000198105, ENSG00000102349, ENSG00000170561, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000152977, ENSG00000109132, ENSG00000188375, ENSG00000153266, ENSG00000119547, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000054598, ENSG00000163508 117 2050 20581 1.801963727329581 0.8004430841018089 0.27554175963005556 11.798110609182277 UP_SEQ_FEATURE compositionally biased region:Poly-Ala 7 5.785123966942149 0.014325138545599983 ENSG00000152977, ENSG00000109132, ENSG00000119547, ENSG00000138083, ENSG00000054598, ENSG00000171450, ENSG00000163508 99 404 20063 3.511376137613761 0.9816249090490333 0.6318223898788264 17.342512514631448 GOTERM_BP_DIRECT GO:0007420~brain development 5 4.132231404958678 0.01839843632364141 ENSG00000152977, ENSG00000053438, ENSG00000138083, ENSG00000054598, ENSG00000163508 90 190 16792 4.909941520467836 0.999849580378387 0.585303424363129 23.310420295398327 GOTERM_MF_DIRECT GO:0003677~DNA binding 15 12.396694214876034 0.030082627435268905 ENSG00000129514, ENSG00000184302, ENSG00000164853, ENSG00000067048, ENSG00000198105, ENSG00000102349, ENSG00000205922, ENSG00000138083, ENSG00000164093, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000132846, ENSG00000054598, ENSG00000163508 83 1674 16881 1.822451094701386 0.9842991054239907 0.4995960457027463 30.131860889678453 UP_SEQ_FEATURE compositionally biased region:Poly-Gly 5 4.132231404958678 0.054948385047742535 ENSG00000109132, ENSG00000119547, ENSG00000138083, ENSG00000054598, ENSG00000163508 99 292 20063 3.4701466722014667 0.9999998410789522 0.956324661067076 52.575403407838586 INTERPRO IPR017970:Homeobox, conserved site 4 3.3057851239669422 0.07390303033342986 ENSG00000109132, ENSG00000164853, ENSG00000170561, ENSG00000164093 96 190 18559 4.069956140350877 0.9999995720584485 0.8039458450125907 61.52082309616287 UP_KEYWORDS Activator 8 6.6115702479338845 0.08017664613850131 ENSG00000269404, ENSG00000152977, ENSG00000129514, ENSG00000109132, ENSG00000119547, ENSG00000205922, ENSG00000106153, ENSG00000163508 117 661 20581 2.128968023067872 0.9999974267468521 0.8409632464439599 63.30574819341427 GOTERM_MF_DIRECT GO:0000977~RNA polymerase II regulatory region sequence-specific DNA binding 4 3.3057851239669422 0.08062058246049644 ENSG00000109132, ENSG00000205922, ENSG00000054598, ENSG00000163508 83 208 16881 3.9112604263206676 0.9999891534993122 0.8046759754940478 62.72090014820477 UP_KEYWORDS Developmental protein 10 8.264462809917356 0.08771095723157968 ENSG00000152977, ENSG00000129514, ENSG00000184302, ENSG00000109132, ENSG00000153266, ENSG00000164853, ENSG00000053438, ENSG00000138083, ENSG00000164093, ENSG00000163508 117 949 20581 1.8535930759323807 0.9999992748987698 0.7567572726996232 66.753294668148 GOTERM_MF_DIRECT GO:0000978~RNA polymerase II core promoter proximal region sequence-specific DNA binding 5 4.132231404958678 0.09432038483218277 ENSG00000152977, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000164093 83 355 16881 2.864585100967249 0.9999985921881136 0.776211228626135 68.74460304016104 GOTERM_MF_DIRECT GO:0003700~transcription factor activity, sequence-specific DNA binding 8 6.6115702479338845 0.18523548045286425 ENSG00000269404, ENSG00000152977, ENSG00000129514, ENSG00000177932, ENSG00000138083, ENSG00000164093, ENSG00000054598, ENSG00000163508 83 961 16881 1.6931158557225783 0.999999999999205 0.9018936246615948 90.97168947201777 UP_KEYWORDS Transcription 17 14.049586776859504 0.27417302590717135 ENSG00000269404, ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000102349, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000177932, ENSG00000215397, ENSG00000106153, ENSG00000054598, ENSG00000048540, ENSG00000163508 117 2398 20581 1.2470399121775269 1.0 0.9627419717782236 97.85931553820366 UP_KEYWORDS Transcription regulation 16 13.223140495867769 0.3337253043325218 ENSG00000269404, ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000102349, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000048540, ENSG00000163508 117 2332 20581 1.20690211256249 1.0 0.9799246465161224 99.2334444591349 GOTERM_BP_DIRECT GO:0030154~cell differentiation 4 3.3057851239669422 0.44486454473031467 ENSG00000269404, ENSG00000152977, ENSG00000119547, ENSG00000205922 90 462 16792 1.6153920153920154 1.0 0.99999104883482 99.97777288991045 GOTERM_BP_DIRECT GO:0045893~positive regulation of transcription, DNA-templated 4 3.3057851239669422 0.5168140223066459 ENSG00000152977, ENSG00000143869, ENSG00000054598, ENSG00000163508 90 515 16792 1.4491477885652644 1.0 0.9999989748024688 99.99694311520028 UP_KEYWORDS Nucleus 28 23.140495867768596 0.739844207165452 ENSG00000164853, ENSG00000102349, ENSG00000253626, ENSG00000253506, ENSG00000159182, ENSG00000152977, ENSG00000109132, ENSG00000188375, ENSG00000147481, ENSG00000177932, ENSG00000197061, ENSG00000215397, ENSG00000106153, ENSG00000054598, ENSG00000269404, ENSG00000129514, ENSG00000184302, ENSG00000067048, ENSG00000198105, ENSG00000205922, ENSG00000170561, ENSG00000138083, ENSG00000250254, ENSG00000164093, ENSG00000182195, ENSG00000119547, ENSG00000153266, ENSG00000163508 117 5244 20581 0.939238657774127 1.0 0.9993921670293953 99.99999033647028 Annotation Cluster 4 Enrichment Score: 1.2499471608414185 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR INTERPRO IPR004001:Actin, conserved site 3 2.479338842975207 0.0037606999271607975 ENSG00000222038, ENSG00000196604, ENSG00000196834 96 18 18559 32.22048611111111 0.5130762433862859 0.1130274884996808 4.578771166139351 INTERPRO IPR020902:Actin/actin-like conserved site 3 2.479338842975207 0.005110513948360066 ENSG00000222038, ENSG00000196604, ENSG00000196834 96 21 18559 27.617559523809522 0.6241671949185366 0.13046922977687558 6.174622944939278 INTERPRO IPR004000:Actin-related protein 3 2.479338842975207 0.013076597763946148 ENSG00000222038, ENSG00000196604, ENSG00000196834 96 34 18559 17.057904411764707 0.9190646182796977 0.269673091270098 15.103574554141886 SMART SM00268:ACTIN 3 2.479338842975207 0.014397180251632587 ENSG00000222038, ENSG00000196604, ENSG00000196834 57 33 10057 16.03987240829346 0.48679708284211987 0.19937397066563267 12.808948263302455 GOTERM_BP_DIRECT GO:0001895~retina homeostasis 3 2.479338842975207 0.01901501309671769 ENSG00000222038, ENSG00000196604, ENSG00000196834 90 40 16792 13.993333333333332 0.9998883241962534 0.5627563073907305 23.996030232028975 INTERPRO IPR002110:Ankyrin repeat 4 3.3057851239669422 0.14256915991012142 ENSG00000222038, ENSG00000196604, ENSG00000163046, ENSG00000196834 96 255 18559 3.032516339869281 0.9999999999998258 0.9308020120860117 85.24172437712 INTERPRO IPR020683:Ankyrin repeat-containing domain 4 3.3057851239669422 0.1545793028275806 ENSG00000222038, ENSG00000196604, ENSG00000163046, ENSG00000196834 96 265 18559 2.918081761006289 0.9999999999999882 0.930937344494069 87.61691240255998 SMART SM00248:ANK 4 3.3057851239669422 0.16094167966935763 ENSG00000222038, ENSG00000196604, ENSG00000163046, ENSG00000196834 57 249 10057 2.834354963714507 0.9996877957409216 0.8670740705007958 80.95863994158734 UP_KEYWORDS ANK repeat 4 3.3057851239669422 0.18693377664161864 ENSG00000222038, ENSG00000196604, ENSG00000163046, ENSG00000196834 117 264 20581 2.6652421652421654 0.9999999999999856 0.9448223909359487 91.64647605511598 UP_KEYWORDS Isopeptide bond 8 6.6115702479338845 0.45637925175104177 ENSG00000067048, ENSG00000198692, ENSG00000222038, ENSG00000196604, ENSG00000102349, ENSG00000197061, ENSG00000196834, ENSG00000197616 117 1132 20581 1.2431518226571232 1.0 0.9859695316647251 99.93322579742441 UP_KEYWORDS Ubl conjugation 8 6.6115702479338845 0.8553030066572205 ENSG00000188375, ENSG00000067048, ENSG00000198692, ENSG00000222038, ENSG00000196604, ENSG00000102349, ENSG00000197061, ENSG00000196834 117 1705 20581 0.8253653156878965 1.0 0.9998424655960397 99.99999999151085 Annotation Cluster 5 Enrichment Score: 1.157814438766558 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_BP_DIRECT GO:0006413~translational initiation 6 4.958677685950414 8.054631319582625E-4 ENSG00000067048, ENSG00000198692, ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000205609 90 137 16792 8.171289537712896 0.31746629317685204 0.07354395203819064 1.1450467582972301 GOTERM_MF_DIRECT GO:0003723~RNA binding 8 6.6115702479338845 0.01694650777453914 ENSG00000067048, ENSG00000129824, ENSG00000198918, ENSG00000144642, ENSG00000229117, ENSG00000163923, ENSG00000129317, ENSG00000178997 83 547 16881 2.9745600317173633 0.9021654064698235 0.3717991943982005 18.179295433938435 GOTERM_CC_DIRECT GO:0022625~cytosolic large ribosomal subunit 3 2.479338842975207 0.06057252970141628 ENSG00000198918, ENSG00000229117, ENSG00000163923 108 68 18224 7.444444444444444 0.9981836949001399 0.8779897639185149 50.08448175192528 UP_KEYWORDS Ribosomal protein 4 3.3057851239669422 0.08696857707716145 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 117 185 20581 3.803372603372604 0.9999991781329468 0.7892018276269627 66.42729041845365 GOTERM_BP_DIRECT GO:0006614~SRP-dependent cotranslational protein targeting to membrane 3 2.479338842975207 0.08882486530305175 ENSG00000129824, ENSG00000198918, ENSG00000229117 90 94 16792 5.954609929078014 1.0 0.9471044953271334 73.53821197674968 GOTERM_MF_DIRECT GO:0003735~structural constituent of ribosome 4 3.3057851239669422 0.09360408784259258 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 83 222 16881 3.664604363399544 0.9999984323843365 0.811893003426054 68.4531829028361 GOTERM_BP_DIRECT GO:0019083~viral transcription 3 2.479338842975207 0.11899350439278467 ENSG00000129824, ENSG00000198918, ENSG00000229117 90 112 16792 4.997619047619048 1.0 0.9707660070127613 83.64588600840528 KEGG_PATHWAY hsa03010:Ribosome 3 2.479338842975207 0.12363305986457417 ENSG00000129824, ENSG00000198918, ENSG00000229117 32 136 6910 4.763327205882352 0.9994591196543766 0.8474980839947884 72.96715286848112 GOTERM_BP_DIRECT GO:0000184~nuclear-transcribed mRNA catabolic process, nonsense-mediated decay 3 2.479338842975207 0.13133460254424958 ENSG00000129824, ENSG00000198918, ENSG00000229117 90 119 16792 4.703641456582633 1.0 0.9754651683595816 86.63206843423275 GOTERM_BP_DIRECT GO:0006412~translation 4 3.3057851239669422 0.15080601210591557 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 90 253 16792 2.949846288976724 1.0 0.9830601621743895 90.33174368843385 UP_KEYWORDS Ribonucleoprotein 4 3.3057851239669422 0.23354480704735753 ENSG00000129824, ENSG00000198918, ENSG00000229117, ENSG00000163923 117 296 20581 2.377107877107877 1.0 0.9463765834126684 95.88564026790725 GOTERM_BP_DIRECT GO:0006364~rRNA processing 3 2.479338842975207 0.31391169530903607 ENSG00000129824, ENSG00000198918, ENSG00000229117 90 214 16792 2.615576323987539 1.0 0.9997016451669573 99.54133004532815 Annotation Cluster 6 Enrichment Score: 0.38824722134828016 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_MF_DIRECT GO:0046872~metal ion binding 15 12.396694214876034 0.12418773298386136 ENSG00000125144, ENSG00000198105, ENSG00000125148, ENSG00000102349, ENSG00000205358, ENSG00000187193, ENSG00000198417, ENSG00000152977, ENSG00000169715, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000259332, ENSG00000132846, ENSG00000121691 83 2069 16881 1.4745206053794686 0.9999999852802904 0.8352634661442035 78.91549668783586 UP_SEQ_FEATURE zinc finger region:C2H2-type 3 6 4.958677685950414 0.20002572836406404 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000102349, ENSG00000177932, ENSG00000215397 99 636 20063 1.9118543929864686 1.0 0.9999664769976425 94.74507395938936 INTERPRO IPR013087:Zinc finger C2H2-type/integrase DNA-binding domain 6 4.958677685950414 0.3005353480315689 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000102349, ENSG00000177932, ENSG00000215397 96 712 18559 1.629125702247191 1.0 0.994760969134608 98.82786835019706 UP_SEQ_FEATURE zinc finger region:C2H2-type 4 5 4.132231404958678 0.32361856514385823 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932, ENSG00000215397 99 588 20063 1.723270116127259 1.0 0.9999998092598401 99.42658956002023 UP_KEYWORDS Transcription regulation 16 13.223140495867769 0.3337253043325218 ENSG00000269404, ENSG00000129514, ENSG00000164853, ENSG00000198105, ENSG00000102349, ENSG00000205922, ENSG00000138083, ENSG00000152977, ENSG00000109132, ENSG00000153266, ENSG00000119547, ENSG00000177932, ENSG00000215397, ENSG00000054598, ENSG00000048540, ENSG00000163508 117 2332 20581 1.20690211256249 1.0 0.9799246465161224 99.2334444591349 UP_KEYWORDS Zinc 16 13.223140495867769 0.34358634892845447 ENSG00000125144, ENSG00000198105, ENSG00000125148, ENSG00000102349, ENSG00000205358, ENSG00000187193, ENSG00000198417, ENSG00000152977, ENSG00000169715, ENSG00000153266, ENSG00000177932, ENSG00000215397, ENSG00000132846, ENSG00000165188, ENSG00000247746, ENSG00000048540 117 2348 20581 1.1986779073661529 1.0 0.9779278733358946 99.35899578165785 INTERPRO IPR015880:Zinc finger, C2H2-like 6 4.958677685950414 0.3512259169300528 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000102349, ENSG00000177932, ENSG00000215397 96 762 18559 1.522227690288714 1.0 0.9972683920835069 99.54021773058996 UP_SEQ_FEATURE zinc finger region:C2H2-type 2 5 4.132231404958678 0.3536534764141638 ENSG00000153266, ENSG00000198105, ENSG00000102349, ENSG00000177932, ENSG00000215397 99 615 20063 1.6476143549314282 1.0 0.999998532606025 99.68517338129126 INTERPRO IPR007087:Zinc finger, C2H2 6 4.958677685950414 0.38910972471353095 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000102349, ENSG00000177932, ENSG00000215397 96 799 18559 1.4517365456821025 1.0 0.9981180926916895 99.7824762372675 SMART SM00355:ZnF_C2H2 6 4.958677685950414 0.4204196340584176 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000102349, ENSG00000177932, ENSG00000215397 57 762 10057 1.3892802873325043 0.9999999999873168 0.9933832349229859 99.42323927690599 UP_KEYWORDS Repressor 5 4.132231404958678 0.4290178098347781 ENSG00000129514, ENSG00000153266, ENSG00000102349, ENSG00000177932, ENSG00000138083 117 592 20581 1.4856924231924231 1.0 0.9866342378500353 99.87965111954423 UP_SEQ_FEATURE zinc finger region:C2H2-type 5 4 3.3057851239669422 0.5058204185977098 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000177932 99 550 20063 1.4738659320477503 1.0 0.999999996684043 99.99089745290264 UP_SEQ_FEATURE zinc finger region:C2H2-type 1 4 3.3057851239669422 0.5106935288567703 ENSG00000153266, ENSG00000102349, ENSG00000177932, ENSG00000215397 99 554 20063 1.4632243007694272 1.0 0.9999999847565189 99.99201360769592 UP_SEQ_FEATURE zinc finger region:C2H2-type 6 3 2.479338842975207 0.7063059798171685 ENSG00000153266, ENSG00000198105, ENSG00000177932 99 501 20063 1.213512369200992 1.0 0.9999999999954072 99.99999053734379 UP_KEYWORDS Zinc-finger 9 7.43801652892562 0.7961450798424026 ENSG00000152977, ENSG00000153266, ENSG00000198105, ENSG00000102349, ENSG00000177932, ENSG00000215397, ENSG00000132846, ENSG00000165188, ENSG00000247746 117 1781 20581 0.8889128838595431 1.0 0.9997152160677613 99.99999948166463 GOTERM_BP_DIRECT GO:0006351~transcription, DNA-templated 9 7.43801652892562 0.8278697337310026 ENSG00000269404, ENSG00000164853, ENSG00000198105, ENSG00000102349, ENSG00000177932, ENSG00000215397, ENSG00000106153, ENSG00000048540, ENSG00000163508 90 1955 16792 0.8589258312020461 1.0 0.9999999999979298 99.9999999988014 GOTERM_MF_DIRECT GO:0003676~nucleic acid binding 4 3.3057851239669422 0.8645297706711467 ENSG00000152977, ENSG00000067048, ENSG00000198105, ENSG00000102349 83 985 16881 0.8259311357103541 1.0 0.9999998865639937 99.99999999356245 Annotation Cluster 7 Enrichment Score: 0.027273308610959623 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_BP_DIRECT GO:0007186~G-protein coupled receptor signaling pathway 4 3.3057851239669422 0.862055875728697 ENSG00000180974, ENSG00000171596, ENSG00000162188, ENSG00000176533 90 899 16792 0.8301569645284884 1.0 0.9999999999998194 99.99999999994938 UP_KEYWORDS Transducer 4 3.3057851239669422 0.8870340956077949 ENSG00000180974, ENSG00000171596, ENSG00000162188, ENSG00000176533 117 899 20581 0.7826740062557638 1.0 0.9999111373289696 99.99999999956434 UP_KEYWORDS Lipoprotein 3 2.479338842975207 0.955850044661467 ENSG00000162188, ENSG00000171450, ENSG00000176533 117 852 20581 0.619387263753461 1.0 0.9999967759511403 99.99999999999999 UP_KEYWORDS Cell membrane 8 6.6115702479338845 0.9994661673136622 ENSG00000165246, ENSG00000180974, ENSG00000171596, ENSG00000180638, ENSG00000110934, ENSG00000162188, ENSG00000171450, ENSG00000176533 117 3175 20581 0.44322767346389397 1.0 0.9999999999167455 100.0 GOTERM_CC_DIRECT GO:0005886~plasma membrane 7 5.785123966942149 0.9999996888448728 ENSG00000180974, ENSG00000171596, ENSG00000180638, ENSG00000121691, ENSG00000162188, ENSG00000171450, ENSG00000176533 108 4121 18224 0.2866258639129301 1.0 1.0 100.0 Annotation Cluster 8 Enrichment Score: 0.018488677632230974 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR GOTERM_MF_DIRECT GO:0005524~ATP binding 5 4.132231404958678 0.939834701022917 ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142, ENSG00000197616 83 1495 16881 0.6802192045775074 1.0 0.9999999994012004 99.99999999999953 UP_SEQ_FEATURE nucleotide phosphate-binding region:ATP 3 2.479338842975207 0.9583380444526277 ENSG00000067048, ENSG00000254598, ENSG00000197616 99 994 20063 0.6116395341747455 1.0 1.0 100.0 UP_KEYWORDS ATP-binding 5 4.132231404958678 0.9583551588999236 ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142, ENSG00000197616 117 1391 20581 0.6323004417900177 1.0 0.9999964597284898 100.0 UP_KEYWORDS Nucleotide-binding 6 4.958677685950414 0.9771215666158212 ENSG00000067048, ENSG00000254598, ENSG00000259332, ENSG00000197142, ENSG00000083750, ENSG00000197616 117 1788 20581 0.5902885332415534 1.0 0.9999993115933432 100.0 Annotation Cluster 9 Enrichment Score: 0.007646781441465043 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_SEQ_FEATURE transmembrane region 19 15.702479338842975 0.957339021402744 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000138944, ENSG00000170091, ENSG00000086159, ENSG00000171596, ENSG00000180638, ENSG00000219438, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000180974, ENSG00000165246, ENSG00000165188, ENSG00000166002, ENSG00000205670 99 5056 20063 0.7615654168264927 1.0 1.0 100.0 UP_SEQ_FEATURE topological domain:Extracellular 9 7.43801652892562 0.9707381785755949 ENSG00000171840, ENSG00000179542, ENSG00000165246, ENSG00000198133, ENSG00000180974, ENSG00000138944, ENSG00000086159, ENSG00000171596, ENSG00000180638 99 2787 20063 0.6544345500212024 1.0 1.0 100.0 UP_SEQ_FEATURE topological domain:Cytoplasmic 11 9.090909090909092 0.9816966995687397 ENSG00000171840, ENSG00000179542, ENSG00000165246, ENSG00000198133, ENSG00000180974, ENSG00000138944, ENSG00000170091, ENSG00000086159, ENSG00000171596, ENSG00000180638, ENSG00000197142 99 3456 20063 0.6450295781893004 1.0 1.0 100.0 UP_KEYWORDS Transmembrane helix 23 19.00826446280992 0.9864641246740288 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000138944, ENSG00000170091, ENSG00000086159, ENSG00000171596, ENSG00000180638, ENSG00000219438, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165246, ENSG00000270136, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670 117 5634 20581 0.7181110413272288 1.0 0.9999997967109956 100.0 UP_KEYWORDS Transmembrane 23 19.00826446280992 0.9871177279266715 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000205639, ENSG00000138944, ENSG00000170091, ENSG00000086159, ENSG00000171596, ENSG00000180638, ENSG00000219438, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165246, ENSG00000270136, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670 117 5651 20581 0.7159507355932768 1.0 0.9999997573537213 100.0 GOTERM_CC_DIRECT GO:0005887~integral component of plasma membrane 4 3.3057851239669422 0.9913310609603783 ENSG00000171840, ENSG00000165246, ENSG00000086159, ENSG00000171596 108 1415 18224 0.47700562753566284 1.0 0.9999999999999987 100.0 GOTERM_CC_DIRECT GO:0016021~integral component of membrane 21 17.355371900826448 0.9920901906582361 ENSG00000171840, ENSG00000127540, ENSG00000198133, ENSG00000139370, ENSG00000138944, ENSG00000205639, ENSG00000170091, ENSG00000086159, ENSG00000171596, ENSG00000180638, ENSG00000219438, ENSG00000197142, ENSG00000178449, ENSG00000228474, ENSG00000179542, ENSG00000214194, ENSG00000180974, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000205670 108 5163 18224 0.6863365399100437 1.0 0.999999999999993 100.0 UP_KEYWORDS Membrane 31 25.6198347107438 0.994169485652969 ENSG00000171840, ENSG00000198133, ENSG00000170091, ENSG00000253626, ENSG00000171596, ENSG00000180638, ENSG00000219438, ENSG00000197142, ENSG00000162188, ENSG00000228474, ENSG00000179542, ENSG00000180389, ENSG00000124172, ENSG00000110934, ENSG00000132846, ENSG00000165188, ENSG00000243317, ENSG00000166002, ENSG00000176533, ENSG00000127540, ENSG00000205639, ENSG00000138944, ENSG00000139370, ENSG00000086159, ENSG00000178449, ENSG00000214194, ENSG00000165246, ENSG00000180974, ENSG00000270136, ENSG00000171450, ENSG00000205670 117 7494 20581 0.7276601908307273 1.0 0.9999999668887811 100.0 Annotation Cluster 10 Enrichment Score: 0.002100062360351966 Category Term Count % PValue Genes List Total Pop Hits Pop Total Fold Enrichment Bonferroni Benjamini FDR UP_SEQ_FEATURE disulfide bond 9 7.43801652892562 0.9804366159733408 ENSG00000165246, ENSG00000180974, ENSG00000171596, ENSG00000096088, ENSG00000105472, ENSG00000143869, ENSG00000164241, ENSG00000168878, ENSG00000248099 99 2917 20063 0.6252688004487799 1.0 1.0 100.0 UP_SEQ_FEATURE signal peptide 9 7.43801652892562 0.9953586987120217 ENSG00000179542, ENSG00000165246, ENSG00000138944, ENSG00000096088, ENSG00000105472, ENSG00000143869, ENSG00000168878, ENSG00000173401, ENSG00000248099 99 3346 20063 0.5451013421724719 1.0 1.0 100.0 UP_KEYWORDS Disulfide bond 11 9.090909090909092 0.995972355429861 ENSG00000165246, ENSG00000180974, ENSG00000171596, ENSG00000096088, ENSG00000270136, ENSG00000105472, ENSG00000106153, ENSG00000143869, ENSG00000164241, ENSG00000168878, ENSG00000248099 117 3434 20581 0.5634728631234164 1.0 0.9999999857858547 100.0 UP_KEYWORDS Signal 12 9.917355371900827 0.9994413719687681 ENSG00000179542, ENSG00000214194, ENSG00000165246, ENSG00000138944, ENSG00000096088, ENSG00000270136, ENSG00000105472, ENSG00000143869, ENSG00000168878, ENSG00000173401, ENSG00000243317, ENSG00000248099 117 4160 20581 0.5074211045364891 1.0 0.999999999940204 100.0 UP_SEQ_FEATURE glycosylation site:N-linked (GlcNAc...) 7 5.785123966942149 0.9999909766059296 ENSG00000179542, ENSG00000165246, ENSG00000180974, ENSG00000138944, ENSG00000171596, ENSG00000143869, ENSG00000168878 99 4234 20063 0.33504864421255537 1.0 1.0 100.0 UP_KEYWORDS Glycoprotein 8 6.6115702479338845 0.9999995368222665 ENSG00000179542, ENSG00000165246, ENSG00000180974, ENSG00000138944, ENSG00000171596, ENSG00000105472, ENSG00000143869, ENSG00000168878 117 4551 20581 0.309217284827041 1.0 1.0 100.0
Compare ZINB estimate against pooled MLE.
S, T = log_mu.mask(mask).align(sc_log_rho.mask(mask), join='inner') del S['NA18498'] del T['NA18498'] diff = abs(S - T)
diff.describe()
NA18489 NA18499 NA18501 NA18502 NA18505 \ count 1.019600e+04 10196.000000 10196.000000 10196.000000 10196.000000 mean 1.178346e-01 0.121262 0.135809 0.139497 0.156081 std 1.363366e-01 0.136127 0.141259 0.146562 0.151453 min 6.676062e-07 0.000004 0.000002 0.000004 0.000025 25% 8.943889e-03 0.008586 0.015372 0.014142 0.024955 50% 4.863866e-02 0.054939 0.075093 0.077339 0.096931 75% 2.057522e-01 0.219528 0.237000 0.249202 0.275236 max 5.949735e-01 0.599258 0.597418 0.643712 0.777383 NA18507 NA18508 NA18511 NA18516 NA18517 \ count 10196.000000 10196.000000 1.019600e+04 10196.000000 10196.000000 mean 0.126535 0.150258 1.399941e-01 0.148479 0.127944 std 0.137818 0.148053 1.467701e-01 0.152430 0.137889 min 0.000003 0.000003 5.337702e-07 0.000011 0.000026 25% 0.011150 0.018908 1.633549e-02 0.017336 0.016981 50% 0.062832 0.096256 7.651160e-02 0.084013 0.060533 75% 0.226499 0.261811 2.471018e-01 0.263972 0.221579 max 0.562528 0.631915 6.857363e-01 0.633576 0.608121 ... NA19190 NA19193 NA19203 NA19204 \ count ... 10196.000000 10196.000000 10196.000000 10196.000000 mean ... 0.136877 0.132370 0.172977 0.146066 std ... 0.144241 0.144685 0.160661 0.147889 min ... 0.000002 0.000006 0.000002 0.000017 25% ... 0.016106 0.013751 0.026534 0.023519 50% ... 0.069349 0.061976 0.119054 0.079187 75% ... 0.242639 0.235401 0.304475 0.260124 max ... 0.655097 0.644531 0.684917 0.617891 NA19206 NA19207 NA19209 NA19210 NA19225 \ count 10196.000000 1.019600e+04 10196.000000 10196.000000 10196.000000 mean 0.119952 1.100978e-01 0.124245 0.127634 0.142626 std 0.137472 1.297691e-01 0.140236 0.139594 0.145838 min 0.000001 6.039983e-07 0.000005 0.000013 0.000010 25% 0.009453 7.563498e-03 0.010923 0.013301 0.018521 50% 0.049953 4.321796e-02 0.054123 0.060849 0.083325 75% 0.211579 1.917986e-01 0.220077 0.224234 0.247469 max 0.615195 5.967968e-01 0.605708 0.657102 0.667808 NA19257 count 10196.000000 mean 0.123779 std 0.137379 min 0.000002 25% 0.010887 50% 0.058749 75% 0.216238 max 0.630771 [8 rows x 53 columns]
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()
count 53.000000 mean 0.998627 std 0.000324 min 0.997574 25% 0.998491 50% 0.998607 75% 0.998778 max 0.999402 dtype: float64
plot_concordance_rho( S, T, xlabel='ZINB ln relative abundance', ylabel='Pooled ln relative abundance', output_dir='/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/zinb-vs-pooled')
Look at \((1 - \pi) \mu\).
logodds = pd.read_table('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/without-cell-cycle/zi2-logodds.txt.gz', sep=' ', index_col=0) corrected_log_mu = pd.read_table('/scratch/midway2/aksarkar/singlecell/density-estimation/without-cell-cycle/zi2-log-mu.txt.gz', sep=' ', index_col=0) corrected_log_mu *= sp.expit(logodds) corrected_log_mu -= sp.logsumexp(corrected_log_mu)
S, T = (bulk_log_rho.loc[keep_genes.values.ravel()] .mask(mask) .align(corrected_log_mu.mask(mask), join='inner'))
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()
count 50.000000 mean 0.737392 std 0.026686 min 0.662447 25% 0.723944 50% 0.743025 75% 0.755475 max 0.781169 dtype: float64
S, T = (sc_log_rho.loc[keep_genes.values.ravel()] .mask(mask) .align(corrected_log_mu.mask(mask), join='inner')) del S["NA18507"] del T["NA18507"]
pd.Series([st.mstats.spearmanr(S[i], T[i]).correlation for i in S]).describe()
count 53.000000 mean 0.960098 std 0.009965 min 0.929497 25% 0.953466 50% 0.960169 75% 0.967074 max 0.976472 dtype: float64
Plot bulk vs. pooled subsets
Plot concordance between bulk vs pools of single cells, focusing on genes which have log-transformed expression at least 1 in both assays.
plot_concordance_by_num_cells( 'NA18507', umi, annotations, bulk_log_tpm, '/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/vs-cells/' )
Plot pooled subsets vs. pooled subsets
Ensure that pools don't overlap by randomly sampling double the cells and partitioning into two halves.
plot_concordance_pooled_subsets( 'NA18507', umi, annotations, '/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/subsets/' )
Chu et al hESC
Chu et al 2016 profiled hESC using single cell and matched bulk RNA-Seq (GSE75748). Analyze their data analagously to understand whether the correlation we observe is anomalous.
curl -sO --ftp-pasv ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75748/suppl/GSE75748_bulk_cell_type_ec.csv.gz
curl -sO --ftp-pasv ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE75nnn/GSE75748/suppl/GSE75748_sc_cell_type_ec.csv.gz
chu_bulk_tpm = pd.read_table('/scratch/midway2/aksarkar/singlecell/hesc/GSE75748_bulk_cell_type_ec.csv.gz', sep=',', index_col=0)
T = np.log(chu_bulk_tpm) - np.log(chu_bulk_tpm.sum(axis=0)) R = pd.DataFrame([(i, j, st.mstats.spearmanr(T[i], T[j]).correlation) for i, j in it.combinations(sorted(T.columns), 2)]) M = R.pivot(index=0, columns=1, values=2).T
plt.clf() plt.imshow(M, cmap=colorcet.cm['kr']) cb = plt.colorbar() cb.set_label('Spearman correlation') plt.gca().set_aspect('equal') plt.xticks(range(M.shape[0]), M.columns, rotation=90) _ = plt.yticks(range(M.shape[1]), M.index)
chu_sc_tpm = pd.read_table('/scratch/midway2/aksarkar/singlecell/hesc/GSE75748_sc_cell_type_ec.csv.gz', sep=',', index_col=0)
for k in ('H1', 'H9', 'DEC', 'EC', 'HFF', 'NPC', 'TB'): bulk_rho = chu_bulk_tpm.filter(like=k, axis='columns').agg(np.mean, axis=1) bulk_rho = np.log(bulk_rho) - np.log(bulk_rho.sum(axis=0)) sc_rho = chu_sc_tpm.filter(like=k, axis='columns').agg(np.mean, axis=1) sc_rho = np.log(sc_rho) - np.log(sc_rho.sum(axis=0)) x = sc_rho.mask(mask).dropna().to_frame() y = bulk_rho.mask(mask).dropna().to_frame() plot_concordance( x=x, y=y, title=k, gridsize=20, filename='/project2/mstephens/aksarkar/projects/singlecell-qtl/analysis/figure/sc-vs-bulk.org/hesc/{}.svg'.format(k), xlabel='Single cell ln relative abundance', ylabel='Bulk ln relative abundance', )