Mapping new data to the Human Neural Organoid Cell Atlas (HNOCA)

We built the HNOCA as a comprehensive collection of (healthy) neural organoid datasets. One can view it as a baseline of how neural organoids can "look like" and therefore it can be very valuable to compare new experimental data, e.g. from disease models to the HNOCA. Moreover, we highly encourage the community to help further expand the atlas over time and submit new datasets to be inclued.

Here we'll show how you can map new data to the HNOCA and how to perform quantitative comparisons. Analogous to our analysis in the manuscript, we map scRNA-seq data from a brain organoid model of fragile X syndrome (Kang et al.) and then perform DE analysis. First, let's load all the necessary libraries and the HNOCA data.

import os

import matplotlib.pyplot as plt
import numpy as np
import scanpy as sc
import scarches

import hnoca.mapping as hmap
import hnoca.stats as stats

os.chdir("/home/fleckj/scratch/hnoca/")

# Read reference and query data
hnoca_adata = sc.read("HNOCA_hv3k.h5ad")
query_adata = sc.read("KangWen2021_FXS.h5ad")

# Format batch columns
query_adata.obs["batch"] = query_adata.obs["sample"].astype(str).copy()

# Load reference model
hnoca_model = scarches.models.scPoli.load(
    "scpoli_HNOCA/",
    adata=hnoca_adata,
)
print(hnoca_model)

AnnData object with n_obs × n_vars = 1751568 × 3000
    obs: 'assay_sc', 'assay_differentiation', 'assay_type_differentiation', 'bio_sample', 'cell_line', 'cell_type', 'development_stage', 'disease', 'ethnicity', 'gm', 'id', 'individual', 'organ', 'organism', 'sex', 'state_exact', 'sample_source', 'source_doi', 'suspension_type_original', 'tech_sample', 'treatment', 'assay_sc_original', 'cell_line_original', 'cell_type_original', 'development_stage_original', 'disease_original', 'ethnicity_original', 'organ_original', 'organism_original', 'sex_original', 'suspension_type', 'obs_names_original', 'organoid_age_days', 'batch', 'publication', 'doi', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'log_n_genes_by_counts', 'log1p_pct_counts_mt', 'leiden_1', 'leiden_10', 'leiden_pca_1', 'leiden_pca_10', 'leiden_20', 'leiden_80', 'leiden_pca_20', 'leiden_pca_80', 'snap_clusters', 'snap_clusters_pca', 'level_1_pca', 'level_2_pca', 'level_3_pca', 'level_4_pca', 'level_5_pca', 'level_12_pca', 'level_123_pca', 'level_1234_pca', 'level_12345_pca', 'leiden_80_scpoli_h123', 'level_1_rss', 'level_2_rss', 'level_3_rss', 'level_4_rss', 'level_5_rss', 'level_12_rss', 'level_123_rss', 'level_1234_rss', 'level_12345_rss', 'level_12', 'level_123', 'level_1234', 'level_12345', 'braun_projected_class', 'braun_projected_region', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5'
    var: 'ensembl', 'gene_symbol', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'gene_length', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'assay_sc_colors', 'braun_projected_class_colors', 'braun_projected_region_colors', 'hvg', 'knn_aggrecell_scpoli_level1', 'knn_pca', 'knn_rss_pca', 'knn_scanvi_level1', 'knn_scanvi_level2', 'knn_scanvi_level3', 'knn_scanvi_pcaclust', 'knn_scpoli_hierarchical123', 'knn_scpoli_level1', 'knn_scpoli_level1_pcaclust', 'knn_scpoli_level2', 'knn_scpoli_level3', 'knn_scvi', 'leiden', 'level_12345_colors', 'level_1234_colors', 'level_123_colors', 'level_123_rss_colors', 'level_123_rss_sizes', 'level_123_sizes', 'level_12_colors', 'level_1_colors', 'level_2_colors', 'level_3_colors', 'level_3_rss_colors', 'level_5_colors', 'log1p', 'neighbors', 'paga', 'publication_colors', 'umap'
    obsm: 'X_aggrecell_scpoli_level1', 'X_pca', 'X_rss', 'X_rss_pca', 'X_scanvi_level1', 'X_scanvi_level1_pcaclust', 'X_scanvi_level2', 'X_scanvi_level3', 'X_scpoli_hierarchical123', 'X_scpoli_level1', 'X_scpoli_level1_pcaclust', 'X_scpoli_level2', 'X_scpoli_level3', 'X_scvi', 'X_umap', 'X_umap_aggrecell_scpoli_level1', 'X_umap_pca', 'X_umap_rss_pca', 'X_umap_scanvi_level1', 'X_umap_scanvi_level2', 'X_umap_scanvi_level3', 'X_umap_scanvi_pcaclust', 'X_umap_scpoli_hierarchical123', 'X_umap_scpoli_level1', 'X_umap_scpoli_level1_pcaclust', 'X_umap_scpoli_level2', 'X_umap_scpoli_level3', 'X_umap_scvi'
    layers: 'counts'
    obsp: 'connectivities', 'distances', 'knn_aggrecell_scpoli_level1_connectivities', 'knn_aggrecell_scpoli_level1_distances', 'knn_pca_connectivities', 'knn_pca_distances', 'knn_rss_pca_connectivities', 'knn_rss_pca_distances', 'knn_scanvi_level1_connectivities', 'knn_scanvi_level1_distances', 'knn_scanvi_level2_connectivities', 'knn_scanvi_level2_distances', 'knn_scanvi_level3_connectivities', 'knn_scanvi_level3_distances', 'knn_scanvi_pcaclust_connectivities', 'knn_scanvi_pcaclust_distances', 'knn_scpoli_hierarchical123_connectivities', 'knn_scpoli_hierarchical123_distances', 'knn_scpoli_level1_connectivities', 'knn_scpoli_level1_distances', 'knn_scpoli_level1_pcaclust_connectivities', 'knn_scpoli_level1_pcaclust_distances', 'knn_scpoli_level2_connectivities', 'knn_scpoli_level2_distances', 'knn_scpoli_level3_connectivities', 'knn_scpoli_level3_distances', 'knn_scvi_connectivities', 'knn_scvi_distances'
Embedding dictionary:
    Num conditions: [396]
    Embedding dim: [5]
Encoder Architecture:
    Input Layer in, out and cond: 3000 1024 5
    Mean/Var Layer in/out: 1024 10
Decoder Architecture:
    First Layer in, out and cond:  10 1024 5
    Output Layer in/out:  1024 3000 

<scarches.models.scpoli.scpoli_model.scPoli object at 0x14bda502a4a0>

Now we have eveything together to start the reference mapping. Here we are finetuning the HNOCA scPoli model for 30 epochs (n_epochs) retraining only the weights for the new batch covariates (retrain="partial").

mapper = hmap.AtlasMapper(hnoca_model)
mapper.map_query(query_adata, retrain="partial", n_epochs=30, pretraining_epochs=20, eta=10, batch_size=1024)
mapper.save("HNOCA_FXS_mapper/")

With this finetuned model we can then compute a weighted kNN graph between the query and reference data.

mapper = hmap.AtlasMapper.load("HNOCA_FXS_mapper/")

mapper.compute_wknn(k=100)

Using the wkNN graph, we can now transfer cell type labels from the HNOCA to the query data and compute a UMAP from the latent space of the scPoli model.

ct_transfer = mapper.transfer_labels(label_key="level_1_pca")
ct_transfer[["best_label", "best_score"]]

	best_label	best_score
AAACCCACAGTTCCAA.4	neural_progenitor_cell	0.831662
AAACCCAGTTGCATCA.4	neural_progenitor_cell	1.733664
AAACCCAGTTGCATTG.4	neural_progenitor_cell	2.297948
AAACCCATCCACGTGG.4	neuron	1.610558
AAACGAACACAGTGAG.4	neuron	0.854941
...	...	...
TTTGGTTGTCACCCTT.12	choroid_plexus_epithelium	1.629327
TTTGTTGAGACCTTTG.12	choroid_plexus_epithelium	0.321833
TTTGTTGAGGGATGTC.12	neural_progenitor_cell	1.303548
TTTGTTGCAATATCCG.12	neural_progenitor_cell	0.817161
TTTGTTGCATGCGTGC.12	neuron	0.531878

36557 rows × 2 columns

X_latent = mapper.get_latent_representation(query_adata)
query_adata.obsm["X_latent"] = X_latent
query_adata.obs["best_label"] = ct_transfer["best_label"]

sc.pp.neighbors(query_adata, use_rep="X_latent", method="rapids")
sc.tl.umap(query_adata, min_dist=0.3, method="rapids")
sc.pl.umap(query_adata, color=["condition", "best_label"], wspace=0.4)

Warning: Query dataset is missing 2472 features from the reference dataset. Adding missing features as zero-filled columns.
WARNING: .obsp["connectivities"] have not been computed using umap

No description has been provided for this image

Looks quite good! Now we can quantify the difference between the FXS dataset and the HNOCA. For this, we first use the wkNN graph to find matched (meta-)cells for each cell in the FXS dataset. Then we perform a differential expression analysis between the matched HNOCA cells and the query using an F-test.

matched_adata = mapper.get_matched_expression()
matched_adata

AnnData object with n_obs × n_vars = 36557 × 3000
    obs: 'sample', 'age', 'sex', 'disease', 'condition', 'publication', 'sc_method', 'batch_in_data', '_scvi_batch', '_scvi_labels', 'snapseed_level_1', 'snapseed_level_2', 'snapseed_level_3', 'snapseed_level_4', 'snapseed_level_5', 'snapseed_level_1_final', 'snapseed_level_2_final', 'snapseed_level_3_final', 'snapseed_level_4_final', 'snapseed_level_5_final', 'conditions_combined', 'annot_level_1', 'annot_level_2', 'annot_level_3', 'annot_level_4', 'annot_region', 'CellClass', 'Subregion', 'SummarizedRegion', 'NeurotransmitterTransporter', 'CellType', 'annot_region_rev', 'SummarizedRegion_hier', 'annot_region_rev2', 'annot_level_1_plus', 'annot_level_2_plus', 'annot_level_3_plus', 'annot_level_4_plus', 'annot_region_plus', 'annot_region_rev_plus', 'annot_region_rev2_plus', 'pearsonr_HNOCA_matched', 'is_control', 'avgdist_HNOCA_matched', 'pearsonr_HNOCA_matched_mtl', 'full_sample', 'nFeatures', 'louvain', 'louvain_1', 'batch', 'snapseed_pca_rss_level_1', 'snapseed_pca_rss_level_12', 'snapseed_pca_rss_level_123'
    var: 'ensembl', 'gene_symbol', 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'gene_length', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    obsm: 'X_latent'

paired_de = stats.test_de_paired(
    query_adata,
    matched_adata,
    num_threads=4,
    var_names=query_adata.var_names,
    adjust_method="holm",
)
paired_de

	f	coef	pval	padj
A2M	1.005743	-0.010660	5.840427e-01	1.000000e+00
ACTA2	1.005277	-0.019927	6.148829e-01	1.000000e+00
ACTC1	1.004356	-0.020906	6.777461e-01	1.000000e+00
ACTN2	1.012864	-0.019045	2.217502e-01	1.000000e+00
ADCYAP1	1.073479	-0.076012	1.221778e-11	4.398402e-09
...	...	...	...	...
ZIC1	1.182211	-0.311655	2.220446e-16	1.172396e-13
ZIC2	1.003396	-0.041030	7.458421e-01	1.000000e+00
ZIC4	1.150357	-0.153911	2.220446e-16	1.172396e-13
ZNF503	1.013888	0.077502	1.873387e-01	1.000000e+00
ZNF843	1.002726	-0.005273	7.946881e-01	1.000000e+00

528 rows × 4 columns

plt.scatter(paired_de["coef"], -np.log10(paired_de["pval"]), s=10)
plt.ylabel("-Log10(P)")
plt.show()