Cell type deconvolution using STitch3D

We first use STitch3D to estimate cell type proportions. After loading the MCube package in R, you can call system.file("deconvolution", package = "MCube") to obtain the path of the modified version of STitch3D used here.

[1]:
import sys
sys.path.append(r'/import/home/share/zw/pql/STitch3D/')
import STitch3D

import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad
import scipy.io
import matplotlib.pyplot as plt
import os

import warnings
warnings.filterwarnings("ignore")

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
/home/zwanghc/anaconda3/envs/stitch3d/lib/python3.7/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
[2]:
DATA_PATH = "/import/home/share/zw/data/DLPFC" # Raw data
SAVE_PATH = "/import/home/share/zw/pql/data/DLPFC" # Deconvolution results

os.makedirs(os.path.join(SAVE_PATH), exist_ok = True)
[3]:
# snRNA-seq reference data

mat = scipy.io.mmread(os.path.join(DATA_PATH, "snRNAseq_brain/GSE144136_GeneBarcodeMatrix_Annotated.mtx"))
meta = pd.read_csv(os.path.join(DATA_PATH, "snRNAseq_brain/GSE144136_CellNames.csv"), index_col=0)
meta.index = meta.x.values
group = [i.split('.')[1].split('_')[0] for i in list(meta.x.values)]
condition = [i.split('.')[1].split('_')[1] for i in list(meta.x.values)]
celltype = [i.split('.')[0] for i in list(meta.x.values)]
meta["group"] = group
meta["condition"] = condition
meta["celltype"] = celltype
genename = pd.read_csv(os.path.join(DATA_PATH, "snRNAseq_brain/GSE144136_GeneNames.csv"), index_col=0)
genename.index = genename.x.values
adata_ref = ad.AnnData(X=mat.tocsr().T)
adata_ref.obs = meta
adata_ref.var = genename
adata_ref = adata_ref[adata_ref.obs.condition.values.astype(str)=="Control", :]
[4]:
adata_ref
[4]:
View of AnnData object with n_obs × n_vars = 35212 × 30062
    obs: 'x', 'group', 'condition', 'celltype'
    var: 'x'
[5]:
# sc_counts = pd.DataFrame(adata_ref.X.toarray().astype(int), index=adata_ref.obs_names, columns=adata_ref.var_names)
# sc_labels = pd.DataFrame(adata_ref.obs['celltype'], index=adata_ref.obs_names, columns=['celltype'])

# sc_counts.to_csv(os.path.join(DATA_PATH, "snRNAseq_brain/sc_counts.csv"))
# sc_labels.to_csv(os.path.join(DATA_PATH, "snRNAseq_brain/sc_labels.csv"))
[6]:
# ST data

anno_df = pd.read_csv(os.path.join(DATA_PATH, "spatialLIBD/barcode_level_layer_map.tsv"), sep='\t', header=None)

slice_idx = [151673, 151674, 151675, 151676]

adata_st1 = sc.read_visium(path = os.path.join(DATA_PATH, "spatialLIBD/%d" % slice_idx[0]),
                          count_file = "%d_filtered_feature_bc_matrix.h5" % slice_idx[0])
anno_df1 = anno_df.iloc[anno_df[1].values.astype(str) == str(slice_idx[0])]
anno_df1.columns = ["barcode", "slice_id", "layer"]
anno_df1.index = anno_df1['barcode']
adata_st1.obs = adata_st1.obs.join(anno_df1, how="left")
adata_st1 = adata_st1[adata_st1.obs['layer'].notna()]

adata_st2 = sc.read_visium(path = os.path.join(DATA_PATH, "spatialLIBD/%d" % slice_idx[1]),
                          count_file = "%d_filtered_feature_bc_matrix.h5" % slice_idx[1])
anno_df2 = anno_df.iloc[anno_df[1].values.astype(str) == str(slice_idx[1])]
anno_df2.columns = ["barcode", "slice_id", "layer"]
anno_df2.index = anno_df2['barcode']
adata_st2.obs = adata_st2.obs.join(anno_df2, how="left")
adata_st2 = adata_st2[adata_st2.obs['layer'].notna()]

adata_st3 = sc.read_visium(path = os.path.join(DATA_PATH, "spatialLIBD/%d" % slice_idx[2]),
                          count_file = "%d_filtered_feature_bc_matrix.h5" % slice_idx[2])
anno_df3 = anno_df.iloc[anno_df[1].values.astype(str) == str(slice_idx[2])]
anno_df3.columns = ["barcode", "slice_id", "layer"]
anno_df3.index = anno_df3['barcode']
adata_st3.obs = adata_st3.obs.join(anno_df3, how="left")
adata_st3 = adata_st3[adata_st3.obs['layer'].notna()]

adata_st4 = sc.read_visium(path = os.path.join(DATA_PATH, "spatialLIBD/%d" % slice_idx[3]),
                          count_file = "%d_filtered_feature_bc_matrix.h5" % slice_idx[3])
anno_df4 = anno_df.iloc[anno_df[1].values.astype(str) == str(slice_idx[3])]
anno_df4.columns = ["barcode", "slice_id", "layer"]
anno_df4.index = anno_df4['barcode']
adata_st4.obs = adata_st4.obs.join(anno_df4, how="left")
adata_st4 = adata_st4[adata_st4.obs['layer'].notna()]
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
[7]:
# anno_df = pd.read_csv(os.path.join(DATA_PATH, "spatialLIBD/barcode_level_layer_map.tsv"), sep='\t', header=None)

# slice_idx = [151673, 151674, 151675, 151676]

# adata_st1 = sc.read_visium(path = os.path.join(DATA_PATH, "spatialLIBD/%d" % slice_idx[0]),
#                           count_file = "%d_filtered_feature_bc_matrix.h5" % slice_idx[0])
# anno_df1 = anno_df.iloc[anno_df[1].values.astype(str) == str(slice_idx[0])]
# anno_df1.columns = ["barcode", "slice_id", "layer"]
# anno_df1.index = anno_df1['barcode']
# adata_st1.obs = adata_st1.obs.join(anno_df1, how="left")
# adata_st1 = adata_st1[adata_st1.obs['layer'].notna()]
# adata_st1.var_names_make_unique()

# st_counts_1 = pd.DataFrame(adata_st1.X.toarray().astype(int), index=adata_st1.obs_names, columns=adata_st1.var_names)
# st_coordinates_1 = pd.DataFrame(data = adata_st1.obsm['spatial'], index = adata_st1.obs.index, columns=['x', 'y'])

# st_counts_1.to_csv(os.path.join(DATA_PATH, "spatialLIBD/st_counts_slice0.csv"))
# st_coordinates_1.to_csv(os.path.join(DATA_PATH, "spatialLIBD/st_coordinates_slice0.csv"))
[8]:
# # Record the original coordinates (for visualization)

# coordinates_1 = pd.DataFrame(data = adata_st1.obsm['spatial'], index = adata_st1.obs.index, columns=['x', 'y'])
# coordinates_1.to_csv(os.path.join(DATA_PATH, "2D_coordinates_slice0.csv"))

# coordinates_2 = pd.DataFrame(data = adata_st2.obsm['spatial'], index = adata_st2.obs.index, columns=['x', 'y'])
# coordinates_2.to_csv(os.path.join(DATA_PATH, "2D_coordinates_slice1.csv"))

# coordinates_3 = pd.DataFrame(data = adata_st3.obsm['spatial'], index = adata_st3.obs.index, columns=['x', 'y'])
# coordinates_3.to_csv(os.path.join(DATA_PATH, "2D_coordinates_slice2.csv"))

# coordinates_4 = pd.DataFrame(data = adata_st4.obsm['spatial'], index = adata_st4.obs.index, columns=['x', 'y'])
# coordinates_4.to_csv(os.path.join(DATA_PATH, "2D_coordinates_slice3.csv"))
[9]:
# Align four ST slices
adata_st_list_raw = [adata_st1, adata_st2, adata_st3, adata_st4]
adata_st_list = STitch3D.utils.align_spots(adata_st_list_raw, plot=True)
../../_images/analysis_DLPFC_DLPFC_STitch3D_10_0.png
Using the Iterative Closest Point algorithm for alignemnt.
Detecting edges...
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Aligning edges...
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
../../_images/analysis_DLPFC_DLPFC_STitch3D_10_5.png
[10]:
celltype_list_use = ['Astros_1', 'Astros_2', 'Astros_3', 'Endo', 'Micro/Macro',
                     'Oligos_1', 'Oligos_2', 'Oligos_3',
                     'Ex_1_L5_6', 'Ex_2_L5', 'Ex_3_L4_5', 'Ex_4_L_6', 'Ex_5_L5',
                     'Ex_6_L4_6', 'Ex_7_L4_6', 'Ex_8_L5_6', 'Ex_9_L5_6', 'Ex_10_L2_4']

adata_st, adata_basis = STitch3D.utils.preprocess(adata_st_list,adata_ref,
                                                celltype_ref = celltype_list_use,
                                                sample_col = "group",
                                                slice_dist_micron = [10., 300., 10.],
                                                n_hvg_group = 500,
                                                save_path = SAVE_PATH)
Finding highly variable genes...
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Trying to set attribute `.obs` of view, copying.
... storing 'group' as categorical
... storing 'condition' as categorical
... storing 'celltype' as categorical
4558 highly variable genes selected.
Trying to set attribute `.obs` of view, copying.
Calculate basis for deconvolution...
1 batches are used for computing the basis vector of cell type <Astros_1>.
17 batches are used for computing the basis vector of cell type <Astros_2>.
14 batches are used for computing the basis vector of cell type <Astros_3>.
17 batches are used for computing the basis vector of cell type <Endo>.
17 batches are used for computing the basis vector of cell type <Ex_10_L2_4>.
15 batches are used for computing the basis vector of cell type <Ex_1_L5_6>.
15 batches are used for computing the basis vector of cell type <Ex_2_L5>.
17 batches are used for computing the basis vector of cell type <Ex_3_L4_5>.
14 batches are used for computing the basis vector of cell type <Ex_4_L_6>.
17 batches are used for computing the basis vector of cell type <Ex_5_L5>.
16 batches are used for computing the basis vector of cell type <Ex_6_L4_6>.
16 batches are used for computing the basis vector of cell type <Ex_7_L4_6>.
15 batches are used for computing the basis vector of cell type <Ex_8_L5_6>.
13 batches are used for computing the basis vector of cell type <Ex_9_L5_6>.
16 batches are used for computing the basis vector of cell type <Micro/Macro>.
8 batches are used for computing the basis vector of cell type <Oligos_1>.
4 batches are used for computing the basis vector of cell type <Oligos_2>.
16 batches are used for computing the basis vector of cell type <Oligos_3>.
Preprocess ST data...
Variable names are not unique. To make them unique, call `.var_names_make_unique`.
Start building a graph...
Radius for graph connection is 150.7000.
9.8415 neighbors per cell on average.
[11]:
adata_st
[11]:
AnnData object with n_obs × n_vars = 14243 × 4558
    obs: 'in_tissue', 'array_row', 'array_col', 'barcode', 'slice_id', 'layer', 'slice', 'batch', 'library_size'
    var: 'gene_ids', 'feature_types', 'genome'
    uns: 'log1p'
    obsm: 'spatial', 'spatial_aligned', 'count', 'graph', '3D_coor'
[12]:
# Run STitch3D
model = STitch3D.model.Model(adata_st, adata_basis)
model.train()
  0%|          | 2/20000 [00:00<1:01:10,  5.45it/s]
Step: 0, Loss: 2438.9314, d_loss: 2433.3323, f_loss: 55.9906
 10%|█         | 2002/20000 [06:01<54:21,  5.52it/s]
Step: 2000, Loss: 745.7781, d_loss: 742.4756, f_loss: 33.0255
 20%|██        | 4002/20000 [12:03<48:00,  5.55it/s]
Step: 4000, Loss: 706.0618, d_loss: 702.7811, f_loss: 32.8061
 30%|███       | 6002/20000 [18:05<41:52,  5.57it/s]
Step: 6000, Loss: 696.4056, d_loss: 693.1412, f_loss: 32.6439
 40%|████      | 8002/20000 [24:07<36:01,  5.55it/s]
Step: 8000, Loss: 693.6951, d_loss: 690.4426, f_loss: 32.5243
 50%|█████     | 10002/20000 [30:09<30:01,  5.55it/s]
Step: 10000, Loss: 693.0974, d_loss: 689.8430, f_loss: 32.5442
 60%|██████    | 12002/20000 [36:11<24:01,  5.55it/s]
Step: 12000, Loss: 691.7047, d_loss: 688.4637, f_loss: 32.4097
 70%|███████   | 14002/20000 [42:12<18:06,  5.52it/s]
Step: 14000, Loss: 692.3254, d_loss: 689.0718, f_loss: 32.5355
 80%|████████  | 16002/20000 [48:14<11:58,  5.57it/s]
Step: 16000, Loss: 690.6113, d_loss: 687.3817, f_loss: 32.2960
 90%|█████████ | 18002/20000 [54:16<05:58,  5.57it/s]
Step: 18000, Loss: 689.8818, d_loss: 686.6545, f_loss: 32.2729
100%|██████████| 20000/20000 [1:00:18<00:00,  5.53it/s]
[13]:
result = model.eval(adata_st_list_raw, save = True, output_path = SAVE_PATH)
... storing 'layer' as categorical
... storing 'feature_types' as categorical
... storing 'genome' as categorical
... storing 'layer' as categorical
... storing 'feature_types' as categorical
... storing 'genome' as categorical
... storing 'layer' as categorical
... storing 'feature_types' as categorical
... storing 'genome' as categorical
... storing 'layer' as categorical
... storing 'feature_types' as categorical
... storing 'genome' as categorical