Case V: STAVAG on mouse embryonic development data

[1]:
import re
import os
import STAVAG
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")
C:\Users\Administrator\AppData\Roaming\Python\Python39\site-packages\pandas\core\computation\expressions.py:21: UserWarning: Pandas requires version '2.8.4' or newer of 'numexpr' (version '2.8.1' currently installed).
  from pandas.core.computation.check import NUMEXPR_INSTALLED
C:\Users\Administrator\AppData\Roaming\Python\Python39\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.4' currently installed).
  from pandas.core import (

Data loading

Data can be downloaded from https://db.cngb.org/stomics/mosta/download/.

[2]:
# Specify main directory path
main_dir = r'E:\Fast_SVG\Data\steroeseq_MOSTA'

# Initialize list to store all AnnData objects
adata_list = []

# Iterate through folders and process each .h5ad file
for folder in sorted(os.listdir(main_dir)):
    folder_path = os.path.join(main_dir, folder)
    if os.path.isdir(folder_path):
        # Find .h5ad file
        h5ad_file = [f for f in os.listdir(folder_path) if f.endswith('.MOSTA.h5ad')]
        if h5ad_file:
            file_path = os.path.join(folder_path, h5ad_file[0])
            adata1 = sc.read_h5ad(file_path)

            # Extract time information (first two characters before '_' as time value)
            time_value = folder.split('_')[0].replace('E', '')

            # Add time column to obs
            adata1.obs['time'] = float(time_value)

            # Get spatial coordinates
            spatial_coords = adata1.obsm['spatial']

            # Rotate coordinates 180° clockwise by negation
            rotated_coords = -spatial_coords

            # Assign rotated coordinates back to adata
            adata1.obsm['spatial'] = rotated_coords

            # Additional flip on X-axis for correction
            adata1.obsm['spatial'][:, 0] = -1 * adata1.obsm['spatial'][:, 0]
            # Add processed AnnData to list
            adata_list.append(adata1)

# Concatenate all AnnData objects
adata = adata_list[0].concatenate(*adata_list[1:])
# [optional] You can convert adata.x to array to speed up STAVAG, but will increase the cost of memory.
#adata.x = adata.x.to_array()
[3]:
# Adjust the Angle of the visualization
time_9_5_cells = adata.obs['time'] == 9.5
adata.obsm['spatial'][time_9_5_cells, 0] -= 420

theta = 30 * np.pi / 180
rotation_matrix = np.array([[np.cos(theta), np.sin(theta)],
                            [-np.sin(theta), np.cos(theta)]])
spatial_coords = adata.obsm['spatial'][time_9_5_cells, :]
rotated_coords = spatial_coords.dot(rotation_matrix.T)
adata.obsm['spatial'][time_9_5_cells, :] = rotated_coords

adata.obsm['spatial'][time_9_5_cells, 0] -= 320
adata.obsm['spatial'][time_9_5_cells, 1] -= 100

time_10_5_cells = adata.obs['time'] == 10.5
adata.obsm['spatial'][time_10_5_cells, 0] -= 160

time_11_5_cells = adata.obs['time'] == 11.5
adata.obsm['spatial'][time_11_5_cells, 1] += 520
adata.obsm['spatial'][time_11_5_cells, 0] -= 20

time_12_5_cells = adata.obs['time'] == 12.5
adata.obsm['spatial'][time_12_5_cells, 1] += 540
adata.obsm['spatial'][time_12_5_cells, 0] += 140

time_13_5_cells = adata.obs['time'] == 13.5
adata.obsm['spatial'][time_13_5_cells, 1] += 500
adata.obsm['spatial'][time_13_5_cells, 0] += 90

time_14_5_cells = adata.obs['time'] == 14.5
adata.obsm['spatial'][time_14_5_cells, 1] -= 200
adata.obsm['spatial'][time_14_5_cells, 0] += 390

time_15_5_cells = adata.obs['time'] == 15.5
adata.obsm['spatial'][time_15_5_cells, 1] += 500
adata.obsm['spatial'][time_15_5_cells, 0] += 1170

time_16_5_cells = adata.obs['time'] == 16.5
adata.obsm['spatial'][time_16_5_cells, 1] += -190
adata.obsm['spatial'][time_16_5_cells, 0] += 1200
[4]:
sc.pp.filter_genes(adata, min_cells=10)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

Visualize the data

[5]:
sc.set_figure_params(figsize=(20,8))
sc.pl.embedding(adata, basis='spatial', color='annotation', size=5)
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_8_0.png

Detect TVGs

coords should be the temporal, i.e. one-dimensional data.

[6]:
coords = np.array(adata.obs['time']).reshape(-1,1)
coord_dict = STAVAG.TVG_detection(adata, coords)
TVGs = list(coord_dict['T']['Feature'])

Gene modules detection

[7]:
Z, corr, df = STAVAG.gene_modules(adata, TVGs)
plt.figure(figsize=(30, 6))
ax = plt.gca()
labels = ax.get_xticklabels()
dendro = dendrogram(Z, labels=corr.columns, leaf_rotation=45)
ax = plt.gca()
labels = ax.get_xticklabels()

for i, label in enumerate(labels):
    if i % 3 != 0:
        label.set_visible(False)
plt.axhline(y=1, color='gray', linestyle='--', linewidth=1)
plt.title("Dendrogram", fontsize=25)
plt.ylabel("Distance", fontsize=22)
plt.xticks(fontsize=20, fontstyle='italic', rotation=90)
plt.tight_layout()
plt.show()
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_13_0.png
[8]:
max_clusters = 17# Tune this parameter to control the fine granularity of clustering.
clusters = fcluster(Z, max_clusters, criterion='maxclust')
[9]:
sc.set_figure_params(figsize=(10,4))
[10]:
# Map clustering results to corresponding column names
cluster_labels = {name: cluster for name, cluster in zip(corr.columns, clusters)}

# Create an empty DataFrame for each cluster to store corresponding columns
cluster_groups = {i: pd.DataFrame() for i in np.unique(clusters)}

# Assign each column to its corresponding cluster DataFrame
for col in df.columns:
    cluster = cluster_labels[col]
    cluster_groups[cluster][col] = df[col]

# Calculate the average value across columns for each cluster
count = 1
for cluster, data in cluster_groups.items():
    mean_values = data.mean(axis=1)
    print('Module', count, '. Number of genes:',data.shape[1])

    # Normalize to the range 0–1
    min_value = mean_values.min()
    max_value = mean_values.max()
    normalized_mean_values = (mean_values - min_value) / (max_value - min_value)

    adata.obs[str(count)] = normalized_mean_values
    sc.pl.embedding(
        adata,
        basis='spatial',
        color=str(count),
        # ax=axes[idx],
        legend_loc=None,       # Remove legend
        show=False,
        cmap='RdBu_r',
        vmax=1,
        vmin=0,
        size=5,
        colorbar_loc=None,
        frameon=False
    )
    plt.title('')  # Clear title
    plt.tight_layout()
    # plt.savefig('module_' + str(count) + '.png', dpi=300)
    plt.show()
    count += 1
Module 1 . Number of genes: 11
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_1.png
Module 2 . Number of genes: 14
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_3.png
Module 3 . Number of genes: 28
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_5.png
Module 4 . Number of genes: 39
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_7.png
Module 5 . Number of genes: 48
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_9.png
Module 6 . Number of genes: 11
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_11.png
Module 7 . Number of genes: 24
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_13.png
Module 8 . Number of genes: 30
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_15.png
Module 9 . Number of genes: 4
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_17.png
Module 10 . Number of genes: 15
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_19.png
Module 11 . Number of genes: 37
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_21.png
Module 12 . Number of genes: 6
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_23.png
Module 13 . Number of genes: 28
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_25.png
Module 14 . Number of genes: 10
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_27.png
Module 15 . Number of genes: 2
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_29.png
Module 16 . Number of genes: 4
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_31.png
Module 17 . Number of genes: 47
_images/Case_V_STAVAG_on_mouse_embryonic_development_data_16_33.png

Export TVGs and corresponding modules

[11]:
gene_cluster = pd.DataFrame([TVGs,clusters]).T
gene_cluster.columns = ['gene', 'modules']
gene_cluster.head()
#gene_cluster.to_csv('xxxxx.csv')
[11]:
gene modules
0 Hbb-bh1 13
1 Cdk8 8
2 Hbb-bs 8
3 Gm42418 8
4 Hbb-y 10
[ ]: