# Set up sys.path so that 'src/spindle_dev' is importable as 'spindle_dev'
import sys
import importlib
from pathlib import Path

project_root = '/data/sarkar_lab/Projects/spindle_dev'
src_path = Path(project_root) / 'src'
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

import spindle_dev
# Reload to pick up code changes without restarting the kernel
importlib.reload(spindle_dev)

<module 'spindle_dev' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/__init__.py'>

import scanpy as sc

adata = sc.read_h5ad("/data/sarkar_lab/zenodo_data/2023_xenium_human_breast_cancer/adata.h5ad")

adata = adata[adata.obs.loc[adata.obs.Cluster != "Unlabeled"].index, :].copy()

sc.pl.spatial(adata, color=["Cluster"], spot_size=13)

/tmp/ipykernel_3030422/1942974121.py:1: FutureWarning: Use `squidpy.pl.spatial_scatter` instead.
  sc.pl.spatial(adata, color=["Cluster"], spot_size=13)

# Reload spindle_dev metrics, index, and package after code changes
import importlib
import spindle_dev
import spindle_dev.metrics as metrics
import spindle_dev.index as index
import spindle_dev.preprocessing as preprocessing
import spindle_dev.plotting as plotting

# Reload in dependency order so new symbols are available
importlib.reload(metrics)
importlib.reload(index)
importlib.reload(preprocessing)
importlib.reload(plotting)
importlib.reload(spindle_dev)
import spindle_dev.typing as typing
importlib.reload(typing)
import spindle_dev.test as test
import spindle_dev.search as search
importlib.reload(test)
importlib.reload(search)

<module 'spindle_dev.search' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/search.py'>

def prepare_to_index(adata, resolution=0.2, min_final_size=20):
    coords = adata.obsm["spatial"]
    tiles = preprocessing.build_quadtree_tiles(coords, max_pts=200, min_side=0.0, max_depth=40)
    num_genes = adata.n_vars
    genes_work, gene_idx = spindle_dev.preprocessing.topvar_genes(adata, G=num_genes)  
    tile_covs = spindle_dev.preprocessing.build_tile_covs_full(adata, tiles, gene_idx, n_jobs=8, eps=1e-6)
    data = index.ProcessedData(tiles, tile_covs, genes_work, adata.n_obs)
    data.reduce_dim(num_pca_components=30, n_components=2, do_umap=True)
    data.cluster_spds(cluster_distance="tree", cluster_method="leiden", resolution=resolution)
    data.assign_label_to_spots()
    data.get_corr_mean_by_cluster()
    out_dict = data.get_adaptive_runs(find_blocks=True, with_size_guard=True,min_final_size=min_final_size,max_final_size=100)
    return tiles, tile_covs, genes_work, data, out_dict

tiles, tile_covs, genes_work, data, out_dict = prepare_to_index(adata, min_final_size=15)

[2026-01-19 12:59:14,378] INFO spindle_dev.index: Clustering SPD-s using 'tree' distance.
[2026-01-19 12:59:14,393] INFO spindle_dev.index: Building ultrametric features from SPD matrices.
[2026-01-19 12:59:25,299] INFO spindle_dev.index: Computing latent features from the tree representations.
[2026-01-19 12:59:25,964] INFO spindle_dev.index: Reducing latent features to 30 dimensions using PCA.
[2026-01-19 12:59:35,508] INFO spindle_dev.index: Explained variance ratios by PCA components: [0.06878107 0.05380314 0.02646384 0.01915205 0.01300586 0.01025453
 0.00781812 0.00735731 0.00665263 0.00645236 0.00500298 0.00465103
 0.00423426 0.00416654 0.00382042 0.00364907 0.00355717 0.00348549
 0.00334294 0.00319313 0.00303316 0.00301945 0.00300855 0.00294676
 0.00288963 0.00281065 0.00275823 0.00274145 0.00271152 0.00265672]
[2026-01-19 12:59:35,510] INFO spindle_dev.index: Reducing latent features to 2 dimensions using UMAP.
/panfs/accrepfs.vampire/home/sarkah1/miniforge3/envs/spatial/lib/python3.10/site-packages/umap/umap_.py:1952: UserWarning: n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.
  warn(
[2026-01-19 12:59:47,595] INFO spindle_dev.index: Clustering SPD-s using 'tree' distance.
[2026-01-19 12:59:47,596] INFO spindle_dev.index: Clustering SPD matrices using Leiden clustering with resolution 0.20.
[2026-01-19 12:59:47,814] INFO spindle_dev.index: Since clustering method is tree, I am going to find global order per cluster
[2026-01-19 12:59:47,815] INFO spindle_dev.index: Finding consensus tree for cluster 0
[2026-01-19 12:59:47,916] INFO spindle_dev.index: Finding consensus tree for cluster 1
[2026-01-19 12:59:47,996] INFO spindle_dev.index: Finding consensus tree for cluster 2
[2026-01-19 12:59:48,068] INFO spindle_dev.index: Finding consensus tree for cluster 3
[2026-01-19 12:59:48,607] INFO spindle_dev.index: Computing mean correlation matrix for cluster 0
[2026-01-19 12:59:49,415] INFO spindle_dev.index: Computing mean correlation matrix for cluster 1
[2026-01-19 12:59:50,019] INFO spindle_dev.index: Computing mean correlation matrix for cluster 2
[2026-01-19 12:59:50,508] INFO spindle_dev.index: Computing mean correlation matrix for cluster 3
[2026-01-19 12:59:50,776] INFO spindle_dev.index: Finding adaptive block runs for cluster 0
[2026-01-19 12:59:51,045] INFO spindle_dev.index:  Chose t=0.7638580524895158 resulting in 200 blocks instead of 61 blocks would have gotten by default
[2026-01-19 12:59:51,076] INFO spindle_dev.index:  Final block runs for cluster 0: 16 blocks.
[2026-01-19 12:59:51,077] INFO spindle_dev.index: Finding adaptive block runs for cluster 1
[2026-01-19 12:59:51,351] INFO spindle_dev.index:  Chose t=0.8451053062066296 resulting in 159 blocks instead of 37 blocks would have gotten by default
[2026-01-19 12:59:51,370] INFO spindle_dev.index:  Final block runs for cluster 1: 14 blocks.
[2026-01-19 12:59:51,371] INFO spindle_dev.index: Finding adaptive block runs for cluster 2
[2026-01-19 12:59:51,641] INFO spindle_dev.index:  Chose t=0.7649764855224312 resulting in 188 blocks instead of 40 blocks would have gotten by default
[2026-01-19 12:59:51,669] INFO spindle_dev.index:  Final block runs for cluster 2: 16 blocks.
[2026-01-19 12:59:51,669] INFO spindle_dev.index: Finding adaptive block runs for cluster 3
[2026-01-19 12:59:51,939] INFO spindle_dev.index:  Chose t=0.8187994098657192 resulting in 220 blocks instead of 98 blocks would have gotten by default
[2026-01-19 12:59:51,977] INFO spindle_dev.index:  Final block runs for cluster 3: 15 blocks.

import matplotlib.pylab as plt
from matplotlib.patches import Rectangle
from matplotlib.collections import PatchCollection

import numpy as np
palette = sc.pl.palettes.default_20
point_colors = [palette[lab] for lab in data.spot_label.values()]
labels = data.labels
indices = np.array([int(i) for i in data.spot_label.keys()])
coords = adata.obsm['spatial']
plt.scatter(coords[indices,0], coords[indices,1], s=1, c=point_colors, lw=0, alpha=0.8)
patches = []
for t in tiles:
    bbox = t.bbox if hasattr(t, "bbox") else t["bbox"]  # (xmin,ymin,xmax,ymax)
    x0, y0, x1, y1 = bbox
    patches.append(Rectangle((x0, y0), x1 - x0, y1 - y0, fill=False))
pc = PatchCollection(patches, match_original=True, linewidths=0.2, edgecolors='b', alpha=0.3)
# add collection
ax = plt.gca()
ax.add_collection(pc)
ax.invert_yaxis()
ax.axis('off')

plt.savefig("/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X_wo_unlabeled/spatial_plot_with_cluster.png", dpi=300)
plt.show()

import numpy as np
palette = sc.pl.palettes.default_20
point_colors = [palette[lab] for lab in data.spot_label.values()]
labels = data.labels
indices = np.array([int(i) for i in data.spot_label.keys()])
coords = adata.obsm['spatial']

# Create new color array - grey for all except cluster 3
new_point_colors = []
for idx, lab in zip(indices, data.spot_label.values()):
    if lab == 3:
        new_point_colors.append(palette[lab])
    else:
        new_point_colors.append("#E9E6E6")  # grey

plt.scatter(coords[indices,0], coords[indices,1], s=1, c=new_point_colors, lw=0, alpha=0.8)
ax = plt.gca()
patches = []
for t in tiles:
    bbox = t.bbox if hasattr(t, "bbox") else t["bbox"]  # (xmin,ymin,xmax,ymax)
    x0, y0, x1, y1 = bbox
    patches.append(Rectangle((x0, y0), x1 - x0, y1 - y0, fill=False))
pc = PatchCollection(patches, match_original=True, linewidths=0.01, edgecolors='b', alpha=0.1)
# add collection
ax.add_collection(pc)
ax.invert_yaxis()
ax.axis('off')

plt.savefig("/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X_wo_unlabeled/spatial_plot_cluster3_only.png", dpi=300, bbox_inches='tight')
plt.show()

# Create colors for clusters
import numpy as np
import matplotlib.pyplot as plt
n_clusters = len(set(data.labels))
# Use Scanpy's default_20 palette for consistency
palette = sc.pl.palettes.default_20
cluster_colors = [palette[lab] for lab in data.labels]
plt.scatter(data.latent['umap'][:,0], data.latent['umap'][:,1], s=6, lw=0, alpha=0.8, c=cluster_colors)
# Write label on the plot
# with backgrond circle for better visibility
# and bold font
for i in range(n_clusters):
    cluster_points = data.latent['umap'][data.labels == i]
    if len(cluster_points) == 0:
        continue
    x_mean = np.mean(cluster_points[:,0])
    y_mean = np.mean(cluster_points[:,1])
    plt.text(x_mean, y_mean, str(i), color='black', fontsize=10, ha='center', va='center', fontweight='bold',
             bbox=dict(facecolor='white', edgecolor='black', boxstyle='circle,pad=0.5', alpha=0.7))
# take away top and right border
ax = plt.gca()
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# remove axis ticks
ax.set_xticks([])
ax.set_yticks([])
plt.savefig("/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X_wo_unlabeled/umap_plot_with_cluster_labels.png", dpi=300)
plt.show()

epsilon_block_wise_dict = {}
epsilon_dict = {}
for cluster_id in set(data.labels):
    eps_per_block, eps_elbow_per_block, eps = index.choose_adaptive_epsilons(data, cluster_id, k_target_per_block=64)
    epsilon_block_wise_dict[int(cluster_id)] = eps_elbow_per_block
    epsilon_dict[int(cluster_id)] = eps

config = typing.IndexConfig()
config.epsilon_dict = epsilon_dict
config.epsilon_block_wise_dict = epsilon_block_wise_dict
config.threshold_type = 'block_wise'
config.kmean_method = 'epsilon_net'

epsilon_block_wise_dict

{0: {0: 7.551487154783578,
  1: 7.688574605978839,
  2: 8.348619550689628,
  3: 8.12886229785936,
  4: 6.941772257682238,
  5: 7.11207711085268,
  6: 7.4634645865588745,
  7: 8.12941780999703,
  8: 7.394765319810765,
  9: 6.714566808424555,
  10: 7.174724333051149,
  11: 6.700368214926148,
  12: 6.223964337518237,
  13: 5.151625529484852,
  14: 4.577674131656347,
  15: 9.7037491104398},
 1: {0: 7.34932238515324,
  1: 8.493449641575141,
  2: 8.901136453589094,
  3: 7.8762835362284145,
  4: 7.151354147881101,
  5: 7.800650669737451,
  6: 8.425077404859794,
  7: 8.975881255696738,
  8: 8.588472145359653,
  9: 5.793292280974387,
  10: 9.691527557373046,
  11: 9.951465035398689,
  12: 6.811198569674318,
  13: 4.351012706756592},
 2: {0: 6.799333509719286,
  1: 8.557316793247027,
  2: 8.452427434089639,
  3: 7.588833091125586,
  4: 6.785369982212529,
  5: 5.5096939776630975,
  6: 5.248055444220603,
  7: 5.746564254670582,
  8: 5.975592355011169,
  9: 3.1322101267631437,
  10: 4.466462934560956,
  11: 8.84131131853376,
  12: 6.445534355120787,
  13: 5.415287845182453,
  14: 6.687208104027927,
  15: 7.126931829888523},
 3: {0: 7.388558401037387,
  1: 7.783292293548584,
  2: 8.289245743649037,
  3: 7.992608086706223,
  4: 7.854193064200638,
  5: 8.034181459182257,
  6: 7.878430512484119,
  7: 8.066423324555597,
  8: 9.167032732265142,
  9: 7.850585689461382,
  10: 8.363619363357321,
  11: 6.642502405394938,
  12: 10.199246521661854,
  13: 8.335221526620284,
  14: 8.539956410725912}}

# Create index 
dag_dict, stat, dist_list = index.index_spds(data, config=config)

[2026-01-19 12:59:59,265] INFO spindle_dev.index: Processing cluster 0
[2026-01-19 12:59:59,266] INFO spindle_dev.index: Building SPD index with epsilon=7.570320670118333
[2026-01-19 12:59:59,267] INFO spindle_dev.index: Step 1: Cluster blocks within each class of SPD matrices.
[2026-01-19 12:59:59,300] INFO spindle_dev.index: Cluster 0: 733 SPDs, 16 blocks
[2026-01-19 12:59:59,301] INFO spindle_dev.index:  Using epsilon-net clustering for block 0
[2026-01-19 12:59:59,356] INFO spindle_dev.index:  Finished block 0 in 0.05 seconds, found 2 clusters.
[2026-01-19 12:59:59,356] INFO spindle_dev.index:  Using epsilon-net clustering for block 1
[2026-01-19 12:59:59,412] INFO spindle_dev.index:  Finished block 1 in 0.06 seconds, found 5 clusters.
[2026-01-19 12:59:59,413] INFO spindle_dev.index:  Using epsilon-net clustering for block 2
[2026-01-19 12:59:59,458] INFO spindle_dev.index:  Finished block 2 in 0.05 seconds, found 3 clusters.
[2026-01-19 12:59:59,459] INFO spindle_dev.index:  Using epsilon-net clustering for block 3
[2026-01-19 12:59:59,506] INFO spindle_dev.index:  Finished block 3 in 0.05 seconds, found 3 clusters.
[2026-01-19 12:59:59,507] INFO spindle_dev.index:  Using epsilon-net clustering for block 4
[2026-01-19 12:59:59,555] INFO spindle_dev.index:  Finished block 4 in 0.05 seconds, found 3 clusters.
[2026-01-19 12:59:59,556] INFO spindle_dev.index:  Using epsilon-net clustering for block 5
[2026-01-19 12:59:59,605] INFO spindle_dev.index:  Finished block 5 in 0.05 seconds, found 3 clusters.
[2026-01-19 12:59:59,606] INFO spindle_dev.index:  Using epsilon-net clustering for block 6
[2026-01-19 12:59:59,664] INFO spindle_dev.index:  Finished block 6 in 0.06 seconds, found 2 clusters.
[2026-01-19 12:59:59,665] INFO spindle_dev.index:  Using epsilon-net clustering for block 7
[2026-01-19 12:59:59,708] INFO spindle_dev.index:  Finished block 7 in 0.04 seconds, found 2 clusters.
[2026-01-19 12:59:59,709] INFO spindle_dev.index:  Using epsilon-net clustering for block 8
[2026-01-19 12:59:59,758] INFO spindle_dev.index:  Finished block 8 in 0.05 seconds, found 3 clusters.
[2026-01-19 12:59:59,759] INFO spindle_dev.index:  Using epsilon-net clustering for block 9
[2026-01-19 12:59:59,814] INFO spindle_dev.index:  Finished block 9 in 0.06 seconds, found 4 clusters.
[2026-01-19 12:59:59,815] INFO spindle_dev.index:  Using epsilon-net clustering for block 10
[2026-01-19 12:59:59,883] INFO spindle_dev.index:  Finished block 10 in 0.07 seconds, found 5 clusters.
[2026-01-19 12:59:59,884] INFO spindle_dev.index:  Using epsilon-net clustering for block 11
[2026-01-19 12:59:59,954] INFO spindle_dev.index:  Finished block 11 in 0.07 seconds, found 2 clusters.
[2026-01-19 12:59:59,954] INFO spindle_dev.index:  Using epsilon-net clustering for block 12
[2026-01-19 13:00:00,029] INFO spindle_dev.index:  Finished block 12 in 0.07 seconds, found 2 clusters.
[2026-01-19 13:00:00,030] INFO spindle_dev.index:  Using epsilon-net clustering for block 13
[2026-01-19 13:00:00,076] INFO spindle_dev.index:  Finished block 13 in 0.05 seconds, found 2 clusters.
[2026-01-19 13:00:00,077] INFO spindle_dev.index:  Using epsilon-net clustering for block 14
[2026-01-19 13:00:00,128] INFO spindle_dev.index:  Finished block 14 in 0.05 seconds, found 3 clusters.
[2026-01-19 13:00:00,129] INFO spindle_dev.index:  Using epsilon-net clustering for block 15
[2026-01-19 13:00:00,338] INFO spindle_dev.index:  Finished block 15 in 0.21 seconds, found 1 clusters.
[2026-01-19 13:00:00,339] INFO spindle_dev.index: Step 2: Build DAG connections between block clusters.
[2026-01-19 13:00:00,339] INFO spindle_dev.index: Step 2.1: For each layer order the block-clusters by
[2026-01-19 13:00:00,339] INFO spindle_dev.index: Not implemented: ordering block-clusters ? How to order them?
[2026-01-19 13:00:00,340] INFO spindle_dev.index: We will use triangle inequality to order clusters.
[2026-01-19 13:00:00,340] INFO spindle_dev.index: Step 2.2: Connect block-clusters between layers based on co-occurrence in SPDs.
[2026-01-19 13:00:00,346] INFO spindle_dev.index: Check if node global_node_id matches index in nodes list
[2026-01-19 13:00:00,347] INFO spindle_dev.index: Step 2.3: Ordering block-clusters within each layer using log-Euclidean distances.
[2026-01-19 13:00:00,347] INFO spindle_dev.index: Processing cluster 1
[2026-01-19 13:00:00,347] INFO spindle_dev.index: Building SPD index with epsilon=8.204116141691717
[2026-01-19 13:00:00,348] INFO spindle_dev.index: Step 1: Cluster blocks within each class of SPD matrices.
[2026-01-19 13:00:00,375] INFO spindle_dev.index: Cluster 1: 567 SPDs, 14 blocks
[2026-01-19 13:00:00,375] INFO spindle_dev.index:  Using epsilon-net clustering for block 0
[2026-01-19 13:00:00,416] INFO spindle_dev.index:  Finished block 0 in 0.04 seconds, found 4 clusters.
[2026-01-19 13:00:00,417] INFO spindle_dev.index:  Using epsilon-net clustering for block 1
[2026-01-19 13:00:00,457] INFO spindle_dev.index:  Finished block 1 in 0.04 seconds, found 4 clusters.
[2026-01-19 13:00:00,458] INFO spindle_dev.index:  Using epsilon-net clustering for block 2
[2026-01-19 13:00:00,490] INFO spindle_dev.index:  Finished block 2 in 0.03 seconds, found 2 clusters.
[2026-01-19 13:00:00,491] INFO spindle_dev.index:  Using epsilon-net clustering for block 3
[2026-01-19 13:00:00,538] INFO spindle_dev.index:  Finished block 3 in 0.05 seconds, found 5 clusters.
[2026-01-19 13:00:00,538] INFO spindle_dev.index:  Using epsilon-net clustering for block 4
[2026-01-19 13:00:00,622] INFO spindle_dev.index:  Finished block 4 in 0.08 seconds, found 10 clusters.
[2026-01-19 13:00:00,622] INFO spindle_dev.index:  Using epsilon-net clustering for block 5
[2026-01-19 13:00:00,679] INFO spindle_dev.index:  Finished block 5 in 0.06 seconds, found 3 clusters.
[2026-01-19 13:00:00,679] INFO spindle_dev.index:  Using epsilon-net clustering for block 6
[2026-01-19 13:00:00,794] INFO spindle_dev.index:  Finished block 6 in 0.11 seconds, found 3 clusters.
[2026-01-19 13:00:00,795] INFO spindle_dev.index:  Using epsilon-net clustering for block 7
[2026-01-19 13:00:00,862] INFO spindle_dev.index:  Finished block 7 in 0.07 seconds, found 2 clusters.
[2026-01-19 13:00:00,862] INFO spindle_dev.index:  Using epsilon-net clustering for block 8
[2026-01-19 13:00:00,932] INFO spindle_dev.index:  Finished block 8 in 0.07 seconds, found 3 clusters.
[2026-01-19 13:00:00,933] INFO spindle_dev.index:  Using epsilon-net clustering for block 9
[2026-01-19 13:00:00,976] INFO spindle_dev.index:  Finished block 9 in 0.04 seconds, found 4 clusters.
[2026-01-19 13:00:00,977] INFO spindle_dev.index:  Using epsilon-net clustering for block 10
[2026-01-19 13:00:01,036] INFO spindle_dev.index:  Finished block 10 in 0.06 seconds, found 2 clusters.
[2026-01-19 13:00:01,036] INFO spindle_dev.index:  Using epsilon-net clustering for block 11
[2026-01-19 13:00:01,115] INFO spindle_dev.index:  Finished block 11 in 0.08 seconds, found 2 clusters.
[2026-01-19 13:00:01,116] INFO spindle_dev.index:  Using epsilon-net clustering for block 12
[2026-01-19 13:00:01,196] INFO spindle_dev.index:  Finished block 12 in 0.08 seconds, found 6 clusters.
[2026-01-19 13:00:01,196] INFO spindle_dev.index:  Using epsilon-net clustering for block 13
[2026-01-19 13:00:01,255] INFO spindle_dev.index:  Finished block 13 in 0.06 seconds, found 7 clusters.
[2026-01-19 13:00:01,256] INFO spindle_dev.index: Step 2: Build DAG connections between block clusters.
[2026-01-19 13:00:01,256] INFO spindle_dev.index: Step 2.1: For each layer order the block-clusters by
[2026-01-19 13:00:01,256] INFO spindle_dev.index: Not implemented: ordering block-clusters ? How to order them?
[2026-01-19 13:00:01,257] INFO spindle_dev.index: We will use triangle inequality to order clusters.
[2026-01-19 13:00:01,257] INFO spindle_dev.index: Step 2.2: Connect block-clusters between layers based on co-occurrence in SPDs.
[2026-01-19 13:00:01,261] INFO spindle_dev.index: Check if node global_node_id matches index in nodes list
[2026-01-19 13:00:01,262] INFO spindle_dev.index: Step 2.3: Ordering block-clusters within each layer using log-Euclidean distances.
[2026-01-19 13:00:01,262] INFO spindle_dev.index: Processing cluster 2
[2026-01-19 13:00:01,262] INFO spindle_dev.index: Building SPD index with epsilon=6.784064466997531
[2026-01-19 13:00:01,263] INFO spindle_dev.index: Step 1: Cluster blocks within each class of SPD matrices.
[2026-01-19 13:00:01,286] INFO spindle_dev.index: Cluster 2: 504 SPDs, 16 blocks
[2026-01-19 13:00:01,286] INFO spindle_dev.index:  Using epsilon-net clustering for block 0
[2026-01-19 13:00:01,335] INFO spindle_dev.index:  Finished block 0 in 0.05 seconds, found 5 clusters.
[2026-01-19 13:00:01,336] INFO spindle_dev.index:  Using epsilon-net clustering for block 1
[2026-01-19 13:00:01,364] INFO spindle_dev.index:  Finished block 1 in 0.03 seconds, found 2 clusters.
[2026-01-19 13:00:01,364] INFO spindle_dev.index:  Using epsilon-net clustering for block 2
[2026-01-19 13:00:01,393] INFO spindle_dev.index:  Finished block 2 in 0.03 seconds, found 2 clusters.
[2026-01-19 13:00:01,394] INFO spindle_dev.index:  Using epsilon-net clustering for block 3
[2026-01-19 13:00:01,424] INFO spindle_dev.index:  Finished block 3 in 0.03 seconds, found 2 clusters.
[2026-01-19 13:00:01,424] INFO spindle_dev.index:  Using epsilon-net clustering for block 4
[2026-01-19 13:00:01,464] INFO spindle_dev.index:  Finished block 4 in 0.04 seconds, found 3 clusters.
[2026-01-19 13:00:01,465] INFO spindle_dev.index:  Using epsilon-net clustering for block 5
[2026-01-19 13:00:01,507] INFO spindle_dev.index:  Finished block 5 in 0.04 seconds, found 5 clusters.
[2026-01-19 13:00:01,508] INFO spindle_dev.index:  Using epsilon-net clustering for block 6
[2026-01-19 13:00:01,552] INFO spindle_dev.index:  Finished block 6 in 0.04 seconds, found 4 clusters.
[2026-01-19 13:00:01,552] INFO spindle_dev.index:  Using epsilon-net clustering for block 7
[2026-01-19 13:00:01,583] INFO spindle_dev.index:  Finished block 7 in 0.03 seconds, found 2 clusters.
[2026-01-19 13:00:01,584] INFO spindle_dev.index:  Using epsilon-net clustering for block 8
[2026-01-19 13:00:01,627] INFO spindle_dev.index:  Finished block 8 in 0.04 seconds, found 5 clusters.
[2026-01-19 13:00:01,627] INFO spindle_dev.index:  Using epsilon-net clustering for block 9
[2026-01-19 13:00:01,682] INFO spindle_dev.index:  Finished block 9 in 0.05 seconds, found 8 clusters.
[2026-01-19 13:00:01,682] INFO spindle_dev.index:  Using epsilon-net clustering for block 10
[2026-01-19 13:00:01,754] INFO spindle_dev.index:  Finished block 10 in 0.07 seconds, found 4 clusters.
[2026-01-19 13:00:01,755] INFO spindle_dev.index:  Using epsilon-net clustering for block 11
[2026-01-19 13:00:01,904] INFO spindle_dev.index:  Finished block 11 in 0.15 seconds, found 4 clusters.
[2026-01-19 13:00:01,905] INFO spindle_dev.index:  Using epsilon-net clustering for block 12
[2026-01-19 13:00:01,952] INFO spindle_dev.index:  Finished block 12 in 0.05 seconds, found 4 clusters.
[2026-01-19 13:00:01,953] INFO spindle_dev.index:  Using epsilon-net clustering for block 13
[2026-01-19 13:00:02,001] INFO spindle_dev.index:  Finished block 13 in 0.05 seconds, found 5 clusters.
[2026-01-19 13:00:02,001] INFO spindle_dev.index:  Using epsilon-net clustering for block 14
[2026-01-19 13:00:02,043] INFO spindle_dev.index:  Finished block 14 in 0.04 seconds, found 4 clusters.
[2026-01-19 13:00:02,044] INFO spindle_dev.index:  Using epsilon-net clustering for block 15
[2026-01-19 13:00:02,083] INFO spindle_dev.index:  Finished block 15 in 0.04 seconds, found 3 clusters.
[2026-01-19 13:00:02,084] INFO spindle_dev.index: Step 2: Build DAG connections between block clusters.
[2026-01-19 13:00:02,084] INFO spindle_dev.index: Step 2.1: For each layer order the block-clusters by
[2026-01-19 13:00:02,085] INFO spindle_dev.index: Not implemented: ordering block-clusters ? How to order them?
[2026-01-19 13:00:02,085] INFO spindle_dev.index: We will use triangle inequality to order clusters.
[2026-01-19 13:00:02,085] INFO spindle_dev.index: Step 2.2: Connect block-clusters between layers based on co-occurrence in SPDs.
[2026-01-19 13:00:02,090] INFO spindle_dev.index: Check if node global_node_id matches index in nodes list
[2026-01-19 13:00:02,090] INFO spindle_dev.index: Step 2.3: Ordering block-clusters within each layer using log-Euclidean distances.
[2026-01-19 13:00:02,090] INFO spindle_dev.index: Processing cluster 3
[2026-01-19 13:00:02,091] INFO spindle_dev.index: Building SPD index with epsilon=8.425444379709184
[2026-01-19 13:00:02,091] INFO spindle_dev.index: Step 1: Cluster blocks within each class of SPD matrices.
[2026-01-19 13:00:02,105] INFO spindle_dev.index: Cluster 3: 277 SPDs, 15 blocks
[2026-01-19 13:00:02,105] INFO spindle_dev.index:  Using epsilon-net clustering for block 0
[2026-01-19 13:00:02,125] INFO spindle_dev.index:  Finished block 0 in 0.02 seconds, found 3 clusters.
[2026-01-19 13:00:02,125] INFO spindle_dev.index:  Using epsilon-net clustering for block 1
[2026-01-19 13:00:02,142] INFO spindle_dev.index:  Finished block 1 in 0.02 seconds, found 3 clusters.
[2026-01-19 13:00:02,143] INFO spindle_dev.index:  Using epsilon-net clustering for block 2
[2026-01-19 13:00:02,157] INFO spindle_dev.index:  Finished block 2 in 0.01 seconds, found 2 clusters.
[2026-01-19 13:00:02,158] INFO spindle_dev.index:  Using epsilon-net clustering for block 3
[2026-01-19 13:00:02,179] INFO spindle_dev.index:  Finished block 3 in 0.02 seconds, found 3 clusters.
[2026-01-19 13:00:02,179] INFO spindle_dev.index:  Using epsilon-net clustering for block 4
[2026-01-19 13:00:02,197] INFO spindle_dev.index:  Finished block 4 in 0.02 seconds, found 3 clusters.
[2026-01-19 13:00:02,197] INFO spindle_dev.index:  Using epsilon-net clustering for block 5
[2026-01-19 13:00:02,215] INFO spindle_dev.index:  Finished block 5 in 0.02 seconds, found 3 clusters.
[2026-01-19 13:00:02,216] INFO spindle_dev.index:  Using epsilon-net clustering for block 6
[2026-01-19 13:00:02,242] INFO spindle_dev.index:  Finished block 6 in 0.03 seconds, found 4 clusters.
[2026-01-19 13:00:02,243] INFO spindle_dev.index:  Using epsilon-net clustering for block 7
[2026-01-19 13:00:02,259] INFO spindle_dev.index:  Finished block 7 in 0.02 seconds, found 2 clusters.
[2026-01-19 13:00:02,260] INFO spindle_dev.index:  Using epsilon-net clustering for block 8
[2026-01-19 13:00:02,276] INFO spindle_dev.index:  Finished block 8 in 0.02 seconds, found 2 clusters.
[2026-01-19 13:00:02,277] INFO spindle_dev.index:  Using epsilon-net clustering for block 9
[2026-01-19 13:00:02,298] INFO spindle_dev.index:  Finished block 9 in 0.02 seconds, found 2 clusters.
[2026-01-19 13:00:02,299] INFO spindle_dev.index:  Using epsilon-net clustering for block 10
[2026-01-19 13:00:02,316] INFO spindle_dev.index:  Finished block 10 in 0.02 seconds, found 2 clusters.
[2026-01-19 13:00:02,317] INFO spindle_dev.index:  Using epsilon-net clustering for block 11
[2026-01-19 13:00:02,336] INFO spindle_dev.index:  Finished block 11 in 0.02 seconds, found 3 clusters.
[2026-01-19 13:00:02,336] INFO spindle_dev.index:  Using epsilon-net clustering for block 12
[2026-01-19 13:00:02,412] INFO spindle_dev.index:  Finished block 12 in 0.08 seconds, found 2 clusters.
[2026-01-19 13:00:02,412] INFO spindle_dev.index:  Using epsilon-net clustering for block 13
[2026-01-19 13:00:02,441] INFO spindle_dev.index:  Finished block 13 in 0.03 seconds, found 2 clusters.
[2026-01-19 13:00:02,441] INFO spindle_dev.index:  Using epsilon-net clustering for block 14
[2026-01-19 13:00:02,487] INFO spindle_dev.index:  Finished block 14 in 0.05 seconds, found 2 clusters.
[2026-01-19 13:00:02,487] INFO spindle_dev.index: Step 2: Build DAG connections between block clusters.
[2026-01-19 13:00:02,488] INFO spindle_dev.index: Step 2.1: For each layer order the block-clusters by
[2026-01-19 13:00:02,488] INFO spindle_dev.index: Not implemented: ordering block-clusters ? How to order them?
[2026-01-19 13:00:02,488] INFO spindle_dev.index: We will use triangle inequality to order clusters.
[2026-01-19 13:00:02,489] INFO spindle_dev.index: Step 2.2: Connect block-clusters between layers based on co-occurrence in SPDs.
[2026-01-19 13:00:02,491] INFO spindle_dev.index: Check if node global_node_id matches index in nodes list
[2026-01-19 13:00:02,492] INFO spindle_dev.index: Step 2.3: Ordering block-clusters within each layer using log-Euclidean distances.

search_cfg = search.SearchConfig(max_results=2, debug=False, max_failed_starts=5, max_failed_paths=10)

test_results = test.run_sanity_search(data, dag_dict, config, search_cfg, max_queries=2000, skip_baseline=True)

[2026-01-19 13:00:02,893] INFO spindle_dev.test: Built ground-truth paths for 733 SPDs in cluster 0 across 16 blocks.
[2026-01-19 13:00:03,058] INFO spindle_dev.test: Built ground-truth paths for 567 SPDs in cluster 1 across 14 blocks.
[2026-01-19 13:00:03,208] INFO spindle_dev.test: Built ground-truth paths for 504 SPDs in cluster 2 across 16 blocks.
[2026-01-19 13:00:03,253] INFO spindle_dev.test: Built ground-truth paths for 277 SPDs in cluster 3 across 15 blocks.
[2026-01-19 13:00:03,254] INFO spindle_dev.test: Created ground-truth paths for all clusters.
[2026-01-19 13:00:03,254] INFO spindle_dev.test: Search config: SearchConfig(max_results=2, max_failed_starts=5, max_failed_paths=10, total_paths_limit=1000, deterministic=DeterministicConfig(seed=0), debug=False)
[2026-01-19 13:00:03,255] INFO spindle_dev.test: Running sanity search with 2000 queries.
100%|██████████| 2000/2000 [04:41<00:00,  7.11it/s]
[2026-01-19 13:04:44,476] INFO spindle_dev.test: Sanity search: 2000 queries, 2000 exact path matches, 2000 leaf matches, mean search_time=0.1399s

fig = plotting.visualize_block_dag_sankey_scaled(dag_dict[2].nodes, height=800, thickness=20.0)
fig

#fig.write_image("/data/sarkar_lab/Projects/spindle_dev/ISMB_notebook/figures/breast_cluster_0_block_dag_sankey.png",  engine="kaleido", width=2000, height=1200, scale=2)

import pandas as pd
def get_dag_stats(dag_dict):
    dag_stats = []
    for cluster_id, cluster_dag in dag_dict.items():
        block_to_node = cluster_dag.block_to_node_indices
        for node in cluster_dag.nodes:
            
            num_spds = len(node.metadata.members)
            block_size = node.metadata.mean.shape[0]
            block_id = node.block_index
            radius = node.metadata.radius
            
            #print(f"Cluster_id {cluster_id} Node id {node.global_node_id} Block {block_id}: {num_spds} spds")
            dag_stats.append({
                "cluster_id": cluster_id,
                "node_id": node.global_node_id,
                "block_id": block_id,
                "block_cluster_id": node.block_cluster_id,
                "num_spds": num_spds,
                "radius": radius,
                "block_size": block_size
            })
    dag_stats_df = pd.DataFrame(dag_stats)
    return dag_stats_df

dag_stats_df = get_dag_stats(dag_dict)

importlib.reload(index)

<module 'spindle_dev.index' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/index.py'>

score_list = []
for cluster_id in set(data.labels):
    cluster_dag = dag_dict[cluster_id]
    cluster_score = index.score_nodes_from_a_cluster(data, cluster_id, dag_dict, take_representative_mean=True)
    score_list.extend(cluster_score)

100%|██████████| 45/45 [00:00<00:00, 326.93it/s]
100%|██████████| 57/57 [00:00<00:00, 228.34it/s]
100%|██████████| 62/62 [00:00<00:00, 243.52it/s]
100%|██████████| 38/38 [00:00<00:00, 282.96it/s]

node_scores = []
for node_score in score_list:
    node_scores.append({
        "cluster_id": node_score["cluster_id"],
        "block_id": node_score["block_id"],
        "node_id": node_score["node_id"],
        "node_score": node_score["node_score"],
        "num_spds": node_score["num_spds"],
        "block_size": node_score["block_size"],
        "E": node_score["E"],
        "Q": node_score["Q"],
        "S": node_score["S"],
        "num_modules": len(node_score["modules"])
    })
node_score_df = pd.DataFrame(node_scores)

node_score_df.sort_values(by='node_score', ascending=False).head(20)

importlib.reload(plotting)

<module 'spindle_dev.plotting' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/plotting.py'>

def get_module_info(id):
    """
    Return metadata + L_mean/index_handle WITHOUT making any plots.
    """
    # Take the top scored node
    cluster_id = int(node_score_df.iloc[id]["cluster_id"])
    node_id    = int(node_score_df.iloc[id]["node_id"])

    block_id = int(
        dag_stats_df.loc[
            (dag_stats_df["cluster_id"] == cluster_id) & (dag_stats_df["node_id"] == node_id),
            "block_id",
        ].values[0]
    )

    print(f"Top scored node is in cluster {cluster_id}, node {node_id}, block {block_id}")

    # Most frequent center (kept for your debug / parity, not strictly needed for L_mean retrieval below)
    _ = dag_stats_df.loc[
        (dag_stats_df["cluster_id"] == cluster_id) & (dag_stats_df["block_id"] == block_id)
    ].sort_values(by="num_spds", ascending=False).node_id.values[0]

    index_handle = dag_dict[cluster_id]

    # Representative log-mean reference for this block
    L_mean = index.get_node_log_ref(index_handle, block_id, use_representative_mean=True)

    return cluster_id, node_id, block_id, L_mean, index_handle
def _fig_to_rgb_array(fig):
    from io import BytesIO
    from PIL import Image
    
    # Ensure figure is rendered
    try:
        fig.canvas.draw()
    except:
        pass
    
    buf = BytesIO()
    fig.savefig(buf, format='png', dpi=100, bbox_inches='tight', pad_inches=0.05)
    buf.seek(0)
    img = np.array(Image.open(buf))
    if img.shape[2] == 4:
        # If RGBA, convert to RGB
        img = img[..., :3]
    buf.close()
    return img

def _as_figure(fig_or_tuple):
    # Accept fig, or (fig, ax), or (fig, axs), etc.
    if isinstance(fig_or_tuple, tuple):
        return fig_or_tuple[0]
    return fig_or_tuple

def plot_module_heatmap_plus_spatial(id, module_ids=(0, 1), zero_corr_threshold=0.05):
    """
    One panel:
      - Left: corr heatmap with modules (auto-sized based on L_mean.shape[0])
      - Right: spatial module score plots for module_ids (stacked, constant size)
    """
    # Get metadata
    cluster_id = int(node_score_df.iloc[id]["cluster_id"])
    node_id = int(node_score_df.iloc[id]["node_id"])
    block_id = int(
        dag_stats_df.loc[
            (dag_stats_df["cluster_id"] == cluster_id) & (dag_stats_df["node_id"] == node_id),
            "block_id",
        ].values[0]
    )
    index_handle = dag_dict[cluster_id]
    L_mean = index.get_node_log_ref(index_handle, block_id, use_representative_mean=True)
    p = int(L_mean.shape[0])

    print(f"Creating combined figure for cluster {cluster_id}, node {node_id}, block {block_id}")

    # Make the heatmap and embed as image
    fig_hm, ax_hm, ordered_genes, gene_to_mod = plotting.plot_corr_heatmap_with_modules(
        score_list[id],
        show_values=False,
        value_fontsize=10,
        bold_labels=True,
        figsize=(8, 7),
        filter_zero_correlation=True,
        zero_corr_threshold=zero_corr_threshold,
    )
    hm_img = _fig_to_rgb_array(fig_hm)
    plt.close(fig_hm)

    # Build the combined figure
    heat_w = max(6.5, 0.25 * p)
    heat_h = max(5.0, 0.21 * p)
    spatial_w = 5.5
    total_w = heat_w + spatial_w
    total_h = max(heat_h, 6.5)

    fig = plt.figure(figsize=(total_w, total_h), constrained_layout=False)
    gs = fig.add_gridspec(
        nrows=2, ncols=2,
        width_ratios=[heat_w, spatial_w],
        height_ratios=[1, 1],
        wspace=0.05, hspace=0.10
    )

    ax_heat = fig.add_subplot(gs[:, 0])
    ax_sp0  = fig.add_subplot(gs[0, 1])
    ax_sp1  = fig.add_subplot(gs[1, 1])

    # Place heatmap image
    ax_heat.imshow(hm_img)
    ax_heat.set_axis_off()
    ax_heat.set_title(f"Cluster {cluster_id} | Node {node_id} | Block {block_id}", fontsize=10, pad=6)

    # Create spatial plots directly into axes (NO intermediate figures!)
    scores_final = None
    for ax_sp, mid in zip([ax_sp0, ax_sp1], module_ids):
        # print(f"Getting module {mid} score for cluster {cluster_id}, node {node_id}, block {block_id}")
        
        scores_tiles_wrt_ref, info = index.tile_module_scores_from_reference(
            data,
            score_list[id]["modules"][mid],
            index_handle,
            block_id,
            L_mean,
            cluster_id,
        )
        
        spot_score = plotting.assign_module_score_to_spots(data, scores_tiles_wrt_ref)
        scores = np.array(list(spot_score.values()))
        scores_final = scores
        
        # Plot directly into the axis - no intermediate figure!
        plotting.plot_spot_module_scores(
            spot_score, tiles, adata,
            ax=ax_sp,  # Pass the axis directly!
            grid=False,
            label=f"Module {mid}",
        )
        ax_sp.set_title(f"Module {mid}", fontsize=10, pad=4)

    return scores_final, fig

# This function is deprecated - use plot_module_heatmap_plus_spatial instead
# def get_module_info(id):
#     # Take the top scored node
#     cluster_id = int(node_score_df.iloc[id]['cluster_id'])
#     node_id = int(node_score_df.iloc[id]['node_id'])
#     block_id = int(dag_stats_df[(dag_stats_df['cluster_id'] == cluster_id) & (dag_stats_df['node_id'] == node_id)]['block_id'].values[0])
#     print(f"Top scored node is in cluster {cluster_id}, node {node_id}, block {block_id}")
#     fig, ax, ordered_genes, gene_to_mod = plotting.plot_corr_heatmap_with_modules(score_list[id], 
#         show_values=False, value_fontsize=10, bold_labels=True, figsize=(8,7), filter_zero_correlation=True, zero_corr_threshold=0.05)
#     # save
#     # fig.savefig(f"/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X/corr_heatmap_cluster{cluster_id}_node_{node_id}.png", dpi=300)
# 
#     # Find out the most frequent center in the same block
#     freq_node_center =  dag_stats_df.loc[(dag_stats_df['cluster_id'] == cluster_id) & (dag_stats_df['block_id'] == block_id)].sort_values(by='num_spds', ascending=False).node_id.values[0]
#     index_handle = dag_dict[cluster_id]
#     #L_mean = dag_dict[cluster_id].nodes[freq_node_center].metadata.representative_mean
#     L_mean = index.get_node_log_ref(index_handle, block_id, use_representative_mean=True)
#     return cluster_id, node_id, block_id, L_mean, index_handle

scores_146, fig_146 = plot_module_heatmap_plus_spatial(146)
fig_146.savefig("/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X_wo_unlabeled/module_146_heatmap_spatial.png", dpi=300)

Creating combined figure for cluster 2, node 44, block 11

out_146 = go_score.enrich_modules_with_gseapy(score_list[146]['modules'])

  Module 0: enrichment completed with 401 terms
  Module 1: enrichment completed with 340 terms
  Module 2: enrichment completed with 270 terms
  Module 3: enrichment completed with 206 terms

for module_id, module_genes, res in out_146:
    if module_id == 1:
        print(module_genes)
        fig, axes = go_score.plot_module_enrichment_libraries(
            module_id=module_id,
            module_genes=module_genes,
            res=res,
            top_n=7,
            ncols=2,
            bar_color="#D41515EE",   # pick your color
            tick_fontsize=10,
            label_fontsize=10,
            title_fontsize=8,
            compact_height_per_term=0.2,
            min_fig_h=6,
            save_dir="/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X_wo_unlabeled/go_enrichment/"
        )
        if fig is not None:
            plt.show()

['ADAM9', 'CCDC6', 'CCND1', 'CD9', 'DSP', 'ELF3', 'EPCAM', 'JUP', 'KRT7', 'KRT8', 'S100A14', 'SERPINA3', 'TACSTD2', 'TRAF4']
['GO_Biological_Process_2023', 'KEGG_2021_Human', 'MSigDB_Hallmark_2020', 'Reactome_2022']

scores_156, fig_156 = plot_module_heatmap_plus_spatial(156)

Creating combined figure for cluster 2, node 54, block 13

Creating combined figure for cluster 2, node 54, block 13

# Create a dictionary of tile_id to block_id
# Clusters do not share tile ids across them
from collections import defaultdict
tile_to_block_dict = {}
cluster_to_tile = defaultdict(set)
for cluster_id, cluster_dag in dag_dict.items():
    block_to_node = cluster_dag.block_to_node_indices
    for node in cluster_dag.nodes:
        member_tile_ids = [tid for tid,_ in node.metadata.members]
        for tile_id in member_tile_ids:
            if not tile_id in tile_to_block_dict:
                tile_to_block_dict[tile_id] = {}
            tile_to_block_dict[tile_id][node.block_index] = node.block_cluster_id
            cluster_to_tile[cluster_id].add(tile_id)

importlib.reload(plotting)

<module 'spindle_dev.plotting' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/plotting.py'>

def get_spot_score_dict(block_id, cluster_id, tile_to_block_dict, grid=False):
    spot_score = {}
    tiles_of_cluster = cluster_to_tile[cluster_id]
    for tid in tiles_of_cluster:
        if tid not in tile_to_block_dict:
            continue
        c = tile_to_block_dict[tid][block_id]
        t = data.metadata['tiles'][tid]
        for id in t.idx:
            spot_score[id] = c
    fig, ax = plotting.plot_spot_module_scores(spot_score, tiles, adata, color='discrete', grid=grid)
    return fig, ax

fig, ax = get_spot_score_dict(5, 2, tile_to_block_dict, grid=False)

node_score_df.loc[(node_score_df['cluster_id'] == 2) & (node_score_df['block_id'] == 5)]

scores_116, fig_116 = plot_module_heatmap_plus_spatial(116)

Creating combined figure for cluster 2, node 14, block 5

fig_116.savefig("/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X_wo_unlabeled/module_heatmap_plus_spatial_node_116.png", dpi=300)

scores_102, fig_102 = plot_module_heatmap_plus_spatial()

out = go_score.enrich_modules_with_gseapy(score_list[116]['modules'])

  Module 0: enrichment completed with 239 terms
  Module 1: skipped (only 2 genes)

for module_id, module_genes, res in out:
    fig, axes = go_score.plot_module_enrichment_libraries(
        module_id=module_id,
        module_genes=module_genes,
        res=res,
        top_n=7,
        ncols=2,
        bar_color="#D41515EE",   # pick your color
        tick_fontsize=10,
        label_fontsize=8,
        title_fontsize=8,
        compact_height_per_term=0.2,
        min_fig_h=8,
        #save_dir="/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X/go_enrichment/"
    )
    if fig is not None:
        plt.show()

['GO_Biological_Process_2023', 'KEGG_2021_Human', 'MSigDB_Hallmark_2020', 'Reactome_2022']

importlib.reload(plotting)

<module 'spindle_dev.plotting' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/plotting.py'>

len(tile_to_block_dict)

2081

len(tiles)

2081

dag_stats_df[['cluster_id', 'block_id', 'block_cluster_id', ]]

node_score_df.sort_values(by='node_score', ascending=False).head(20)

cluster_id, node_id, block_id, L_mean, index_handle = get_module_info(146)

Top scored node is in cluster 2, node 44, block 11
Module of size 20: mean abs corr = 0.2477
Adding module of size 20 with mean abs corr = 0.2477
Module of size 14: mean abs corr = 0.4084
Adding module of size 14 with mean abs corr = 0.4084
Module of size 8: mean abs corr = 0.2126
Adding module of size 8 with mean abs corr = 0.2126
Module of size 7: mean abs corr = 0.2540
Adding module of size 7 with mean abs corr = 0.2540
Using 4 modules for heatmap plotting.

module_id = 1
scores_146, fig_spatial = get_module_score_plot_spatial(146, module_id, cluster_id, node_id, block_id, L_mean, index_handle)

Getting module 1 score for cluster 2, node 44, block 11

cluster_id, node_id, block_id, L_mean, index_handle = get_module_info(145)

Top scored node is in cluster 2, node 43, block 11
Module of size 28: mean abs corr = 0.2114
Adding module of size 28 with mean abs corr = 0.2114
Module of size 18: mean abs corr = 0.5462
Adding module of size 18 with mean abs corr = 0.5462
Module of size 2: mean abs corr = 0.5234
Adding module of size 2 with mean abs corr = 0.5234
Using 3 modules for heatmap plotting.

module_id = 1
scores_145, fig_spatial = get_module_score_plot_spatial(145, module_id, cluster_id, node_id, block_id, L_mean, index_handle)

Getting module 1 score for cluster 2, node 43, block 11

plt.scatter(scores_145, scores_146, s=1, alpha=0.1)
plt.ylabel(f"Cluster{cluster_id} Module 1 Score in Node 43")
plt.xlabel(f"Cluster{cluster_id} Module 1 Score in Node 44")

Text(0.5, 0, 'Cluster2 Module 1 Score in Node 44')

plt.hist(scores_145, bins=50, alpha=0.5);
plt.hist(scores_146, bins=50, alpha=0.5);

#id = node_score_df.sort_values(by='node_score', ascending=False).index[1]
id = 146
# Take the top scored node
cluster_id = int(node_score_df.iloc[id]['cluster_id'])
node_id = int(node_score_df.iloc[id]['node_id'])
block_id = int(dag_stats_df[(dag_stats_df['cluster_id'] == cluster_id) & (dag_stats_df['node_id'] == node_id)]['block_id'].values[0])
print(f"Top scored node is in cluster {cluster_id}, node {node_id}, block {block_id}")
fig, ax, ordered_genes, gene_to_mod = plotting.plot_corr_heatmap_with_modules(score_list[id], 
    show_values=False, value_fontsize=10, bold_labels=True, figsize=(8,7), filter_zero_correlation=True, zero_corr_threshold=0.05)
# save
fig.savefig(f"/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X/corr_heatmap_cluster{cluster_id}_node_{node_id}.png", dpi=300)

# Find out the most frequent center in the same block
freq_node_center =  dag_stats_df.loc[(dag_stats_df['cluster_id'] == cluster_id) & (dag_stats_df['block_id'] == block_id)].sort_values(by='num_spds', ascending=False).node_id.values[0]
index_handle = dag_dict[cluster_id]
#L_mean = dag_dict[cluster_id].nodes[freq_node_center].metadata.representative_mean
L_mean = index.get_node_log_ref(index_handle, block_id, use_representative_mean=True)

Top scored node is in cluster 2, node 44, block 11
Module of size 20: mean abs corr = 0.2477
Adding module of size 20 with mean abs corr = 0.2477
Module of size 14: mean abs corr = 0.4084
Adding module of size 14 with mean abs corr = 0.4084
Module of size 8: mean abs corr = 0.2126
Adding module of size 8 with mean abs corr = 0.2126
Module of size 7: mean abs corr = 0.2540
Adding module of size 7 with mean abs corr = 0.2540
Using 4 modules for heatmap plotting.

importlib.reload(plotting)

<module 'spindle_dev.plotting' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/plotting.py'>

importlib.reload(index)

<module 'spindle_dev.index' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/index.py'>

# Differential LE score for all tiles for module 2
module_id = 1
scores_tiles_wrt_ref, info = index.tile_module_scores_from_reference(data, score_list[id]['modules'][module_id], index_handle, block_id, L_mean, cluster_id)
# assign module scores to spots
spot_score = plotting.assign_module_score_to_spots(data, scores_tiles_wrt_ref)
scores = np.array(list(spot_score.values()))
# fig, ax = plotting.plot_spot_module_scores(spot_score, tiles, adata,grid=False, figsize=(6,3.5), label="Module 2 Score")
# # save
# fig.savefig(f"/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X_wo_unlabeled/spatial_module{module_id}_score_cluster{cluster_id}_node{node_id}.png", dpi=300)

importlib.reload(plotting)

<module 'spindle_dev.plotting' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/plotting.py'>

fig, ax = plotting.plot_spot_module_scores(spot_score, tiles, adata,grid=False, figsize=(6,3.5), label="Module 2 Score")

fig, ax = plotting.plot_spot_module_scores(spot_score, tiles, adata,grid=False, figsize=(6,3.5), label="Module 2 Score")

from spindle_dev import go_score

out = go_score.enrich_modules_with_gseapy(score_list[146]['modules'])

  Module 0: enrichment completed with 401 terms
  Module 1: enrichment completed with 340 terms
  Module 2: enrichment completed with 270 terms
  Module 3: enrichment completed with 206 terms

for module_id, module_genes, res in out:
    fig, axes = go_score.plot_module_enrichment_libraries(
        module_id=module_id,
        module_genes=module_genes,
        res=res,
        top_n=7,
        ncols=2,
        bar_color="#D41515EE",   # pick your color
        tick_fontsize=10,
        label_fontsize=8,
        title_fontsize=8,
        compact_height_per_term=0.2,
        min_fig_h=8,
        #save_dir="/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X/go_enrichment/"
    )
    if fig is not None:
        plt.show()

['GO_Biological_Process_2023', 'KEGG_2021_Human', 'MSigDB_Hallmark_2020', 'Reactome_2022']

['GO_Biological_Process_2023', 'KEGG_2021_Human', 'MSigDB_Hallmark_2020', 'Reactome_2022']

['GO_Biological_Process_2023', 'KEGG_2021_Human', 'MSigDB_Hallmark_2020', 'Reactome_2022']

['GO_Biological_Process_2023', 'KEGG_2021_Human', 'MSigDB_Hallmark_2020', 'Reactome_2022']

from spindle_dev import index
importlib.reload(index)

<module 'spindle_dev.index' from '/data/sarkar_lab/Projects/spindle_dev/src/spindle_dev/index.py'>

import pandas as pd

cluster_id = 0
index_handle = dag_dict[cluster_id]
node_rows, per_node_tile_scores, tile_z = index.node_scores_from_reference(data, index_handle, cluster_id, representative_mean=True)

import pandas as pd

node_df = pd.DataFrame(node_rows).sort_values('z_scale', ascending=False)

node_score_df.loc[(node_score_df.cluster_id == cluster_id) & (node_score_df.node_id == 21)]

node_df.head(10)

id = node_score_df.index[21]
# Take the top scored node
cluster_id = int(node_score_df.iloc[id]['cluster_id'])
node_id = int(node_score_df.iloc[id]['node_id'])
block_id = int(dag_stats_df[(dag_stats_df['cluster_id'] == cluster_id) & (dag_stats_df['node_id'] == node_id)]['block_id'].values[0])
print(f"Top scored node is in cluster {cluster_id}, node {node_id}, block {block_id}")
fig, ax, ordered_genes, gene_to_mod = plotting.plot_corr_heatmap_with_modules(score_list[id], 
    show_values=False, value_fontsize=10, bold_labels=True, figsize=(8,7), filter_zero_correlation=True, zero_corr_threshold=0.05)
# save
fig.savefig(f"/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X/corr_heatmap_cluster{cluster_id}_node_{node_id}.png", dpi=300)

# Find out the most frequent center in the same block
freq_node_center =  dag_stats_df.loc[(dag_stats_df['cluster_id'] == cluster_id) & (dag_stats_df['block_id'] == block_id)].sort_values(by='num_spds', ascending=False).node_id.values[0]
index_handle = dag_dict[cluster_id]
#L_mean = dag_dict[cluster_id].nodes[freq_node_center].metadata.representative_mean
L_mean = index.get_node_log_ref(index_handle, block_id, use_representative_mean=True)

Top scored node is in cluster 0, node 21, block 7
Module of size 6: mean abs corr = 0.2030
Adding module of size 6 with mean abs corr = 0.2030
Module of size 5: mean abs corr = 0.2500
Adding module of size 5 with mean abs corr = 0.2500
Module of size 4: mean abs corr = 0.3630
Adding module of size 4 with mean abs corr = 0.3630
Using 3 modules for heatmap plotting.

# Differential LE score for all tiles for module 2
module_id = 2
scores_tiles_wrt_ref, info = index.tile_module_scores_from_reference(data, score_list[id]['modules'][module_id], index_handle, block_id, L_mean, cluster_id)
# assign module scores to spots
spot_score = plotting.assign_module_score_to_spots(data, scores_tiles_wrt_ref)
scores = np.array(list(spot_score.values()))
# fig, ax = plotting.plot_spot_module_scores(spot_score, tiles, adata,grid=False, figsize=(6,3.5), label="Module 2 Score")
# # save
# fig.savefig(f"/data/sarkar_lab/Projects/spindle_dev/results/hbreast_10X_wo_unlabeled/spatial_module{module_id}_score_cluster{cluster_id}_node{node_id}.png", dpi=300)

fig, ax = plotting.plot_spot_module_scores(spot_score, tiles, adata,grid=False, figsize=(6,3.5), label="Module 2 Score")

from collections import defaultdict

def block_dag_edges_consecutive(nodes, block_id_from_members=True):
    # group nodes by layer
    by_layer = defaultdict(list)
    for n in nodes:
        by_layer[n.block_index].append(n)
    layers = sorted(by_layer.keys())

    # stable order within each layer
    ordered_by_layer = {
        b: sorted(by_layer[b], key=lambda n: (n.order_id, n.global_node_id))
        for b in layers
    }

    # spd_id -> node_id in each layer
    spd_to_node_in_layer = {b: {} for b in layers}
    for b in layers:
        for n in ordered_by_layer[b]:
            for spd_id, blk_id in n.metadata.members:
                if (not block_id_from_members) or (blk_id == b):
                    spd_to_node_in_layer[b][spd_id] = n.global_node_id

    # edges between consecutive layers only (u -> v) counted by shared SPD ids
    edges = []  # list of (u_id, v_id, count_true)
    for b0, b1 in zip(layers[:-1], layers[1:]):
        m0 = spd_to_node_in_layer[b0]
        m1 = spd_to_node_in_layer[b1]
        common = set(m0.keys()) & set(m1.keys())

        cnt = defaultdict(int)
        for spd_id in common:
            cnt[(m0[spd_id], m1[spd_id])] += 1

        for (u, v), c in cnt.items():
            edges.append((u, v, c))

    return layers, ordered_by_layer, edges


def block_dag_to_dot(layers, ordered_by_layer, edges, node_label_fn=None, edge_label_fn=None):
    """
    node_label_fn(n) -> str
    edge_label_fn(count) -> str
    """
    if node_label_fn is None:
        node_label_fn = lambda n: str(n.global_node_id)
    if edge_label_fn is None:
        edge_label_fn = lambda c: str(c)

    # map node_id -> layer index (for rank)
    node_to_layer = {}
    for b in layers:
        for n in ordered_by_layer[b]:
            node_to_layer[n.global_node_id] = b

    lines = []
    lines.append("digraph G {")
    lines.append('  graph [rankdir=TB, splines=true, nodesep=0.25, ranksep=0.5];')
    lines.append('  node  [shape=circle, fixedsize=true, width=0.28, fontsize=10, penwidth=1];')
    lines.append('  edge  [fontsize=9, penwidth=1, arrowsize=0.6];')

    # --- ranks (each layer on one horizontal row)
    for b in layers:
        ids = [n.global_node_id for n in ordered_by_layer[b]]
        lines.append(f"  subgraph cluster_rank_{b} {{")
        lines.append("    rank=same;")
        # invisible box for rank grouping label if you want later
        for nid in ids:
            lines.append(f'    "{nid}";')
        lines.append("  }")

    # --- node labels
    for b in layers:
        for n in ordered_by_layer[b]:
            nid = n.global_node_id
            label = node_label_fn(n).replace('"', '\\"')
            lines.append(f'  "{nid}" [label="{label}"];')

    # --- edges with labels (counts)
    for u, v, c in edges:
        elab = edge_label_fn(c).replace('"', '\\"')
        lines.append(f'  "{u}" -> "{v}" [label="{elab}"];')

    lines.append("}")
    return "\n".join(lines)


def render_dot(dot_str, out_path, fmt="png"):
    """
    Requires:
      - system graphviz (dot)
      - python package: graphviz  (pip install graphviz)
    """
    from graphviz import Source
    src = Source(dot_str, format=fmt)
    # graphviz adds extension automatically; keep clean name
    filename = out_path.rsplit(".", 1)[0]
    return src.render(filename=filename, cleanup=True)

layers, ordered_by_layer, edges = block_dag_edges_consecutive(dag_dict[0].nodes)

dot = block_dag_to_dot(
    layers,
    ordered_by_layer,
    edges,
    node_label_fn=lambda n: f"{n.order_id}",   # or cluster id, or short name
    edge_label_fn=lambda c: str(c),            # true SPD counts
)

# save dot text
with open("block_dag.dot", "w") as f:
    f.write(dot)

# render (optional)
render_dot(dot, "block_dag.png", fmt="png")

'block_dag.png'

from graphviz import Source

Source(dot)

	cluster_id	block_id	node_id	node_score	num_spds	block_size	E	Q	S	num_modules
75	1	6	30	0.239966	1	38	1.932651	0.254274	0.511693	3
146	2	11	44	0.165974	1	49	1.627859	0.164071	0.378574	4
5	0	1	5	0.159690	91	15	0.774484	0.390674	0.472224	4
94	1	12	49	0.158631	13	26	1.273377	0.231421	0.461699	4
74	1	6	29	0.156417	5	38	1.526276	0.186053	0.449176	6
156	2	13	54	0.150369	2	18	0.935547	0.288826	0.443511	5
60	1	4	15	0.148257	397	21	0.976389	0.243900	0.377444	7
145	2	11	43	0.148110	1	49	2.225433	0.125890	0.471341	4
89	1	12	44	0.140268	543	26	1.321063	0.188080	0.435467	3
173	3	3	9	0.136184	49	20	0.794279	0.282274	0.392594	4
80	1	8	35	0.132119	1	27	1.412164	0.179984	0.480190	4
79	1	8	34	0.131210	2	27	1.464330	0.189470	0.527084	4
198	3	13	34	0.125699	266	26	0.862653	0.251575	0.420804	3
201	3	14	37	0.113385	11	36	1.961125	0.121871	0.525596	3
52	1	1	7	0.105444	91	15	0.571309	0.322258	0.427276	5
86	1	10	41	0.104808	3	25	1.835981	0.124215	0.540434	3
131	2	8	29	0.104168	42	15	0.792322	0.248719	0.471405	3
77	1	7	32	0.101813	3	28	2.014848	0.139933	0.638893	4
91	1	12	46	0.101234	1	26	1.344811	0.142968	0.473469	5
97	1	13	52	0.100772	1	16	1.536831	0.159211	0.588153	2

	cluster_id	block_id	node_id	node_score	num_spds	block_size	E	Q	S	num_modules
75	1	6	30	0.239966	1	38	1.932651	0.254274	0.511693	3
146	2	11	44	0.165974	1	49	1.627859	0.164071	0.378574	4
5	0	1	5	0.159690	91	15	0.774484	0.390674	0.472224	4
94	1	12	49	0.158631	13	26	1.273377	0.231421	0.461699	4
74	1	6	29	0.156417	5	38	1.526276	0.186053	0.449176	6
156	2	13	54	0.150369	2	18	0.935547	0.288826	0.443511	5
60	1	4	15	0.148257	397	21	0.976389	0.243900	0.377444	7
145	2	11	43	0.148110	1	49	2.225433	0.125890	0.471341	4
89	1	12	44	0.140268	543	26	1.321063	0.188080	0.435467	3
173	3	3	9	0.136184	49	20	0.794279	0.282274	0.392594	4
80	1	8	35	0.132119	1	27	1.412164	0.179984	0.480190	4
79	1	8	34	0.131210	2	27	1.464330	0.189470	0.527084	4
198	3	13	34	0.125699	266	26	0.862653	0.251575	0.420804	3
201	3	14	37	0.113385	11	36	1.961125	0.121871	0.525596	3
52	1	1	7	0.105444	91	15	0.571309	0.322258	0.427276	5
86	1	10	41	0.104808	3	25	1.835981	0.124215	0.540434	3
131	2	8	29	0.104168	42	15	0.792322	0.248719	0.471405	3
77	1	7	32	0.101813	3	28	2.014848	0.139933	0.638893	4
91	1	12	46	0.101234	1	26	1.344811	0.142968	0.473469	5
97	1	13	52	0.100772	1	16	1.536831	0.159211	0.588153	2

	block_id	node_id	node_index	prevalence	median_dist	mad_dist	quality	interest	k_genes	z_median	z_mad	z_scale
21	7	21	21	1.0	3.609869	1.693648	0.004974	0.004974	15	3.609869	1.693648	2.511002
23	8	23	23	1.0	3.275327	1.256047	0.010766	0.010766	15	3.275327	1.256047	1.862216
16	5	16	16	1.0	3.077620	1.237488	0.013365	0.013365	15	3.077620	1.237488	1.834699
26	9	26	26	1.0	3.195635	1.227180	0.012000	0.012000	15	3.195635	1.227180	1.819417
5	1	5	5	1.0	4.973354	1.050387	0.002421	0.002421	15	4.973354	1.050387	1.557303
13	4	13	13	1.0	4.723527	0.958816	0.003406	0.003406	15	4.723527	0.958816	1.421540
10	3	10	10	1.0	5.474510	0.894123	0.001715	0.001715	15	5.474510	0.894123	1.325627
34	11	35	35	1.0	2.700770	0.753236	0.031619	0.031619	24	2.700770	0.753236	1.116748
30	10	30	30	1.0	3.227608	0.726557	0.019175	0.019175	18	3.227608	0.726557	1.077193
7	2	7	7	1.0	6.275040	0.642391	0.000990	0.000990	15	6.275040	0.642391	0.952408

	cluster_id	block_id	node_id	node_score	num_spds	block_size	E	Q	S	num_modules
116	2	5	14	0.001398	350	15	1.661521	0.010595	0.920589	2
117	2	5	15	0.003958	2	15	0.563854	0.051842	0.864585	4
118	2	5	16	0.001554	22	15	0.287832	0.029581	0.817513	5
119	2	5	17	0.010833	67	15	0.351048	0.100392	0.692624	3
120	2	5	18	0.007216	63	15	0.419383	0.075795	0.772996	4

	cluster_id	block_id	block_cluster_id
0	0	0	0
1	0	0	1
2	0	1	0
3	0	1	1
4	0	1	2
...	...	...	...
197	3	12	1
198	3	13	0
199	3	13	1
200	3	14	0
201	3	14	1

	cluster_id	block_id	block_cluster_id
0	0	0	0
1	0	0	1
2	0	1	0
3	0	1	1
4	0	1	2
...	...	...	...
197	3	12	1
198	3	13	0
199	3	13	1
200	3	14	0
201	3	14	1

	cluster_id	block_id	block_cluster_id
0	0	0	0
1	0	0	1
2	0	1	0
3	0	1	1
4	0	1	2
...	...	...	...
197	3	12	1
198	3	13	0
199	3	13	1
200	3	14	0
201	3	14	1