In [14]:
import pandas as pd
import glob
import scanpy as sc
In [44]:
test_benchmark_files = glob.glob("/data/sarkar_lab/insitupy_demo_data_xenium/*/*.csv")
In [17]:
# read number of cells and number of genes from each file and store in a dataframe
h5ad_files = glob.glob("/data/sarkar_lab/insitupy_demo_data_xenium/*.h5ad")
file_info = []
for h5ad_file in h5ad_files:
adata = sc.read_h5ad(h5ad_file)
num_cells = adata.n_obs
num_genes = adata.n_vars
experiment_name = h5ad_file.split("/")[-1].replace(".h5ad", "")
file_info.append({'experiment': experiment_name, 'num_cells': num_cells, 'num_genes': num_genes})
del adata # free memory
file_info_df = pd.DataFrame(file_info)
In [25]:
file_info_df
Out[25]:
| experiment | num_cells | num_genes | |
|---|---|---|---|
| 0 | xenium_human_skin_melanoma | 87499 | 382 |
| 1 | xenium_human_brain_cancer | 816769 | 480 |
| 2 | xenium_human_kidney_nondiseased | 97560 | 377 |
| 3 | xenium_human_lung_cancer | 162254 | 377 |
| 4 | xenium_human_lymph_node | 377985 | 377 |
| 5 | xenium_human_lymph_node_5k | 708983 | 4624 |
| 6 | xenium_human_pancreatic_cancer | 190965 | 474 |
| 7 | xenium_human_breast_cancer | 167780 | 313 |
In [45]:
df_list = []
for f in test_benchmark_files:
df = pd.read_csv(f)
# Extract name of the experiment from the file path
experiment_name = f.split("/")[-2]
# remove "_index"
experiment_name = experiment_name.replace("_index", "")
df['experiment'] = experiment_name
df_list.append(df)
In [ ]:
In [46]:
df_all = pd.concat(df_list, ignore_index=True)
In [47]:
#df_all = pd.merge(df_all, file_info_df, on='experiment')
In [48]:
df_all
Out[48]:
| query_idx | cluster_id | budget | matched_gt | matched_leaf | matched_budget | search_time | baseline_time | experiment | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 301 | 1 | 189.430309 | True | True | 81.416071 | 0.006204 | 5.615099 | xenium_human_skin_melanoma |
| 1 | 457 | 0 | 167.719218 | True | True | 53.975176 | 0.006687 | 6.413544 | xenium_human_skin_melanoma |
| 2 | 416 | 1 | 189.430309 | True | True | 69.324286 | 0.005751 | 5.728292 | xenium_human_skin_melanoma |
| 3 | 82 | 5 | 215.963601 | True | True | 65.404513 | 0.015185 | 2.908104 | xenium_human_skin_melanoma |
| 4 | 79 | 2 | 174.854662 | True | True | 72.108620 | 0.004831 | 4.720362 | xenium_human_skin_melanoma |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1395 | 1387 | 1 | 221.229611 | True | True | 91.196119 | 0.005294 | 15.113822 | xenium_human_lung_cancer |
| 1396 | 52 | 0 | 204.228495 | True | True | 101.705962 | 0.014020 | 15.415307 | xenium_human_lung_cancer |
| 1397 | 1067 | 2 | 231.852274 | True | True | 92.992326 | 0.004700 | 12.283487 | xenium_human_lung_cancer |
| 1398 | 1649 | 1 | 221.229611 | True | True | 106.058703 | 0.005202 | 14.603771 | xenium_human_lung_cancer |
| 1399 | 214 | 0 | 204.228495 | True | True | 98.810778 | 0.013991 | 15.630480 | xenium_human_lung_cancer |
1400 rows × 9 columns
In [49]:
df_all.experiment.unique()
Out[49]:
array(['xenium_human_skin_melanoma', 'xenium_human_kidney_nondiseased',
'xenium_human_breast_cancer', 'xenium_human_pancreatic_cancer',
'xenium_human_lymph_node', 'xenium_human_brain_cancer',
'xenium_human_lung_cancer'], dtype=object)
In [50]:
file_info_df.experiment.unique()
Out[50]:
array(['xenium_human_skin_melanoma', 'xenium_human_brain_cancer',
'xenium_human_kidney_nondiseased', 'xenium_human_lung_cancer',
'xenium_human_lymph_node', 'xenium_human_lymph_node_5k',
'xenium_human_pancreatic_cancer', 'xenium_human_breast_cancer'],
dtype=object)
In [97]:
# Create boxplot with strip plots of search_time and baseline_time using seaborn
import seaborn as sns
import matplotlib.pyplot as plt
# Merge file_info_df with df_all to get num_cells and num_genes
df_melted = df_all.melt(id_vars=['experiment'], value_vars=['search_time', 'baseline_time'], var_name='time_type', value_name='time')
# Merge with file_info to get num_cells and num_genes
df_melted = df_melted.merge(file_info_df, on='experiment', how='left')
# Rename time_type values: baseline_time -> baseline, search_time -> spindle
df_melted['method'] = df_melted['time_type'].map({'baseline_time': 'baseline', 'search_time': 'spindle'})
# Create mapping with specific meaningful names
short_name_map = {
'xenium_human_skin_melanoma': 'Skin',
'xenium_human_kidney_nondiseased': 'Kidney',
'xenium_human_breast_cancer': 'Breast',
'xenium_human_pancreatic_cancer': 'Pancreas',
'xenium_human_lymph_node': 'Lymph',
'xenium_human_lung_cancer': 'Lung',
'xenium_human_brain_cancer': 'Brain'
}
# Sort by num_genes and create ordered list
exp_order = df_melted.drop_duplicates(subset=['experiment']).sort_values(['num_cells'])['experiment'].tolist()
# Create figure with larger size for better readability
fig, ax = plt.subplots(figsize=(12, 6))
# Create boxplot sorted by num_genes
sns.boxplot(x='experiment', y='time', hue='method',
data=df_melted, ax=ax, order=exp_order,
hue_order=['baseline', 'spindle'],
patch_artist=True) # Enable transparency
# Make boxplot patches transparent
for patch in ax.patches:
patch.set_alpha(0.4)
# Remove horizontal gridlines
ax.set_axisbelow(True)
ax.yaxis.grid(False)
# Overlay strip plots with jitter and dodging for better visibility of individual points
sns.stripplot(x='experiment', y='time', hue='method', data=df_melted,
jitter=True, dodge=True, alpha=0.5, size=4, ax=ax, order=exp_order, hue_order=['baseline', 'spindle'])
# Remove duplicate legend entries from stripplot
handles, labels = ax.get_legend_handles_labels()
# Keep only the first 2 entries (boxplot legend for baseline and spindle)
ax.legend(handles[:2],
labels[:2],
title='Method',
bbox_to_anchor=(1.05, 1),
loc='upper left'
)
# Update x-axis labels with short names and stats
exp_info = file_info_df.set_index('experiment').loc[exp_order]
x_labels = [f"{short_name_map[exp]}\n({row['num_cells']} cells)\n({row['num_genes']} genes)"
for exp, (_, row) in zip(exp_order, exp_info.iterrows())]
ax.set_xticklabels(x_labels, rotation=0, ha='center', fontsize=12)
#ax.tick_params(axis='x', labelsize=11, length=8, width=2, colors='black')
ax.tick_params(axis='y', labelsize=12, length=8, width=2, colors='black')
# Remove top and right spines but keep left and bottom for ticks
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title('Boxplot with Individual Points: Baseline vs Spindle by Experiment (sorted by num_genes)')
plt.xlabel('Experiment')
plt.yscale('log')
plt.ylabel('Time in seconds (Log scale) ', fontdict={'fontsize':12})
plt.tight_layout()
plt.savefig('/data/sarkar_lab/Projects/spindle_dev/ISMB_notebook/figures/performance_comparison_boxplot.pdf', dpi=500)
plt.show()
/tmp/ipykernel_1198668/3501101289.py:63: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_xticklabels(x_labels, rotation=0, ha='center', fontsize=12)
In [ ]:
# Calculate space and time for the
In [65]:
import os
import glob
import pandas as pd
BASE = "/data/sarkar_lab/insitupy_demo_data_xenium"
# -----------------------------
# Index-level stats
# -----------------------------
index_dirs = glob.glob(os.path.join(BASE, "xenium_*_index"))
index_rows = []
for d in index_dirs:
experiment = os.path.basename(d).replace("_index", "")
stats_path = os.path.join(d, "index_stats.txt")
spindle_path = os.path.join(d, "spindle.pkl")
index_time = None
if os.path.exists(stats_path):
with open(stats_path, "r") as f:
try:
index_time = round(float(f.read().strip()), 2)
except ValueError:
index_time = None
spindle_size_mb = (
round(os.path.getsize(spindle_path) / 1e6, 2)
if os.path.exists(spindle_path)
else None
)
index_rows.append({
"experiment": experiment,
"index_time_raw": index_time,
"spindle_size_MB": spindle_size_mb
})
index_df = pd.DataFrame(index_rows)
# -----------------------------
# h5ad file sizes
# -----------------------------
h5ad_rows = []
for h5ad in glob.glob(os.path.join(BASE, "*.h5ad")):
experiment = os.path.basename(h5ad).replace(".h5ad", "")
h5ad_rows.append({
"experiment": experiment,
"h5ad_size_MB": round(os.path.getsize(h5ad) / 1e6, 2)
})
h5ad_df = pd.DataFrame(h5ad_rows)
# -----------------------------
# Merge
# -----------------------------
summary_df = index_df.merge(h5ad_df, on="experiment", how="left")
# print(summary_df)
# summary_df.to_csv("xenium_index_summary.csv", index=False)
In [66]:
summary_df.dropna()
Out[66]:
| experiment | index_time_raw | spindle_size_MB | h5ad_size_MB | |
|---|---|---|---|---|
| 0 | xenium_human_skin_melanoma | 33.47 | 20.61 | 51.64 |
| 1 | xenium_human_kidney_nondiseased | 28.63 | 20.34 | 33.94 |
| 2 | xenium_human_breast_cancer | 50.21 | 16.14 | 103.27 |
| 3 | xenium_human_pancreatic_cancer | 120.90 | 32.67 | 115.89 |
| 4 | xenium_human_lymph_node | 80.97 | 24.12 | 135.48 |
| 5 | xenium_human_brain_cancer | 675.91 | 48.03 | 525.59 |
| 6 | xenium_human_lung_cancer | 54.50 | 22.25 | 60.73 |
In [67]:
summary_df = pd.merge(file_info_df, summary_df.dropna(), on='experiment')
In [68]:
summary_df
Out[68]:
| experiment | num_cells | num_genes | index_time_raw | spindle_size_MB | h5ad_size_MB | |
|---|---|---|---|---|---|---|
| 0 | xenium_human_skin_melanoma | 87499 | 382 | 33.47 | 20.61 | 51.64 |
| 1 | xenium_human_brain_cancer | 816769 | 480 | 675.91 | 48.03 | 525.59 |
| 2 | xenium_human_kidney_nondiseased | 97560 | 377 | 28.63 | 20.34 | 33.94 |
| 3 | xenium_human_lung_cancer | 162254 | 377 | 54.50 | 22.25 | 60.73 |
| 4 | xenium_human_lymph_node | 377985 | 377 | 80.97 | 24.12 | 135.48 |
| 5 | xenium_human_pancreatic_cancer | 190965 | 474 | 120.90 | 32.67 | 115.89 |
| 6 | xenium_human_breast_cancer | 167780 | 313 | 50.21 | 16.14 | 103.27 |
In [69]:
# Create mapping with specific meaningful names
short_name_map = {
'xenium_human_skin_melanoma': 'Skin',
'xenium_human_kidney_nondiseased': 'Kidney',
'xenium_human_breast_cancer': 'Breast',
'xenium_human_pancreatic_cancer': 'Pancreas',
'xenium_human_lymph_node': 'Lymph',
'xenium_human_lung_cancer': 'Lung',
'xenium_human_brain_cancer': 'Brain'
}
In [70]:
summary_df.experiment = summary_df.experiment.map(short_name_map)
In [72]:
summary_df.sort_values('num_cells')
Out[72]:
| experiment | num_cells | num_genes | index_time_raw | spindle_size_MB | h5ad_size_MB | |
|---|---|---|---|---|---|---|
| 0 | Skin | 87499 | 382 | 33.47 | 20.61 | 51.64 |
| 2 | Kidney | 97560 | 377 | 28.63 | 20.34 | 33.94 |
| 3 | Lung | 162254 | 377 | 54.50 | 22.25 | 60.73 |
| 6 | Breast | 167780 | 313 | 50.21 | 16.14 | 103.27 |
| 5 | Pancreas | 190965 | 474 | 120.90 | 32.67 | 115.89 |
| 4 | Lymph | 377985 | 377 | 80.97 | 24.12 | 135.48 |
| 1 | Brain | 816769 | 480 | 675.91 | 48.03 | 525.59 |
In [ ]:
# create a latex table from the summary_df showing experiment, num_cells, num_genes, index_time_raw, spindle_size_MB, h5ad_size_MB?
In [ ]:
In [ ]:
# Create a barplot for spindle_size_MB, and index building time in two separate sub-plots
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(5, 5))
sns.barplot(x='experiment', y='spindle_size_MB', data=summary_df, ax=ax, order=summary_df.sort_values('num_cells')['experiment'])
plt.title('Spindle Index Size by Experiment')
plt.xlabel('Experiment')
plt.ylabel('Spindle Size (MB)')
plt.tight_layout()
plt.savefig('/data/sarkar_lab/Projects/spindle_dev/ISMB_notebook/figures/spindle_size_barplot.png', dpi=300)
plt.show()
In [74]:
# Create LaTeX table from summary_df
cols = ['experiment', 'num_cells', 'num_genes', 'index_time_raw', 'spindle_size_MB', 'h5ad_size_MB']
latex_table = summary_df[cols].sort_values('num_cells').to_latex(
index=False,
float_format=lambda x: f'{x:.2f}' if isinstance(x, float) else str(x),
caption='Xenium Dataset Summary Statistics',
label='tab:xenium_summary'
)
print(latex_table)
\begin{table}
\caption{Xenium Dataset Summary Statistics}
\label{tab:xenium_summary}
\begin{tabular}{lrrrrr}
\toprule
experiment & num_cells & num_genes & index_time_raw & spindle_size_MB & h5ad_size_MB \\
\midrule
Skin & 87499 & 382 & 33.47 & 20.61 & 51.64 \\
Kidney & 97560 & 377 & 28.63 & 20.34 & 33.94 \\
Lung & 162254 & 377 & 54.50 & 22.25 & 60.73 \\
Breast & 167780 & 313 & 50.21 & 16.14 & 103.27 \\
Pancreas & 190965 & 474 & 120.90 & 32.67 & 115.89 \\
Lymph & 377985 & 377 & 80.97 & 24.12 & 135.48 \\
Brain & 816769 & 480 & 675.91 & 48.03 & 525.59 \\
\bottomrule
\end{tabular}
\end{table}
In [101]:
# Create barplots for spindle_size_MB and index_time_raw
import matplotlib.pyplot as plt
import seaborn as sns
fig, axes = plt.subplots(2, 1, figsize=(6, 5), sharex=True)
# Sort summary_df by num_cells for consistent ordering
summary_sorted = summary_df.sort_values('num_cells')
# Top subplot: spindle_size_MB
axes[0].bar(summary_sorted['experiment'], summary_sorted['spindle_size_MB'], color='steelblue', alpha=0.7)
axes[0].set_title('Spindle Index Size (MB)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Size (MB)', fontsize=11)
axes[0].tick_params(axis='y', labelsize=11)
axes[0].grid(False)
# Bottom subplot: index_time_raw
axes[1].bar(summary_sorted['experiment'], summary_sorted['index_time_raw'], color='coral', alpha=0.7)
axes[1].set_title('Index Building Time (seconds)', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Experiment', fontsize=11)
axes[1].set_ylabel('Time (seconds)', fontsize=11)
axes[1].tick_params(axis='x', rotation=90, labelsize=11)
axes[1].tick_params(axis='y', labelsize=11)
axes[1].grid(False)
sns.despine()
plt.tight_layout()
plt.savefig('/data/sarkar_lab/Projects/spindle_dev/ISMB_notebook/figures/index_stats_barplot_updown.pdf', dpi=500, bbox_inches='tight')
plt.show()
In [ ]: