Skip to content

Commit 7480b48

Browse files
committed
add tmp_plots
1 parent 6c14e47 commit 7480b48

23 files changed

Lines changed: 1622 additions & 0 deletions

tmp_plots/.gitignore

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Python-generated files
2+
__pycache__/
3+
*.py[oc]
4+
build/
5+
dist/
6+
wheels/
7+
*.egg-info
8+
9+
# Virtual environments
10+
.venv

tmp_plots/.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

tmp_plots/README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
## Quick start
2+
3+
- Data file: place `spacewire_250919215922.xlsx` in the project root.
4+
- Outputs: generated PDFs/CSVs are written to `output/`.
5+
6+
## Using uv (recommended)
7+
8+
- Create and sync a local env from `pyproject.toml`/`uv.lock`:
9+
- `uv sync`
10+
- Run the plotting pipeline:
11+
- `uv run python main.py`
12+
13+
## Using Python venv directly
14+
15+
- Create venv and install deps via pip:
16+
- macOS/Linux:
17+
- `python3 -m venv .venv`
18+
- `source .venv/bin/activate`
19+
- `pip install -r requirements.txt`
20+
- `python3 main.py`
21+
- Windows (PowerShell):
22+
- `py -3 -m venv .venv`
23+
- `.venv\\Scripts\\Activate.ps1`
24+
- `pip install -r requirements.txt`
25+
- `python main.py`
26+
27+
## Selecting charts
28+
29+
- Each plot is a separate function call in `main.py:17`. Comment out the calls you don’t need.
30+
- Clustering/PCA lives in `clustering.py` and is invoked from `main.py` with `plot_agglomerative_clustering_and_pca`.

tmp_plots/clustering.py

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
import os
2+
from typing import Optional
3+
4+
import pandas as pd
5+
import seaborn as sns
6+
from matplotlib.colors import to_hex
7+
import matplotlib.pyplot as plt
8+
from sklearn.preprocessing import StandardScaler
9+
from sklearn.decomposition import PCA
10+
from sklearn.cluster import AgglomerativeClustering
11+
from scipy.cluster.hierarchy import dendrogram, linkage
12+
13+
14+
def _ensure_dir(output_dir: str) -> None:
15+
os.makedirs(output_dir, exist_ok=True)
16+
17+
18+
def plot_agglomerative_clustering_and_pca(
19+
df: pd.DataFrame,
20+
output_dir: str = "output",
21+
manual_n_clusters: Optional[int] = 5,
22+
) -> None:
23+
"""Run agglomerative clustering, save dendrogram and PCA cluster scatter.
24+
25+
Outputs:
26+
- Clustering_Dendrogram.pdf
27+
- AgglomerativeClusters_PCA.pdf
28+
- cluster_assignments.csv
29+
"""
30+
_ensure_dir(output_dir)
31+
32+
bool_cols = {
33+
"stats.spot.syntactic_safety": "syntactic_safety",
34+
"stats.spot.is_stutter_invariant_formula": "stutter_invariant",
35+
"stats.spot.tgba_analysis.is_complete": "tgba_complete",
36+
"stats.spot.tgba_analysis.is_deterministic": "tgba_deterministic",
37+
"stats.spot.buchi_analysis.is_complete": "buchi_complete",
38+
"stats.spot.buchi_analysis.is_deterministic": "buchi_deterministic",
39+
"stats.spot.deterministic_attempt.automaton_analysis.is_complete": "det_complete",
40+
"stats.spot.deterministic_attempt.automaton_analysis.is_deterministic": "det_deterministic",
41+
}
42+
num_cols = [
43+
"stats.spot.tgba_analysis.state_count",
44+
"stats.spot.tgba_analysis.transition_count",
45+
"stats.spot.buchi_analysis.state_count",
46+
"stats.spot.buchi_analysis.transition_count",
47+
"stats.spot.deterministic_attempt.automaton_analysis.state_count",
48+
"stats.spot.deterministic_attempt.automaton_analysis.transition_count",
49+
"stats.spot.tgba_analysis.acceptance_sets",
50+
"stats.spot.buchi_analysis.acceptance_sets",
51+
"stats.spot.deterministic_attempt.automaton_analysis.acceptance_sets",
52+
]
53+
54+
existing_bool_cols = {k: v for k, v in bool_cols.items() if k in df.columns}
55+
existing_num_cols = [c for c in num_cols if c in df.columns]
56+
if not existing_bool_cols and not existing_num_cols:
57+
print("[clustering] Skipped: no relevant columns found.")
58+
return
59+
60+
clu_df = df[list(existing_bool_cols.keys()) + existing_num_cols].copy()
61+
clu_df = clu_df.replace({
62+
"WAHR": 1, "FALSCH": 0,
63+
"TRUE": 1, "FALSE": 0,
64+
True: 1, False: 0,
65+
"Error": pd.NA,
66+
})
67+
for c in clu_df.columns:
68+
clu_df[c] = pd.to_numeric(clu_df[c], errors="coerce")
69+
70+
keywords = ["safety", "obligation", "persistence", "recurrence", "reactivity", "guarantee"]
71+
if "stats.spot.manna_pnueli_class" in df.columns:
72+
for kw in keywords:
73+
clu_df[kw] = df["stats.spot.manna_pnueli_class"].apply(
74+
lambda x: 1 if pd.notna(x) and isinstance(x, str) and kw in x.lower() else 0
75+
)
76+
77+
clu_df = clu_df.dropna()
78+
if clu_df.empty or clu_df.shape[0] <= 1:
79+
print("[clustering] Skipped: not enough clean data to cluster.")
80+
return
81+
82+
if "stats.spot.spot_formula" in df.columns:
83+
labels_series = df.loc[clu_df.index, "stats.spot.spot_formula"].astype(str)
84+
elif "stats.spot.formula" in df.columns:
85+
labels_series = df.loc[clu_df.index, "stats.spot.formula"].astype(str)
86+
else:
87+
labels_series = pd.Series([f"row_{i}" for i in clu_df.index], index=clu_df.index)
88+
89+
if "id" in df.columns:
90+
id_series = df.loc[clu_df.index, "id"].astype(str)
91+
else:
92+
id_series = pd.Series(clu_df.index.astype(str), index=clu_df.index)
93+
94+
def _fmt_label(formula, id_):
95+
s = str(formula) if pd.notna(formula) else ""
96+
s = s.strip()
97+
if len(s) > 60:
98+
s = s[:60] + "..."
99+
return f"{s} [{id_}]"
100+
101+
labels_list = [_fmt_label(labels_series.loc[i], id_series.loc[i]) for i in clu_df.index]
102+
103+
scaler = StandardScaler()
104+
X_scaled = scaler.fit_transform(clu_df.values)
105+
Z = linkage(X_scaled, method="ward")
106+
107+
if manual_n_clusters is not None:
108+
n_clusters = int(manual_n_clusters)
109+
else:
110+
n_clusters = 5 if clu_df.shape[0] >= 5 else max(2, clu_df.shape[0] // 2)
111+
112+
agg = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward")
113+
labels = agg.fit_predict(X_scaled)
114+
palette = sns.color_palette("tab10", n_clusters)
115+
palette_hex = [to_hex(c) for c in palette]
116+
117+
def _make_link_color_func(Z, leaf_cluster_labels, palette_hex):
118+
n = len(leaf_cluster_labels)
119+
children = {i + n: (int(left), int(right)) for i, (left, right, _, _) in enumerate(Z)}
120+
121+
from functools import lru_cache
122+
@lru_cache(maxsize=None)
123+
def subtree_clusters(node_id):
124+
if node_id < n:
125+
return {int(leaf_cluster_labels[node_id])}
126+
l, r = children[node_id]
127+
return subtree_clusters(l) | subtree_clusters(r)
128+
129+
def link_color_func(node_id):
130+
clusters = subtree_clusters(int(node_id))
131+
if len(clusters) == 1:
132+
k = next(iter(clusters))
133+
return palette_hex[k % len(palette_hex)]
134+
return "#BBBBBB"
135+
return link_color_func
136+
137+
if n_clusters > 1:
138+
d_high = Z[-(n_clusters - 1), 2]
139+
d_low = Z[-(n_clusters), 2] if n_clusters <= Z.shape[0] else d_high * 0.5
140+
cut_height = (d_high + d_low) / 2.0
141+
else:
142+
cut_height = 0.0
143+
144+
# Dendrogram
145+
n_leaves = len(labels_list)
146+
fig_height = max(6, 0.28 * n_leaves + 2)
147+
fig = plt.figure(figsize=(12, fig_height))
148+
link_color_func = _make_link_color_func(Z, labels, palette_hex)
149+
dobj = dendrogram(
150+
Z,
151+
labels=labels_list,
152+
orientation="right",
153+
leaf_rotation=0,
154+
leaf_font_size=8,
155+
link_color_func=link_color_func,
156+
)
157+
if n_clusters > 1:
158+
plt.axvline(cut_height, linestyle="--", color="#444444", linewidth=1)
159+
ax = plt.gca()
160+
for tick, row_idx in zip(ax.get_yticklabels(), dobj['leaves']):
161+
k = int(labels[row_idx])
162+
tick.set_color(palette_hex[k % len(palette_hex)])
163+
plt.title("Agglomerative clustering dendrogram (Ward linkage)")
164+
plt.xlabel("Distance")
165+
plt.ylabel("")
166+
plt.tight_layout()
167+
plt.subplots_adjust(left=0.35)
168+
fig.savefig(f"{output_dir}/Clustering_Dendrogram.pdf", format="pdf", bbox_inches="tight")
169+
plt.close(fig)
170+
171+
# PCA projection
172+
pca = PCA(n_components=2)
173+
coords = pca.fit_transform(X_scaled)
174+
fig2 = plt.figure(figsize=(12, 8))
175+
for k in sorted(set(labels)):
176+
idx = labels == k
177+
plt.scatter(
178+
coords[idx, 0], coords[idx, 1], alpha=0.8, edgecolor="k",
179+
label=f"Cluster {k}", color=palette_hex[k % len(palette_hex)]
180+
)
181+
ids_for_plot = id_series.loc[clu_df.index].astype(str).values
182+
for (x, y, lab) in zip(coords[:, 0], coords[:, 1], ids_for_plot):
183+
plt.text(x + 0.06, y + 0.06, lab, fontsize=7, alpha=0.9)
184+
plt.xlabel("PCA 1")
185+
plt.ylabel("PCA 2")
186+
plt.title("Agglomerative clusters (PCA projection)")
187+
plt.legend(loc="best", fontsize=8)
188+
plt.tight_layout()
189+
fig2.savefig(f"{output_dir}/AgglomerativeClusters_PCA.pdf", format="pdf", bbox_inches="tight")
190+
plt.close(fig2)
191+
192+
cluster_out = pd.DataFrame({
193+
"label": labels_series.values,
194+
"cluster": labels,
195+
}, index=clu_df.index)
196+
cluster_out.to_csv(os.path.join(output_dir, "cluster_assignments.csv"))
197+

tmp_plots/main.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import os
2+
import pandas as pd
3+
4+
from plots import (
5+
plot_violin,
6+
plot_scatter,
7+
plot_upset,
8+
plot_stacked_barchart,
9+
plot_correlation_matrix,
10+
plot_extended_correlation_matrix,
11+
plot_extended_conditional_probability_matrix,
12+
plot_extended_jaccard_similarity_matrix,
13+
plot_frequent_itemset_mining,
14+
plot_semantic_features_frequency,
15+
)
16+
from clustering import plot_agglomerative_clustering_and_pca
17+
18+
19+
DATA_FILE = "spacewire_250919215922.xlsx"
20+
OUTPUT_DIR = "output"
21+
22+
23+
def main():
24+
os.makedirs(OUTPUT_DIR, exist_ok=True)
25+
df = pd.read_excel(DATA_FILE)
26+
27+
# Call any plots you want to generate. Comment out lines to skip.
28+
plot_violin(df, OUTPUT_DIR)
29+
plot_scatter(df, OUTPUT_DIR)
30+
plot_upset(df, OUTPUT_DIR)
31+
plot_stacked_barchart(df, OUTPUT_DIR)
32+
plot_correlation_matrix(df, OUTPUT_DIR)
33+
plot_extended_correlation_matrix(df, OUTPUT_DIR)
34+
plot_extended_conditional_probability_matrix(df, OUTPUT_DIR)
35+
plot_extended_jaccard_similarity_matrix(df, OUTPUT_DIR)
36+
plot_frequent_itemset_mining(df, OUTPUT_DIR)
37+
plot_semantic_features_frequency(df, OUTPUT_DIR)
38+
39+
# Clustering and PCA plots are in a separate module
40+
plot_agglomerative_clustering_and_pca(df, OUTPUT_DIR, manual_n_clusters=5)
41+
42+
43+
if __name__ == "__main__":
44+
main()
17.8 KB
Binary file not shown.
25.9 KB
Binary file not shown.
18.1 KB
Binary file not shown.
15.6 KB
Binary file not shown.
17.4 KB
Binary file not shown.

0 commit comments

Comments
 (0)