Skip to content

Commit 197dce9

Browse files
committed
fix parallel job 2 GPUs
1 parent bd14143 commit 197dce9

10 files changed

Lines changed: 272 additions & 29 deletions

GNN_Main.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@
5656
task_params[key] = int(value) if value.isdigit() else value
5757
else:
5858
best_model = ''
59-
task = 'train' # 'generate_train_test_plot_Claude' # 'train', 'test', 'generate', 'plot', 'train_NGP', 'train_INR', 'Claude'
60-
task_params = {'iterations': 512, 'experiment': 'experiment_convergence_7', 'llm_task': 'signal_Claude'}
61-
config_list = ['signal_chaotic_1']
59+
task = 'generate_train_test_plot_Claude' # 'train', 'test', 'generate', 'plot', 'train_NGP', 'train_INR', 'Claude'
60+
task_params = {'iterations': 512, 'experiment': 'experiment_convergence_7', 'llm_task': 'signal_Claude_bis'}
61+
config_list = ['signal_chaotic_2']
6262

6363
# parse parameters from task_params
6464
n_iterations = task_params.get('iterations', 5)
@@ -95,7 +95,7 @@
9595
print(f"\033[93mmodified {target_config}: dataset='{llm_task_name}', n_epochs=1, data_augmentation_loop=50, description='designed by Claude'\033[0m")
9696

9797
# delete ucb_scores.txt at start of experiment
98-
ucb_file = f"{root_dir}/ucb_scores.txt"
98+
ucb_file = f"{root_dir}/{llm_task_name}_ucb_scores.txt"
9999
if os.path.exists(ucb_file):
100100
os.remove(ucb_file)
101101
print(f"\033[93mdeleted {ucb_file}\033[0m")
@@ -114,7 +114,7 @@
114114
if 'Claude' in task:
115115
root_dir = os.path.dirname(os.path.abspath(__file__))
116116
experiment_path = f"{root_dir}/{experiment_name}.md"
117-
analysis_path = f"{root_dir}/analysis_{experiment_name}.md"
117+
analysis_path = f"{root_dir}/{llm_task_name}_analysis_{experiment_name}.md"
118118

119119
# check experiment file exists
120120
if not os.path.exists(experiment_path):
@@ -133,15 +133,15 @@
133133

134134
# analysis log file in root folder (for Claude to read)
135135
root_dir = os.path.dirname(os.path.abspath(__file__))
136-
analysis_log_path = f"{root_dir}/analysis.log"
136+
analysis_log_path = f"{root_dir}/{llm_task_name}_analysis.log"
137137

138138
for iteration in iteration_range:
139139
if 'Claude' in task:
140140
print(f"\n\n\n\033[94miteration {iteration}/{n_iterations}: {config_file_} ===\033[0m")
141141

142142
# block boundary: erase UCB at start of each 24-iteration block (except iter 1, already handled)
143143
if iteration > 1 and (iteration - 1) % 24 == 0:
144-
ucb_file = f"{root_dir}/ucb_scores.txt"
144+
ucb_file = f"{root_dir}/{llm_task_name}_ucb_scores.txt"
145145
if os.path.exists(ucb_file):
146146
os.remove(ucb_file)
147147
print(f"\033[93msimulation block boundary: deleted {ucb_file} (new simulation block)\\033[0m")
@@ -233,7 +233,7 @@
233233

234234
# claude analysis: reads activity.png and analysis.log, updates config per experiment protocol
235235
config_path = f"{root_dir}/config/{pre_folder}{config_file_}.yaml"
236-
ucb_path = f"{root_dir}/ucb_scores.txt"
236+
ucb_path = f"{root_dir}/{llm_task_name}_ucb_scores.txt"
237237

238238
# compute UCB scores for Claude to read
239239
compute_ucb_scores(analysis_path, ucb_path,
Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,2 @@
11
# Experiment Log: signal_Claude
22

3-
## Iter 1: converged
4-
--- NEW SIMULATION BLOCK ---
5-
Simulation: connectivity_type=chaotic, Dale_law=False, noise_model_level=0
6-
Node: id=1, parent=root
7-
Mode/Strategy: exploit
8-
Config: lr_W=2.0E-3, lr=1.0E-4, lr_emb=2.5E-4, coeff_W_L1=1.0E-5, batch_size=8
9-
Metrics: test_R2=0.9981, test_pearson=0.9937, connectivity_R2=0.9914, final_loss=5.72E+02
10-
Activity: chaotic dynamics, range [-20.3, 23.9], effective rank 30, 100 neurons distinguishable
11-
Mutation: baseline config (first iteration)
12-
Parent rule: UCB file shows only Node 1 as current; this was baseline from root
13-
Observation: excellent convergence on baseline chaotic config, GNN successfully recovers connectivity
14-
Next: parent=1
15-

analysis_experiment_convergence_7.md

Lines changed: 0 additions & 2 deletions
This file was deleted.

config/signal/signal_Claude.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ training:
5959
cluster_method: "distance_plot"
6060
fix_cluster_embedding: True
6161

62-
learning_rate_W_start: 3.0E-3
62+
learning_rate_W_start: 2.0E-3
6363
learning_rate_start: 1.0E-4
6464
learning_rate_embedding_start: 2.5E-4
6565

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
description: "designed by Claude"
2+
dataset: 'signal_Claude_bis'
3+
4+
simulation:
5+
connectivity_type: "chaotic"
6+
connectivity_init: [0, 0.1]
7+
# params: [a, b, g, s, w, h]
8+
# a: decay, b: offset, g: gain, s: self-recurrence, w: width, h: threshold MLP1((u-h)/w)
9+
params: [[1.0, 0.0, 7.0, 0.0, 1.0, 0.0], [1.0, 0.0, 7.0, 0.0, 1.0, 0.0]]
10+
phi: "tanh"
11+
n_neurons: 1000
12+
n_neuron_types: 1
13+
n_frames: 100000
14+
delta_t: 0.01
15+
dpos_init: 0
16+
boundary: "no"
17+
start_frame: -100
18+
19+
graph_model:
20+
signal_model_name: "PDE_N4"
21+
particle_model_name: ""
22+
mesh_model_name: ""
23+
prediction: "first_derivative"
24+
25+
input_size: 3
26+
output_size: 1
27+
hidden_dim: 64
28+
n_layers: 3
29+
30+
input_size_update: 3
31+
n_layers_update: 3
32+
hidden_dim_update: 64
33+
34+
aggr_type: "add"
35+
embedding_dim: 2
36+
update_type: "none"
37+
38+
plotting:
39+
colormap: "tab10"
40+
arrow_length: 1
41+
xlim: [-5, 5]
42+
ylim: [-2, 2]
43+
44+
training:
45+
n_epochs: 1
46+
n_runs: 1
47+
device: "cuda:1"
48+
49+
batch_size: 8
50+
small_init_batch_size: False
51+
training_single_type: True
52+
53+
seed: 24
54+
55+
data_augmentation_loop: 50
56+
57+
sparsity: "replace_embedding_function"
58+
sparsity_freq: 4
59+
cluster_method: "distance_plot"
60+
fix_cluster_embedding: True
61+
62+
learning_rate_W_start: 2.0E-3
63+
learning_rate_start: 1.0E-4
64+
learning_rate_embedding_start: 2.5E-4
65+
66+
learning_rate_NNR_f: 1.0E-6
67+
68+
coeff_W_L1: 1.0E-5
69+
coeff_edge_norm: 0.0
70+
coeff_edge_diff: 100

config/signal/signal_chaotic_1.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ plotting:
4444
training:
4545
n_epochs: 10
4646
n_runs: 1
47-
device: "cuda:1"
47+
device: "auto"
4848

4949
batch_size: 8
5050
small_init_batch_size: False

experiment_convergence_7.md

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
# Simulation-GNN training Landscape Study
2+
3+
## Goal
4+
5+
Map the **simulation-GNN training landscape**: understand which simulation configurations allow successful GNN training (connectivity_R2 > 0.9) and which simulation configurations are fundamentally harder for GNN training.
6+
When Can GNN recover synaptic weights from simulated data?
7+
8+
## Context
9+
10+
You are a LLM, you are **hyperparameter optimizer** in a meta-learning loop. Your role:
11+
12+
1. **Analyze results**: Read activity plots and metrics from the current GNN training run
13+
2. **Update config**: Modify training parameters for the next iteration based on UCB scores
14+
3. **Log decisions**: Append structured observations to the analysis file
15+
4. **Self-improve**: At simulation block boundaries, you are asked edit THIS protocol file to refine your own exploration rules
16+
17+
### Simulation Blocks
18+
19+
Each block = 24 iterations exploring one simulation configuration.
20+
21+
- **Within block (iter 1-24, 25-48, ...)**: Only modify training parameters (learning rates, regularization, batch size)
22+
- **At block boundaries (iter 25, 49, 73...)**:
23+
- Summarize what worked/failed in previous block
24+
- Change simulation parameters (connectivity_type, Dale_law, noise_model_level)
25+
- UCB tree resets (parent=root for first iteration of new block)
26+
27+
At block boundaries, add:
28+
29+
```
30+
## Iter N: [status]
31+
--- NEW SIMULATION BLOCK ---
32+
Simulation: connectivity_type=[type], Dale_law=[True/False], Dale_law_factor=[F], connectivity_rank = [R] if connectivity_type='low_rank', noise_model_level=[L]
33+
Node: id=N, parent=root
34+
```
35+
36+
### Simulation block Summary
37+
38+
1. Did this simulation regime converge?
39+
2. What training configs worked best?
40+
3. Comparison to previous blocks
41+
4. Remains to be explored
42+
43+
```
44+
## Simulation block N Summary (iters X-Y)
45+
46+
Simulation: [connectivity_type], [n_types] types, noise=[level]
47+
Best R2: [value] at iter [N]
48+
Observation: [four lines about what worked/failed for this simulation]
49+
Optimum training parameters: [learning_rate_W_start, learning_rate_start, learning_rate_embedding_start, coeff_W_L1: 1.0E-5]
50+
51+
```
52+
53+
## MANDATORY: Block Boundary Actions (iter 25, 49, 73, ...)
54+
55+
At the **first iteration of each new block**, you MUST complete ALL of these actions:
56+
57+
### Checklist (complete in order):
58+
59+
- [ ] **1. Write block summary** for the previous block (see "Simulation block Summary" format above)
60+
- [ ] **2. Evaluate exploration rules** using metrics below
61+
- [ ] **3. EDIT THIS PROTOCOL FILE** - modify the rules between `## Parent Selection Rule (CRITICAL)` and `## END Parent selection Rule (CRITICAL)`
62+
- [ ] **4. Document your edit** - in the analysis file, state what you changed and why (or state "No changes needed" with justification)
63+
64+
### Evaluation Metrics for Rule Modification:
65+
66+
1. **Branching rate**: Count unique parents in last 6 iters
67+
- If all sequential (rate=0%) → ADD exploration incentive to rules
68+
2. **Improvement rate**: How many iters improved R²?
69+
- If <30% improving → INCREASE exploitation (raise R² threshold)
70+
- If >80% improving → INCREASE exploration (probe boundaries)
71+
3. **Stuck detection**: Same R² plateau (±0.05) for 3+ iters?
72+
- If yes → ADD forced branching rule
73+
74+
### Example Protocol Edit:
75+
76+
If branching rate was 0% (all sequential), you might add a new row to the strategy table:
77+
78+
**Before:**
79+
```
80+
| Default | **exploit** | Use highest UCB node, try new mutation |
81+
```
82+
83+
**After:**
84+
```
85+
| Default | **exploit** | Use highest UCB node, try new mutation |
86+
| Branching rate < 20% in last block | **force-branch** | Select random node from top 3 UCB, not the sequential parent|
87+
```
88+
89+
Or modify threshold values, add new conditions, remove ineffective rules, etc.
90+
91+
**IMPORTANT**: You must actually use the Edit tool to modify this file. Simply stating what you would change is NOT sufficient.
92+
93+
## Analysis of Files
94+
95+
- `analysis.log`: metrics from training/test/plot:
96+
- `spectral_radius`: eigenvalue analysis of connectivity
97+
- `svd_rank`: SVD rank at 99% variance (activity complexity)
98+
- `test_R2`: R² between ground truth and rollout prediction
99+
- `test_pearson`: Pearson correlation per neuron (mean)
100+
- `connectivity_R2`: R² of learned vs true connectivity weights
101+
- `final_loss`: final training loss (lower is better)
102+
- `ucb_scores.txt`: provides pre-computed UCB scores for all nodes including current iteration
103+
at block boundaries, the UCB file will be empty (erased). When UCB file is empty, use `parent=root`.
104+
105+
```
106+
107+
Node 2: UCB=2.175, parent=1, visits=1, R2=0.997 [CURRENT]
108+
Node 1: UCB=2.110, parent=root, visits=2, R2=0.934
109+
110+
```
111+
112+
- `Node N`:
113+
- `UCB`: Upper Confidence Bound score = R² + c×√(log(N_total)/visits); higher = more promising to explore
114+
- `parent`: which node's config was mutated to create this node (root = baseline config)
115+
- `visits`: how many times this node or its descendants have been explored
116+
- `R2`: connectivity_R2 achieved by this node's config
117+
118+
## Classification
119+
120+
- **Converged**: connectivity_R2 > 0.9
121+
- **Partial**: connectivity_R2 0.1-0.9
122+
- **Failed**: connectivity_R2 < 0.1
123+
124+
## Simulation Parameters to explore
125+
126+
These parameters affect the **data generation** (simulation). Only change at block boundaries.
127+
128+
```yaml
129+
simulation:
130+
connectivity_type: "chaotic" # or "low_rank"
131+
Dale_law: True # enforce excitatory/inhibitory separation
132+
Dale_law_factor: 0.5 # fraction excitatory/inhibitory (0.1 to 0.9)
133+
connectivity_rank: 20 # only used when connectivity_type="low_rank", range 5-100
134+
# noise_model_level: 0.0 # noise added during simulation, affects data complexity. values: 0, 0.5, 1
135+
```
136+
137+
## Training Parameters to explore
138+
139+
These parameters affect the **GNN training**. Can be changed within a block.
140+
141+
```yaml
142+
training:
143+
learning_rate_W_start: 2.0E-3 # LR for connectivity weights W range: 1.0E-4 to 1.0E-2
144+
learning_rate_start: 1.0E-4 # LR for model parameters range: 1.0E-5 to 1.0E-3
145+
learning_rate_embedding_start: 2.5E-4 # LR for embeddings range: 1.0E-5 to 1.0E-3, only if n_neuron_types > 1
146+
coeff_W_L1: 1.0E-5 # L1 regularization on W range: 1.0E-6 to 1.0E-3
147+
batch_size: 8 # batch size values: 8, 16, 32
148+
```
149+
150+
## Parent Selection Rule (CRITICAL)
151+
152+
**Step 1: select parent node to ccontinue**
153+
154+
- Use `ucb_scores.txt` to select a new node
155+
- If UCB file is empty → `parent=root`
156+
- Otherwise → select node with **highest UCB** as parent
157+
158+
**Step 2: Choose exploration strategy**
159+
160+
| Condition | Strategy | Action |
161+
| ----------------------------------- | ------------------- | ----------------------------------------------------------- |
162+
| Default | **exploit** | Use highest UCB node, try new mutation |
163+
| 3+ consecutive successes (R² ≥ 0.9) | **failure-probe** | Deliberately try extreme parameter to find failure boundary |
164+
| 6+ consecutive successes (R² ≥ 0.9) | **explore** | Use highest UCB node not last 6 nodes, try new mutation |
165+
| Found good config | **robustness-test** | Re-run same config (no mutation) to verify reproducibility |
166+
167+
**failure-probe**: After multiple successes, intentionally push parameters to extremes (e.g., 10x lr, 0.1x lr) to map where the config breaks. This helps understand the stability region.
168+
169+
**robustness-test**: Duplicate the best iteration with identical config to verify the result is reproducible, not due to lucky initialization.
170+
171+
**Reversion check**: If reverting a parameter to match a previous node's value, use that node as parent.
172+
Example: If reverting `lr` back to `1E-4` (Node 2's value), use `parent=2`.
173+
174+
## END Parent selection Rule (CRITICAL)
175+
176+
## Log Format
177+
178+
```
179+
## Iter N: [converged/partial/failed]
180+
Node: id=N, parent=P
181+
Mode/Strategy: [success-exploit/failure-probe]/[exploit/explore/boundary]
182+
Config: lr_W=X, lr=Y, lr_emb=Z, coeff_W_L1=W, batch_size=B
183+
Metrics: test_R2=A, test_pearson=B, connectivity_R2=C, final_loss=D
184+
Activity: [brief description of dynamics]
185+
Mutation: [param]: [old] -> [new]
186+
Parent rule: [brief description of Parent Selection Rule]
187+
Observation: [one line about result]
188+
Next: parent=P [CRITICAL: specify which node the NEXT iteration should branch from]
189+
```
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Experiment Log: signal_Claude_bis
2+

src/NeuralGraph/generators/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def init_neurons(config=[], scenario='none', ratio=1, device=[]):
8181

8282
xc, yc = get_equidistant_points(n_points=n_neurons)
8383
pos = torch.tensor(np.stack((xc, yc), axis=1), dtype=torch.float32, device=device) / 2
84-
perm = torch.randperm(pos.size(0))
84+
perm = torch.randperm(pos.size(0), device=device)
8585
pos = pos[perm]
8686

8787
dpos = dpos_init * torch.randn((n_neurons, dimension), device=device)

ucb_scores.txt

Lines changed: 0 additions & 3 deletions
This file was deleted.

0 commit comments

Comments
 (0)