fix parallel job 2 GPUs

allierc · allierc · commit 197dce99fa8b · 2025-12-18T16:59:16.000-05:00
diff --git a/GNN_Main.py b/GNN_Main.py
@@ -56,9 +56,9 @@
                 task_params[key] = int(value) if value.isdigit() else value
     else:
         best_model = ''
-        task = 'train'   #  'generate_train_test_plot_Claude'  # 'train', 'test', 'generate', 'plot', 'train_NGP', 'train_INR', 'Claude'
-        task_params = {'iterations': 512, 'experiment': 'experiment_convergence_7', 'llm_task': 'signal_Claude'}
-        config_list = ['signal_chaotic_1']
+        task = 'generate_train_test_plot_Claude'  # 'train', 'test', 'generate', 'plot', 'train_NGP', 'train_INR', 'Claude'
+        task_params = {'iterations': 512, 'experiment': 'experiment_convergence_7', 'llm_task': 'signal_Claude_bis'}
+        config_list = ['signal_chaotic_2']
 
     # parse parameters from task_params
     n_iterations = task_params.get('iterations', 5)
@@ -95,7 +95,7 @@
                 print(f"\033[93mmodified {target_config}: dataset='{llm_task_name}', n_epochs=1, data_augmentation_loop=50, description='designed by Claude'\033[0m")
 
         # delete ucb_scores.txt at start of experiment
-        ucb_file = f"{root_dir}/ucb_scores.txt"
+        ucb_file = f"{root_dir}/{llm_task_name}_ucb_scores.txt"
         if os.path.exists(ucb_file):
             os.remove(ucb_file)
             print(f"\033[93mdeleted {ucb_file}\033[0m")
@@ -114,7 +114,7 @@
         if 'Claude' in task:
             root_dir = os.path.dirname(os.path.abspath(__file__))
             experiment_path = f"{root_dir}/{experiment_name}.md"
-            analysis_path = f"{root_dir}/analysis_{experiment_name}.md"
+            analysis_path = f"{root_dir}/{llm_task_name}_analysis_{experiment_name}.md"
 
             # check experiment file exists
             if not os.path.exists(experiment_path):
@@ -133,15 +133,15 @@
 
         # analysis log file in root folder (for Claude to read)
         root_dir = os.path.dirname(os.path.abspath(__file__))
-        analysis_log_path = f"{root_dir}/analysis.log"
+        analysis_log_path = f"{root_dir}/{llm_task_name}_analysis.log"
 
         for iteration in iteration_range:
             if 'Claude' in task:
                 print(f"\n\n\n\033[94miteration {iteration}/{n_iterations}: {config_file_} ===\033[0m")
 
                 # block boundary: erase UCB at start of each 24-iteration block (except iter 1, already handled)
                 if iteration > 1 and (iteration - 1) % 24 == 0:
-                    ucb_file = f"{root_dir}/ucb_scores.txt"
+                    ucb_file = f"{root_dir}/{llm_task_name}_ucb_scores.txt"
                     if os.path.exists(ucb_file):
                         os.remove(ucb_file)
                         print(f"\033[93msimulation block boundary: deleted {ucb_file} (new simulation block)\\033[0m")
@@ -233,7 +233,7 @@
 
                 # claude analysis: reads activity.png and analysis.log, updates config per experiment protocol
                 config_path = f"{root_dir}/config/{pre_folder}{config_file_}.yaml"
-                ucb_path = f"{root_dir}/ucb_scores.txt"
+                ucb_path = f"{root_dir}/{llm_task_name}_ucb_scores.txt"
 
                 # compute UCB scores for Claude to read
                 compute_ucb_scores(analysis_path, ucb_path,
diff --git a/analysis_experiment_convergence_6.md b/analysis_experiment_convergence_6.md
@@ -1,15 +1,2 @@
 # Experiment Log: signal_Claude
 
-## Iter 1: converged
---- NEW SIMULATION BLOCK ---
-Simulation: connectivity_type=chaotic, Dale_law=False, noise_model_level=0
-Node: id=1, parent=root
-Mode/Strategy: exploit
-Config: lr_W=2.0E-3, lr=1.0E-4, lr_emb=2.5E-4, coeff_W_L1=1.0E-5, batch_size=8
-Metrics: test_R2=0.9981, test_pearson=0.9937, connectivity_R2=0.9914, final_loss=5.72E+02
-Activity: chaotic dynamics, range [-20.3, 23.9], effective rank 30, 100 neurons distinguishable
-Mutation: baseline config (first iteration)
-Parent rule: UCB file shows only Node 1 as current; this was baseline from root
-Observation: excellent convergence on baseline chaotic config, GNN successfully recovers connectivity
-Next: parent=1
-
diff --git a/analysis_experiment_convergence_7.md b/analysis_experiment_convergence_7.md
diff --git a/config/signal/signal_Claude.yaml b/config/signal/signal_Claude.yaml
@@ -59,7 +59,7 @@ training:
   cluster_method: "distance_plot"
   fix_cluster_embedding: True
 
-  learning_rate_W_start: 3.0E-3
+  learning_rate_W_start: 2.0E-3
   learning_rate_start: 1.0E-4
   learning_rate_embedding_start: 2.5E-4
 
diff --git a/config/signal/signal_Claude_bis.yaml b/config/signal/signal_Claude_bis.yaml
@@ -0,0 +1,70 @@
+description: "designed by Claude"
+dataset: 'signal_Claude_bis'
+
+simulation:
+  connectivity_type: "chaotic"
+  connectivity_init: [0, 0.1]
+  # params: [a, b, g, s, w, h]
+  # a: decay, b: offset, g: gain, s: self-recurrence, w: width, h: threshold   MLP1((u-h)/w)
+  params: [[1.0, 0.0, 7.0, 0.0, 1.0, 0.0], [1.0, 0.0, 7.0, 0.0, 1.0, 0.0]]
+  phi: "tanh"
+  n_neurons: 1000
+  n_neuron_types: 1
+  n_frames: 100000
+  delta_t: 0.01
+  dpos_init: 0
+  boundary: "no"
+  start_frame: -100
+
+graph_model:
+  signal_model_name: "PDE_N4"
+  particle_model_name: ""
+  mesh_model_name: ""
+  prediction: "first_derivative"
+
+  input_size: 3
+  output_size: 1
+  hidden_dim: 64
+  n_layers: 3
+
+  input_size_update: 3
+  n_layers_update: 3
+  hidden_dim_update: 64
+
+  aggr_type: "add"
+  embedding_dim: 2
+  update_type: "none"
+
+plotting:
+  colormap: "tab10"
+  arrow_length: 1
+  xlim: [-5, 5]
+  ylim: [-2, 2]
+
+training:
+  n_epochs: 1
+  n_runs: 1
+  device: "cuda:1"
+
+  batch_size: 8
+  small_init_batch_size: False
+  training_single_type: True
+
+  seed: 24
+
+  data_augmentation_loop: 50
+
+  sparsity: "replace_embedding_function"
+  sparsity_freq: 4
+  cluster_method: "distance_plot"
+  fix_cluster_embedding: True
+
+  learning_rate_W_start: 2.0E-3
+  learning_rate_start: 1.0E-4
+  learning_rate_embedding_start: 2.5E-4
+
+  learning_rate_NNR_f: 1.0E-6
+
+  coeff_W_L1: 1.0E-5
+  coeff_edge_norm: 0.0
+  coeff_edge_diff: 100
diff --git a/config/signal/signal_chaotic_1.yaml b/config/signal/signal_chaotic_1.yaml
@@ -44,7 +44,7 @@ plotting:
 training:
   n_epochs: 10
   n_runs: 1
-  device: "cuda:1"
+  device: "auto"
 
   batch_size: 8
   small_init_batch_size: False
diff --git a/experiment_convergence_7.md b/experiment_convergence_7.md
@@ -0,0 +1,189 @@
+# Simulation-GNN training Landscape Study
+
+## Goal
+
+Map the **simulation-GNN training landscape**: understand which simulation configurations allow successful GNN training (connectivity_R2 > 0.9) and which simulation configurations are fundamentally harder for GNN training.
+When Can GNN recover synaptic weights from simulated data?
+
+## Context
+
+You are a LLM, you are **hyperparameter optimizer** in a meta-learning loop. Your role:
+
+1. **Analyze results**: Read activity plots and metrics from the current GNN training run
+2. **Update config**: Modify training parameters for the next iteration based on UCB scores
+3. **Log decisions**: Append structured observations to the analysis file
+4. **Self-improve**: At simulation block boundaries, you are asked edit THIS protocol file to refine your own exploration rules
+
+### Simulation Blocks
+
+Each block = 24 iterations exploring one simulation configuration.
+
+- **Within block (iter 1-24, 25-48, ...)**: Only modify training parameters (learning rates, regularization, batch size)
+- **At block boundaries (iter 25, 49, 73...)**:
+  - Summarize what worked/failed in previous block
+  - Change simulation parameters (connectivity_type, Dale_law, noise_model_level)
+  - UCB tree resets (parent=root for first iteration of new block)
+
+At block boundaries, add:
+
+```
+## Iter N: [status]
+--- NEW SIMULATION BLOCK ---
+Simulation: connectivity_type=[type], Dale_law=[True/False], Dale_law_factor=[F], connectivity_rank = [R] if connectivity_type='low_rank', noise_model_level=[L]
+Node: id=N, parent=root
+```
+
+### Simulation block Summary
+
+1. Did this simulation regime converge?
+2. What training configs worked best?
+3. Comparison to previous blocks
+4. Remains to be explored
+
+```
+## Simulation block N Summary (iters X-Y)
+
+Simulation: [connectivity_type], [n_types] types, noise=[level]
+Best R2: [value] at iter [N]
+Observation: [four lines about what worked/failed for this simulation]
+Optimum training parameters: [learning_rate_W_start, learning_rate_start, learning_rate_embedding_start, coeff_W_L1: 1.0E-5]
+
+```
+
+## MANDATORY: Block Boundary Actions (iter 25, 49, 73, ...)
+
+At the **first iteration of each new block**, you MUST complete ALL of these actions:
+
+### Checklist (complete in order):
+
+- [ ] **1. Write block summary** for the previous block (see "Simulation block Summary" format above)
+- [ ] **2. Evaluate exploration rules** using metrics below
+- [ ] **3. EDIT THIS PROTOCOL FILE** - modify the rules between `## Parent Selection Rule (CRITICAL)` and `## END Parent selection Rule (CRITICAL)`
+- [ ] **4. Document your edit** - in the analysis file, state what you changed and why (or state "No changes needed" with justification)
+
+### Evaluation Metrics for Rule Modification:
+
+1. **Branching rate**: Count unique parents in last 6 iters
+   - If all sequential (rate=0%) → ADD exploration incentive to rules
+2. **Improvement rate**: How many iters improved R²?
+   - If <30% improving → INCREASE exploitation (raise R² threshold)
+   - If >80% improving → INCREASE exploration (probe boundaries)
+3. **Stuck detection**: Same R² plateau (±0.05) for 3+ iters?
+   - If yes → ADD forced branching rule
+
+### Example Protocol Edit:
+
+If branching rate was 0% (all sequential), you might add a new row to the strategy table:
+
+**Before:**
+```
+| Default                             | **exploit**         | Use highest UCB node, try new mutation                      |
+```
+
+**After:**
+```
+| Default                             | **exploit**         | Use highest UCB node, try new mutation                      |
+| Branching rate < 20% in last block  | **force-branch**    | Select random node from top 3 UCB, not the sequential parent|
+```
+
+Or modify threshold values, add new conditions, remove ineffective rules, etc.
+
+**IMPORTANT**: You must actually use the Edit tool to modify this file. Simply stating what you would change is NOT sufficient.
+
+## Analysis of Files
+
+- `analysis.log`: metrics from training/test/plot:
+  - `spectral_radius`: eigenvalue analysis of connectivity
+  - `svd_rank`: SVD rank at 99% variance (activity complexity)
+  - `test_R2`: R² between ground truth and rollout prediction
+  - `test_pearson`: Pearson correlation per neuron (mean)
+  - `connectivity_R2`: R² of learned vs true connectivity weights
+  - `final_loss`: final training loss (lower is better)
+- `ucb_scores.txt`: provides pre-computed UCB scores for all nodes including current iteration
+  at block boundaries, the UCB file will be empty (erased). When UCB file is empty, use `parent=root`.
+
+```
+
+Node 2: UCB=2.175, parent=1, visits=1, R2=0.997 [CURRENT]
+Node 1: UCB=2.110, parent=root, visits=2, R2=0.934
+
+```
+
+- `Node N`:
+- `UCB`: Upper Confidence Bound score = R² + c×√(log(N_total)/visits); higher = more promising to explore
+- `parent`: which node's config was mutated to create this node (root = baseline config)
+- `visits`: how many times this node or its descendants have been explored
+- `R2`: connectivity_R2 achieved by this node's config
+
+## Classification
+
+- **Converged**: connectivity_R2 > 0.9
+- **Partial**: connectivity_R2 0.1-0.9
+- **Failed**: connectivity_R2 < 0.1
+
+## Simulation Parameters to explore
+
+These parameters affect the **data generation** (simulation). Only change at block boundaries.
+
+```yaml
+simulation:
+  connectivity_type: "chaotic" # or "low_rank"
+  Dale_law: True # enforce excitatory/inhibitory separation
+  Dale_law_factor: 0.5 # fraction excitatory/inhibitory (0.1 to 0.9)
+  connectivity_rank: 20 # only used when connectivity_type="low_rank", range 5-100
+#   noise_model_level: 0.0 # noise added during simulation, affects data complexity. values: 0, 0.5, 1
+```
+
+## Training Parameters to explore
+
+These parameters affect the **GNN training**. Can be changed within a block.
+
+```yaml
+training:
+  learning_rate_W_start: 2.0E-3 # LR for connectivity weights W range: 1.0E-4 to 1.0E-2
+  learning_rate_start: 1.0E-4 # LR for model parameters range: 1.0E-5 to 1.0E-3
+  learning_rate_embedding_start: 2.5E-4 # LR for embeddings range: 1.0E-5 to 1.0E-3, only if n_neuron_types > 1
+  coeff_W_L1: 1.0E-5 # L1 regularization on W range: 1.0E-6 to 1.0E-3
+  batch_size: 8 # batch size values: 8, 16, 32
+```
+
+## Parent Selection Rule (CRITICAL)
+
+**Step 1: select parent node to ccontinue**
+
+- Use `ucb_scores.txt` to select a new node
+- If UCB file is empty → `parent=root`
+- Otherwise → select node with **highest UCB** as parent
+
+**Step 2: Choose exploration strategy**
+
+| Condition                           | Strategy            | Action                                                      |
+| ----------------------------------- | ------------------- | ----------------------------------------------------------- |
+| Default                             | **exploit**         | Use highest UCB node, try new mutation                      |
+| 3+ consecutive successes (R² ≥ 0.9) | **failure-probe**   | Deliberately try extreme parameter to find failure boundary |
+| 6+ consecutive successes (R² ≥ 0.9) | **explore**         | Use highest UCB node not last 6 nodes, try new mutation     |
+| Found good config                   | **robustness-test** | Re-run same config (no mutation) to verify reproducibility  |
+
+**failure-probe**: After multiple successes, intentionally push parameters to extremes (e.g., 10x lr, 0.1x lr) to map where the config breaks. This helps understand the stability region.
+
+**robustness-test**: Duplicate the best iteration with identical config to verify the result is reproducible, not due to lucky initialization.
+
+**Reversion check**: If reverting a parameter to match a previous node's value, use that node as parent.
+Example: If reverting `lr` back to `1E-4` (Node 2's value), use `parent=2`.
+
+## END Parent selection Rule (CRITICAL)
+
+## Log Format
+
+```
+## Iter N: [converged/partial/failed]
+Node: id=N, parent=P
+Mode/Strategy: [success-exploit/failure-probe]/[exploit/explore/boundary]
+Config: lr_W=X, lr=Y, lr_emb=Z, coeff_W_L1=W, batch_size=B
+Metrics: test_R2=A, test_pearson=B, connectivity_R2=C, final_loss=D
+Activity: [brief description of dynamics]
+Mutation: [param]: [old] -> [new]
+Parent rule: [brief description of Parent Selection Rule]
+Observation: [one line about result]
+Next: parent=P [CRITICAL: specify which node the NEXT iteration should branch from]
+```
diff --git a/signal_Claude_bis_analysis_experiment_convergence_7.md b/signal_Claude_bis_analysis_experiment_convergence_7.md
@@ -0,0 +1,2 @@
+# Experiment Log: signal_Claude_bis
+
diff --git a/src/NeuralGraph/generators/utils.py b/src/NeuralGraph/generators/utils.py
@@ -81,7 +81,7 @@ def init_neurons(config=[], scenario='none', ratio=1, device=[]):
 
     xc, yc = get_equidistant_points(n_points=n_neurons)
     pos = torch.tensor(np.stack((xc, yc), axis=1), dtype=torch.float32, device=device) / 2
-    perm = torch.randperm(pos.size(0))
+    perm = torch.randperm(pos.size(0), device=device)
     pos = pos[perm]
 
     dpos = dpos_init * torch.randn((n_neurons, dimension), device=device)
diff --git a/ucb_scores.txt b/ucb_scores.txt

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Experiment Log: signal_Claude_bis`
	`2`	`+`