fieldflow/sample_config.toml at main · RiceAstroparticleLab/fieldflow · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
# FieldFlow Configuration File
# This is a sample configuration file for training continuous normalizing flow
# models for electric field modeling. Copy this file and modify the parameters
# as needed for your specific experiment.

# Optional metadata for tracking experiments
experiment_name = "sample_experiment"
description = "Sample configuration for FieldFlow training"

[model]
# Model architecture parameters
data_size = 2  # Dimensionality of the input data (2D for x,y coordinates)
exact_logp = true  # Use exact log probability computation (more accurate but slower)
width_size = 256  # Width of neural network hidden layers
depth = 16  # Number of hidden layers in the neural network
scalar = true # Use scalar potential model

# ODE solver settings - these control the accuracy and efficiency of the flow
use_pid_controller = false  # Use adaptive PIDController (recommended) vs constant step size
rtol = 1e-3  # Relative tolerance for PIDController (smaller = more accurate, slower)
atol = 1e-6  # Absolute tolerance for PIDController (smaller = more accurate, slower)
dtmax = 2.0  # Maximum step size for PIDController
dtmin = 0.05  # Minimum step size for PIDController

# Time integration parameters
t0 = 0.0  # Starting time for ODE integration
extract_t1 = 10.0  # End time for extract phase
dt0 = 0.5  # Initial time step size

[training]
# Multi-GPU Training Support:
# Set num_devices > 1 to enable data parallelization across multiple GPUs.
# - num_devices=1: Single GPU training (default, backward compatible)
# - num_devices=2+: Multi-GPU training with automatic data sharding
# - For optimal performance: batch_size should be divisible by num_devices
# - Memory efficient: full dataset stays in CPU, only batches go to GPU
# - Example configs:
#   * 4 GPUs: num_devices=4, batch_size=2048 (512 samples per GPU)
#   * 2 GPUs: num_devices=2, batch_size=1024 (512 samples per GPU)

# Training process parameters
seed = 42  # Random seed for reproducibility
learning_rate = 1e-5  # Initial learning rate (will be scheduled during training)
weight_decay = 1e-4  # L2 regularization parameter
epochs = 300  # Number of training epochs
enable_scheduler = true  # Enable learning rate scheduling for training from scratch
                        # When false, uses constant LR = learning_rate * 0.01
                        # Set to false when loading pretrained models to continue training

# Data and batching parameters
batch_size = 1024  # Training batch size (adjust based on GPU memory)
num_devices = 1  # Number of GPUs for data parallelization
                 # Examples: 1 (single GPU), 2 (dual GPU), 4 (quad GPU), 8 (octa GPU)
                 # Note: batch_size should be divisible by num_devices for optimal performance
n_samples = 16  # Number of samples per instance for likelihood estimation
n_train = 65536  # Size of training set
n_test = 4096  # Size of test/validation set

# Training strategy parameters
use_best = true  # Use the best model based on validation loss (recommended)
curl_loss_multiplier = 1000.0  # Weight for curl penalty (encourages curl-free fields)
z_scale = 5.0  # Scaling factor for z dimension coordinates
multisteps_every_k = 2  # Gradient accumulation steps for MultiSteps optimizer

[experiment]
# Physical experimental setup parameters
tpc_height = 259.92  # Height of the TPC in cm (for filtering z coordinates)
tpc_r = 129.96  # Radius of the TPC in cm (for boundary constraints)

[posrec]
# Position reconstruction flow model parameters
# These should match the pretrained position reconstruction model
flow_layers = 6  # Number of coupling layers in the flow
nn_width = 128  # Width of neural networks in coupling layers
nn_depth = 6  # Depth of neural networks in coupling layers
invert_bool = false  # Whether to invert the flow (should match pretrained model)
cond_dim = 860  # Conditioning dimension (should match hit pattern size)

# Spline transformation parameters
spline_knots = 5  # Number of knots for rational quadratic splines
spline_interval = 5.0  # Interval for spline transformations

# Coordinate transformation parameters
radius_buffer = 0.0  # Buffer for predictions beyond TPC radius (in cm)