Skip to content

Commit e397e4b

Browse files
committed
Merge origin/main and make regex caps burst-proof
2 parents 4f6b17a + 76a8772 commit e397e4b

37 files changed

+2070
-1006
lines changed

DEVELOPMENT.md

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -292,10 +292,10 @@ build:
292292

293293
### Local OpenTelemetry (optional)
294294

295-
To collect traces and metrics locally, run the Grafana LGTM stack (Loki, Grafana, Tempo, Mimir):
295+
To collect traces and metrics locally, run the LGTM stack (Loki, Tempo, Mimir):
296296

297297
```bash
298-
# Start Grafana LGTM (UI at http://localhost:3000, login: admin/admin)
298+
# Start LGTM (UI at http://localhost:3000, login: admin/admin)
299299
# Note, if you are developing on a shared server, you can use the same LGTM stack as your peer(s)
300300
# You will be able to sort your metrics, traces, and logs using the ENV configuration (see below)
301301
BIND=127.0.0.1
@@ -323,15 +323,7 @@ docker run -d --name lgtm \
323323
make dev
324324
```
325325

326-
Open http://localhost:3000 to view traces (Tempo), metrics (Mimir), and logs (Loki) in Grafana.
327-
328-
**Import the Hypeman dashboard:**
329-
330-
1. Go to Dashboards → New → Import
331-
2. Upload `dashboards/hypeman.json` or paste its contents
332-
3. Select the Prometheus datasource and click Import
333-
334-
Use the Environment/Instance dropdowns to filter by `deployment.environment` or `service.instance.id`.
326+
Open http://localhost:3000 to view traces (Tempo), metrics (Mimir), and logs (Loki).
335327

336328
## Testing
337329

cmd/api/config/config.go

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,11 @@ type APIConfig struct {
9898

9999
// MetricsConfig holds metrics endpoint settings.
100100
type MetricsConfig struct {
101-
ListenAddress string `koanf:"listen_address"`
102-
Port int `koanf:"port"`
103-
VMLabelBudget int `koanf:"vm_label_budget"`
104-
ResourceRefreshInterval string `koanf:"resource_refresh_interval"`
101+
ListenAddress string `koanf:"listen_address"`
102+
Port int `koanf:"port"`
103+
VMLabelBudget int `koanf:"vm_label_budget"`
104+
ResourceRefreshInterval string `koanf:"resource_refresh_interval"`
105+
AllocationReconcileInterval string `koanf:"allocation_reconcile_interval"`
105106
}
106107

107108
// OtelConfig holds OpenTelemetry settings.
@@ -144,6 +145,11 @@ type BuildConfig struct {
144145
DockerSocket string `koanf:"docker_socket"`
145146
}
146147

148+
// InstancesConfig holds instance-manager internal settings.
149+
type InstancesConfig struct {
150+
LifecycleEventBufferSize int `koanf:"lifecycle_event_buffer_size"`
151+
}
152+
147153
// RegistryConfig holds OCI registry settings.
148154
type RegistryConfig struct {
149155
URL string `koanf:"url"`
@@ -241,6 +247,7 @@ type Config struct {
241247
Logging LoggingConfig `koanf:"logging"`
242248
Images ImagesConfig `koanf:"images"`
243249
Build BuildConfig `koanf:"build"`
250+
Instances InstancesConfig `koanf:"instances"`
244251
Registry RegistryConfig `koanf:"registry"`
245252
Limits LimitsConfig `koanf:"limits"`
246253
Oversubscription OversubscriptionConfig `koanf:"oversubscription"`
@@ -310,10 +317,11 @@ func defaultConfig() *Config {
310317
},
311318

312319
Metrics: MetricsConfig{
313-
ListenAddress: "127.0.0.1",
314-
Port: 9464,
315-
VMLabelBudget: 200,
316-
ResourceRefreshInterval: "120s",
320+
ListenAddress: "127.0.0.1",
321+
Port: 9464,
322+
VMLabelBudget: 200,
323+
ResourceRefreshInterval: "120s",
324+
AllocationReconcileInterval: "120s",
317325
},
318326

319327
Otel: OtelConfig{
@@ -349,6 +357,10 @@ func defaultConfig() *Config {
349357
DockerSocket: "/var/run/docker.sock",
350358
},
351359

360+
Instances: InstancesConfig{
361+
LifecycleEventBufferSize: 256,
362+
},
363+
352364
Registry: RegistryConfig{
353365
URL: "localhost:8080",
354366
Insecure: false,
@@ -499,6 +511,16 @@ func (c *Config) Validate() error {
499511
if interval <= 0 {
500512
return fmt.Errorf("metrics.resource_refresh_interval must be positive, got %q", c.Metrics.ResourceRefreshInterval)
501513
}
514+
if strings.TrimSpace(c.Metrics.AllocationReconcileInterval) == "" {
515+
return fmt.Errorf("metrics.allocation_reconcile_interval must not be empty")
516+
}
517+
reconcileInterval, err := time.ParseDuration(c.Metrics.AllocationReconcileInterval)
518+
if err != nil {
519+
return fmt.Errorf("metrics.allocation_reconcile_interval must be a valid duration, got %q: %w", c.Metrics.AllocationReconcileInterval, err)
520+
}
521+
if reconcileInterval <= 0 {
522+
return fmt.Errorf("metrics.allocation_reconcile_interval must be positive, got %q", c.Metrics.AllocationReconcileInterval)
523+
}
502524
if c.Otel.MetricExportInterval != "" {
503525
if _, err := time.ParseDuration(c.Otel.MetricExportInterval); err != nil {
504526
return fmt.Errorf("otel.metric_export_interval must be a valid duration, got %q: %w", c.Otel.MetricExportInterval, err)
@@ -552,6 +574,9 @@ func (c *Config) Validate() error {
552574
if err := validateNamePatternLimits(c.Limits.NamePatterns); err != nil {
553575
return err
554576
}
577+
if c.Instances.LifecycleEventBufferSize <= 0 {
578+
return fmt.Errorf("instances.lifecycle_event_buffer_size must be positive, got %d", c.Instances.LifecycleEventBufferSize)
579+
}
555580
if err := validateDuration("images.auto_delete.unused_for", c.Images.AutoDelete.UnusedFor); err != nil {
556581
return err
557582
}

cmd/api/config/config_test.go

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) {
2525
if cfg.Metrics.ResourceRefreshInterval != "120s" {
2626
t.Fatalf("expected default metrics.resource_refresh_interval to be 120s, got %q", cfg.Metrics.ResourceRefreshInterval)
2727
}
28+
if cfg.Metrics.AllocationReconcileInterval != "120s" {
29+
t.Fatalf("expected default metrics.allocation_reconcile_interval to be 120s, got %q", cfg.Metrics.AllocationReconcileInterval)
30+
}
2831
if cfg.Otel.MetricExportInterval != "60s" {
2932
t.Fatalf("expected default otel.metric_export_interval to be 60s, got %q", cfg.Otel.MetricExportInterval)
3033
}
@@ -40,15 +43,20 @@ func TestDefaultConfigIncludesMetricsSettings(t *testing.T) {
4043
if len(cfg.Images.AutoDelete.Allowed) != 0 {
4144
t.Fatalf("expected default images.auto_delete.allowed to be empty, got %v", cfg.Images.AutoDelete.Allowed)
4245
}
46+
if cfg.Instances.LifecycleEventBufferSize != 256 {
47+
t.Fatalf("expected default instances.lifecycle_event_buffer_size to be 256, got %d", cfg.Instances.LifecycleEventBufferSize)
48+
}
4349
}
4450

4551
func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
4652
t.Setenv("METRICS__LISTEN_ADDRESS", "0.0.0.0")
4753
t.Setenv("METRICS__PORT", "9999")
4854
t.Setenv("METRICS__VM_LABEL_BUDGET", "350")
4955
t.Setenv("METRICS__RESOURCE_REFRESH_INTERVAL", "30s")
56+
t.Setenv("METRICS__ALLOCATION_RECONCILE_INTERVAL", "45s")
5057
t.Setenv("OTEL__METRIC_EXPORT_INTERVAL", "15s")
5158
t.Setenv("OTEL__SUCCESSFUL_GET_SAMPLE_RATIO", "0.25")
59+
t.Setenv("INSTANCES__LIFECYCLE_EVENT_BUFFER_SIZE", "512")
5260

5361
tmp := t.TempDir()
5462
cfgPath := filepath.Join(tmp, "config.yaml")
@@ -73,12 +81,18 @@ func TestLoadEnvOverridesMetricsAndOtelInterval(t *testing.T) {
7381
if cfg.Metrics.ResourceRefreshInterval != "30s" {
7482
t.Fatalf("expected metrics.resource_refresh_interval override, got %q", cfg.Metrics.ResourceRefreshInterval)
7583
}
84+
if cfg.Metrics.AllocationReconcileInterval != "45s" {
85+
t.Fatalf("expected metrics.allocation_reconcile_interval override, got %q", cfg.Metrics.AllocationReconcileInterval)
86+
}
7687
if cfg.Otel.MetricExportInterval != "15s" {
7788
t.Fatalf("expected otel.metric_export_interval override, got %q", cfg.Otel.MetricExportInterval)
7889
}
7990
if cfg.Otel.SuccessfulGetSampleRatio != 0.25 {
8091
t.Fatalf("expected otel.successful_get_sample_ratio override, got %v", cfg.Otel.SuccessfulGetSampleRatio)
8192
}
93+
if cfg.Instances.LifecycleEventBufferSize != 512 {
94+
t.Fatalf("expected instances.lifecycle_event_buffer_size override, got %d", cfg.Instances.LifecycleEventBufferSize)
95+
}
8296
}
8397

8498
func TestValidateRejectsInvalidMetricsPort(t *testing.T) {
@@ -147,6 +161,59 @@ func TestValidateRejectsInvalidResourceRefreshInterval(t *testing.T) {
147161
}
148162
}
149163

164+
func TestValidateRejectsInvalidAllocationReconcileInterval(t *testing.T) {
165+
cfg := defaultConfig()
166+
cfg.Metrics.AllocationReconcileInterval = ""
167+
168+
err := cfg.Validate()
169+
if err == nil {
170+
t.Fatalf("expected validation error for empty allocation reconcile interval")
171+
}
172+
173+
cfg = defaultConfig()
174+
cfg.Metrics.AllocationReconcileInterval = "not-a-duration"
175+
176+
err = cfg.Validate()
177+
if err == nil {
178+
t.Fatalf("expected validation error for invalid allocation reconcile interval")
179+
}
180+
181+
cfg = defaultConfig()
182+
cfg.Metrics.AllocationReconcileInterval = "0s"
183+
184+
err = cfg.Validate()
185+
if err == nil {
186+
t.Fatalf("expected validation error for non-positive allocation reconcile interval")
187+
}
188+
}
189+
190+
func TestLoadUsesConfiguredLifecycleEventBufferSize(t *testing.T) {
191+
tmp := t.TempDir()
192+
cfgPath := filepath.Join(tmp, "config.yaml")
193+
if err := os.WriteFile(cfgPath, []byte("instances:\n lifecycle_event_buffer_size: 384\n"), 0600); err != nil {
194+
t.Fatalf("write temp config: %v", err)
195+
}
196+
197+
cfg, err := Load(cfgPath)
198+
if err != nil {
199+
t.Fatalf("load config: %v", err)
200+
}
201+
202+
if cfg.Instances.LifecycleEventBufferSize != 384 {
203+
t.Fatalf("expected instances.lifecycle_event_buffer_size from config file, got %d", cfg.Instances.LifecycleEventBufferSize)
204+
}
205+
}
206+
207+
func TestValidateRejectsInvalidLifecycleEventBufferSize(t *testing.T) {
208+
cfg := defaultConfig()
209+
cfg.Instances.LifecycleEventBufferSize = 0
210+
211+
err := cfg.Validate()
212+
if err == nil {
213+
t.Fatalf("expected validation error for invalid lifecycle event buffer size")
214+
}
215+
}
216+
150217
func TestLoadUsesDefaultImageAutoDeleteRetentionWindow(t *testing.T) {
151218
tmp := t.TempDir()
152219
cfgPath := filepath.Join(tmp, "config.yaml")

cmd/api/main.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,18 @@ func run() error {
183183
if err != nil {
184184
return fmt.Errorf("invalid metrics resource refresh interval %q: %w", app.Config.Metrics.ResourceRefreshInterval, err)
185185
}
186+
allocationReconcileInterval, err := time.ParseDuration(app.Config.Metrics.AllocationReconcileInterval)
187+
if err != nil {
188+
return fmt.Errorf("invalid metrics allocation reconcile interval %q: %w", app.Config.Metrics.AllocationReconcileInterval, err)
189+
}
186190
if err := app.ResourceManager.StartMonitoring(ctx, otelProvider.Meter, resourceRefreshInterval); err != nil {
187191
return fmt.Errorf("start resource monitoring: %w", err)
188192
}
193+
if reconciler, ok := app.InstanceManager.(interface {
194+
StartAdmissionAllocationReconciler(context.Context, time.Duration)
195+
}); ok {
196+
reconciler.StartAdmissionAllocationReconciler(ctx, allocationReconcileInterval)
197+
}
189198

190199
// Log OTel status
191200
if cfg.Otel.Enabled {

0 commit comments

Comments
 (0)