diff --git a/.beads/.gitignore b/.beads/.gitignore new file mode 100644 index 0000000..d27a1db --- /dev/null +++ b/.beads/.gitignore @@ -0,0 +1,44 @@ +# SQLite databases +*.db +*.db?* +*.db-journal +*.db-wal +*.db-shm + +# Daemon runtime files +daemon.lock +daemon.log +daemon.pid +bd.sock +sync-state.json +last-touched + +# Local version tracking (prevents upgrade notification spam after git ops) +.local_version + +# Legacy database files +db.sqlite +bd.db + +# Worktree redirect file (contains relative path to main repo's .beads/) +# Must not be committed as paths would be wrong in other clones +redirect + +# Merge artifacts (temporary files from 3-way merge) +beads.base.jsonl +beads.base.meta.json +beads.left.jsonl +beads.left.meta.json +beads.right.jsonl +beads.right.meta.json + +# Sync state (local-only, per-machine) +# These files are machine-specific and should not be shared across clones +.sync.lock +sync_base.jsonl + +# NOTE: Do NOT add negation patterns (e.g., !issues.jsonl) here. +# They would override fork protection in .git/info/exclude, allowing +# contributors to accidentally commit upstream issue databases. +# The JSONL files (issues.jsonl, interactions.jsonl) and config files +# are tracked by git by default since no pattern above ignores them. diff --git a/.beads/README.md b/.beads/README.md new file mode 100644 index 0000000..50f281f --- /dev/null +++ b/.beads/README.md @@ -0,0 +1,81 @@ +# Beads - AI-Native Issue Tracking + +Welcome to Beads! This repository uses **Beads** for issue tracking - a modern, AI-native tool designed to live directly in your codebase alongside your code. + +## What is Beads? + +Beads is issue tracking that lives in your repo, making it perfect for AI coding agents and developers who want their issues close to their code. No web UI required - everything works through the CLI and integrates seamlessly with git. + +**Learn more:** [github.com/steveyegge/beads](https://github.com/steveyegge/beads) + +## Quick Start + +### Essential Commands + +```bash +# Create new issues +bd create "Add user authentication" + +# View all issues +bd list + +# View issue details +bd show + +# Update issue status +bd update --status in_progress +bd update --status done + +# Sync with git remote +bd sync +``` + +### Working with Issues + +Issues in Beads are: +- **Git-native**: Stored in `.beads/issues.jsonl` and synced like code +- **AI-friendly**: CLI-first design works perfectly with AI coding agents +- **Branch-aware**: Issues can follow your branch workflow +- **Always in sync**: Auto-syncs with your commits + +## Why Beads? + +✨ **AI-Native Design** +- Built specifically for AI-assisted development workflows +- CLI-first interface works seamlessly with AI coding agents +- No context switching to web UIs + +🚀 **Developer Focused** +- Issues live in your repo, right next to your code +- Works offline, syncs when you push +- Fast, lightweight, and stays out of your way + +🔧 **Git Integration** +- Automatic sync with git commits +- Branch-aware issue tracking +- Intelligent JSONL merge resolution + +## Get Started with Beads + +Try Beads in your own projects: + +```bash +# Install Beads +curl -sSL https://raw.githubusercontent.com/steveyegge/beads/main/scripts/install.sh | bash + +# Initialize in your repo +bd init + +# Create your first issue +bd create "Try out Beads" +``` + +## Learn More + +- **Documentation**: [github.com/steveyegge/beads/docs](https://github.com/steveyegge/beads/tree/main/docs) +- **Quick Start Guide**: Run `bd quickstart` +- **Examples**: [github.com/steveyegge/beads/examples](https://github.com/steveyegge/beads/tree/main/examples) + +--- + +*Beads: Issue tracking that moves at the speed of thought* ⚡ diff --git a/.beads/config.yaml b/.beads/config.yaml new file mode 100644 index 0000000..f242785 --- /dev/null +++ b/.beads/config.yaml @@ -0,0 +1,62 @@ +# Beads Configuration File +# This file configures default behavior for all bd commands in this repository +# All settings can also be set via environment variables (BD_* prefix) +# or overridden with command-line flags + +# Issue prefix for this repository (used by bd init) +# If not set, bd init will auto-detect from directory name +# Example: issue-prefix: "myproject" creates issues like "myproject-1", "myproject-2", etc. +# issue-prefix: "" + +# Use no-db mode: load from JSONL, no SQLite, write back after each command +# When true, bd will use .beads/issues.jsonl as the source of truth +# instead of SQLite database +# no-db: false + +# Disable daemon for RPC communication (forces direct database access) +# no-daemon: false + +# Disable auto-flush of database to JSONL after mutations +# no-auto-flush: false + +# Disable auto-import from JSONL when it's newer than database +# no-auto-import: false + +# Enable JSON output by default +# json: false + +# Default actor for audit trails (overridden by BD_ACTOR or --actor) +# actor: "" + +# Path to database (overridden by BEADS_DB or --db) +# db: "" + +# Auto-start daemon if not running (can also use BEADS_AUTO_START_DAEMON) +# auto-start-daemon: true + +# Debounce interval for auto-flush (can also use BEADS_FLUSH_DEBOUNCE) +# flush-debounce: "5s" + +# Git branch for beads commits (bd sync will commit to this branch) +# IMPORTANT: Set this for team projects so all clones use the same sync branch. +# This setting persists across clones (unlike database config which is gitignored). +# Can also use BEADS_SYNC_BRANCH env var for local override. +# If not set, bd sync will require you to run 'bd config set sync.branch '. +# sync-branch: "beads-sync" + +# Multi-repo configuration (experimental - bd-307) +# Allows hydrating from multiple repositories and routing writes to the correct JSONL +# repos: +# primary: "." # Primary repo (where this database lives) +# additional: # Additional repos to hydrate from (read-only) +# - ~/beads-planning # Personal planning repo +# - ~/work-planning # Work planning repo + +# Integration settings (access with 'bd config get/set') +# These are stored in the database, not in this file: +# - jira.url +# - jira.project +# - linear.url +# - linear.api-key +# - github.org +# - github.repo diff --git a/.beads/interactions.jsonl b/.beads/interactions.jsonl new file mode 100644 index 0000000..e69de29 diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl new file mode 100644 index 0000000..d6f4ff6 --- /dev/null +++ b/.beads/issues.jsonl @@ -0,0 +1,13 @@ +{"id":"opslevel-runner-5kw","title":"Remove redundant client validation in root.go","description":"root.go:56-62 - Dead code after cobra.CheckErr() which exits on error. The second cobra.CheckErr(clientErr) is unreachable.","status":"open","priority":3,"issue_type":"task","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:45.884341-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:33:45.884341-05:00"} +{"id":"opslevel-runner-8tu","title":"Remove redundant slice re-assignment in opslevelAppendLogProcessor.go","description":"opslevelAppendLogProcessor.go:99-101 - Code sets logLines to nil then immediately to empty slice. Either use s.logLines = s.logLines[:0] to reuse backing array, or just s.logLines = nil.","status":"open","priority":4,"issue_type":"task","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:46.38156-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:33:46.38156-05:00"} +{"id":"opslevel-runner-95x","title":"Fix elapsed time reset bug in opslevelAppendLogProcessor.go","description":"In opslevelAppendLogProcessor.go:59-62, time.Since(time.Now()) always returns ~0, so the elapsed counter never actually resets. Should be s.elapsed = 0.","status":"closed","priority":0,"issue_type":"bug","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:44.016053-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:57:30.420415-05:00","closed_at":"2026-01-20T08:57:30.420415-05:00","close_reason":"Closed"} +{"id":"opslevel-runner-9br","title":"Replace busy-wait loops with tickers in LogStreamer","description":"logs.go:53-101 - Both Run() and Flush() use busy-waiting with time.Sleep(). Should use tickers for efficiency and add timeout safety in Flush().","status":"open","priority":1,"issue_type":"task","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:44.515054-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:33:44.515054-05:00"} +{"id":"opslevel-runner-c7f","title":"Fix memory calculation typo (1204 → 1024) in k8s_config.go:46","description":"Memory limits use 1024*1204 instead of 1024*1024, allocating ~17% more memory than intended. This directly impacts resource allocation and could cause pod failures or resource wastage in production.","status":"closed","priority":0,"issue_type":"bug","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:43.49835-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:57:30.423465-05:00","closed_at":"2026-01-20T08:57:30.423465-05:00","close_reason":"Closed"} +{"id":"opslevel-runner-cpt","title":"Add sync.Once to API client singletons for thread safety","description":"api.go:15-37 - REST/GraphQL client singletons could have race conditions during initialization. Use sync.Once to ensure thread-safe initialization.","status":"open","priority":2,"issue_type":"task","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:45.382082-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:33:45.382082-05:00"} +{"id":"opslevel-runner-eya","title":"Fix log typo in DeletePDB - says 'configmap' instead of 'pod disruption budget'","description":"k8s.go:458 - DeletePDB logs 'Deleting configmap' instead of 'Deleting pod disruption budget'. Copy-paste error.","status":"closed","priority":3,"issue_type":"bug","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:45.632702-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T09:21:56.091381-05:00","closed_at":"2026-01-20T09:21:56.091381-05:00","close_reason":"Fixed as part of DeletePDB nil guard addition"} +{"id":"opslevel-runner-gs7","title":"Extract container name constants in k8s.go","description":"k8s.go:226, 338 - Container names 'helper' and 'job' are hardcoded strings. Extract to constants for maintainability.","status":"open","priority":4,"issue_type":"task","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:46.622763-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:33:46.622763-05:00"} +{"id":"opslevel-runner-ifs","title":"Propagate context to K8s operations instead of context.TODO()","description":"K8s operations in k8s.go use context.TODO() instead of propagating parent context. This prevents graceful cancellation during shutdown. Affects: CreateConfigMap, CreatePDB, CreatePod, DeleteConfigMap, DeletePDB, DeletePod, isPodInDesiredState.","status":"open","priority":1,"issue_type":"task","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:44.266878-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:33:44.266878-05:00"} +{"id":"opslevel-runner-lz8","title":"Move defer cleanup after error check in k8s.go","description":"k8s.go:299-324 - Defer statements for cleanup are placed before error checks, meaning cleanup will be called even if resource creation failed. Move defer after the if err != nil check.","status":"closed","priority":1,"issue_type":"bug","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:45.08706-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T09:18:27.188443-05:00","closed_at":"2026-01-20T09:18:27.188443-05:00","close_reason":"Closed"} +{"id":"opslevel-runner-qgb","title":"Fix race condition on isLeader variable in leaderElection.go","description":"leaderElection.go:18-48 - The isLeader package-level variable is accessed without synchronization from multiple goroutines. Add sync.RWMutex protection with setLeader()/getLeader() functions.","status":"closed","priority":1,"issue_type":"bug","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:44.819558-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T09:18:27.190004-05:00","closed_at":"2026-01-20T09:18:27.190004-05:00","close_reason":"Closed"} +{"id":"opslevel-runner-sin","title":"Use dynamic runner image version instead of hardcoded v2024.1.3","description":"In k8s.go:208, the init container image is hardcoded to v2024.1.3 instead of using ImageTagVersion. This causes version mismatches between runner controller and helper binary.","status":"closed","priority":0,"issue_type":"bug","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:43.754131-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:57:30.42223-05:00","closed_at":"2026-01-20T08:57:30.42223-05:00","close_reason":"Closed"} +{"id":"opslevel-runner-t4m","title":"Add context cancellation check in leader election loop","description":"leaderElection.go:51-85 - The infinite loop in OnStartedLeading doesn't check context for cancellation. Use select with ticker instead of time.Sleep for responsive shutdown.","status":"open","priority":2,"issue_type":"task","owner":"jason@opslevel.com","created_at":"2026-01-20T08:33:46.134478-05:00","created_by":"Jason Morrow","updated_at":"2026-01-20T08:33:46.134478-05:00"} diff --git a/.beads/metadata.json b/.beads/metadata.json new file mode 100644 index 0000000..c787975 --- /dev/null +++ b/.beads/metadata.json @@ -0,0 +1,4 @@ +{ + "database": "beads.db", + "jsonl_export": "issues.jsonl" +} \ No newline at end of file diff --git a/.claude/agents/code-improver.md b/.claude/agents/code-improver.md new file mode 100644 index 0000000..d92bb13 --- /dev/null +++ b/.claude/agents/code-improver.md @@ -0,0 +1,113 @@ +--- +name: code-improver +description: "Use this agent when you want to analyze existing code for potential improvements in readability, performance, and best practices. This agent examines code files and provides detailed suggestions with explanations, current code snippets, and improved versions.\\n\\nExamples:\\n\\n\\nContext: User wants to improve a specific file they've been working on.\\nuser: \"Can you review my utils.go file for improvements?\"\\nassistant: \"I'll use the code-improver agent to analyze utils.go and suggest improvements.\"\\n\\n\\n\\n\\nContext: User has just finished writing a new module and wants feedback.\\nuser: \"I just finished the authentication module, please review it\"\\nassistant: \"Let me launch the code-improver agent to scan your authentication module and identify opportunities for better readability, performance, and adherence to best practices.\"\\n\\n\\n\\n\\nContext: User asks for general code quality feedback on recent changes.\\nuser: \"Are there any improvements I should make to the files I changed today?\"\\nassistant: \"I'll use the code-improver agent to review your recently modified files and suggest improvements.\"\\n\\n" +model: sonnet +color: pink +--- + +You are an expert code quality analyst with deep expertise in software engineering best practices, performance optimization, and clean code principles. You have extensive experience across multiple programming languages and paradigms, with particular strength in identifying subtle issues that impact maintainability, readability, and runtime efficiency. + +## Your Mission + +You analyze code files to identify opportunities for improvement across three key dimensions: +1. **Readability**: Code clarity, naming conventions, documentation, and structural organization +2. **Performance**: Algorithmic efficiency, resource usage, unnecessary computations, and optimization opportunities +3. **Best Practices**: Language idioms, design patterns, error handling, security considerations, and adherence to established conventions + +## Analysis Process + +For each file or code section you review: + +1. **Initial Assessment**: Read through the entire code to understand its purpose, structure, and context +2. **Systematic Scan**: Examine the code for issues in each of the three dimensions +3. **Prioritization**: Rank findings by impact (critical, moderate, minor) +4. **Solution Development**: Craft improved versions that address identified issues + +## Output Format + +For each improvement suggestion, provide: + +### Issue Title +**Category**: [Readability | Performance | Best Practices] +**Severity**: [Critical | Moderate | Minor] +**Location**: [File path and line numbers] + +**Explanation**: A clear description of why this is an issue and the impact it has on the codebase. + +**Current Code**: +```[language] +[The problematic code snippet] +``` + +**Improved Code**: +```[language] +[The refactored/improved version] +``` + +**Why This Is Better**: Specific explanation of the improvements and their benefits. + +--- + +## Guidelines + +### Readability Checks +- Variable and function naming (clarity, consistency, conventions) +- Function length and complexity (single responsibility) +- Code comments (presence, accuracy, necessity) +- Logical grouping and organization +- Consistent formatting and style +- Magic numbers and hardcoded values +- Nested complexity and early returns + +### Performance Checks +- Unnecessary iterations or computations +- Inefficient data structures for the use case +- Memory allocation patterns +- Database query optimization opportunities +- Caching opportunities +- Lazy evaluation possibilities +- Algorithmic complexity improvements + +### Best Practices Checks +- Error handling completeness and consistency +- Input validation and sanitization +- Resource cleanup (connections, file handles) +- Thread safety considerations +- Security vulnerabilities (injection, exposure) +- Language-specific idioms and patterns +- Testing considerations +- Logging and observability + +## Project-Specific Considerations + +When reviewing Go code in this codebase: +- Follow the established patterns using Zerolog for structured logging +- Ensure Kubernetes resources are properly cleaned up +- Use context for cancellation and timeouts consistently +- Follow the log processor pipeline pattern for any streaming data +- Adhere to the existing error handling patterns + +## Behavioral Guidelines + +1. **Be Constructive**: Frame suggestions positively, focusing on improvement rather than criticism +2. **Be Specific**: Always show exact code locations and concrete improvements +3. **Be Practical**: Prioritize suggestions that provide meaningful value over pedantic nitpicks +4. **Be Educational**: Explain the reasoning so developers learn from the feedback +5. **Respect Context**: Consider the project's existing patterns and conventions +6. **Acknowledge Good Code**: When code is well-written, say so + +## Quality Assurance + +Before presenting each suggestion: +- Verify the improved code is syntactically correct +- Ensure the improvement doesn't change the intended behavior +- Confirm the suggestion aligns with the project's existing patterns +- Check that the explanation clearly communicates the benefit + +## Summary Section + +After all suggestions, provide a brief summary: +- Total issues found by category and severity +- Top 3 highest-impact improvements to prioritize +- Overall assessment of code quality +- Patterns or themes observed across the suggestions diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 0000000..a350098 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,15 @@ +{ + "hooks": { + "PostToolUse": [ + { + "matcher": "Write|Edit", + "hooks": [ + { + "type": "command", + "command": "task lint;" + } + ] + } + ] + } +} diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..807d598 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ + +# Use bd merge for beads JSONL files +.beads/issues.jsonl merge=beads diff --git a/.gitignore b/.gitignore index 42ec24e..05944a5 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,5 @@ src/go.work** # ignore any user-created yaml files that may have been used for tophatting. src/*.yaml -# agent folders -.beads/ +# Git worktrees .worktrees/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..df7a4af --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,40 @@ +# Agent Instructions + +This project uses **bd** (beads) for issue tracking. Run `bd onboard` to get started. + +## Quick Reference + +```bash +bd ready # Find available work +bd show # View issue details +bd update --status in_progress # Claim work +bd close # Complete work +bd sync # Sync with git +``` + +## Landing the Plane (Session Completion) + +**When ending a work session**, you MUST complete ALL steps below. Work is NOT complete until `git push` succeeds. + +**MANDATORY WORKFLOW:** + +1. **File issues for remaining work** - Create issues for anything that needs follow-up +2. **Run quality gates** (if code changed) - Tests, linters, builds +3. **Update issue status** - Close finished work, update in-progress items +4. **PUSH TO REMOTE** - This is MANDATORY: + ```bash + git pull --rebase + bd sync + git push + git status # MUST show "up to date with origin" + ``` +5. **Clean up** - Clear stashes, prune remote branches +6. **Verify** - All changes committed AND pushed +7. **Hand off** - Provide context for next session + +**CRITICAL RULES:** +- Work is NOT complete until `git push` succeeds +- NEVER stop before pushing - that leaves work stranded locally +- NEVER say "ready to push when you are" - YOU must push +- If push fails, resolve and retry until it succeeds + diff --git a/CLAUDE.md b/CLAUDE.md index 7464fb4..c2a9abd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,6 +2,8 @@ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. +@AGENTS.md + ## Project Overview OpsLevel Runner is a Kubernetes-based job processor for OpsLevel. It polls for jobs from the OpsLevel API (or Faktory queue), spins up Kubernetes pods to execute commands, and streams logs back to OpsLevel. diff --git a/src/pkg/k8s.go b/src/pkg/k8s.go index b856c04..3b65b59 100644 --- a/src/pkg/k8s.go +++ b/src/pkg/k8s.go @@ -205,7 +205,7 @@ func (s *JobRunner) getPodObject(identifier string, labels map[string]string, jo InitContainers: []corev1.Container{ { Name: "helper", - Image: "public.ecr.aws/opslevel/opslevel-runner:v2024.1.3", // TODO: fmt.Sprintf("public.ecr.aws/opslevel/opslevel-runner:v%s", ImageTagVersion), + Image: fmt.Sprintf("public.ecr.aws/opslevel/opslevel-runner:v%s", ImageTagVersion), ImagePullPolicy: s.podConfig.PullPolicy, Command: []string{ "cp", @@ -297,31 +297,31 @@ func (s *JobRunner) Run(ctx context.Context, job opslevel.RunnerJob, stdout, std } // TODO: manage pods based on image for re-use? cfgMap, err := s.CreateConfigMap(s.getConfigMapObject(identifier, job)) - defer s.DeleteConfigMap(cfgMap) // TODO: if we reuse pods then delete should not happen? if err != nil { return JobOutcome{ Message: fmt.Sprintf("failed to create configmap REASON: %s", err), Outcome: opslevel.RunnerJobOutcomeEnumFailed, } } + defer s.DeleteConfigMap(cfgMap) // TODO: if we reuse pods then delete should not happen? pdb, err := s.CreatePDB(s.getPBDObject(identifier, labelSelector)) - defer s.DeletePDB(pdb) // TODO: if we reuse pods then delete should not happen? if err != nil { return JobOutcome{ Message: fmt.Sprintf("failed to create pod disruption budget REASON: %s", err), Outcome: opslevel.RunnerJobOutcomeEnumFailed, } } + defer s.DeletePDB(pdb) // TODO: if we reuse pods then delete should not happen? pod, err := s.CreatePod(s.getPodObject(identifier, labels, job)) - defer s.DeletePod(pod) // TODO: if we reuse pods then delete should not happen if err != nil { return JobOutcome{ Message: fmt.Sprintf("failed to create pod REASON: %s", err), Outcome: opslevel.RunnerJobOutcomeEnumFailed, } } + defer s.DeletePod(pod) // TODO: if we reuse pods then delete should not happen timeout := time.Second * time.Duration(viper.GetInt("job-pod-max-wait")) waitErr := s.WaitForPod(pod, timeout) @@ -447,6 +447,9 @@ func (s *JobRunner) WaitForPod(podConfig *corev1.Pod, timeout time.Duration) err } func (s *JobRunner) DeleteConfigMap(config *corev1.ConfigMap) { + if config == nil { + return + } s.logger.Trace().Msgf("Deleting configmap %s/%s ...", config.Namespace, config.Name) err := s.clientset.CoreV1().ConfigMaps(config.Namespace).Delete(context.TODO(), config.Name, metav1.DeleteOptions{}) if err != nil { @@ -455,7 +458,10 @@ func (s *JobRunner) DeleteConfigMap(config *corev1.ConfigMap) { } func (s *JobRunner) DeletePDB(config *policyv1.PodDisruptionBudget) { - s.logger.Trace().Msgf("Deleting configmap %s/%s ...", config.Namespace, config.Name) + if config == nil { + return + } + s.logger.Trace().Msgf("Deleting pod disruption budget %s/%s ...", config.Namespace, config.Name) err := s.clientset.PolicyV1().PodDisruptionBudgets(config.Namespace).Delete(context.TODO(), config.Name, metav1.DeleteOptions{}) if err != nil { s.logger.Error().Err(err).Msgf("received error on PDB deletion") @@ -463,6 +469,9 @@ func (s *JobRunner) DeletePDB(config *policyv1.PodDisruptionBudget) { } func (s *JobRunner) DeletePod(config *corev1.Pod) { + if config == nil { + return + } s.logger.Trace().Msgf("Deleting pod %s/%s ...", config.Namespace, config.Name) err := s.clientset.CoreV1().Pods(config.Namespace).Delete(context.TODO(), config.Name, metav1.DeleteOptions{}) if err != nil { diff --git a/src/pkg/k8s_config.go b/src/pkg/k8s_config.go index 0940bbe..1d1344d 100644 --- a/src/pkg/k8s_config.go +++ b/src/pkg/k8s_config.go @@ -43,7 +43,7 @@ func ReadPodConfig(path string) (*K8SPodConfig, error) { }, Limits: corev1.ResourceList{ corev1.ResourceCPU: *resource.NewMilliQuantity(viper.GetInt64("job-pod-limits-cpu"), resource.DecimalSI), - corev1.ResourceMemory: *resource.NewQuantity(viper.GetInt64("job-pod-limits-memory")*1024*1204, resource.BinarySI), + corev1.ResourceMemory: *resource.NewQuantity(viper.GetInt64("job-pod-limits-memory")*1024*1024, resource.BinarySI), }, }, TerminationGracePeriodSeconds: 5, diff --git a/src/pkg/k8s_test.go b/src/pkg/k8s_test.go index b0a4662..f46552e 100644 --- a/src/pkg/k8s_test.go +++ b/src/pkg/k8s_test.go @@ -7,6 +7,7 @@ import ( "github.com/rocktavious/autopilot/v2023" "github.com/rs/zerolog" corev1 "k8s.io/api/core/v1" + policyv1 "k8s.io/api/policy/v1" ) func TestCreateLabelSelector(t *testing.T) { @@ -70,3 +71,106 @@ func TestGetPodObject_RegularJobNotPrivileged(t *testing.T) { // Assert autopilot.Equals(t, (*corev1.SecurityContext)(nil), pod.Spec.Containers[0].SecurityContext) } + +func TestDeleteConfigMap_NilSafe(t *testing.T) { + // Arrange - JobRunner with nil clientset (won't be used due to nil guard) + runner := &JobRunner{ + logger: zerolog.Nop(), + } + + // Act & Assert - should not panic when called with nil + runner.DeleteConfigMap(nil) +} + +func TestDeletePDB_NilSafe(t *testing.T) { + // Arrange + runner := &JobRunner{ + logger: zerolog.Nop(), + } + + // Act & Assert - should not panic when called with nil + runner.DeletePDB(nil) +} + +func TestDeletePod_NilSafe(t *testing.T) { + // Arrange + runner := &JobRunner{ + logger: zerolog.Nop(), + } + + // Act & Assert - should not panic when called with nil + runner.DeletePod(nil) +} + +func TestGetConfigMapObject(t *testing.T) { + // Arrange + runner := &JobRunner{ + logger: zerolog.Nop(), + podConfig: &K8SPodConfig{ + Namespace: "test-namespace", + }, + } + job := opslevel.RunnerJob{ + Files: []opslevel.RunnerJobFile{ + {Name: "script.sh", Contents: "#!/bin/bash\necho hello"}, + {Name: "config.yaml", Contents: "key: value"}, + }, + } + + // Act + configMap := runner.getConfigMapObject("test-job-123", job) + + // Assert + autopilot.Equals(t, "test-job-123", configMap.Name) + autopilot.Equals(t, "test-namespace", configMap.Namespace) + autopilot.Equals(t, true, *configMap.Immutable) + autopilot.Equals(t, "#!/bin/bash\necho hello", configMap.Data["script.sh"]) + autopilot.Equals(t, "key: value", configMap.Data["config.yaml"]) +} + +func TestGetPBDObject(t *testing.T) { + // Arrange + runner := &JobRunner{ + logger: zerolog.Nop(), + podConfig: &K8SPodConfig{ + Namespace: "test-namespace", + }, + } + labels := map[string]string{"app": "test"} + labelSelector, _ := CreateLabelSelector(labels) + + // Act + pdb := runner.getPBDObject("test-job-123", labelSelector) + + // Assert + autopilot.Equals(t, "test-job-123", pdb.Name) + autopilot.Equals(t, "test-namespace", pdb.Namespace) + autopilot.Equals(t, "0", pdb.Spec.MaxUnavailable.String()) +} + +// Verify that delete functions require non-nil clientset when given valid input +// These tests document the expected behavior after the defer fix +func TestDeleteFunctions_RequireClientset(t *testing.T) { + // This test documents that delete functions will attempt to use clientset + // when given non-nil resources. The defer fix ensures these are only called + // after successful resource creation (when clientset operations succeeded). + + runner := &JobRunner{ + logger: zerolog.Nop(), + clientset: nil, // intentionally nil + } + + // These would panic if called with non-nil resources but nil clientset + // The defer fix prevents this by ensuring defer only runs after successful creation + + // Verify nil resources are handled safely + runner.DeleteConfigMap(nil) + runner.DeletePDB(nil) + runner.DeletePod(nil) + + // If we reach here, nil handling works correctly + t.Log("Delete functions correctly handle nil resources") +} + +// Suppress unused import warning for policyv1 +var _ = policyv1.PodDisruptionBudget{} diff --git a/src/pkg/leaderElection.go b/src/pkg/leaderElection.go index 61a0d4e..19b9f24 100644 --- a/src/pkg/leaderElection.go +++ b/src/pkg/leaderElection.go @@ -3,6 +3,7 @@ package pkg import ( "context" "math" + "sync" "time" "github.com/opslevel/opslevel-go/v2024" @@ -15,7 +16,22 @@ import ( "k8s.io/client-go/util/retry" ) -var isLeader bool +var ( + isLeader bool + isLeaderMu sync.RWMutex +) + +func setLeader(val bool) { + isLeaderMu.Lock() + defer isLeaderMu.Unlock() + isLeader = val +} + +func getLeader() bool { + isLeaderMu.RLock() + defer isLeaderMu.RUnlock() + return isLeader +} func RunLeaderElection(ctx context.Context, runnerId opslevel.ID, lockName, lockIdentity, lockNamespace string) error { config, err := GetKubernetesConfig() @@ -45,7 +61,7 @@ func RunLeaderElection(ctx context.Context, runnerId opslevel.ID, lockName, lock RetryPeriod: 2 * time.Second, Callbacks: leaderelection.LeaderCallbacks{ OnStartedLeading: func(c context.Context) { - isLeader = true + setLeader(true) logger.Info().Msgf("leader is %s", lockIdentity) deploymentsClient := client.AppsV1().Deployments(lockNamespace) for { @@ -85,13 +101,13 @@ func RunLeaderElection(ctx context.Context, runnerId opslevel.ID, lockName, lock } }, OnStoppedLeading: func() { - isLeader = false + setLeader(false) }, OnNewLeader: func(currentId string) { - if !isLeader && currentId == lockIdentity { + if !getLeader() && currentId == lockIdentity { logger.Info().Msgf("%s started leading!", currentId) return - } else if !isLeader && currentId != lockIdentity { + } else if !getLeader() && currentId != lockIdentity { logger.Info().Msgf("leader is %s", currentId) } }, diff --git a/src/pkg/leaderElection_test.go b/src/pkg/leaderElection_test.go new file mode 100644 index 0000000..6a9299e --- /dev/null +++ b/src/pkg/leaderElection_test.go @@ -0,0 +1,97 @@ +package pkg + +import ( + "sync" + "testing" + + "github.com/rocktavious/autopilot/v2023" +) + +func TestSetLeaderGetLeader(t *testing.T) { + // Arrange - reset to known state + setLeader(false) + + // Act & Assert - basic functionality + autopilot.Equals(t, false, getLeader()) + + setLeader(true) + autopilot.Equals(t, true, getLeader()) + + setLeader(false) + autopilot.Equals(t, false, getLeader()) +} + +func TestSetLeaderGetLeader_ConcurrentAccess(t *testing.T) { + // This test verifies that concurrent access to setLeader/getLeader + // does not cause data races. Run with -race flag to detect races. + // Arrange + setLeader(false) + const goroutines = 100 + const iterations = 100 + + var wg sync.WaitGroup + wg.Add(goroutines * 2) // writers and readers + + // Act - concurrent writers + for i := 0; i < goroutines; i++ { + go func(id int) { + defer wg.Done() + for j := 0; j < iterations; j++ { + setLeader(id%2 == 0) + } + }(i) + } + + // Act - concurrent readers + for i := 0; i < goroutines; i++ { + go func() { + defer wg.Done() + for j := 0; j < iterations; j++ { + _ = getLeader() + } + }() + } + + wg.Wait() + + // Assert - if we get here without race detector complaints, the test passes + // Final state is indeterminate due to concurrent writes, but no panics or races + _ = getLeader() // should not panic +} + +func TestSetLeaderGetLeader_ConcurrentReadWrite(t *testing.T) { + // Specifically test the pattern used in leaderElection.go callbacks + // where OnNewLeader reads while OnStartedLeading/OnStoppedLeading write + setLeader(false) + + var wg sync.WaitGroup + done := make(chan struct{}) + + // Simulate OnStartedLeading/OnStoppedLeading callbacks + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case <-done: + return + default: + setLeader(true) + setLeader(false) + } + } + }() + + // Simulate OnNewLeader callback reading isLeader + wg.Add(1) + go func() { + defer wg.Done() + for i := 0; i < 1000; i++ { + // Simulate the condition checks in OnNewLeader + _ = getLeader() + } + }() + + close(done) + wg.Wait() +} diff --git a/src/pkg/opslevelAppendLogProcessor.go b/src/pkg/opslevelAppendLogProcessor.go index d74d3c9..ab52cf6 100644 --- a/src/pkg/opslevelAppendLogProcessor.go +++ b/src/pkg/opslevelAppendLogProcessor.go @@ -59,7 +59,7 @@ func (s *OpsLevelAppendLogProcessor) Process(line string) string { s.elapsed += time.Since(s.lastTime) if s.elapsed > s.maxTime { s.logger.Trace().Msg("Shipping logs because of maxTime ...") - s.elapsed = time.Since(time.Now()) + s.elapsed = 0 s.submit() } s.lastTime = time.Now()