diff --git a/.agents/skills/sok-ci-fixer/SKILL.md b/.agents/skills/sok-ci-fixer/SKILL.md new file mode 100644 index 000000000..7823e29e3 --- /dev/null +++ b/.agents/skills/sok-ci-fixer/SKILL.md @@ -0,0 +1,49 @@ +--- +name: sok-ci-fixer +description: Analyze CI failures, map them to local repro commands, and propose fixes. Use when CI logs or pipeline failures need a local reproduction and patch. +--- + +# SOK CI Fixer + +## Overview +Turn CI failures into a local repro and a minimal fix with a validation plan. + +## Scope +Allowed paths: +- `scripts/**` +- `test/**` +- `kuttl/**` +- `config/**` +- `api/**` +- `internal/**` +- `pkg/**` +- `Makefile`, `go.mod`, `go.sum` + +Forbidden paths: +- `vendor/**` +- `bin/**` +- `.git/**` + +If changes are needed outside the allowed paths, stop and propose a follow-up plan. + +## Workflow +1. Summarize the CI failure (job name, step, error). +2. Map to a local command (prefer `scripts/dev/pr_check.sh` or `./scripts/verify_repo.sh`). +3. Reproduce locally if possible and capture the failing output. +4. Implement the smallest safe fix. +5. Re-run the local repro command. + +## Commands +- PR gate: `scripts/dev/pr_check.sh` +- Repo verify: `./scripts/verify_repo.sh` + +## Definition of Done +- CI failure is reproducible locally or explained why not. +- Fix is minimal and validated with a local repro command. +- Regression risk is noted. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-ci-fixer/agents/openai.yaml b/.agents/skills/sok-ci-fixer/agents/openai.yaml new file mode 100644 index 000000000..2c12f9b2d --- /dev/null +++ b/.agents/skills/sok-ci-fixer/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK CI Fixer" + short_description: "Triage and fix CI failures" diff --git a/.agents/skills/sok-doc-updater/SKILL.md b/.agents/skills/sok-doc-updater/SKILL.md new file mode 100644 index 000000000..920db55d2 --- /dev/null +++ b/.agents/skills/sok-doc-updater/SKILL.md @@ -0,0 +1,44 @@ +--- +name: sok-doc-updater +description: Update Splunk Operator docs and examples for a change. Use when a change requires docs, CR examples, or user-facing guidance updates. +--- + +# SOK Doc Updater + +## Overview +Keep docs, examples, and user-facing guidance in sync with code changes. + +## Scope +Allowed paths: +- `docs/**` +- `README.md` +- `config/samples/**` +- `helm-chart/**` + +Forbidden paths: +- `api/**` +- `internal/**` +- `pkg/**` +- `test/**` +- `kuttl/**` +- `bundle/**` +- `vendor/**` + +If product code changes are required, stop and hand off to the appropriate skill. + +## Workflow +1. Identify the user-facing change and affected docs. +2. Update spec fields, examples, and any compatibility notes. +3. Verify examples are consistent with current CRD schema. +4. Provide a short summary and a test/validation note if applicable. + +## Definition of Done +- Docs and examples are consistent with current CRD schema. +- User-facing behavior is clearly described. +- Any required follow-up (tests or validation) is documented. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-doc-updater/agents/openai.yaml b/.agents/skills/sok-doc-updater/agents/openai.yaml new file mode 100644 index 000000000..f0c094142 --- /dev/null +++ b/.agents/skills/sok-doc-updater/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Doc Updater" + short_description: "Update docs and examples" diff --git a/.agents/skills/sok-feature-scaffold/SKILL.md b/.agents/skills/sok-feature-scaffold/SKILL.md new file mode 100644 index 000000000..139eba9bd --- /dev/null +++ b/.agents/skills/sok-feature-scaffold/SKILL.md @@ -0,0 +1,111 @@ +--- +name: sok-feature-scaffold +description: Add or change Splunk Operator behavior by introducing a new field in a CRD spec/status, wiring it into reconciliation, and updating tests and docs. Use for changes to Standalone, IndexerCluster, SearchHeadCluster, ClusterManager, LicenseManager, MonitoringConsole, or shared CRD config. Do not use for pure refactors, dependency bumps, or formatting-only changes. +--- + +# SOK Feature Scaffold + +## Overview +Implement CRD-driven features end-to-end with code, tests, and docs in this repository. + +## Scope +Allowed paths: +- `api/**` +- `internal/controller/**` +- `pkg/**` +- `config/**` +- `docs/**` +- `test/**` +- `kuttl/**` +- `bundle/**` +- `helm-chart/**` +- `scripts/**` +- `Makefile`, `README.md`, `PROJECT`, `go.mod`, `go.sum` + +Forbidden paths: +- `vendor/**` +- `bin/**` +- `.git/**` + +If changes are needed outside the allowed paths, stop and propose a follow-up plan. + +## Workflow +1. Print the files you plan to change and the test commands you will run before editing. +2. Identify the target CRD kind and API version. Confirm the mapping in `PROJECT` and `docs/agent/CRD_MAP.md`. +3. Locate the API types under `api/v*/` and update the spec/status struct. +4. Add JSON tags, `omitempty` rules, and kubebuilder markers consistent with adjacent fields. +5. Update any defaulting or validation logic that applies to the new field. +6. Regenerate CRD/RBAC artifacts via the operator-sdk workflow. +7. Wire the field into reconciliation with idempotent logic. +8. Update or add unit tests, and add an integration test stub when relevant. +9. Update docs and examples to expose the new field. +10. Produce a PR-ready summary with tests and risks. + +## Implementation Details + +### 1) Find the right API types +- `PROJECT` is the source of truth for kind and version mapping. +- Use `docs/agent/CRD_MAP.md` for fast navigation to types, controllers, and enterprise logic. +- Prefer the latest stable API version (typically `api/v4`). +- Legacy kinds still use `api/v3` (legacy cluster manager and legacy license manager). +- Use `rg "type .*Spec" api -g "*_types.go"` to locate the spec struct. +- If the field is shared across CRDs, check `api/v4/common_types.go` (and `api/v3/common_types.go` for legacy kinds). + +### 2) Schema, CRD, and RBAC generation (operator-sdk workflow) +- Use `operator-sdk create api` when introducing a new API or controller (scaffolding). +- Add the field with a clear JSON name and `omitempty` as appropriate. +- Follow nearby kubebuilder markers for validation, defaults, and list/map behavior. +- Regenerate code and manifests with the repo targets (operator-sdk scaffolding uses controller-gen under the hood). +`make generate` for deepcopy code. +`make manifests` for CRDs and RBAC. +Run `make bundle` to refresh `bundle/manifests/*` and `helm-chart/splunk-operator/crds` when bundle or Helm CRDs are tracked. +- For verification, use `./scripts/verify_crd.sh` and optionally `./scripts/verify_bundle.sh` or `make verify VERIFY_BUNDLE=1`. +- If you add new RBAC needs, update kubebuilder RBAC markers in the controller and re-run `make manifests` to refresh `config/rbac/role.yaml`. + +### 3) Reconcile wiring +- Locate the controller in `internal/controller` and shared logic in `pkg/splunk/enterprise` or `pkg/splunk/common`. +- Read the new field from the spec and apply it in a single, idempotent reconciliation path. +- Update status only when the desired state is reached and avoid hot-looping. + +### 4) Tests +- Add or update unit tests near the logic you touched (often under `internal/controller` or `pkg/splunk/*`). +- If the behavior is user-visible or multi-resource, add a minimal integration test stub in `test/` or `kuttl/` to document coverage intent. +- Prefer helper scripts when available: `scripts/dev/unit.sh`, `scripts/dev/lint.sh`, `scripts/dev/pr_check.sh`. + +### 5) Docs +- Update `docs/CustomResources.md` for spec fields. +- Update any feature-specific doc under `docs/` and add an example manifest if needed. + +## Definition of Done +- CRD/schema generation is updated and verified. +- Reconcile logic is idempotent and status updates are gated. +- Tests added/updated for the new behavior. +- Docs/examples reflect the new field or behavior. + +## Assets +- Use `assets/pr-template.md` for the PR summary format. +- Use `assets/crd-change-checklist.md` as a guardrail for CRD edits. + +## Key Paths +- API types: `api/v*/` (e.g. `api/v4/*_types.go`) +- Controllers: `internal/controller/` +- Shared logic: `pkg/splunk/enterprise`, `pkg/splunk/common` +- CRDs: `config/crd/bases/` +- RBAC output: `config/rbac/role.yaml` +- Bundles: `bundle/manifests/`, `helm-chart/splunk-operator/crds` +- Docs: `docs/CustomResources.md`, `docs/Examples.md` + +## Repo Map (Common Cases) +- Standalone: `api/v4/standalone_types.go`, `internal/controller/standalone_controller.go`, `pkg/splunk/enterprise/standalone.go` +- IndexerCluster: `api/v4/indexercluster_types.go`, `internal/controller/indexercluster_controller.go`, `pkg/splunk/enterprise/indexercluster.go` +- SearchHeadCluster: `api/v4/searchheadcluster_types.go`, `internal/controller/searchheadcluster_controller.go`, `pkg/splunk/enterprise/searchheadcluster.go` +- ClusterManager: `api/v4/clustermanager_types.go`, `internal/controller/clustermanager_controller.go`, `pkg/splunk/enterprise/clustermanager.go` +- LicenseManager: `api/v4/licensemanager_types.go`, `internal/controller/licensemanager_controller.go`, `pkg/splunk/enterprise/licensemanager.go` +- MonitoringConsole: `api/v4/monitoringconsole_types.go`, `internal/controller/monitoringconsole_controller.go`, `pkg/splunk/enterprise/monitoringconsole.go` +- Legacy v3 control-plane types/controllers: search under `api/v3/`, `internal/controller/`, and `pkg/splunk/enterprise/` + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-feature-scaffold/agents/openai.yaml b/.agents/skills/sok-feature-scaffold/agents/openai.yaml new file mode 100644 index 000000000..60be2761d --- /dev/null +++ b/.agents/skills/sok-feature-scaffold/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Sok Feature Scaffold" + short_description: "Help with Sok Feature Scaffold tasks" diff --git a/.agents/skills/sok-feature-scaffold/assets/crd-change-checklist.md b/.agents/skills/sok-feature-scaffold/assets/crd-change-checklist.md new file mode 100644 index 000000000..26ba35ab8 --- /dev/null +++ b/.agents/skills/sok-feature-scaffold/assets/crd-change-checklist.md @@ -0,0 +1,11 @@ +## CRD Change Checklist +- [ ] Update the correct `api/v*/` spec or status struct +- [ ] Add JSON tags and `omitempty` consistently +- [ ] Add kubebuilder markers for validation or defaults +- [ ] Run `make manifests` +- [ ] Verify `config/rbac/role.yaml` if RBAC markers changed +- [ ] If bundle/Helm CRDs are tracked, run `make bundle` +- [ ] Run `make verify` (optionally `VERIFY_BUNDLE=1`) to confirm outputs +- [ ] Verify generated YAML under `config/crd/bases/` and `bundle/manifests/` +- [ ] Update docs under `docs/` +- [ ] Add or update tests diff --git a/.agents/skills/sok-feature-scaffold/assets/pr-template.md b/.agents/skills/sok-feature-scaffold/assets/pr-template.md new file mode 100644 index 000000000..aa2665483 --- /dev/null +++ b/.agents/skills/sok-feature-scaffold/assets/pr-template.md @@ -0,0 +1,8 @@ +## Summary +- + +## Tests +- + +## Risks / Follow-ups +- diff --git a/.agents/skills/sok-issue-triage/SKILL.md b/.agents/skills/sok-issue-triage/SKILL.md new file mode 100644 index 000000000..af5245485 --- /dev/null +++ b/.agents/skills/sok-issue-triage/SKILL.md @@ -0,0 +1,79 @@ +--- +name: sok-issue-triage +description: Turn a Splunk Operator issue report into scope, impacted components, proposed change list, test plan, and risks. Use when asked to triage a GitHub issue, bug report, or feature request into a PR plan. +--- + +# SOK Issue Triage + +## Overview +Convert issue context into a PR-ready plan with scope, changes, tests, and risks. + +## Scope +Allowed paths: +- `.agents/**` +- `docs/**` +- `templates/**` + +Forbidden paths: +- `api/**` +- `internal/**` +- `pkg/**` +- `test/**` +- `kuttl/**` +- `config/**` +- `bundle/**` +- `helm-chart/**` +- `vendor/**` + +This skill should not change product code. If code changes are required, stop and hand off to the appropriate skill. + +## Workflow +1. Extract the problem statement and expected behavior. +2. Identify impacted CRDs, controllers, and packages. +3. Determine the minimal scope for a safe fix. +4. Propose the change list in implementation order. +5. Define a concrete test plan. +6. Call out risks, migrations, and backward compatibility. + +## Details + +### 1) Parse the issue +- Capture user intent, repro steps, and current behavior. +- Identify the CR kind and any referenced fields. + +### 2) Map to code +- Find the spec/status types under `api/v*/`. +- Locate controllers in `internal/controller`. +- Identify shared helpers in `pkg/splunk/enterprise` or `pkg/splunk/common`. +- Use `PROJECT` to confirm CRD kind and version mapping. +- Use `docs/agent/CRD_MAP.md` for a fast file map. +- Use `docs/agent/RECONCILE_FLOW.md` for flow and phase context. + +### 3) Build a PR plan +- List files or directories to touch. +- Keep the change list ordered: schema, reconcile logic, tests, docs. + +### 4) Test plan +- Prefer `make test` for unit coverage. +- Propose an integration test or minimal stub when behavior is user-visible. + +## Definition of Done +- Scope, impacted components, and change list are explicit. +- Test plan is concrete and executable. +- Risks and open questions are called out. + +## Key Paths +- API types: `api/v*/` +- Controllers: `internal/controller/` +- Shared logic: `pkg/splunk/enterprise`, `pkg/splunk/common` +- Docs: `docs/` +- Project mapping: `PROJECT` +- Agent docs: `docs/agent/CRD_MAP.md`, `docs/agent/RECONCILE_FLOW.md` + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary + +Use `assets/issue-triage-template.md` for the final structure and include open questions if any context is missing. diff --git a/.agents/skills/sok-issue-triage/agents/openai.yaml b/.agents/skills/sok-issue-triage/agents/openai.yaml new file mode 100644 index 000000000..cd7774b4a --- /dev/null +++ b/.agents/skills/sok-issue-triage/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Sok Issue Triage" + short_description: "Help with Sok Issue Triage tasks" diff --git a/.agents/skills/sok-issue-triage/assets/issue-triage-template.md b/.agents/skills/sok-issue-triage/assets/issue-triage-template.md new file mode 100644 index 000000000..cb462508c --- /dev/null +++ b/.agents/skills/sok-issue-triage/assets/issue-triage-template.md @@ -0,0 +1,17 @@ +## Scope +- + +## Impacted Components +- + +## Proposed Changes +- + +## Test Plan +- + +## Risks / Compatibility +- + +## Open Questions +- diff --git a/.agents/skills/sok-new-crd-controller/SKILL.md b/.agents/skills/sok-new-crd-controller/SKILL.md new file mode 100644 index 000000000..4a0982157 --- /dev/null +++ b/.agents/skills/sok-new-crd-controller/SKILL.md @@ -0,0 +1,57 @@ +--- +name: sok-new-crd-controller +description: Create a new CRD and controller skeleton (operator-sdk), wire RBAC, add sample YAML, and add tests/docs. Use when introducing a brand-new custom resource to Splunk Operator. +--- + +# SOK New CRD + Controller + +## Overview +Scaffold and wire a new CRD + controller end-to-end, with RBAC, samples, tests, and docs. + +## Scope +Allowed paths: +- `api/**` +- `internal/controller/**` +- `cmd/**` +- `config/**` +- `docs/**` +- `test/**` +- `kuttl/**` +- `bundle/**` +- `helm-chart/**` +- `scripts/**` +- `PROJECT`, `Makefile`, `go.mod`, `go.sum` + +Forbidden paths: +- `vendor/**` +- `bin/**` +- `.git/**` + +If changes are needed outside the allowed paths, stop and propose a follow-up plan. + +## Workflow +1. Print the files you plan to change and test commands you will run. +2. Run `operator-sdk create api` (if a brand-new API) and confirm entries in `PROJECT`. +3. Implement the spec/status types and kubebuilder markers in `api/v*/`. +4. Wire the controller under `internal/controller/` and register in `cmd/main.go`. +5. Update RBAC markers and regenerate manifests: `make generate manifests` or `./scripts/verify_crd.sh`. +6. Add a sample CR under `config/samples/`. +7. Add unit tests (and an integration stub if user-visible behavior). +8. Update docs and examples. + +## Notes +- Prefer scripts when available: `scripts/dev/unit.sh`, `scripts/dev/lint.sh`, `scripts/dev/pr_check.sh`. +- Ensure CRD output is updated in `config/crd/bases/` and, if tracked, `bundle/` and `helm-chart/`. + +## Definition of Done +- New CRD types are generated and registered in `PROJECT`/`cmd/main.go`. +- RBAC and CRD manifests are regenerated and verified. +- Sample YAML exists under `config/samples/`. +- At least one test (unit or integration stub) is added. +- Docs/examples updated for the new CRD. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-new-crd-controller/agents/openai.yaml b/.agents/skills/sok-new-crd-controller/agents/openai.yaml new file mode 100644 index 000000000..4291290cc --- /dev/null +++ b/.agents/skills/sok-new-crd-controller/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK New CRD Controller" + short_description: "Scaffold a new CRD + controller" diff --git a/.agents/skills/sok-pr-crafter/SKILL.md b/.agents/skills/sok-pr-crafter/SKILL.md new file mode 100644 index 000000000..7ee29b189 --- /dev/null +++ b/.agents/skills/sok-pr-crafter/SKILL.md @@ -0,0 +1,45 @@ +--- +name: sok-pr-crafter +description: Generate clean PR descriptions, checklists, and risk notes. Use after code changes are complete and tests are run (or explicitly skipped). +--- + +# SOK PR Crafter + +## Overview +Create a PR-ready summary and checklist from the current diff and test results. + +## Scope +Allowed paths: +- `templates/**` +- `docs/**` +- `.agents/**` + +Forbidden paths: +- `api/**` +- `internal/**` +- `pkg/**` +- `test/**` +- `kuttl/**` +- `config/**` +- `bundle/**` +- `helm-chart/**` +- `vendor/**` + +This skill should not change product code. If changes are required, stop and hand off. + +## Workflow +1. Summarize the change set and key behavior changes. +2. List tests run (or explicitly not run). +3. Call out risks, rollbacks, and compatibility notes. +4. Format output using `templates/pull_request.md`. + +## Definition of Done +- PR summary matches the actual diff and behavior. +- Tests section is accurate and explicit. +- Risks/rollback are captured if relevant. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-pr-crafter/agents/openai.yaml b/.agents/skills/sok-pr-crafter/agents/openai.yaml new file mode 100644 index 000000000..5d82c9823 --- /dev/null +++ b/.agents/skills/sok-pr-crafter/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK PR Crafter" + short_description: "Generate PR descriptions" diff --git a/.agents/skills/sok-reconcile-debugger/SKILL.md b/.agents/skills/sok-reconcile-debugger/SKILL.md new file mode 100644 index 000000000..967592805 --- /dev/null +++ b/.agents/skills/sok-reconcile-debugger/SKILL.md @@ -0,0 +1,94 @@ +--- +name: sok-reconcile-debugger +description: Debug reconcile loops, stuck status phases, or app framework pipeline behavior in the Splunk Operator. Use when a CR is not progressing, status is stuck, or reconciliation repeatedly requeues and needs root-cause analysis and a fix plan. +--- + +# SOK Reconcile Debugger + +## Overview +Diagnose reconciliation failures and produce a clear root cause, fix, and regression test plan. + +## Scope +Allowed paths: +- `api/**` +- `internal/controller/**` +- `pkg/**` +- `config/**` +- `docs/**` +- `test/**` +- `kuttl/**` +- `scripts/**` + +Forbidden paths: +- `vendor/**` +- `bin/**` +- `.git/**` + +If changes are needed outside the allowed paths, stop and propose a follow-up plan. + +## Workflow +1. Gather context. +2. Reproduce with a minimal manifest. +3. Trace the reconcile path and status gating logic. +4. Add targeted debug logs, then remove them before final output. +5. Identify the root cause and propose the smallest safe fix. +6. Add or propose a regression test. +7. Produce a concise incident summary. + +## Details + +### 1) Gather context +- Capture CR kind, namespace, spec snippet, and operator version. +- Collect `kubectl describe` output and recent operator logs. +- Note whether the issue is a hot loop, terminal error, or stalled status phase. + +### Quick triage commands +`kubectl get -n -o yaml` +`kubectl describe -n ` +`kubectl get events -n --sort-by=.lastTimestamp` +`kubectl logs -n splunk-operator deploy/splunk-operator-controller-manager -c manager --since=30m` +Use `./scripts/debug_reconcile.sh ` to capture these into a single output folder. + +### 2) Reproduce +- Start from an example in `docs/Examples.md` or `test/example/` and reduce it to the minimal spec that reproduces the bug. +- Prefer a local kind cluster for fast iteration when feasible. + +### 3) Trace reconciliation +- Find the controller in `internal/controller` for the affected kind. +- Follow the reconcile flow in shared logic under `pkg/splunk/enterprise` and `pkg/splunk/common`. +- Identify status fields and conditions that gate progression. +- Check paused annotations (see `api/v4/*_types.go` or `api/v3/*_types.go`). +- Check `Phase` constants in `api/v4/common_types.go` for expected state transitions. +- Review predicates in `internal/controller/common/predicate.go` for reconcile triggers. +- Use `docs/agent/RECONCILE_FLOW.md` and `docs/agent/OPERATIONS.md` for guidance. + +### 4) Add targeted logs +- Add temporary logs at the gate that stops progression and at any error return. +- Use consistent keys to make log filtering easy. +- Remove debug logs before final output unless explicitly requested to keep them. + +### 5) Root cause and fix +- State the precise condition that prevents progression. +- Propose the smallest fix that restores the expected state transition. +- Verify idempotency and avoid new reconcile loops. + +### 6) Regression test +- Add or outline a unit test near the affected logic. +- If the bug is integration-only, add a minimal test stub under `test/` or `kuttl/`. +- Prefer helper scripts when available: `scripts/dev/unit.sh`, `scripts/dev/pr_check.sh`. + +## Definition of Done +- Root cause is clearly identified and reproducible. +- Fix is minimal, idempotent, and avoids new reconcile loops. +- Regression test is added or explicitly scoped as follow-up. + +## Key Paths +- Controllers: `internal/controller/` +- Shared logic: `pkg/splunk/enterprise`, `pkg/splunk/common` +- Examples: `docs/Examples.md`, `test/example/` + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-reconcile-debugger/agents/openai.yaml b/.agents/skills/sok-reconcile-debugger/agents/openai.yaml new file mode 100644 index 000000000..f841b4558 --- /dev/null +++ b/.agents/skills/sok-reconcile-debugger/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Sok Reconcile Debugger" + short_description: "Help with Sok Reconcile Debugger tasks" diff --git a/.agents/skills/sok-release-checklist/SKILL.md b/.agents/skills/sok-release-checklist/SKILL.md new file mode 100644 index 000000000..2b21d9d43 --- /dev/null +++ b/.agents/skills/sok-release-checklist/SKILL.md @@ -0,0 +1,54 @@ +--- +name: sok-release-checklist +description: Prepare or verify a Splunk Operator release checklist, including compatibility, manifests, bundles, images, docs, and upgrade notes. Use when asked about release readiness, compatibility matrices, or release process steps. +--- + +# SOK Release Checklist + +## Overview +Produce a release readiness checklist tailored to this repo's build, bundle, and documentation flow. + +## Workflow +1. Gather release context (version, target Splunk Enterprise versions, Kubernetes support). +2. Verify CRD and bundle artifacts. +3. Verify docs and upgrade guidance. +4. Verify image tags and helm chart outputs. +5. Summarize risks and required follow-ups. + +## Details + +### 1) Compatibility and support +- Review `docs/README.md` for compatibility notes and pointers to release notes. +- Review `docs/SplunkOperatorUpgrade.md` for upgrade constraints and breaking changes. +- Review `docs/ChangeLog.md` for release changes. +- If a public release is being prepared, confirm compatibility in the GitHub release notes. + +### 2) Manifests and bundle +- For CRD changes, ensure `make manifests` and `make bundle` are run. +- Confirm generated CRDs in `config/crd/bases/` and `bundle/manifests/`. +- Confirm helm chart CRDs updated in `helm-chart/splunk-operator/crds`. +- Confirm CSV and manifest bases in `bundle/manifests/splunk-operator.clusterserviceversion.yaml` and `config/manifests/`. +- Use `make verify` or `./scripts/verify_crd.sh` and `./scripts/verify_bundle.sh` to confirm outputs are in sync. +- Use `docs/agent/RELEASE_FLOW.md` for the canonical release flow. + +### 3) Images and tags +- Confirm operator image tag and any distroless tag if used. +- Verify `bundle.Dockerfile` or `Dockerfile` changes if applicable. + +### 4) Docs and examples +- Update install or upgrade docs if defaults or requirements changed. +- Update examples when spec fields or defaults changed. + +## Output Contract +- Use `assets/release-checklist.md` for the final checklist. +- Call out any missing inputs needed to finish the checklist. + +## Key Paths +- Release docs: `docs/README.md`, `docs/ChangeLog.md`, `docs/SplunkOperatorUpgrade.md`, `docs/Install.md` +- CRDs: `config/crd/bases/`, `bundle/manifests/`, `helm-chart/splunk-operator/crds` +- Manifests: `config/manifests/` +- CSV: `bundle/manifests/splunk-operator.clusterserviceversion.yaml` +- Build: `Makefile`, `Dockerfile`, `bundle.Dockerfile` +- Project mapping: `PROJECT` +- Agent docs: `docs/agent/TEST_MATRIX.md` +- Release flow: `docs/agent/RELEASE_FLOW.md` diff --git a/.agents/skills/sok-release-checklist/agents/openai.yaml b/.agents/skills/sok-release-checklist/agents/openai.yaml new file mode 100644 index 000000000..08e9ac1c8 --- /dev/null +++ b/.agents/skills/sok-release-checklist/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Sok Release Checklist" + short_description: "Help with Sok Release Checklist tasks" diff --git a/.agents/skills/sok-test-author/SKILL.md b/.agents/skills/sok-test-author/SKILL.md new file mode 100644 index 000000000..54df418ae --- /dev/null +++ b/.agents/skills/sok-test-author/SKILL.md @@ -0,0 +1,50 @@ +--- +name: sok-test-author +description: Write or update Splunk Operator tests (unit/envtest/integration/KUTTL). Use when a change needs new tests or test adjustments. +--- + +# SOK Test Author + +## Overview +Create or update tests for operator behavior, using existing testenv helpers and patterns. + +## Scope +Allowed paths: +- `test/**` +- `kuttl/**` +- `scripts/**` +- `docs/**` +- `config/samples/**` + +Forbidden paths: +- `api/**` +- `internal/**` +- `pkg/**` +- `bundle/**` +- `helm-chart/**` +- `vendor/**` + +If product code changes are required, stop and hand off to the appropriate skill. + +## Workflow +1. Determine test type: unit/envtest, integration (Ginkgo), or KUTTL. +2. Locate existing patterns in `docs/agent/TESTCASE_PATTERNS.md` and `test/`. +3. Scaffold tests using `scripts/generate_testcase.py` if helpful. +4. Implement assertions using `test/testenv` helpers. +5. Run tests (or specify exact commands). + +## Commands +- Unit/envtest: `scripts/dev/unit.sh` +- Lint/format: `scripts/dev/lint.sh` +- KUTTL scaffolds: `python3 scripts/generate_testcase.py --spec docs/agent/TESTCASE_SPEC.yaml` + +## Definition of Done +- Tests compile and run (or a clear reason is recorded). +- Assertions cover the intended behavior and failure modes. +- Test names and suite structure match repo conventions. + +## Output Contract +- Changed files +- Commands run +- Results +- PR-ready summary diff --git a/.agents/skills/sok-test-author/agents/openai.yaml b/.agents/skills/sok-test-author/agents/openai.yaml new file mode 100644 index 000000000..7068e3691 --- /dev/null +++ b/.agents/skills/sok-test-author/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "SOK Test Author" + short_description: "Write or update tests" diff --git a/.agents/skills/sok-test-harness/SKILL.md b/.agents/skills/sok-test-harness/SKILL.md new file mode 100644 index 000000000..d8dc4d7ff --- /dev/null +++ b/.agents/skills/sok-test-harness/SKILL.md @@ -0,0 +1,59 @@ +--- +name: sok-test-harness +description: Run or troubleshoot Splunk Operator tests, including unit tests, integration tests, and local kind-based workflows. Use when asked to run tests, set up a kind cluster, or produce a test failure triage summary. +--- + +# SOK Test Harness + +## Overview +Run the repo's standard unit and integration tests, and summarize failures consistently. + +## Quick Start +- Unit tests: `scripts/dev/unit.sh` +- Lint/format checks: `scripts/dev/lint.sh` +- Envtest assets: `scripts/dev/envtest.sh` +- Kind smoke: `scripts/dev/kind_smoke.sh` +- Repo verification: `scripts/dev/pr_check.sh` (or `scripts/verify_repo.sh --all`) +- Generate new tests: `python3 scripts/generate_testcase.py --spec docs/agent/TESTCASE_SPEC.yaml` + +## Workflow +1. Confirm prerequisites. +2. Run unit tests or kind integration tests. +3. If tests fail, summarize the failure and propose next steps. +4. When CRDs or bundles changed, run `make verify` to confirm generated outputs. + +## Prerequisites +- Go toolchain and `ginkgo` +- Docker and `kubectl` +- `kind` installed for local integration tests + +## Unit Tests +- Default command: `make test` +- Prefer `scripts/dev/unit.sh` to run with the repo defaults. + +## Integration Tests (Kind) +- Default commands: `make cluster-up`, `make int-test`, `make cluster-down` +- Prefer `scripts/dev/kind_smoke.sh` for a quick local sanity run. +- Skill-local scripts (`scripts/run_kind_e2e.sh`, `scripts/push_kind_operator_image.sh`) remain available for deeper e2e flows. + +## Common Environment Variables +These are defined in `test/env.sh` and can be overridden in your shell. +- `SPLUNK_OPERATOR_IMAGE` default `splunk/splunk-operator:latest` +- `SPLUNK_ENTERPRISE_IMAGE` default `splunk/splunk:latest` +- `CLUSTER_PROVIDER` default `kind` for local runs +- `PRIVATE_REGISTRY` default `localhost:5000` when using kind +- `TEST_REGEX` or `TEST_FOCUS` to filter tests +- `SKIP_REGEX` to skip tests +- `CLUSTER_WIDE` to run cluster-wide operator install + +## Failure Triage Output +- Provide the failing test names or package paths. +- Include the first error and any repeated error pattern. +- Suggest the most likely code area to inspect. + +## Key Paths +- Test harness: `test/README.md` +- Integration scripts: `test/run-tests.sh`, `test/deploy-cluster.sh`, `test/deploy-kind-cluster.sh` +- Unit test target: `Makefile` (`make test`) +- Environment defaults: `test/env.sh` +- Harness docs: `docs/agent/TEST_MATRIX.md` diff --git a/.agents/skills/sok-test-harness/agents/openai.yaml b/.agents/skills/sok-test-harness/agents/openai.yaml new file mode 100644 index 000000000..a0fe42467 --- /dev/null +++ b/.agents/skills/sok-test-harness/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Sok Test Harness" + short_description: "Help with Sok Test Harness tasks" diff --git a/.agents/skills/sok-test-harness/scripts/push_kind_operator_image.sh b/.agents/skills/sok-test-harness/scripts/push_kind_operator_image.sh new file mode 100755 index 000000000..d745fd87f --- /dev/null +++ b/.agents/skills/sok-test-harness/scripts/push_kind_operator_image.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + repo_root="$(cd "${script_dir}/../../../.." && pwd)" +fi + +cd "${repo_root}" + +SPLUNK_OPERATOR_IMAGE="${SPLUNK_OPERATOR_IMAGE:-splunk/splunk-operator:latest}" +PRIVATE_REGISTRY="${PRIVATE_REGISTRY:-localhost:5000}" + +first_segment="${SPLUNK_OPERATOR_IMAGE%%/*}" +if [[ "${first_segment}" == *.* || "${first_segment}" == *:* ]]; then + echo "SPLUNK_OPERATOR_IMAGE looks like it already includes a registry: ${first_segment}" + echo "Set SPLUNK_OPERATOR_IMAGE to a repo/name:tag without a registry prefix." + echo "Example: SPLUNK_OPERATOR_IMAGE=splunk/splunk-operator:latest" + exit 1 +fi + +if [[ "${PRIVATE_REGISTRY}" == localhost:* || "${PRIVATE_REGISTRY}" == 127.0.0.1:* ]]; then + if ! docker ps --format '{{.Names}}' | grep -q '^kind-registry$'; then + echo "Local kind registry container is not running. Run: make cluster-up" + exit 1 + fi +else + if [[ "${FORCE_PUSH:-}" != "1" ]]; then + echo "Refusing to push to non-local registry '${PRIVATE_REGISTRY}'." + echo "Set FORCE_PUSH=1 to override." + exit 1 + fi +fi + +target_image="${PRIVATE_REGISTRY}/${SPLUNK_OPERATOR_IMAGE}" + +if ! docker image inspect "${SPLUNK_OPERATOR_IMAGE}" >/dev/null 2>&1; then + echo "Local image ${SPLUNK_OPERATOR_IMAGE} not found, pulling..." + docker pull "${SPLUNK_OPERATOR_IMAGE}" +fi + +docker tag "${SPLUNK_OPERATOR_IMAGE}" "${target_image}" + +echo "Pushing ${target_image}" +docker push "${target_image}" + +echo "Pushed operator image: ${target_image}" diff --git a/.agents/skills/sok-test-harness/scripts/run_kind_e2e.sh b/.agents/skills/sok-test-harness/scripts/run_kind_e2e.sh new file mode 100755 index 000000000..43deaa06e --- /dev/null +++ b/.agents/skills/sok-test-harness/scripts/run_kind_e2e.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + echo "Usage: $(basename "$0") [--keep]" + echo " --keep Keep the kind cluster running after tests" + echo " --push-operator-image Push operator image to local kind registry before tests" +} + +keep_cluster=false +push_operator_image=false +while [[ $# -gt 0 ]]; do + case "$1" in + --keep) + keep_cluster=true + shift + ;; + --push-operator-image) + push_operator_image=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" + usage + exit 1 + ;; + esac +done + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + repo_root="$(cd "${script_dir}/../../../.." && pwd)" +fi + +cd "${repo_root}" + +export CLUSTER_PROVIDER="${CLUSTER_PROVIDER:-kind}" +export TEST_CLUSTER_PLATFORM="${TEST_CLUSTER_PLATFORM:-kind}" + +cleanup() { + if [[ "${keep_cluster}" == "true" ]]; then + echo "Keeping kind cluster running (requested)." + return 0 + fi + echo "Tearing down kind cluster: make cluster-down" + make cluster-down +} + +trap cleanup EXIT + +echo "Bringing up kind cluster: make cluster-up" +make cluster-up + +if [[ "${push_operator_image}" == "true" ]]; then + echo "Pushing operator image to local kind registry" + "${script_dir}/push_kind_operator_image.sh" +fi + +echo "Running integration tests: make int-test" +make int-test diff --git a/.agents/skills/sok-test-harness/scripts/run_unit.sh b/.agents/skills/sok-test-harness/scripts/run_unit.sh new file mode 100755 index 000000000..55f36258d --- /dev/null +++ b/.agents/skills/sok-test-harness/scripts/run_unit.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + repo_root="$(cd "${script_dir}/../../../.." && pwd)" +fi + +cd "${repo_root}" + +echo "Running unit tests: make test" +make test diff --git a/.agents/skills/sok-testcase-builder/SKILL.md b/.agents/skills/sok-testcase-builder/SKILL.md new file mode 100644 index 000000000..e90f24927 --- /dev/null +++ b/.agents/skills/sok-testcase-builder/SKILL.md @@ -0,0 +1,59 @@ +--- +name: sok-testcase-builder +description: Create new Splunk Operator integration (Ginkgo) or KUTTL tests from a CR spec and expected results. Use when a developer asks to add a new test case that validates CR status phase Ready and required resources. +--- + +# SOK Testcase Builder + +## Overview +Generate scaffolds for new integration or KUTTL tests based on a CR spec and expected results. + +## Workflow +1. Determine test type: integration (Ginkgo) or KUTTL. +2. Identify the SVA architecture (S1, C3, M4, M1), features (smartstore, appframework), and any SVA validations. +3. Collect CR manifest path(s) and expected results. +4. Create or update a testcase spec file from `docs/agent/TESTCASE_SPEC.yaml`. +5. Run the generator script to scaffold the test. +6. Fill in TODOs (spec struct, resource checks, extra asserts). +7. Run the appropriate test command. + +## Test Types + +### KUTTL +- Inputs: CR manifest, expected phase, and resource assertions. +- Output: `kuttl/tests///` with deploy and assert steps. +- Recommended when validating CRD behavior with simple YAML assertions. + - Supports optional operator upgrade steps using the `upgrade` spec block. + +### Integration (Ginkgo) +- Inputs: CR spec and expected behaviors. +- Output: `test//_test.go` with a suite file if missing. +- Recommended for multi-step flows or API-based verification. + - Use `docs/agent/TESTCASE_PATTERNS.md` to map SVA patterns to helpers. + - For C3 SVA, set `validations.sva: C3` to include Monitoring Console + License Manager readiness checks. + +## Generator Script +Use `scripts/generate_testcase.py` with a spec file: + +`python3 scripts/generate_testcase.py --spec docs/agent/TESTCASE_SPEC.yaml` + +Options: +- `--force` overwrite existing files +- `--dry-run` print actions without writing +Note: YAML specs require `pyyaml` (`python3 -m pip install pyyaml`). + +## Expected Results +- Always validate `status.phase` is `Ready` (or the specified phase). +- Add asserts for key resources (StatefulSet, Service, Secret, ConfigMap) as needed. + +## Output Contract +- List created/edited files +- Provide the test command to run +- Call out any TODOs left in the scaffold + +## Key References +- Spec template: `docs/agent/TESTCASE_SPEC.yaml` +- Patterns: `docs/agent/TESTCASE_PATTERNS.md` +- Test matrix: `docs/agent/TEST_MATRIX.md` +- CRD map: `docs/agent/CRD_MAP.md` +- Test helpers: `test/testenv/verificationutils.go` diff --git a/.agents/skills/sok-testcase-builder/agents/openai.yaml b/.agents/skills/sok-testcase-builder/agents/openai.yaml new file mode 100644 index 000000000..65a5b6b1d --- /dev/null +++ b/.agents/skills/sok-testcase-builder/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Sok Testcase Builder" + short_description: "Help with Sok Testcase Builder tasks" diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 65d2b7b12..27d28e864 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -10,6 +10,10 @@ _Highlight the updates in specific files_ _How did you test these changes? What automated tests are added?_ +Suggested local gates: +- `scripts/dev/pr_check.sh` +- `scripts/dev/unit.sh` + ### Related Issues _Jira tickets, GitHub issues, Support tickets..._ diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml new file mode 100644 index 000000000..2b74be2a4 --- /dev/null +++ b/.github/workflows/pr-check.yml @@ -0,0 +1,27 @@ +name: PR Check + +on: + pull_request: + branches: + - main + - develop + +jobs: + pr-check: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version-file: go.mod + + - name: Run PR check (fast) + env: + PR_CHECK_FLAGS: --fast + run: | + scripts/dev/pr_check.sh diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 000000000..cb02ac25a --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,10 @@ +# TODO: Replace this placeholder team with real owners. +* @splunk/splunk-operator-for-kubernetes + +/api/ @splunk/splunk-operator-for-kubernetes +/internal/controller/ @splunk/splunk-operator-for-kubernetes +/pkg/splunk/enterprise/ @splunk/splunk-operator-for-kubernetes +/test/ @splunk/splunk-operator-for-kubernetes +/kuttl/ @splunk/splunk-operator-for-kubernetes +/docs/ @splunk/splunk-operator-for-kubernetes +/scripts/ @splunk/splunk-operator-for-kubernetes diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..1022e8c79 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,66 @@ +# Code of Conduct + +This project follows the [Contributor Covenant](https://www.contributor-covenant.org/version/2/1/code_of_conduct/). + +## Our Pledge +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity and +orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards +Examples of behavior that contributes to a positive environment for our +community include: + +- Demonstrating empathy and kindness toward other people +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by our mistakes +- Focusing on what is best for the community + +Examples of unacceptable behavior include: + +- The use of sexualized language or imagery, and sexual attention or advances +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others’ private information, such as a physical or email address, + without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. + +## Enforcement +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the project team at `opensource@splunk.com`. All complaints will be +reviewed and investigated promptly and fairly. + +## Enforcement Guidelines +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +1. **Correction** — A private, written warning and guidance. +2. **Warning** — A formal warning with consequences for continued behavior. +3. **Temporary Ban** — A temporary ban from the community. +4. **Permanent Ban** — A permanent ban from the community. + +## Attribution +This Code of Conduct is adapted from the Contributor Covenant, version 2.1. diff --git a/GOVERNANCE.md b/GOVERNANCE.md new file mode 100644 index 000000000..29ff98756 --- /dev/null +++ b/GOVERNANCE.md @@ -0,0 +1,29 @@ +# Governance + +## Purpose +This document defines how the Splunk Operator project is maintained and how +decisions are made. The goal is to enable a healthy, professional open-source +project with clear ownership and predictable workflows. + +## Roles +- **Maintainers**: Responsible for roadmap decisions, triage, and final review. +- **Contributors**: Anyone submitting issues, patches, or documentation changes. + +Maintainers are listed in `MAINTAINERS.md`. + +## Decision Making +We aim for consensus. When consensus cannot be reached, maintainers will decide +based on technical merit, security, and project direction. + +## Contribution Workflow +1. Open or reference a GitHub issue when applicable. +2. Follow `docs/CONTRIBUTING.md` for development and testing. +3. Ensure changes include tests and documentation as appropriate. +4. Submit a PR using the repository PR template. + +## Release and Compatibility +Releases follow the project’s published release process. See `docs/ChangeLog.md` +and relevant release documentation under `docs/`. + +## Code of Conduct +All participants must follow `CODE_OF_CONDUCT.md`. diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 000000000..b3968df15 --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,24 @@ +# Maintainers + +Maintainers are responsible for project direction, release decisions, and final +review of changes. + +Current maintainers: +- Arjun Kondur +- Gabriel Mendoza +- Gaurav Gupta +- Igor Grzankowski +- Jakub Buczak +- Katarzyna Kozioł +- Minjie Qiu +- Patryk Wasielewski +- Qing Wang +- Raizel Lieberman +- Richard Wang +- Sirish Mohan +- Subba Gontla +- Szymon Buczak +- Vivek Reddy +- Yuhan Yang + +If you need help, tag a maintainer in a GitHub issue or PR. diff --git a/Makefile b/Makefile index 170ba70a6..bf6420677 100644 --- a/Makefile +++ b/Makefile @@ -140,6 +140,24 @@ vet: setup/ginkgo ## Run go vet against code. test: manifests generate fmt vet setup-envtest ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use ${ENVTEST_K8S_VERSION} --bin-dir $(LOCALBIN) -p path)" ginkgo --junit-report=unit_test.xml --output-dir=`pwd` -vv --trace --keep-going --timeout=3h --cover --covermode=count --coverprofile=coverage.out ./pkg/splunk/common ./pkg/splunk/enterprise ./pkg/splunk/client ./pkg/splunk/util ./internal/controller ./pkg/splunk/splkcontroller +.PHONY: verify verify-crd verify-bundle +verify: verify-crd ## Verify generated artifacts (set VERIFY_BUNDLE=1 to include bundle) + @if [ "$(VERIFY_BUNDLE)" = "1" ]; then \ + $(MAKE) verify-bundle; \ + else \ + echo "Skipping bundle verify (set VERIFY_BUNDLE=1 to enable)"; \ + fi + +.PHONY: verify-repo +verify-repo: ## Run repository verification script (see scripts/verify_repo.sh) + @./scripts/verify_repo.sh + +verify-crd: ## Regenerate and verify CRD/RBAC outputs + @./scripts/verify_crd.sh + +verify-bundle: ## Regenerate and verify bundle/helm outputs + @./scripts/verify_bundle.sh + ##@ Documentation diff --git a/README.md b/README.md new file mode 100644 index 000000000..b1d427449 --- /dev/null +++ b/README.md @@ -0,0 +1,172 @@ +# Splunk Operator for Kubernetes + +[![License](https://img.shields.io/:license-apache-blue.svg)](http://www.apache.org/licenses/LICENSE-2.0.html) +[![PkgGoDev](https://pkg.go.dev/badge/github.com/splunk/splunk-operator)](https://pkg.go.dev/github.com/splunk/splunk-operator) +[![Go Report Card](https://goreportcard.com/badge/github.com/splunk/splunk-operator)](https://goreportcard.com/report/github.com/splunk/splunk-operator) +[![Coverage Status](https://coveralls.io/repos/github/splunk/splunk-operator/badge.svg?branch=master)](https://coveralls.io/github/splunk/splunk-operator?branch=master) +[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2Fsplunk%2Fsplunk-operator.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2Fsplunk%2Fsplunk-operator?ref=badge_shield) +[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/splunk/splunk-operator) + +The Splunk Operator for Kubernetes (SOK) makes it easy for Splunk +Administrators to deploy and operate Enterprise deployments in a Kubernetes +infrastructure. Packaged as a container, it uses the +[operator pattern](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) +to manage Splunk-specific [custom resources](https://kubernetes.io/docs/concepts/extend-kubernetes/api-extension/custom-resources/), +following best practices to manage all the underlying Kubernetes objects for you. + +This repository is used to build the Splunk +[Operator](https://kubernetes.io/docs/concepts/extend-kubernetes/operator/) +for Kubernetes (SOK). If you are just looking for documentation on how to +deploy and use the latest release, please see the +[Getting Started Documentation](docs/README.md). + +## Splunk General Terms Acceptance + +Starting with operator version 3.0.0, which includes support for Splunk Enterprise version 10.x, an additional Docker-Splunk specific parameter is required to start containers. **This is a breaking change, and user action is required.** + +Starting in 10.x image versions of Splunk Enterprise, license acceptance requires an additional `SPLUNK_GENERAL_TERMS=--accept-sgt-current-at-splunk-com` argument. This indicates that users have read and accepted the current/latest version of the Splunk General Terms, available at https://www.splunk.com/en_us/legal/splunk-general-terms.html as may be updated from time to time. Unless you have jointly executed with Splunk a negotiated version of these General Terms that explicitly supersedes this agreement, by accessing or using Splunk software, you are agreeing to the Splunk General Terms posted at the time of your access and use and acknowledging its applicability to the Splunk software. Please read and make sure you agree to the Splunk General Terms before you access or use this software. Only after doing so should you include the `--accept-sgt-current-at-splunk-com` flag to indicate your acceptance of the current/latest Splunk General Terms and launch this software. All examples below have been updated with this change. + +If you use the below examples and the ‘--accept-sgt-current-at-splunk-com’ flag, you are indicating that you have read and accepted the current/latest version of the Splunk General Terms, as may be updated from time to time, and acknowledging its applicability to this software - as noted above. + +By default, the SPLUNK_GENERAL_TERMS environment variable will be set to an empty string. You must either manually update it to have the required additional value `--accept-sgt-current-at-splunk-com` in the splunk-operator-controller-manager deployment, or you can pass the `SPLUNK_GENERAL_TERMS` parameter with the required additional value to the `make deploy` command. + +``` +make deploy IMG=docker.io/splunk/splunk-operator: WATCH_NAMESPACE="namespace1" RELATED_IMAGE_SPLUNK_ENTERPRISE="splunk/splunk:edge" SPLUNK_GENERAL_TERMS="--accept-sgt-current-at-splunk-com" +``` + +For more information about this change, see the [Splunk General Terms Migration Documentation](docs/SplunkGeneralTermsMigration.md). + +## Prerequisites + +You must have [Docker Engine](https://docs.docker.com/install/) installed to +build the Splunk Operator. + +This project uses [Go modules](https://blog.golang.org/using-go-modules), +and requires [golang](https://golang.org/doc/install) 1.23.0 or later. +You must `export GO111MODULE=on` if cloning these repositories into your +`$GOPATH` (not recommended). + +The [Kubernetes Operator SDK](https://github.com/operator-framework/operator-sdk) +must also be installed to build this project. + +``` +git clone -b v1.31.0 https://github.com/operator-framework/operator-sdk +cd operator-sdk +make tidy +make install +``` + +You may need to add `$GOPATH/bin` to your path to run the `operator-sdk` +command line tool: + +``` +export PATH=${PATH}:${GOPATH}/bin +``` + +It is also recommended that you install the following golang tools, +which are used by various `make` targets: + +```shell +go install golang.org/x/lint/golint +go install golang.org/x/tools/cmd/cover +go install github.com/mattn/goveralls +go get -u github.com/mikefarah/yq/v3 +go get -u github.com/go-delve/delve/cmd/dlv +``` + +## Cloning this repository + +```shell +git clone git@github.com:splunk/splunk-operator.git +cd splunk-operator +``` + +## Repository overview + +This repository consists of the following code used to build the splunk-operator binary: + +* `main.go`: Provides the main() function, where everything begins +* `apis/`: Source code for the operator's custom resource definition types +* `controllers/`: Used to register controllers that watch for changes to custom resources +* `pkg/splunk/enterprise/`: Source code for controllers that manage Splunk Enterprise resources +* `pkg/splunk/controller/`: Common code shared across Splunk controllers +* `pkg/splunk/common/`: Common code used by most other splunk packages +* `pkg/splunk/client/`: Simple client for Splunk Enterprise REST API +* `pkg/splunk/test/`: Common code used by other packages for unit testing + +`main()` uses `controllers` to register all the `enterprise` controllers +that manage custom resources by watching for Kubernetes events. +The `enterprise` controllers are implemented using common code provided +by the `controllers` package. The `enterprise` controllers also use the REST API client +provided in the `pkg/splunk/client` package. The types provided by `apis/` and +common code in the `pkg/splunk/common/` package are used universally. Note that the +source code for `main()` is generated from a template provided by the Operator SDK. + +In addition to the source code, this repository includes: + +* `tools`: Build scripts, templates, etc. used to build the container image +* `config`: Kubernetes YAML templates used to install the Splunk Operator +* `docs`: Getting Started Guide and other documentation in Markdown format +* `test`: Integration test framework built using Ginko. See [docs](test/README.md) for more info. + +## Building the operator + +You can build the operator by just running `make`. + +Other make targets include (more info below): + +* `make all`: builds `manager` executable +* `make test`: Runs unit tests with Coveralls code coverage output to coverage.out +* `make scorecard`: Runs operator-sdk scorecard tests using OLM installation bundle +* `make generate`: runs operator-generate k8s, crds and csv commands, updating installation YAML files and OLM bundle +* `make docker-build`: generates `splunk-operator` container image example `make docker-build IMG=docker.io/splunk/splunk-operator:` +* `make docker-buildx`: generates `splunk-operator` container image for multiple platforms, example `make docker-buildx IMG=docker.io/splunk/splunk-operator:` +* `make docker-push`: push docker image to given repository example `make docker-push IMG=docker.io/splunk/splunk-operator:` +* `make clean`: removes the binary build output and `splunk-operator` container image example `make docker-push IMG=docker.io/splunk/splunk-operator:` +* `make run`: runs the Splunk Operator locally, monitoring the Kubernetes cluster configured in your current `kubectl` context +* `make fmt`: runs `go fmt` on all `*.go` source files in this project +* `make bundle-build`: generates `splunk-operator-bundle` bundle container image for OLM example `make bundle-build IMAGE_TAG_BASE=docker.io/splunk/splunk-operator VERSION= IMG=docker.io/splunk/splunk-operator:` +* `make bundle-push`: push OLM bundle docker image to given repository example `make bundle-push IMAGE_TAG_BASE=docker.io/splunk/splunk-operator VERSION= IMG=docker.io/splunk/splunk-operator:` +* `make catalog-build`: generates `splunk-operator-catalog` catalog container image example `make catalog-build IMAGE_TAG_BASE=docker.io/splunk/splunk-operator VERSION= IMG=docker.io/splunk/splunk-operator:` +* `make catalog-push`: push catalog docker image to given repository example`make catalog-push IMAGE_TAG_BASE=docker.io/splunk/splunk-operator VERSION= IMG=docker.io/splunk/splunk-operator:` + +## Agent Harness + +For agent-assisted development, see `docs/agent/README.md`. +Useful scripts include `scripts/verify_crd.sh`, `scripts/verify_bundle.sh`, `scripts/verify_repo.sh`, `scripts/debug_reconcile.sh`, and `scripts/generate_testcase.py`. + +## Deploying the Splunk Operator +`make deploy` command will deploy all the necessary resources to run Splunk Operator like RBAC policies, services, configmaps, deployment. Operator will be installed in `splunk-operator` namespace. If `splunk-operator` namespace does not exist, it will create the namespace. By default `make deploy` will install operator clusterwide. Operator will watch all the namespaces for any splunk enterprise custom resources. + +```shell +make deploy IMG=docker.io/splunk/splunk-operator: +``` + +If you want operator for specific namespace then you must pass `WATCH_NAMESPACE` parameter to `make deploy` command + +``` +make deploy IMG=docker.io/splunk/splunk-operator: WATCH_NAMESPACE="namespace1" +``` + +If you want operator to use specific version of splunk instance, then you must pass `RELATED_IMAGE_SPLUNK_ENTERPRISE` parameter to `make deploy` command + +``` +make deploy IMG=docker.io/splunk/splunk-operator: WATCH_NAMESPACE="namespace1" RELATED_IMAGE_SPLUNK_ENTERPRISE="splunk/splunk:edge" +``` + +Use this to run the operator as a local foreground process on your machine: + +```shell +make run +``` + +This will use your current Kubernetes context from `~/.kube/config` to manage +resources in your current namespace. + +Please see the [Getting Started Documentation](docs/README.md) for more +information, including instructions on how to install the operator in your +cluster. + + +## License +[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2Fsplunk%2Fsplunk-operator.svg?type=large)](https://app.fossa.com/projects/git%2Bgithub.com%2Fsplunk%2Fsplunk-operator?ref=badge_large) diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..27b84e37c --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,14 @@ +# Security Policy + +## Reporting a Vulnerability +If you believe you have found a security vulnerability, please report it +privately. Do **not** open a public GitHub issue. + +Contact: `security@splunk.com` + +We will acknowledge your report, assess the impact, and provide a timeline for +remediation as soon as possible. + +## Supported Versions +Security fixes are applied to currently supported releases. See `docs/ChangeLog.md` +for version history. diff --git a/SUPPORT.md b/SUPPORT.md new file mode 100644 index 000000000..bffbdfc5e --- /dev/null +++ b/SUPPORT.md @@ -0,0 +1,11 @@ +# Support + +## Community Support +- GitHub Issues: https://github.com/splunk/splunk-operator/issues + +## Commercial Support +If you are a Splunk customer and need formal support, contact Splunk Support +through your normal support channels. + +## Documentation +Project docs live under `docs/` in this repository. diff --git a/api/AGENTS.md b/api/AGENTS.md new file mode 100644 index 000000000..d1ee53850 --- /dev/null +++ b/api/AGENTS.md @@ -0,0 +1,25 @@ +# api/ — CRD Types and Schemas + +## What Lives Here +- CRD Go types (spec/status structs) +- Kubebuilder markers (`+kubebuilder:validation`, `+kubebuilder:default`, etc.) +- JSON tags and OpenAPI schema generation + +## Invariants +- JSON tags must match field names and `omitempty` rules. +- Optional fields should be pointers or `omitempty` where appropriate. +- Status fields must be write-only from controllers. + +## Common Pitfalls +- Forgetting to run generation after type changes. +- Mismatched JSON tag or missing `omitempty`. +- Breaking backward compatibility by removing/renaming fields. + +## Commands +- Regenerate CRDs: `./scripts/verify_crd.sh` +- Full repo verify: `make verify-repo` + +## Notes +If you update types here, you likely need changes in: +- `internal/controller/` for reconciliation +- `docs/` for user-facing updates diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 7e3c7531f..54623e577 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -33,7 +33,7 @@ We only accept pull requests submitted from: * Individuals who have signed the [Splunk Contributor License Agreement](https://www.splunk.com/en_us/form/contributions.html) #### Code of Conduct -All contributors are expected to read our [Code of Conduct](contributing/code-of-conduct.md) and observe it in all interactions involving this project. +All contributors are expected to read our [Code of Conduct](../CODE_OF_CONDUCT.md) and observe it in all interactions involving this project. ## Contribution Workflow Help is always welcome! For example, documentation can always use improvement. There's always code that can be clarified, functionality that can be extended, and tests to be added to guarantee behavior. If you see something you think should be fixed, don't be afraid to own it. @@ -167,6 +167,18 @@ Testing is the responsibility of all contributors. To run Unit Tests in Splunk O $ make test ``` +For agent-assisted or standardized local workflows, prefer the scripts under `scripts/dev/`: +- `scripts/dev/unit.sh` +- `scripts/dev/lint.sh` +- `scripts/dev/pr_check.sh` + +#### Agentic Development Workflow +This repo includes skills and harness scripts to make common workflows repeatable. +Start with: +- `AGENTS.md` (root) for repo map and conventions +- `.agents/skills/*/SKILL.md` for skill workflows +- `docs/agent/README.md` for agent-focused guidance + #### Documentation We can always use improvements to our documentation! Anyone can contribute to these docs, whether you identify as a developer, an end user, or someone who just can’t stand seeing typos. What exactly is needed? @@ -197,11 +209,7 @@ If you need help, tag one of the active maintainers of this project in a post or () Gaurav Gupta () Subba Gontla () Arjun Kondur -() Kriti Ashok -() Param Dhanoya -() Victor Ebken -() Ajeet Kumar -() Jeff Rybczynski -() Patrick Ogdin + + ``` diff --git a/docs/agent/CRD_MAP.md b/docs/agent/CRD_MAP.md new file mode 100644 index 000000000..8b8f5d0d9 --- /dev/null +++ b/docs/agent/CRD_MAP.md @@ -0,0 +1,25 @@ +# CRD Map + +`PROJECT` is the source of truth for kind and API version mapping. +This map is optimized for fast navigation in the codebase. + +## Primary Controller Map + +| Kind | API Version | Type File | Controller | Enterprise Logic | +| --- | --- | --- | --- | --- | +| Standalone | v4 | `api/v4/standalone_types.go` | `internal/controller/standalone_controller.go` | `pkg/splunk/enterprise/standalone.go` | +| IndexerCluster | v4 | `api/v4/indexercluster_types.go` | `internal/controller/indexercluster_controller.go` | `pkg/splunk/enterprise/indexercluster.go` | +| SearchHeadCluster | v4 | `api/v4/searchheadcluster_types.go` | `internal/controller/searchheadcluster_controller.go` | `pkg/splunk/enterprise/searchheadcluster.go` | +| ClusterManager | v4 | `api/v4/clustermanager_types.go` | `internal/controller/clustermanager_controller.go` | `pkg/splunk/enterprise/clustermanager.go` | +| LicenseManager | v4 | `api/v4/licensemanager_types.go` | `internal/controller/licensemanager_controller.go` | `pkg/splunk/enterprise/licensemanager.go` | +| MonitoringConsole | v4 | `api/v4/monitoringconsole_types.go` | `internal/controller/monitoringconsole_controller.go` | `pkg/splunk/enterprise/monitoringconsole.go` | +| ClusterMaster (legacy) | v3 | `api/v3/clustermaster_types.go` | `internal/controller/clustermaster_controller.go` | `pkg/splunk/enterprise/clustermaster.go` | +| LicenseMaster (legacy) | v3 | `api/v3/licensemaster_types.go` | `internal/controller/licensemaster_controller.go` | `pkg/splunk/enterprise/licensemaster.go` | + +## Shared Types +- Common spec fields and phases live in `api/v4/common_types.go`. +- Legacy common types live in `api/v3/common_types.go`. + +## Docs Pointers +- Spec field reference: `docs/CustomResources.md` +- Example manifests: `docs/Examples.md` diff --git a/docs/agent/OPERATIONS.md b/docs/agent/OPERATIONS.md new file mode 100644 index 000000000..ace16030e --- /dev/null +++ b/docs/agent/OPERATIONS.md @@ -0,0 +1,19 @@ +# Operations and Debug + +## Quick Capture Commands +`kubectl get -n -o yaml` +`kubectl describe -n ` +`kubectl get events -n --sort-by=.lastTimestamp` +`kubectl logs -n splunk-operator deploy/splunk-operator-controller-manager -c manager --since=30m` + +## Operator Metrics and pprof +The operator registers pprof handlers on the metrics server when `--pprof=true` (default). +Default metrics bind address is `:8080` with `--metrics-secure=false`. + +Example local access: +`kubectl -n splunk-operator port-forward deploy/splunk-operator-controller-manager 8080:8080` +Then open `/debug/pprof` on `http://127.0.0.1:8080`. + +## Where Debug Endpoints Are Wired +- Registration: `internal/controller/debug/register.go` +- Flags and setup: `cmd/main.go` diff --git a/docs/agent/README.md b/docs/agent/README.md new file mode 100644 index 000000000..6d683713b --- /dev/null +++ b/docs/agent/README.md @@ -0,0 +1,13 @@ +# Agent Harness Docs + +These documents are the system of record for agent-assisted development in this repo. +They are short, concrete, and intended to be read by Codex skills and humans. + +## Index +- `CRD_MAP.md` maps kinds to API versions, types, controllers, and enterprise logic files +- `RECONCILE_FLOW.md` outlines the reconciliation flow, gates, and status phases +- `TEST_MATRIX.md` lists unit and integration test paths and environment variables +- `TESTCASE_SPEC.yaml` is a template for generating new integration/KUTTL tests +- `TESTCASE_PATTERNS.md` maps SVA patterns and features to test helpers +- `OPERATIONS.md` provides debug commands, log access, and pprof access notes +- `RELEASE_FLOW.md` provides a concise release checklist and artifact map diff --git a/docs/agent/RECONCILE_FLOW.md b/docs/agent/RECONCILE_FLOW.md new file mode 100644 index 000000000..5899c3bd3 --- /dev/null +++ b/docs/agent/RECONCILE_FLOW.md @@ -0,0 +1,27 @@ +# Reconcile Flow + +This document describes the typical reconcile flow and the most common gates. + +## Control Flow +1. Controller `Reconcile` fetches the CR instance. +2. Paused annotation check may short-circuit and requeue. +3. Apply function in `pkg/splunk/enterprise/*` builds desired state. +4. Status is updated and the controller returns a requeue or completion. + +## Common Gates +- Paused annotation in `api/v4/*_types.go` or `api/v3/*_types.go`. +- Phase gating constants in `api/v4/common_types.go`. +- Predicate filtering in `internal/controller/common/predicate.go`. + +## Phases +`Pending`, `Ready`, `Updating`, `ScalingUp`, `ScalingDown`, `Terminating`, `Error`. + +## Where To Look First +- Controller entry: `internal/controller/_controller.go`. +- Apply logic: `pkg/splunk/enterprise/.go`. +- Shared helpers: `pkg/splunk/enterprise/util.go` and `pkg/splunk/common/*`. + +## Debug Checklist +- Confirm CR spec and status (`kubectl get -o yaml`). +- Check events (`kubectl get events -n --sort-by=.lastTimestamp`). +- Inspect operator logs for the CR name. diff --git a/docs/agent/RELEASE_FLOW.md b/docs/agent/RELEASE_FLOW.md new file mode 100644 index 000000000..c7c5db39a --- /dev/null +++ b/docs/agent/RELEASE_FLOW.md @@ -0,0 +1,34 @@ +# Release Flow + +This is a concise, repo-specific release flow intended for humans and Codex skills. + +## Inputs +- Release version (update `VERSION` in `Makefile` or set `VERSION=` in the environment). +- Target Splunk Enterprise compatibility (update docs and release notes accordingly). + +## Core Steps +1. Update `docs/ChangeLog.md` and any release notes. +2. Update compatibility notes in `docs/README.md` and `docs/SplunkOperatorUpgrade.md` as needed. +3. If CRDs changed, run `make generate` and `make manifests` (or `./scripts/verify_crd.sh`). +4. If bundle/CSV outputs are needed, run `make bundle` (or `./scripts/verify_bundle.sh`). +5. Run `make verify VERIFY_BUNDLE=1` to ensure generated outputs are consistent. +6. Run unit tests (`make test`) and any required integration tests. +7. Build/push images and bundle artifacts as required by release packaging. + +## Artifacts to Inspect +- CRDs: `config/crd/bases/` +- RBAC: `config/rbac/role.yaml` +- Bundle/CSV: `bundle/manifests/`, `bundle/manifests/splunk-operator.clusterserviceversion.yaml` +- Helm CRDs: `helm-chart/splunk-operator/crds` + +## Common Commands +- `make verify VERIFY_BUNDLE=1` +- `make bundle` +- `make bundle-build` and `make bundle-push` +- `make catalog-build` and `make catalog-push` + +## Docs to Update +- `docs/ChangeLog.md` +- `docs/README.md` +- `docs/SplunkOperatorUpgrade.md` +- `docs/Install.md` (if install defaults or requirements changed) diff --git a/docs/agent/TESTCASE_PATTERNS.md b/docs/agent/TESTCASE_PATTERNS.md new file mode 100644 index 000000000..e2df1acf7 --- /dev/null +++ b/docs/agent/TESTCASE_PATTERNS.md @@ -0,0 +1,53 @@ +# Testcase Patterns + +This doc maps common Splunk Validated Architectures (SVA) and features to test helpers. + +## Integration Helpers (test/testenv/deployment.go) + +S1 (Standalone) +- `DeployStandalone` (basic) +- `DeployStandaloneWithGivenSmartStoreSpec` (smartstore) + +C3 (Single-site cluster, SHC optional) +- `DeploySingleSiteCluster` (basic) +- `DeploySingleSiteClusterWithGivenAppFrameworkSpec` (app framework) +- SmartStore for C3 requires manual flow: use `DeployClusterManagerWithSmartStoreIndexes` plus `DeployIndexerCluster` and optional SHC. + +M4 (Multisite with SHC) +- `DeployMultisiteClusterWithSearchHead` (basic) +- `DeployMultisiteClusterWithSearchHeadAndAppFramework` (app framework) +- `DeployMultisiteClusterWithSearchHeadAndIndexes` (smartstore) + +M1 (Multisite, no SHC) +- `DeployMultisiteCluster` (basic) +- For app framework, use `DeployMultisiteClusterWithSearchHeadAndAppFramework` with `shc=false` +- SmartStore without SHC requires manual flow or a custom helper + +## Readiness Helpers (test/testenv/verificationutils.go) +- `StandaloneReady` +- `ClusterManagerReady` or `LegacyClusterManagerReady` +- `SearchHeadClusterReady` +- `SingleSiteIndexersReady` or `IndexersReady` +- `IndexerClusterMultisiteStatus` +- `VerifyRFSFMet` +- `LicenseManagerReady` or `LegacyLicenseManagerReady` +- `VerifyMonitoringConsoleReady` + +## SVA Validation (Integration) +Use `validations` in `docs/agent/TESTCASE_SPEC.yaml` to auto-add readiness checks. +- For C3 SVA: `validations.sva: C3` (adds Monitoring Console + License Manager checks unless disabled) +- Ensure a license file/configmap is configured when enabling License Manager checks + +## App Framework Helpers +- `GenerateAppFrameworkSpec` (test/testenv/appframework_utils.go) +- `DeploySingleSiteClusterWithGivenAppFrameworkSpec` +- `DeployMultisiteClusterWithSearchHeadAndAppFramework` + +## SmartStore Helpers +- `DeployStandaloneWithGivenSmartStoreSpec` +- `DeployClusterManagerWithSmartStoreIndexes` +- `DeployMultisiteClusterWithSearchHeadAndIndexes` + +## Operator Upgrade (KUTTL) +- Use `upgrade` in `docs/agent/TESTCASE_SPEC.yaml` to generate helm install/upgrade steps. +- Example suite: `kuttl/tests/upgrade/c3-with-operator` (checks operator deployment and image). diff --git a/docs/agent/TESTCASE_SPEC.yaml b/docs/agent/TESTCASE_SPEC.yaml new file mode 100644 index 000000000..43ad7ea9f --- /dev/null +++ b/docs/agent/TESTCASE_SPEC.yaml @@ -0,0 +1,89 @@ +# Testcase specification template +# Save a copy and pass it to scripts/generate_testcase.py --spec + +# Test type: kuttl or integration +# - kuttl: generates kuttl/tests///... with deploy/assert steps +# - integration: generates test//_test.go scaffold + +type: kuttl +suite: smoke +name: c3-basic-ready + +architecture: + # Supported: S1, C3, M4, M1 (used by integration scaffolds) + name: C3 + indexerReplicas: 3 + siteCount: 1 + shc: true + # Use legacy v3 control-plane readiness helpers + useLegacyClusterManager: false + +features: + smartstore: false + appframework: false + +# Optional: extra readiness validations for integration tests +# For C3 SVA, set `sva: C3` to auto-enable Monitoring Console + License Manager checks +# (unless you explicitly set them to false). +validations: + sva: "" # e.g., C3 + monitoringConsole: false + monitoringConsoleName: "" # optional override (defaults to deployment name) + licenseManager: false # requires a license file/configmap to be configured + +# Optional: operator upgrade flow (KUTTL only) +# When enabled, generator will create: +# 00-install.yaml (helm install) +# 01-assert-operator-ready.yaml +# 02-upgrade.yaml (helm upgrade) +# 03-assert-operator-image.yaml +# 04-assert.yaml (CR readiness/resource asserts) +upgrade: + enabled: false + method: helm + helmRelease: splunk-test + helmChartPathEnv: HELM_REPO_PATH + namespaceEnv: NAMESPACE + valuesFile: "" # optional values file to copy into test dir + operatorImageEnv: KUTTL_SPLUNK_OPERATOR_IMAGE + operatorImageNewEnv: KUTTL_SPLUNK_OPERATOR_NEW_IMAGE + enterpriseImageEnv: KUTTL_SPLUNK_ENTERPRISE_IMAGE + enterpriseImageNewEnv: KUTTL_SPLUNK_ENTERPRISE_NEW_IMAGE + extraHelmArgs: [] + +# For a single-CR test, use `cr`. For multi-CR (C3/M4), use `crs` in deploy order. +# cr: +# path: ./path/to/your-cr.yaml +# apiVersion: enterprise.splunk.com/v4 +# kind: Standalone +# name: example-standalone + +crs: + - path: ./path/to/clustermanager.yaml + apiVersion: enterprise.splunk.com/v4 + kind: ClusterManager + name: test + - path: ./path/to/indexercluster.yaml + apiVersion: enterprise.splunk.com/v4 + kind: IndexerCluster + name: test + - path: ./path/to/searchheadcluster.yaml + apiVersion: enterprise.splunk.com/v4 + kind: SearchHeadCluster + name: test + +expected: + phase: Ready + # Optional: path to extra KUTTL asserts (resources, fields) appended after phase checks + assert_path: "" + +# Optional: additional resources for KUTTL assertion generation (if assert_path is empty) +resources: + - apiVersion: apps/v1 + kind: StatefulSet + name: splunk-test-cluster-manager + status: + replicas: 1 + - apiVersion: v1 + kind: Secret + name: splunk-test-cluster-manager-secret-v1 diff --git a/docs/agent/TEST_MATRIX.md b/docs/agent/TEST_MATRIX.md new file mode 100644 index 000000000..61335dc86 --- /dev/null +++ b/docs/agent/TEST_MATRIX.md @@ -0,0 +1,33 @@ +# Test Matrix + +## Unit Tests +- Command: `make test` +- Scope: `./pkg/splunk/common`, `./pkg/splunk/enterprise`, `./internal/controller`, and related packages. + +## Integration Tests (Kind) +- Create cluster: `make cluster-up` +- Run tests: `make int-test` +- Teardown: `make cluster-down` + +## KUTTL Tests +- Suite config: `kuttl/kuttl-test.yaml` +- Tests live under: `kuttl/tests/` +- Example run (if kuttl is installed): `kubectl kuttl test --config kuttl/kuttl-test.yaml` +- Upgrade suite: `kubectl kuttl test --config kuttl/kuttl-test-helm-upgrade.yaml` + +## Patterns +- Use `docs/agent/TESTCASE_PATTERNS.md` for SVA helper mapping. + +## Environment Variables (from `test/env.sh`) +- `SPLUNK_OPERATOR_IMAGE` default `splunk/splunk-operator:latest` +- `SPLUNK_ENTERPRISE_IMAGE` default `splunk/splunk:latest` +- `CLUSTER_PROVIDER` default `kind` +- `PRIVATE_REGISTRY` default `localhost:5000` for kind +- `TEST_REGEX` or `TEST_FOCUS` to filter tests +- `SKIP_REGEX` to skip tests +- `CLUSTER_WIDE` to install operator cluster-wide +- `DEPLOYMENT_TYPE` set to `manifest` or `helm` + +## Targeted Test Runs +- Run a single suite: `cd test/ && ginkgo -v -progress ...` +- Default focus is `smoke` when `TEST_REGEX` is not set. diff --git a/internal/controller/AGENTS.md b/internal/controller/AGENTS.md new file mode 100644 index 000000000..ef0db51d2 --- /dev/null +++ b/internal/controller/AGENTS.md @@ -0,0 +1,23 @@ +# internal/controller/ — Reconcilers + +## What Lives Here +- Controller setup and reconciliation logic for CRs +- Watches, predicates, and event handling + +## Invariants +- Reconcile must be idempotent and safe to retry. +- Status updates must reflect actual observed state. +- Avoid tight loops; use requeues sparingly and intentionally. + +## Common Pitfalls +- Updating status without checking resource version or observed generation. +- Creating resources without proper ownership or labels. +- Missing RBAC updates when new resources are added. + +## Commands +- Unit tests (envtest + ginkgo): `make test` +- Repo verify: `make verify-repo` + +## Notes +Controller behavior is tightly coupled to `pkg/splunk/enterprise/` helpers. +When updating reconciliation, update or add tests in `test/`. diff --git a/pkg/splunk/enterprise/AGENTS.md b/pkg/splunk/enterprise/AGENTS.md new file mode 100644 index 000000000..36f5ae562 --- /dev/null +++ b/pkg/splunk/enterprise/AGENTS.md @@ -0,0 +1,24 @@ +# pkg/splunk/enterprise/ — Core Operator Logic + +## What Lives Here +- Enterprise CR orchestration and state transitions +- App framework workflows +- Stateful resource creation helpers + +## Invariants +- State transitions must be monotonic and recoverable. +- Helpers should be idempotent and tolerate partial resources. +- Respect spec defaults and validate inputs before use. + +## Common Pitfalls +- Assuming resources exist without checking. +- Updating status too early (before resources are ready). +- Cross-CR dependencies without clear ordering. + +## Commands +- Unit tests: `make test` +- Repo verify: `make verify-repo` + +## Notes +When touching app framework paths, add/adjust tests in `test/` and +consider any KUTTL coverage under `kuttl/`. diff --git a/scripts/debug_reconcile.sh b/scripts/debug_reconcile.sh new file mode 100755 index 000000000..849eaf72e --- /dev/null +++ b/scripts/debug_reconcile.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + echo "Usage: $(basename "$0") " >&2 + echo "Environment overrides:" >&2 + echo " OPERATOR_NAMESPACE (default: splunk-operator)" >&2 + echo " LOG_SINCE (default: 30m)" >&2 + echo " OUTPUT_DIR (default: ./.agent-output/reconcile---)" >&2 +} + +if [[ $# -ne 3 ]]; then + usage + exit 1 +fi + +kind="$1" +name="$2" +namespace="$3" + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v kubectl >/dev/null 2>&1; then + echo "kubectl is required for debug_reconcile.sh" >&2 + exit 1 +fi + +operator_namespace="${OPERATOR_NAMESPACE:-splunk-operator}" +log_since="${LOG_SINCE:-30m}" + +if [[ -n "${OUTPUT_DIR:-}" ]]; then + out_dir="${OUTPUT_DIR}" +else + ts="$(date +%Y%m%d-%H%M%S)" + out_dir="${repo_root}/.agent-output/reconcile-${kind}-${name}-${ts}" +fi + +mkdir -p "${out_dir}" + +printf "Collecting reconcile debug data into %s\n" "${out_dir}" + +kubectl get "${kind}" "${name}" -n "${namespace}" -o yaml > "${out_dir}/cr.yaml" +kubectl describe "${kind}" "${name}" -n "${namespace}" > "${out_dir}/cr.describe.txt" + +kubectl get events -n "${namespace}" --sort-by=.lastTimestamp > "${out_dir}/events.txt" + +kubectl get pods -n "${namespace}" -o wide > "${out_dir}/pods.txt" +kubectl get svc -n "${namespace}" -o wide > "${out_dir}/services.txt" + +kubectl logs -n "${operator_namespace}" deploy/splunk-operator-controller-manager -c manager --since="${log_since}" > "${out_dir}/operator.logs.txt" + +printf "Done.\n" diff --git a/scripts/dev/envtest.sh b/scripts/dev/envtest.sh new file mode 100755 index 000000000..922d3ed9a --- /dev/null +++ b/scripts/dev/envtest.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" +echo "Ensuring envtest assets are available" +make envtest + +if [[ "${RUN_TESTS:-}" == "1" ]]; then + echo "Running unit/envtest suite (make test)" + make test +fi diff --git a/scripts/dev/kind_smoke.sh b/scripts/dev/kind_smoke.sh new file mode 100755 index 000000000..62ef94d2e --- /dev/null +++ b/scripts/dev/kind_smoke.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v kubectl >/dev/null 2>&1; then + echo "kubectl not found in PATH." >&2 + exit 1 +fi +if ! command -v kind >/dev/null 2>&1; then + echo "kind not found in PATH." >&2 + exit 1 +fi + +export CLUSTER_PROVIDER=kind +export TEST_CLUSTER_PLATFORM=kind +: "${TEST_CLUSTER_NAME:=sok-kind}" +: "${CLUSTER_WORKERS:=3}" + +echo "Bringing up kind cluster" +make cluster-up + +: "${NAMESPACE:=splunk-operator}" +: "${WATCH_NAMESPACE:=${NAMESPACE}}" +: "${ENVIRONMENT:=default}" +: "${SPLUNK_GENERAL_TERMS:=--accept-sgt-current-at-splunk-com}" +: "${SPLUNK_ENTERPRISE_IMAGE:=splunk/splunk:latest}" +: "${IMG:=splunk/splunk-operator:latest}" + +export NAMESPACE WATCH_NAMESPACE ENVIRONMENT SPLUNK_GENERAL_TERMS SPLUNK_ENTERPRISE_IMAGE IMG + +kubectl create namespace "${NAMESPACE}" >/dev/null 2>&1 || true + +echo "Deploying operator" +make deploy + +echo "Waiting for operator deployment to be ready" +kubectl -n "${NAMESPACE}" rollout status deploy/splunk-operator-controller-manager --timeout="${OPERATOR_READY_TIMEOUT:-5m}" + +if [[ "${APPLY_SAMPLE:-}" == "1" ]]; then + echo "Applying sample CR (best effort)" + kubectl -n "${NAMESPACE}" apply -f config/samples/enterprise_v4_standalone.yaml +fi + +echo "Kind smoke complete" diff --git a/scripts/dev/lint.sh b/scripts/dev/lint.sh new file mode 100755 index 000000000..c9124fe6a --- /dev/null +++ b/scripts/dev/lint.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" +echo "Running format + vet" +make fmt vet + +if [[ "${RUN_STATICCHECK:-}" == "1" ]]; then + echo "Running staticcheck" + make scheck +fi + +if [[ "${RUN_BIAS_LINT:-}" == "1" ]]; then + echo "Running bias language linter" + make lang +fi diff --git a/scripts/dev/pr_check.sh b/scripts/dev/pr_check.sh new file mode 100755 index 000000000..2b0f59108 --- /dev/null +++ b/scripts/dev/pr_check.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +args=() +if [[ "${RUN_ALL:-}" == "1" ]]; then + args+=(--all) +else + if [[ "${RUN_BUNDLE:-}" == "1" ]]; then + args+=(--bundle) + fi + if [[ "${RUN_TESTS:-}" == "1" ]]; then + args+=(--tests) + fi +fi + +if [[ -n "${PR_CHECK_FLAGS:-}" ]]; then + # shellcheck disable=SC2206 + extra_flags=(${PR_CHECK_FLAGS}) + args+=("${extra_flags[@]}") +fi + +echo "Running repo PR checks: ./scripts/verify_repo.sh ${args[*]}" +./scripts/verify_repo.sh "${args[@]}" diff --git a/scripts/dev/scope_check.sh b/scripts/dev/scope_check.sh new file mode 100755 index 000000000..b9ea154f7 --- /dev/null +++ b/scripts/dev/scope_check.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +allowed_raw="${ALLOWED_PATHS:-}" +forbidden_raw="${FORBIDDEN_PATHS:-}" + +split_patterns() { + local raw="$1" + raw="${raw//,/ }" + echo "${raw}" +} + +allowed_patterns=() +forbidden_patterns=() + +if [[ -n "${allowed_raw}" ]]; then + # shellcheck disable=SC2206 + allowed_patterns=($(split_patterns "${allowed_raw}")) +fi +if [[ -n "${forbidden_raw}" ]]; then + # shellcheck disable=SC2206 + forbidden_patterns=($(split_patterns "${forbidden_raw}")) +fi + +files=() +while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") +done < <(git diff --name-only --diff-filter=ACMR) + +while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") +done < <(git diff --name-only --cached --diff-filter=ACMR) + +while IFS= read -r line; do + [[ -n "${line}" ]] && files+=("${line}") +done < <(git ls-files --others --exclude-standard) + +if [[ ${#files[@]} -eq 0 ]]; then + echo "No changed files detected." + exit 0 +fi + +errors=() + +if [[ ${#allowed_patterns[@]} -gt 0 ]]; then + for file in "${files[@]}"; do + matched=false + for pat in "${allowed_patterns[@]}"; do + if [[ "${file}" == ${pat} ]]; then + matched=true + break + fi + done + if [[ "${matched}" == "false" ]]; then + errors+=("File not in allowed scope: ${file}") + fi + done +fi + +if [[ ${#forbidden_patterns[@]} -gt 0 ]]; then + for file in "${files[@]}"; do + for pat in "${forbidden_patterns[@]}"; do + if [[ "${file}" == ${pat} ]]; then + errors+=("File in forbidden scope: ${file} (matches ${pat})") + fi + done + done +fi + +if [[ ${#errors[@]} -gt 0 ]]; then + printf "Scope check failed:\n" + printf " - %s\n" "${errors[@]}" + exit 1 +fi + +echo "Scope check passed." diff --git a/scripts/dev/unit.sh b/scripts/dev/unit.sh new file mode 100755 index 000000000..a40b14f80 --- /dev/null +++ b/scripts/dev/unit.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" +echo "Running unit/envtest suite (make test)" +make test diff --git a/scripts/generate_testcase.py b/scripts/generate_testcase.py new file mode 100755 index 000000000..1e405e38d --- /dev/null +++ b/scripts/generate_testcase.py @@ -0,0 +1,621 @@ +#!/usr/bin/env python3 +import argparse +import json +import os +import re +import shutil +import sys +from datetime import datetime +from pathlib import Path + + +def load_spec(path: Path): + suffix = path.suffix.lower() + if suffix == ".json": + with path.open("r", encoding="utf-8") as f: + return json.load(f) + if suffix in (".yaml", ".yml"): + try: + import yaml # type: ignore + except Exception: + print("[ERROR] PyYAML is required to read YAML specs.") + print("Install with: python3 -m pip install pyyaml") + sys.exit(2) + with path.open("r", encoding="utf-8") as f: + return yaml.safe_load(f) + print(f"[ERROR] Unsupported spec extension: {suffix}") + print("Use .yaml, .yml, or .json") + sys.exit(2) + + +def slugify(value: str) -> str: + value = value.strip().lower() + value = re.sub(r"[^a-z0-9]+", "-", value) + value = re.sub(r"-+", "-", value).strip("-") + return value or "test" + + +def ensure_dir(path: Path): + path.mkdir(parents=True, exist_ok=True) + + +def read_text(path: Path) -> str: + return path.read_text(encoding="utf-8") + + +def write_text(path: Path, content: str, force: bool): + if path.exists() and not force: + print(f"[ERROR] Refusing to overwrite existing file: {path}") + print("Use --force to overwrite") + sys.exit(1) + path.write_text(content, encoding="utf-8") + + +def indent_block(text: str, spaces: int) -> str: + prefix = " " * spaces + return "\n".join(prefix + line if line.strip() else "" for line in text.splitlines()) + + +def go_bool(value: bool) -> str: + return "true" if value else "false" + + +def kuttl_assert_for_resource(res: dict) -> str: + api_version = res.get("apiVersion", "") + kind = res.get("kind", "") + name = res.get("name", "") + status = res.get("status") + lines = ["---", f"apiVersion: {api_version}", f"kind: {kind}", "metadata:", f" name: {name}"] + if isinstance(status, dict) and status: + lines.append("status:") + for key, value in status.items(): + if isinstance(value, str): + lines.append(f" {key}: {value}") + else: + lines.append(f" {key}: {value}") + return "\n".join(lines) + "\n" + + +def generate_kuttl(spec: dict, repo_root: Path, force: bool, dry_run: bool): + suite = spec["suite"] + name = spec["name"] + crs = spec.get("crs") + if crs is None: + cr = spec.get("cr") + if cr is None: + crs = [] + else: + crs = [cr] + if not isinstance(crs, list): + print("[ERROR] crs must be a list") + sys.exit(1) + upgrade = spec.get("upgrade", {}) if isinstance(spec.get("upgrade", {}), dict) else {} + upgrade_enabled = bool(upgrade.get("enabled", False)) + if not crs and not upgrade_enabled: + print("[ERROR] kuttl spec requires cr or crs unless upgrade.enabled=true") + sys.exit(1) + expected = spec.get("expected", {}) + resources = spec.get("resources", []) + phase = expected.get("phase", "Ready") + phases = expected.get("phases", {}) + assert_path = expected.get("assert_path", "") + + test_dir = repo_root / "kuttl" / "tests" / suite / name + assert_name = "00-assert.yaml" if not upgrade_enabled else "04-assert.yaml" + assert_target = test_dir / assert_name + + if dry_run: + print(f"[DRY-RUN] Create {test_dir}") + if upgrade_enabled: + print(f"[DRY-RUN] Write {test_dir / '00-install.yaml'}") + print(f"[DRY-RUN] Write {test_dir / '01-assert-operator-ready.yaml'}") + print(f"[DRY-RUN] Write {test_dir / '02-upgrade.yaml'}") + print(f"[DRY-RUN] Write {test_dir / '03-assert-operator-image.yaml'}") + for index, cr in enumerate(crs): + kind = cr.get("kind", "") + deploy_index = index if not upgrade_enabled else index + 4 + deploy_name = f"{deploy_index:02d}-deploy-{slugify(kind)}.yaml" + cr_path = Path(cr.get("path", "")).expanduser() + if not cr_path.is_absolute(): + cr_path = (repo_root / cr_path).resolve() + print(f"[DRY-RUN] Copy {cr_path} -> {test_dir / deploy_name}") + print(f"[DRY-RUN] Write {assert_target}") + return + + ensure_dir(test_dir) + + if upgrade_enabled: + method = str(upgrade.get("method", "helm")).lower() + if method != "helm": + print("[ERROR] upgrade.method only supports 'helm' for now") + sys.exit(1) + + helm_release = str(upgrade.get("helmRelease", "splunk-test")) + helm_repo_env = str(upgrade.get("helmChartPathEnv", "HELM_REPO_PATH")) + namespace_env = str(upgrade.get("namespaceEnv", "NAMESPACE")) + values_file = str(upgrade.get("valuesFile", "")).strip() + operator_image_env = str(upgrade.get("operatorImageEnv", "KUTTL_SPLUNK_OPERATOR_IMAGE")) + operator_image_new_env = str(upgrade.get("operatorImageNewEnv", "KUTTL_SPLUNK_OPERATOR_NEW_IMAGE")) + enterprise_image_env = str(upgrade.get("enterpriseImageEnv", "KUTTL_SPLUNK_ENTERPRISE_IMAGE")) + enterprise_image_new_env = str(upgrade.get("enterpriseImageNewEnv", "KUTTL_SPLUNK_ENTERPRISE_NEW_IMAGE")) + extra_args = upgrade.get("extraHelmArgs", []) + if not isinstance(extra_args, list): + extra_args = [] + + values_arg = "" + if values_file: + values_path = Path(values_file).expanduser() + if not values_path.is_absolute(): + values_path = (repo_root / values_path).resolve() + if not values_path.exists(): + print(f"[ERROR] valuesFile not found: {values_path}") + sys.exit(1) + values_target = test_dir / values_path.name + if not values_target.exists() or force: + shutil.copyfile(values_path, values_target) + values_arg = f"-f {values_target.name}" + + extra = " ".join(extra_args) + install_cmd = ( + f"helm install {helm_release} " + f"${{{helm_repo_env}}}/splunk-enterprise {values_arg} " + f"--set splunk-operator.splunkOperator.image.repository=${{{operator_image_env}}} " + f"--set splunk-operator.image.repository=${{{enterprise_image_env}}} " + f"--namespace ${{{namespace_env}}} " + f"--set splunk-operator.splunkOperator.splunkGeneralTerms=\\\"--accept-sgt-current-at-splunk-com\\\" " + f"{extra}" + ).strip() + + upgrade_cmd = ( + f"helm upgrade {helm_release} " + f"${{{helm_repo_env}}}/splunk-enterprise --reuse-values {values_arg} " + f"--set splunk-operator.splunkOperator.image.repository=${{{operator_image_new_env}}} " + f"--set splunk-operator.image.repository=${{{enterprise_image_new_env}}} " + f"--namespace ${{{namespace_env}}} " + f"{extra}" + ).strip() + + install_step = f\"\"\"---\napiVersion: kuttl.dev/v1beta1\nkind: TestStep\ncommands:\n - command: {install_cmd}\n namespaced: true\n\"\"\"\n+ ready_assert = \"\"\"---\napiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: splunk-operator-controller-manager\nstatus:\n readyReplicas: 1\n availableReplicas: 1\n\"\"\"\n+ upgrade_step = f\"\"\"---\napiVersion: kuttl.dev/v1beta1\nkind: TestStep\ncommands:\n - command: {upgrade_cmd}\n namespaced: true\n\"\"\"\n+ image_check_cmd = (\n+ f\"kubectl -n ${{{namespace_env}}} get deploy splunk-operator-controller-manager \"\n+ f\"-o jsonpath='{{{{.spec.template.spec.containers[?(@.name==\\\\\\\"manager\\\\\\\")].image}}}}' \"\n+ f\"| grep -q \\\"${{{operator_image_new_env}}}\\\"\"\n+ )\n+ image_assert_step = f\"\"\"---\napiVersion: kuttl.dev/v1beta1\nkind: TestStep\ncommands:\n - command: {image_check_cmd}\n namespaced: true\n\"\"\"\n+\n+ write_text(test_dir / \"00-install.yaml\", install_step, force)\n+ write_text(test_dir / \"01-assert-operator-ready.yaml\", ready_assert, force)\n+ write_text(test_dir / \"02-upgrade.yaml\", upgrade_step, force)\n+ write_text(test_dir / \"03-assert-operator-image.yaml\", image_assert_step, force)\n+ + for index, cr in enumerate(crs): + api_version = cr.get("apiVersion", "") + kind = cr.get("kind", "") + cr_name = cr.get("name", "") + cr_path = Path(cr.get("path", "")).expanduser() + if not api_version or not kind or not cr_name: + print("[ERROR] crs entries must include apiVersion, kind, and name") + sys.exit(1) + if not cr_path.is_absolute(): + cr_path = (repo_root / cr_path).resolve() + if not cr_path.exists(): + print(f"[ERROR] CR manifest not found: {cr_path}") + sys.exit(1) + deploy_index = index if not upgrade_enabled else index + 4 + deploy_name = f"{deploy_index:02d}-deploy-{slugify(kind)}.yaml" + deploy_target = test_dir / deploy_name + if deploy_target.exists() and not force: + print(f"[ERROR] Deploy file exists: {deploy_target}") + sys.exit(1) + shutil.copyfile(cr_path, deploy_target) + + # Build assert content + content = [] + for cr in crs: + api_version = cr.get("apiVersion", "") + kind = cr.get("kind", "") + cr_name = cr.get("name", "") + if not api_version or not kind or not cr_name: + print("[ERROR] crs entries must include apiVersion, kind, and name") + sys.exit(1) + phase_for_cr = phases.get(cr_name, phase) if isinstance(phases, dict) else phase + content.append("---") + content.append(f"apiVersion: {api_version}") + content.append(f"kind: {kind}") + content.append("metadata:") + content.append(f" name: {cr_name}") + content.append("status:") + content.append(f" phase: {phase_for_cr}") + content.append("") + + if assert_path: + assert_file = Path(assert_path).expanduser() + if not assert_file.is_absolute(): + assert_file = (repo_root / assert_file).resolve() + if not assert_file.exists(): + print(f"[ERROR] assert_path not found: {assert_file}") + sys.exit(1) + content.append(read_text(assert_file).rstrip()) + content.append("") + elif resources: + for res in resources: + content.append(kuttl_assert_for_resource(res).rstrip()) + elif crs: + content.append("# TODO: add resource assertions (StatefulSet, Service, Secret, etc.)") + + if content: + write_text(assert_target, "\n".join(content).rstrip() + "\n", force) + + readme_path = test_dir / "readme.txt" + if not readme_path.exists(): + readme_path.write_text(f"KUTTL test: {name}\n", encoding="utf-8") + + print(f"[OK] Created KUTTL test: {test_dir}") + + +def suite_template(suite: str) -> str: + suite_name = slugify(suite) + return f"""// Copyright (c) 2018-2022 Splunk Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the \"License\"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//\thttp://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an \"AS IS\" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package {suite_name} + +import ( + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/splunk/splunk-operator/test/testenv" +) + +const ( + PollInterval = 5 * time.Second + ConsistentPollInterval = 200 * time.Millisecond + ConsistentDuration = 2000 * time.Millisecond +) + +var ( + testenvInstance *testenv.TestEnv + testSuiteName = "{suite_name}-" + testenv.RandomDNSName(3) +) + +func TestBasic(t *testing.T) {{ + RegisterFailHandler(Fail) + RunSpecs(t, "Running "+testSuiteName) +}} + +var _ = BeforeSuite(func() {{ + var err error + testenvInstance, err = testenv.NewDefaultTestEnv(testSuiteName) + Expect(err).ToNot(HaveOccurred()) +}}) + +var _ = AfterSuite(func() {{ + if testenvInstance != nil {{ + Expect(testenvInstance.Teardown()).ToNot(HaveOccurred()) + }} +}}) +""" + + +def integration_flow(spec: dict): + arch = spec.get("architecture", {}) if isinstance(spec.get("architecture", {}), dict) else {} + arch_name = str(arch.get("name", "")).upper() + indexer_replicas = int(arch.get("indexerReplicas", 3) or 3) + site_count = int(arch.get("siteCount", 3) or 3) + shc = bool(arch.get("shc", True)) + use_legacy_cluster_manager = bool(arch.get("useLegacyClusterManager", False)) + features = spec.get("features", {}) if isinstance(spec.get("features", {}), dict) else {} + use_smartstore = bool(features.get("smartstore", False)) + use_appframework = bool(features.get("appframework", False)) + + deploy_lines = [] + ready_lines = [] + notes = [] + + upgrade = spec.get("upgrade", {}) if isinstance(spec.get("upgrade", {}), dict) else {} + upgrade_enabled = bool(upgrade.get("enabled", False)) + operator_image_env = str(upgrade.get("operatorImageNewEnv", "UPGRADE_OPERATOR_IMAGE")) + enterprise_image_env = str(upgrade.get("enterpriseImageNewEnv", "UPGRADE_SPLUNK_IMAGE")) + upgrade_lines = [] + + validations = spec.get("validations", {}) if isinstance(spec.get("validations", {}), dict) else {} + sva = str(validations.get("sva", "")).strip().upper() + has_mc_flag = isinstance(validations, dict) and "monitoringConsole" in validations + has_lm_flag = isinstance(validations, dict) and "licenseManager" in validations + mc_enabled = bool(validations.get("monitoringConsole", False)) + lm_enabled = bool(validations.get("licenseManager", False)) + mc_name_override = str(validations.get("monitoringConsoleName", "")).strip() + if sva in ("C3", "SINGLE-SITE", "SINGLE_SITE", "SVA") and arch_name in ("C3", "SINGLE-SITE", "SINGLE_SITE"): + if not has_mc_flag: + mc_enabled = True + if not has_lm_flag: + lm_enabled = True + + go_shc = go_bool(shc) + + if arch_name in ("", "S1", "STANDALONE"): + if use_smartstore: + deploy_lines.append('Skip("TODO: implement smartstore standalone using DeployStandaloneWithGivenSmartStoreSpec")') + elif use_appframework: + deploy_lines.append('Skip("TODO: implement app framework standalone (no helper) using custom spec")') + else: + deploy_lines.append('instance, err := deployment.DeployStandalone(ctx, deployment.GetName(), "", "")') + deploy_lines.append('Expect(err).To(Succeed(), "Unable to deploy standalone instance")') + ready_lines.append("testenv.StandaloneReady(ctx, deployment, deployment.GetName(), instance, testcaseEnvInst)") + elif arch_name in ("C3", "SINGLE-SITE", "SINGLE_SITE"): + if use_appframework: + deploy_lines.append('Skip("TODO: implement app framework using DeploySingleSiteClusterWithGivenAppFrameworkSpec")') + elif use_smartstore: + deploy_lines.append('Skip("TODO: implement smartstore using DeployClusterManagerWithSmartStoreIndexes + DeployIndexerCluster")') + else: + if mc_enabled: + mc_ref_expr = "deployment.GetName()" + if mc_name_override: + mc_ref_expr = json.dumps(mc_name_override) + deploy_lines.append(f"mcRef := {mc_ref_expr}") + deploy_lines.append('lmRef := ""') + if lm_enabled: + deploy_lines.append("lmRef = deployment.GetName()") + deploy_lines.append( + f'err := deployment.DeploySingleSiteCluster(ctx, deployment.GetName(), {indexer_replicas}, {go_shc}, {"mcRef" if mc_enabled else "\"\""})' + ) + deploy_lines.append('Expect(err).To(Succeed(), "Unable to deploy single-site cluster")') + ready_lines.append( + "testenv.LegacyClusterManagerReady(ctx, deployment, testcaseEnvInst)" + if use_legacy_cluster_manager + else "testenv.ClusterManagerReady(ctx, deployment, testcaseEnvInst)" + ) + if shc: + ready_lines.append("testenv.SearchHeadClusterReady(ctx, deployment, testcaseEnvInst)") + ready_lines.append("testenv.SingleSiteIndexersReady(ctx, deployment, testcaseEnvInst)") + ready_lines.append("testenv.VerifyRFSFMet(ctx, deployment, testcaseEnvInst)") + if lm_enabled: + ready_lines.append( + "testenv.LegacyLicenseManagerReady(ctx, deployment, testcaseEnvInst)" + if use_legacy_cluster_manager + else "testenv.LicenseManagerReady(ctx, deployment, testcaseEnvInst)" + ) + notes.append("License Manager readiness requires a license file/configmap to be configured for the test env.") + if mc_enabled: + ready_lines.append('mc, err := deployment.DeployMonitoringConsole(ctx, mcRef, lmRef)') + ready_lines.append('Expect(err).To(Succeed(), "Unable to deploy Monitoring Console")') + ready_lines.append("testenv.VerifyMonitoringConsoleReady(ctx, deployment, mcRef, mc, testcaseEnvInst)") + elif arch_name in ("M4", "MULTISITE", "MULTI-SITE", "M4-SHC", "MULTISITE-SHC"): + if use_appframework: + deploy_lines.append('Skip("TODO: implement app framework using DeployMultisiteClusterWithSearchHeadAndAppFramework")') + elif use_smartstore: + deploy_lines.append('Skip("TODO: implement smartstore using DeployMultisiteClusterWithSearchHeadAndIndexes")') + else: + deploy_lines.append( + f'err := deployment.DeployMultisiteClusterWithSearchHead(ctx, deployment.GetName(), {indexer_replicas}, {site_count}, "")' + ) + deploy_lines.append('Expect(err).To(Succeed(), "Unable to deploy multisite cluster with SHC")') + ready_lines.append( + "testenv.LegacyClusterManagerReady(ctx, deployment, testcaseEnvInst)" + if use_legacy_cluster_manager + else "testenv.ClusterManagerReady(ctx, deployment, testcaseEnvInst)" + ) + ready_lines.append(f"testenv.IndexersReady(ctx, deployment, testcaseEnvInst, {site_count})") + ready_lines.append(f"testenv.IndexerClusterMultisiteStatus(ctx, deployment, testcaseEnvInst, {site_count})") + ready_lines.append("testenv.SearchHeadClusterReady(ctx, deployment, testcaseEnvInst)") + ready_lines.append("testenv.VerifyRFSFMet(ctx, deployment, testcaseEnvInst)") + elif arch_name in ("M1", "MULTISITE-NOSHC", "MULTISITE_NO_SHC", "MULTISITE-NO-SHC"): + if use_appframework: + deploy_lines.append('Skip("TODO: implement app framework using DeployMultisiteClusterWithSearchHeadAndAppFramework (shc=false)")') + elif use_smartstore: + deploy_lines.append('Skip("TODO: implement smartstore multisite without SHC (no helper) - use custom flow")') + else: + deploy_lines.append( + f'err := deployment.DeployMultisiteCluster(ctx, deployment.GetName(), {indexer_replicas}, {site_count}, "")' + ) + deploy_lines.append('Expect(err).To(Succeed(), "Unable to deploy multisite cluster")') + ready_lines.append( + "testenv.LegacyClusterManagerReady(ctx, deployment, testcaseEnvInst)" + if use_legacy_cluster_manager + else "testenv.ClusterManagerReady(ctx, deployment, testcaseEnvInst)" + ) + ready_lines.append(f"testenv.IndexersReady(ctx, deployment, testcaseEnvInst, {site_count})") + ready_lines.append(f"testenv.IndexerClusterMultisiteStatus(ctx, deployment, testcaseEnvInst, {site_count})") + ready_lines.append("testenv.VerifyRFSFMet(ctx, deployment, testcaseEnvInst)") + else: + deploy_lines.append('Skip("TODO: unsupported architecture in generator. Update spec or add helper mapping.")') + notes.append("Use docs/agent/TESTCASE_PATTERNS.md to pick the right helper.") + + if deploy_lines and deploy_lines[0].startswith("Skip("): + ready_lines = ["// TODO: add readiness checks once deployment is implemented"] + + if upgrade_enabled: + upgrade_lines = [ + f'operatorImage := os.Getenv("{operator_image_env}")', + 'Expect(operatorImage).ToNot(BeEmpty())', + 'err = testcaseEnvInst.UpdateOperatorImage(operatorImage)', + 'Expect(err).To(Succeed(), "Unable to update operator image")', + 'testenv.VerifyOperatorImage(ctx, testcaseEnvInst, operatorImage)', + '', + f'splunkImage := os.Getenv("{enterprise_image_env}")', + 'Expect(splunkImage).ToNot(BeEmpty())', + '// TODO: update CR spec image to splunkImage and wait for reconciliation', + '// TODO: verify splunk pod images updated', + 'testenv.VerifySplunkPodImagesContain(testcaseEnvInst.GetName(), splunkImage)', + ] + + return deploy_lines, ready_lines, upgrade_lines, notes, upgrade_enabled + + +def integration_template(spec: dict) -> str: + suite = slugify(spec["suite"]) + name = spec["name"] + arch = spec.get("architecture", {}) if isinstance(spec.get("architecture", {}), dict) else {} + arch_name = str(arch.get("name", "")).upper() or "Custom" + cr = spec.get("cr", {}) if isinstance(spec.get("cr", {}), dict) else {} + kind = cr.get("kind", "") or arch_name + cr_path = cr.get("path", "") + + deploy_lines, ready_lines, upgrade_lines, notes, upgrade_enabled = integration_flow(spec) + deploy_snippet = indent_block("\n".join(deploy_lines), 12) + ready_snippet = indent_block("\n".join(ready_lines), 12) + upgrade_snippet = "" + post_upgrade_ready = "" + if upgrade_lines: + upgrade_snippet = indent_block("\n".join(upgrade_lines), 12) + post_upgrade_ready = ready_snippet + notes_snippet = "" + if notes: + notes_snippet = indent_block("\n".join([f\"// NOTE: {n}\" for n in notes]), 12) + extra_imports = "" + if upgrade_enabled: + extra_imports = " \"os\"\\n" + + return f"""// Copyright (c) 2018-2022 Splunk Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the \"License\"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +//\thttp://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an \"AS IS\" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package {suite} + +import ( + "context" + "fmt" +{extra_imports} + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/onsi/ginkgo/v2/types" + + "github.com/splunk/splunk-operator/test/testenv" +) + +var _ = Describe("{kind} integration test", func() {{ + var testcaseEnvInst *testenv.TestCaseEnv + var deployment *testenv.Deployment + ctx := context.TODO() + + BeforeEach(func() {{ + var err error + name := fmt.Sprintf("%s-%s", testenvInstance.GetName(), testenv.RandomDNSName(3)) + testcaseEnvInst, err = testenv.NewDefaultTestCaseEnv(testenvInstance.GetKubeClient(), name) + Expect(err).To(Succeed(), "Unable to create testcaseenv") + deployment, err = testcaseEnvInst.NewDeployment(testenv.RandomDNSName(3)) + Expect(err).To(Succeed(), "Unable to create deployment") + }}) + + AfterEach(func() {{ + if types.SpecState(CurrentSpecReport().State) == types.SpecStateFailed {{ + testcaseEnvInst.SkipTeardown = true + }} + if deployment != nil {{ + deployment.Teardown() + }} + if testcaseEnvInst != nil {{ + Expect(testcaseEnvInst.Teardown()).ToNot(HaveOccurred()) + }} + }}) + + Context("{name}", func() {{ + It("integration, {kind}: {name}", func() {{ + // Architecture: {arch_name} + // Spec path: {cr_path} +{notes_snippet} +{deploy_snippet} + + {ready_snippet} + +{upgrade_snippet} + +{post_upgrade_ready} + + // TODO: verify resources (pods, services, statefulsets, secrets) + }}) + }}) +}}) +""" + + +def generate_integration(spec: dict, repo_root: Path, force: bool, dry_run: bool): + suite = slugify(spec["suite"]) + name = slugify(spec["name"]).replace("-", "_") + + suite_dir = repo_root / "test" / suite + test_file = suite_dir / f"{name}_test.go" + suite_file = suite_dir / f"{suite}_suite_test.go" + + if dry_run: + print(f"[DRY-RUN] Create {suite_dir}") + print(f"[DRY-RUN] Write {test_file}") + if not suite_file.exists(): + print(f"[DRY-RUN] Write {suite_file}") + return + + ensure_dir(suite_dir) + + if not suite_file.exists(): + write_text(suite_file, suite_template(suite), force=False) + print(f"[OK] Created suite file: {suite_file}") + + write_text(test_file, integration_template(spec), force) + print(f"[OK] Created integration test: {test_file}") + + +def main(): + parser = argparse.ArgumentParser(description="Generate KUTTL or integration test scaffolds from a spec file.") + parser.add_argument("--spec", required=True, help="Path to testcase spec (.yaml/.yml/.json)") + parser.add_argument("--force", action="store_true", help="Overwrite existing files") + parser.add_argument("--dry-run", action="store_true", help="Print actions without writing files") + args = parser.parse_args() + + spec_path = Path(args.spec).expanduser().resolve() + if not spec_path.exists(): + print(f"[ERROR] Spec file not found: {spec_path}") + sys.exit(1) + + spec = load_spec(spec_path) + if not isinstance(spec, dict): + print("[ERROR] Spec must be a dictionary") + sys.exit(1) + + required = ["type", "suite", "name"] + for key in required: + if key not in spec: + print(f"[ERROR] Missing required field: {key}") + sys.exit(1) + + cr = spec.get("cr") + crs = spec.get("crs") + if cr is None and crs is None: + print("[ERROR] spec must include cr or crs") + sys.exit(1) + if cr is not None: + if not isinstance(cr, dict): + print("[ERROR] cr must be an object") + sys.exit(1) + if "path" not in cr: + print("[ERROR] cr.path is required") + sys.exit(1) + if crs is not None and not isinstance(crs, list): + print("[ERROR] crs must be a list") + sys.exit(1) + + repo_root = Path(__file__).resolve().parent.parent + test_type = str(spec["type"]).lower() + + if test_type == "kuttl": + generate_kuttl(spec, repo_root, args.force, args.dry_run) + elif test_type in ("integration", "ginkgo"): + generate_integration(spec, repo_root, args.force, args.dry_run) + else: + print(f"[ERROR] Unknown test type: {test_type}") + print("Use 'kuttl' or 'integration'") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/verify_bundle.sh b/scripts/verify_bundle.sh new file mode 100755 index 000000000..ea70f29d2 --- /dev/null +++ b/scripts/verify_bundle.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v git >/dev/null 2>&1; then + echo "git is required for verify_bundle.sh" >&2 + exit 1 +fi + +if ! command -v operator-sdk >/dev/null 2>&1; then + echo "operator-sdk is required for bundle generation." >&2 + echo "Install it via: make setup/devsetup" >&2 + exit 1 +fi + +printf "Running bundle generation...\n" +make bundle + +changed=$(git diff --name-only -- bundle/manifests helm-chart/splunk-operator/crds) +if [[ -n "${changed}" ]]; then + echo "Bundle or Helm CRD outputs changed after regeneration:" >&2 + echo "${changed}" >&2 + echo "Please commit regenerated files." >&2 + exit 1 +fi + +echo "Bundle outputs are up to date." diff --git a/scripts/verify_crd.sh b/scripts/verify_crd.sh new file mode 100755 index 000000000..27c1d0e76 --- /dev/null +++ b/scripts/verify_crd.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +if ! command -v git >/dev/null 2>&1; then + echo "git is required for verify_crd.sh" >&2 + exit 1 +fi + +printf "Running CRD/RBAC generation...\n" +make generate +make manifests + +changed=$(git diff --name-only -- config/crd/bases config/rbac/role.yaml) +if [[ -n "${changed}" ]]; then + echo "CRD/RBAC outputs changed after regeneration:" >&2 + echo "${changed}" >&2 + echo "Please commit regenerated files." >&2 + exit 1 +fi + +echo "CRD/RBAC outputs are up to date." diff --git a/scripts/verify_repo.sh b/scripts/verify_repo.sh new file mode 100755 index 000000000..10ede0013 --- /dev/null +++ b/scripts/verify_repo.sh @@ -0,0 +1,121 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'USAGE' +Usage: verify_repo.sh [options] + +Options: + --bundle Verify bundle/helm outputs (runs scripts/verify_bundle.sh) + --tests Run unit tests (make test) + --fmt Check gofmt formatting (no changes) + --vet Run go vet ./... + --no-fmt Skip gofmt check + --no-vet Skip go vet + --fast Only verify CRD/RBAC outputs (skip fmt/vet/tests/bundle) + --all Run bundle, tests, fmt, and vet + -h, --help Show this help + +Default behavior runs CRD/RBAC verification plus fmt and vet. +USAGE +} + +bundle=false +tests=false +fmt=true +vet=true + +while [[ $# -gt 0 ]]; do + case "$1" in + --bundle) + bundle=true + shift + ;; + --tests) + tests=true + shift + ;; + --fmt) + fmt=true + shift + ;; + --vet) + vet=true + shift + ;; + --no-fmt) + fmt=false + shift + ;; + --no-vet) + vet=false + shift + ;; + --fast) + bundle=false + tests=false + fmt=false + vet=false + shift + ;; + --all) + bundle=true + tests=true + fmt=true + vet=true + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + esac +done + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(git -C "${script_dir}" rev-parse --show-toplevel 2>/dev/null || true)" +if [[ -z "${repo_root}" ]]; then + echo "Unable to locate repo root. Run from inside the git repo." >&2 + exit 1 +fi + +cd "${repo_root}" + +printf "Running CRD/RBAC verification...\n" +./scripts/verify_crd.sh + +if [[ "${bundle}" == "true" ]]; then + printf "Running bundle verification...\n" + ./scripts/verify_bundle.sh +fi + +if [[ "${fmt}" == "true" ]]; then + printf "Checking gofmt formatting...\n" + if ! command -v gofmt >/dev/null 2>&1; then + echo "gofmt not found in PATH." >&2 + exit 1 + fi + unformatted=$(gofmt -l $(git ls-files '*.go')) + if [[ -n "${unformatted}" ]]; then + echo "gofmt needed for the following files:" >&2 + echo "${unformatted}" >&2 + exit 1 + fi +fi + +if [[ "${vet}" == "true" ]]; then + printf "Running go vet...\n" + go vet ./... +fi + +if [[ "${tests}" == "true" ]]; then + printf "Running unit tests...\n" + make test +fi + +echo "verify_repo.sh completed successfully." diff --git a/templates/pull_request.md b/templates/pull_request.md new file mode 100644 index 000000000..0fed2e000 --- /dev/null +++ b/templates/pull_request.md @@ -0,0 +1,18 @@ +## Summary +- + +## Changes +- + +## Tests +- [ ] `scripts/dev/pr_check.sh` +- [ ] `scripts/dev/unit.sh` +- [ ] Other: + +## Risks / Rollback +- + +## Checklist +- [ ] CRD/RBAC artifacts updated (if applicable) +- [ ] Docs/examples updated (if user-facing change) +- [ ] Backward compatibility considered diff --git a/test/AGENTS.md b/test/AGENTS.md new file mode 100644 index 000000000..aad55538b --- /dev/null +++ b/test/AGENTS.md @@ -0,0 +1,23 @@ +# test/ — Integration Tests (Ginkgo) + +## What Lives Here +- Ginkgo integration suites (`test//...`) +- `test/testenv` helpers for deployment and verification + +## Invariants +- Tests should clean up resources (unless DEBUG_RUN / failure). +- Use `testenv` helpers for common readiness checks. +- Keep suites scoped and name tests with searchable tags. + +## Common Pitfalls +- Skipping teardown without `DEBUG_RUN`. +- Mixing cluster-wide and namespace-scoped assumptions. +- Assuming license manager exists without a license file/configmap. + +## Commands +- Run integration tests: `make int-test` +- Run unit/envtest suite: `make test` +- Generate scaffolds: `python3 scripts/generate_testcase.py --spec docs/agent/TESTCASE_SPEC.yaml` + +## Notes +KUTTL tests live under `kuttl/` and are not executed by `make test`. diff --git a/test/testenv/testcaseenv.go b/test/testenv/testcaseenv.go index cb3c8a107..8314af866 100644 --- a/test/testenv/testcaseenv.go +++ b/test/testenv/testcaseenv.go @@ -118,6 +118,58 @@ func (testenv *TestCaseEnv) GetSplunkImage() string { return testenv.splunkImage } +// GetOperatorName returns operator deployment name for this test env +func (testenv *TestCaseEnv) GetOperatorName() string { + return testenv.operatorName +} + +// UpdateOperatorImage updates the operator deployment image and waits for rollout +func (testenv *TestCaseEnv) UpdateOperatorImage(image string) error { + operatorNamespace := testenv.namespace + if testenv.clusterWideOperator == "true" { + operatorNamespace = "splunk-operator" + } + namespacedName := client.ObjectKey{Name: testenv.operatorName, Namespace: operatorNamespace} + operator := &appsv1.Deployment{} + err := testenv.GetKubeClient().Get(context.TODO(), namespacedName, operator) + if err != nil { + testenv.Log.Error(err, "Unable to get operator", "operator name", testenv.operatorName, "namespace", operatorNamespace) + return err + } + + containerIndex := 0 + for i, container := range operator.Spec.Template.Spec.Containers { + if container.Name == "manager" { + containerIndex = i + break + } + } + operator.Spec.Template.Spec.Containers[containerIndex].Image = image + + err = testenv.GetKubeClient().Update(context.TODO(), operator) + if err != nil { + testenv.Log.Error(err, "Unable to update operator image", "operator name", testenv.operatorName, "namespace", operatorNamespace) + return err + } + + operatorInstallTimeout := 5 * time.Minute + return wait.PollImmediate(PollInterval, operatorInstallTimeout, func() (bool, error) { + deployment := &appsv1.Deployment{} + err := testenv.GetKubeClient().Get(context.TODO(), namespacedName, deployment) + if err != nil { + testenv.Log.Error(err, "operator not found waiting", "operator name", testenv.operatorName, "namespace", operatorNamespace) + return false, nil + } + if deployment.Status.UpdatedReplicas < deployment.Status.Replicas { + return false, nil + } + if deployment.Status.ReadyReplicas < deployment.Status.Replicas { + return false, nil + } + return true, nil + }) +} + // IsOperatorInstalledClusterWide returns if operator is installed clusterwide func (testenv *TestCaseEnv) IsOperatorInstalledClusterWide() string { return testenv.clusterWideOperator diff --git a/test/testenv/verificationutils.go b/test/testenv/verificationutils.go index cb611254d..0f3a719d9 100644 --- a/test/testenv/verificationutils.go +++ b/test/testenv/verificationutils.go @@ -20,23 +20,38 @@ import ( "context" "encoding/json" "fmt" - "math/rand" "os/exec" - "sigs.k8s.io/controller-runtime/pkg/client" "strings" "time" gomega "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" enterpriseApiV3 "github.com/splunk/splunk-operator/api/v3" enterpriseApi "github.com/splunk/splunk-operator/api/v4" splcommon "github.com/splunk/splunk-operator/pkg/splunk/common" + splenterprise "github.com/splunk/splunk-operator/pkg/splunk/enterprise" + crclient "sigs.k8s.io/controller-runtime/pkg/client" logf "sigs.k8s.io/controller-runtime/pkg/log" ) var StabilizationDuration = time.Second * 20 +const ( + telemetryConfigMapPrefix = "splunk-operator-" + telemetryLabelKey = "name" + telemetryLabelValue = "splunk-operator" + telemetryStatusKey = "status" +) + +type telemetryStatus struct { + LastTransmission string `json:"lastTransmission,omitempty"` + Test string `json:"test,omitempty"` + SokVersion string `json:"sokVersion,omitempty"` +} + // PodDetailsStruct captures output of kubectl get pods podname -o json type PodDetailsStruct struct { Spec struct { @@ -58,6 +73,7 @@ type PodDetailsStruct struct { Status struct { ContainerStatuses []struct { + Name string `json:"name"` ContainerID string `json:"containerID"` Image string `json:"image"` ImageID string `json:"imageID"` @@ -187,31 +203,26 @@ func SingleSiteIndexersReady(ctx context.Context, deployment *Deployment, testen }, ConsistentDuration, ConsistentPollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) } -// IngestorsReady verify ingestors go to ready state +// IngestorReady verify ingestor cluster is in Ready status and does not flip-flop func IngestorReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { - ingest := &enterpriseApi.IngestorCluster{} + ingestor := &enterpriseApi.IngestorCluster{} instanceName := fmt.Sprintf("%s-ingest", deployment.GetName()) - gomega.Eventually(func() enterpriseApi.Phase { - err := deployment.GetInstance(ctx, instanceName, ingest) + err := deployment.GetInstance(ctx, instanceName, ingestor) if err != nil { return enterpriseApi.PhaseError } - - testenvInstance.Log.Info("Waiting for ingestor instance's phase to be ready", "instance", instanceName, "phase", ingest.Status.Phase) + testenvInstance.Log.Info("Waiting for ingestor cluster phase to be ready", "instance", instanceName, "Phase", ingestor.Status.Phase) DumpGetPods(testenvInstance.GetName()) - - return ingest.Status.Phase + return ingestor.Status.Phase }, deployment.GetTimeout(), PollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) // In a steady state, we should stay in Ready and not flip-flop around gomega.Consistently(func() enterpriseApi.Phase { - _ = deployment.GetInstance(ctx, instanceName, ingest) - - testenvInstance.Log.Info("Check for Consistency ingestor instance's phase to be ready", "instance", instanceName, "phase", ingest.Status.Phase) - DumpGetSplunkVersion(ctx, testenvInstance.GetName(), deployment, "-ingest-") - - return ingest.Status.Phase + _ = deployment.GetInstance(ctx, instanceName, ingestor) + testenvInstance.Log.Info("Check for Consistency ingestor cluster phase to be ready", "instance", instanceName, "Phase", ingestor.Status.Phase) + DumpGetSplunkVersion(ctx, testenvInstance.GetName(), deployment, "-ingestor-") + return ingestor.Status.Phase }, ConsistentDuration, ConsistentPollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) } @@ -241,6 +252,11 @@ func ClusterManagerReady(ctx context.Context, deployment *Deployment, testenvIns }, ConsistentDuration, ConsistentPollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) } +// LegacyClusterManagerReady wraps the legacy v3 control-plane readiness check +func LegacyClusterManagerReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { + ClusterMasterReady(ctx, deployment, testenvInstance) +} + // ClusterMasterReady verify Cluster Master Instance is in ready status func ClusterMasterReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { // Ensure that the cluster-master goes to Ready phase @@ -340,6 +356,109 @@ func VerifyRFSFMet(ctx context.Context, deployment *Deployment, testenvInstance }, deployment.GetTimeout(), PollInterval).Should(gomega.Equal(true)) } +func getTelemetryConfigMap(ctx context.Context, deployment *Deployment) (*corev1.ConfigMap, string, error) { + operatorNamespace := deployment.testenv.GetName() + if deployment.testenv.IsOperatorInstalledClusterWide() == "true" { + operatorNamespace = "splunk-operator" + } + cmName := splenterprise.GetTelemetryConfigMapName(telemetryConfigMapPrefix) + cm := &corev1.ConfigMap{} + err := deployment.testenv.GetKubeClient().Get(ctx, crclient.ObjectKey{Name: cmName, Namespace: operatorNamespace}, cm) + return cm, operatorNamespace, err +} + +func parseTelemetryStatus(cm *corev1.ConfigMap) (telemetryStatus, error) { + if cm == nil || cm.Data == nil { + return telemetryStatus{}, fmt.Errorf("telemetry configmap is empty") + } + raw, ok := cm.Data[telemetryStatusKey] + if !ok || strings.TrimSpace(raw) == "" { + return telemetryStatus{}, fmt.Errorf("telemetry status not found") + } + var status telemetryStatus + if err := json.Unmarshal([]byte(raw), &status); err != nil { + return telemetryStatus{}, err + } + return status, nil +} + +// GetTelemetryLastSubmissionTime returns last telemetry transmission time (UTC) or zero time if unavailable +func GetTelemetryLastSubmissionTime(ctx context.Context, deployment *Deployment) time.Time { + cm, _, err := getTelemetryConfigMap(ctx, deployment) + if err != nil { + deployment.testenv.Log.Info("Unable to get telemetry configmap", "error", err) + return time.Time{} + } + status, err := parseTelemetryStatus(cm) + if err != nil { + deployment.testenv.Log.Info("Unable to parse telemetry status", "error", err) + return time.Time{} + } + if status.LastTransmission == "" { + return time.Time{} + } + ts, err := time.Parse(time.RFC3339, status.LastTransmission) + if err != nil { + deployment.testenv.Log.Info("Unable to parse telemetry timestamp", "value", status.LastTransmission, "error", err) + return time.Time{} + } + return ts +} + +// TriggerTelemetrySubmission updates telemetry configmap to request a new submission +func TriggerTelemetrySubmission(ctx context.Context, deployment *Deployment) { + cmName := splenterprise.GetTelemetryConfigMapName(telemetryConfigMapPrefix) + cm, operatorNamespace, err := getTelemetryConfigMap(ctx, deployment) + create := false + if err != nil { + if k8serrors.IsNotFound(err) { + create = true + cm = &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: cmName, + Namespace: operatorNamespace, + }, + } + } else { + gomega.Expect(err).ToNot(gomega.HaveOccurred(), "Unable to get telemetry configmap") + } + } + + status, _ := parseTelemetryStatus(cm) + status.Test = "true" + status.LastTransmission = "" + statusBytes, err := json.MarshalIndent(status, "", " ") + gomega.Expect(err).ToNot(gomega.HaveOccurred(), "Unable to marshal telemetry status") + + if cm.Data == nil { + cm.Data = map[string]string{} + } + cm.Data[telemetryStatusKey] = string(statusBytes) + + if cm.Labels == nil { + cm.Labels = map[string]string{} + } + cm.Labels[telemetryLabelKey] = telemetryLabelValue + + if create { + err = deployment.testenv.GetKubeClient().Create(ctx, cm) + } else { + err = deployment.testenv.GetKubeClient().Update(ctx, cm) + } + gomega.Expect(err).ToNot(gomega.HaveOccurred(), "Unable to update telemetry configmap") +} + +// VerifyTelemetry ensures telemetry lastTransmission is updated after triggering submission +func VerifyTelemetry(ctx context.Context, deployment *Deployment, prev time.Time) { + gomega.Eventually(func() bool { + current := GetTelemetryLastSubmissionTime(ctx, deployment) + if prev.IsZero() { + return !current.IsZero() + } + return current.After(prev) + }, deployment.GetTimeout(), PollInterval).Should(gomega.Equal(true)) +} + // VerifyNoDisconnectedSHPresentOnCM is present on cluster manager func VerifyNoDisconnectedSHPresentOnCM(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { gomega.Consistently(func() bool { @@ -382,6 +501,11 @@ func LicenseManagerReady(ctx context.Context, deployment *Deployment, testenvIns }, ConsistentDuration, ConsistentPollInterval).Should(gomega.Equal(enterpriseApi.PhaseReady)) } +// LegacyLicenseManagerReady wraps the legacy v3 license control-plane readiness check +func LegacyLicenseManagerReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { + LicenseMasterReady(ctx, deployment, testenvInstance) +} + // LicenseMasterReady verify LM is in ready status and does not flip flop func LicenseMasterReady(ctx context.Context, deployment *Deployment, testenvInstance *TestCaseEnv) { LicenseMaster := &enterpriseApiV3.LicenseMaster{} @@ -1027,11 +1151,7 @@ func VerifyAppListPhase(ctx context.Context, deployment *Deployment, testenvInst appDeploymentInfo, err := GetAppDeploymentInfo(ctx, deployment, testenvInstance, name, crKind, appSourceName, appName) if err != nil { testenvInstance.Log.Error(err, "Failed to get app deployment info") - return phase // Continue polling - } - if appDeploymentInfo.AppName == "" { - testenvInstance.Log.Info(fmt.Sprintf("App deployment info not found yet for app %s (CR %s/%s, AppSource %s), continuing to poll", appName, crKind, name, appSourceName)) - return phase // Continue polling + return phase } testenvInstance.Log.Info(fmt.Sprintf("App State found for CR %s NAME %s APP NAME %s Expected Phase should not be %s", crKind, name, appName, phase), "Actual Phase", appDeploymentInfo.PhaseInfo.Phase, "App State", appDeploymentInfo) return appDeploymentInfo.PhaseInfo.Phase @@ -1044,11 +1164,7 @@ func VerifyAppListPhase(ctx context.Context, deployment *Deployment, testenvInst appDeploymentInfo, err := GetAppDeploymentInfo(ctx, deployment, testenvInstance, name, crKind, appSourceName, appName) if err != nil { testenvInstance.Log.Error(err, "Failed to get app deployment info") - return enterpriseApi.PhaseDownload // Continue polling - } - if appDeploymentInfo.AppName == "" { - testenvInstance.Log.Info(fmt.Sprintf("App deployment info not found yet for app %s (CR %s/%s, AppSource %s), continuing to poll", appName, crKind, name, appSourceName)) - return enterpriseApi.PhaseDownload // Continue polling + return enterpriseApi.PhaseDownload } testenvInstance.Log.Info(fmt.Sprintf("App State found for CR %s NAME %s APP NAME %s Expected Phase %s", crKind, name, appName, phase), "Actual Phase", appDeploymentInfo.PhaseInfo.Phase, "App Phase Status", appDeploymentInfo.PhaseInfo.Status, "App State", appDeploymentInfo) if appDeploymentInfo.PhaseInfo.Status != enterpriseApi.AppPkgInstallComplete { @@ -1252,83 +1368,102 @@ func VerifyFilesInDirectoryOnPod(ctx context.Context, deployment *Deployment, te } } -func GetTelemetryLastSubmissionTime(ctx context.Context, deployment *Deployment) string { - const ( - configMapName = "splunk-operator-manager-telemetry" - statusKey = "status" - ) - type telemetryStatus struct { - LastTransmission string `json:"lastTransmission"` - } - - cm := &corev1.ConfigMap{} - err := deployment.testenv.GetKubeClient().Get(ctx, client.ObjectKey{Name: configMapName, Namespace: "splunk-operator"}, cm) - if err != nil { - logf.Log.Error(err, "GetTelemetryLastSubmissionTime: failed to retrieve configmap") - return "" - } - - statusVal, ok := cm.Data[statusKey] - if !ok || statusVal == "" { - logf.Log.Info("GetTelemetryLastSubmissionTime: failed to retrieve status") - return "" +// VerifyOperatorImage verifies the operator pod is running the expected image +func VerifyOperatorImage(ctx context.Context, testenvInstance *TestCaseEnv, expectedImage string) { + _ = ctx // reserved for future use + var ns string + if testenvInstance.clusterWideOperator != "true" { + ns = testenvInstance.GetName() + } else { + ns = "splunk-operator" } - logf.Log.Info("GetTelemetryLastSubmissionTime: retrieved status", "status", statusVal) + timeout := time.Duration(SpecifiedTestTimeout) * time.Second + gomega.Eventually(func() bool { + operatorPod := GetOperatorPodName(testenvInstance) + if operatorPod == "" { + logf.Log.Info("Operator pod not found yet", "namespace", ns) + return false + } + return podImageContains(ns, operatorPod, expectedImage) + }, timeout, PollInterval).Should(gomega.Equal(true)) +} - var status telemetryStatus - if err := json.Unmarshal([]byte(statusVal), &status); err != nil { - logf.Log.Error(err, "GetTelemetryLastSubmissionTime: failed to unmarshal status", "statusVal", statusVal) - return "" - } - return status.LastTransmission +// VerifyPodImageContains verifies the pod is running a container image that contains expectedImage +func VerifyPodImageContains(ns string, podName string, expectedImage string) { + timeout := time.Duration(SpecifiedTestTimeout) * time.Second + gomega.Eventually(func() bool { + return podImageContains(ns, podName, expectedImage) + }, timeout, PollInterval).Should(gomega.Equal(true)) } -// VerifyTelemetry checks that the telemetry ConfigMap has a non-empty lastTransmission field in its status key. -func VerifyTelemetry(ctx context.Context, deployment *Deployment, prevVal string) { - logf.Log.Info("VerifyTelemetry: start") +// VerifySplunkPodImagesContain verifies all Splunk pods (excluding operator) are running expected image +func VerifySplunkPodImagesContain(ns string, expectedImage string) { + timeout := time.Duration(SpecifiedTestTimeout) * time.Second gomega.Eventually(func() bool { - currentVal := GetTelemetryLastSubmissionTime(ctx, deployment) - if currentVal != "" && currentVal != prevVal { - logf.Log.Info("VerifyTelemetry: success", "previous", prevVal, "current", currentVal) - return true + pods := DumpGetPods(ns) + checked := 0 + for _, pod := range pods { + if !isSplunkWorkloadPod(pod) { + continue + } + checked++ + if !podImageContains(ns, pod, expectedImage) { + return false + } } - return false - }, deployment.GetTimeout(), PollInterval).Should(gomega.Equal(true)) + if checked == 0 { + logf.Log.Info("No Splunk pods found yet", "namespace", ns) + return false + } + return true + }, timeout, PollInterval).Should(gomega.Equal(true)) } -// TriggerTelemetrySubmission updates or adds the 'test_submission' key in the telemetry ConfigMap with a JSON value containing a random number. -func TriggerTelemetrySubmission(ctx context.Context, deployment *Deployment) { - const ( - configMapName = "splunk-operator-manager-telemetry" - testKey = "test_submission" - ) - - // Generate a random number - rand.Seed(time.Now().UnixNano()) - randomNumber := rand.Intn(1000) - - // Create the JSON value - jsonValue, err := json.Marshal(map[string]int{"value": randomNumber}) - if err != nil { - logf.Log.Error(err, "Failed to marshal JSON value") - return +// podImageContains checks if any container image or imageID on the pod contains expectedImage +func podImageContains(ns string, podName string, expectedImage string) bool { + if podName == "" { + logf.Log.Info("Pod name is empty; cannot verify image", "namespace", ns) + return false } - - // Update the ConfigMap - cm := &corev1.ConfigMap{} - err = deployment.testenv.GetKubeClient().Get(ctx, client.ObjectKey{Name: configMapName, Namespace: "splunk-operator"}, cm) + output, err := exec.Command("kubectl", "get", "pods", "-n", ns, podName, "-o", "json").Output() if err != nil { - logf.Log.Error(err, "Failed to get ConfigMap") - return + cmd := fmt.Sprintf("kubectl get pods -n %s %s -o json", ns, podName) + logf.Log.Error(err, "Failed to execute command", "command", cmd) + return false } - - // Update the test_submission key - cm.Data[testKey] = string(jsonValue) - err = deployment.testenv.GetKubeClient().Update(ctx, cm) + restResponse := PodDetailsStruct{} + err = json.Unmarshal([]byte(output), &restResponse) if err != nil { - logf.Log.Error(err, "Failed to update ConfigMap") - return + logf.Log.Error(err, "Failed to parse pod JSON") + return false + } + found := false + images := []string{} + for _, status := range restResponse.Status.ContainerStatuses { + if status.Image != "" { + images = append(images, status.Image) + } + if status.ImageID != "" { + images = append(images, status.ImageID) + } + if strings.Contains(status.Image, expectedImage) || strings.Contains(status.ImageID, expectedImage) { + found = true + } } + logf.Log.Info("Pod image check", "pod", podName, "expected", expectedImage, "found", found, "images", images) + return found +} - logf.Log.Info("Successfully updated telemetry ConfigMap", "key", testKey, "value", jsonValue) +func isOperatorPod(podName string) bool { + return strings.HasPrefix(podName, "splunk-op") || strings.HasPrefix(podName, "splunk-operator") +} + +func isSplunkWorkloadPod(podName string) bool { + if podName == "" { + return false + } + if isOperatorPod(podName) { + return false + } + return strings.HasPrefix(podName, "splunk-") }