From 3ac1f9fa3910e47685cbf51233deaeda0d9b4a8f Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 29 Apr 2026 06:35:52 -0400 Subject: [PATCH 1/8] Add PostgreSQL data provider for Component Readiness and seed data Adds a PostgreSQL-based DataProvider implementation for Component Readiness, enabling local development and testing without BigQuery. The seed-data command creates deterministic test data covering all CR statuses (regression, improvement, missing, fallback, etc.) and syncs regressions. The e2e script uses seed data with the postgres provider, removing the BigQuery credential requirement. Co-Authored-By: Claude Opus 4.6 --- cmd/sippy/seed_data.go | 1035 ++++++++++++----- cmd/sippy/serve.go | 12 +- config/e2e-views.yaml | 150 ++- .../dataprovider/postgres/provider.go | 791 +++++++++++++ scripts/e2e.sh | 88 +- 5 files changed, 1662 insertions(+), 414 deletions(-) create mode 100644 pkg/api/componentreadiness/dataprovider/postgres/provider.go diff --git a/cmd/sippy/seed_data.go b/cmd/sippy/seed_data.go index e6829df337..4526398971 100644 --- a/cmd/sippy/seed_data.go +++ b/cmd/sippy/seed_data.go @@ -1,61 +1,50 @@ package main import ( + "context" + "database/sql" "fmt" - "math/rand" + "os" + "sort" + "strings" "time" + "github.com/lib/pq" "github.com/pkg/errors" log "github.com/sirupsen/logrus" "github.com/spf13/cobra" "github.com/spf13/pflag" - + "gopkg.in/yaml.v3" + + componentreadiness "github.com/openshift/sippy/pkg/api/componentreadiness" + pgprovider "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider/postgres" + "github.com/openshift/sippy/pkg/api/componentreadiness/utils" + apitype "github.com/openshift/sippy/pkg/apis/api" + "github.com/openshift/sippy/pkg/apis/api/componentreport/crview" + "github.com/openshift/sippy/pkg/apis/api/componentreport/reqopts" v1 "github.com/openshift/sippy/pkg/apis/sippyprocessing/v1" "github.com/openshift/sippy/pkg/db" "github.com/openshift/sippy/pkg/db/models" "github.com/openshift/sippy/pkg/db/models/jobrunscan" "github.com/openshift/sippy/pkg/flags" "github.com/openshift/sippy/pkg/sippyserver" + "github.com/openshift/sippy/pkg/util/sets" ) type SeedDataFlags struct { - DBFlags *flags.PostgresFlags - CacheFlags *flags.CacheFlags - InitDatabase bool - Releases []string - JobsPerRelease int - TestNames []string - RunsPerJob int + DBFlags *flags.PostgresFlags + InitDatabase bool } func NewSeedDataFlags() *SeedDataFlags { return &SeedDataFlags{ - DBFlags: flags.NewPostgresDatabaseFlags(), - CacheFlags: flags.NewCacheFlags(), - Releases: []string{"5.0", "4.22", "4.21"}, // Default releases - JobsPerRelease: 3, // Default jobs per release - TestNames: []string{ - "install should succeed: infrastructure", - "install should succeed: overall", - "install should succeed: configuration", - "install should succeed: cluster bootstrap", - "install should succeed: other", - "[sig-cluster-lifecycle] Cluster completes upgrade", - "[sig-sippy] upgrade should work", - "[sig-sippy] openshift-tests should work", - }, - RunsPerJob: 20, // Default runs per job + DBFlags: flags.NewPostgresDatabaseFlags(), } } func (f *SeedDataFlags) BindFlags(fs *pflag.FlagSet) { f.DBFlags.BindFlags(fs) - f.CacheFlags.BindFlags(fs) fs.BoolVar(&f.InitDatabase, "init-database", false, "Initialize the DB schema before seeding data") - fs.StringSliceVar(&f.Releases, "release", f.Releases, "Releases to create ProwJobs for (can be specified multiple times)") - fs.IntVar(&f.JobsPerRelease, "jobs", f.JobsPerRelease, "Number of ProwJobs to create for each release") - fs.StringSliceVar(&f.TestNames, "test", f.TestNames, "Test names to create (can be specified multiple times)") - fs.IntVar(&f.RunsPerJob, "runs", f.RunsPerJob, "Number of ProwJobRuns to create for each ProwJob") } func NewSeedDataCommand() *cobra.Command { @@ -65,16 +54,19 @@ func NewSeedDataCommand() *cobra.Command { Use: "seed-data", Short: "Populate test data in the database", Long: `Populate test data in the database for development purposes. -This command creates sample ProwJob and Test records with realistic test data -that can be used for local development and testing. -Test results are randomized with 85% pass rate, 10% flake rate, and 5% failure rate. -All counts, releases, and test names are configurable via command-line flags. +Creates deterministic Component Readiness data covering all CR statuses +(NotSignificant, SignificantRegression, ExtremeRegression, MissingSample, +MissingBasis, BasisOnly, SignificantImprovement, BelowMinFailure) and +fallback scenarios. Use with 'sippy serve --data-provider postgres'. -The command can be re-run as needed to add more runs, or because your old job runs -rolled off the 1 week window. +Drop and recreate the database to re-seed (e.g. docker compose down -v). `, RunE: func(cmd *cobra.Command, args []string) error { + if strings.Contains(f.DBFlags.DSN, "amazonaws.com") { + return fmt.Errorf("refusing to seed synthetic data into a production database") + } + dbc, err := f.DBFlags.GetDBClient() if err != nil { return errors.WithMessage(err, "could not connect to database") @@ -89,255 +81,775 @@ rolled off the 1 week window. log.Info("Database schema initialized successfully") } - cacheClient, cacheErr := f.CacheFlags.GetCacheClient() - if cacheErr != nil { - return fmt.Errorf("failed to get cache client: %v", cacheErr) - } else if cacheClient == nil { - log.Warn("no cache provided; refresh timestamps will not be cached") - } - log.Info("Starting to seed test data...") + return seedSyntheticData(dbc) + }, + } - // Create the test suite - if err := createTestSuite(dbc); err != nil { - return errors.WithMessage(err, "failed to create test suite") - } - log.Info("Created test suite 'ourtests'") + f.BindFlags(cmd.Flags()) - // Create ProwJobs for each release - for _, release := range f.Releases { - if err := createProwJobsForRelease(dbc, release, f.JobsPerRelease); err != nil { - return errors.WithMessagef(err, "failed to create ProwJobs for release %s", release) - } - log.Infof("Processed %d ProwJobs for release %s", f.JobsPerRelease, release) - } + return cmd +} - // Create Test models - if err := createTestModels(dbc, f.TestNames); err != nil { - return errors.WithMessage(err, "failed to create Test models") - } - log.Infof("Processed %d Test models", len(f.TestNames)) +// --- Synthetic data seeding --- - // Create labels and symptoms - if err := createLabelsAndSymptoms(dbc); err != nil { - return errors.WithMessage(err, "failed to create labels and symptoms") - } - log.Info("Created sample labels and symptoms") +// syntheticJobDef defines a job with its full 9-key variant map. +type syntheticJobDef struct { + nameTemplate string + variants map[string]string +} - // Create ProwJobRuns for each ProwJob - if err := createProwJobRuns(dbc, f.RunsPerJob); err != nil { - return errors.WithMessage(err, "failed to create ProwJobRuns") - } - log.Info("Created ProwJobRuns and test results for all ProwJobs") +// syntheticTestSpec defines a test with deterministic pass/fail counts per release per job. +type syntheticTestSpec struct { + testID string + testName string + component string + capabilities []string + // Each entry maps a job name template -> per-release counts. + // The job template determines which variants the test runs with. + jobCounts map[string]map[string]testCount // jobTemplate -> release -> counts +} - // Apply labels to job runs - if err := applyLabelsToJobRuns(dbc); err != nil { - return errors.WithMessage(err, "failed to apply labels to job runs") - } - log.Info("Applied labels to ~25% of job runs") +type testCount struct { + total int + success int + flake int +} + +var syntheticReleases = []string{"4.22", "4.21", "4.20", "4.19"} + +var syntheticJobs = []syntheticJobDef{ + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-upgrade-from-stable-4.21-e2e-aws-ovn-upgrade", + variants: map[string]string{ + "Platform": "aws", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "unknown", "Upgrade": "minor", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-amd64", + variants: map[string]string{ + "Platform": "aws", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "parallel", "Upgrade": "none", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-arm64", + variants: map[string]string{ + "Platform": "aws", "Architecture": "arm64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "parallel", "Upgrade": "none", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-techpreview-serial", + variants: map[string]string{ + "Platform": "aws", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "techpreview", + "Suite": "serial", "Upgrade": "none", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-gcp-ovn-amd64", + variants: map[string]string{ + "Platform": "gcp", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "parallel", "Upgrade": "none", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-gcp-ovn-upgrade-micro", + variants: map[string]string{ + "Platform": "gcp", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "unknown", "Upgrade": "micro", "LayeredProduct": "none", + }, + }, +} - totalProwJobs := len(f.Releases) * f.JobsPerRelease - totalRuns := totalProwJobs * f.RunsPerJob - totalTestResults := totalRuns * len(f.TestNames) +// Job template constants for referencing specific jobs in test specs. +const awsAmd64Parallel = "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-amd64" +const awsArm64Parallel = "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-arm64" +const gcpAmd64Parallel = "periodic-ci-openshift-release-master-ci-%s-e2e-gcp-ovn-amd64" + +// allJobTemplates returns name templates from syntheticJobs for use in test specs +// that should run on every job (e.g. install tests). +func allJobTemplates() []string { + templates := make([]string, len(syntheticJobs)) + for i, j := range syntheticJobs { + templates[i] = j.nameTemplate + } + return templates +} - log.Info("Refreshing materialized views...") - sippyserver.RefreshData(dbc, cacheClient, false) +// allJobCounts builds a jobCounts map that assigns the given per-release counts +// to every synthetic job. Used for tests like install indicators that run everywhere. +func allJobCounts(releaseCounts map[string]testCount) map[string]map[string]testCount { + result := make(map[string]map[string]testCount, len(syntheticJobs)) + for _, tpl := range allJobTemplates() { + result[tpl] = releaseCounts + } + return result +} - log.Infof("Successfully seeded test data! Created %d ProwJobs, %d Tests, %d ProwJobRuns, and %d test results", - totalProwJobs, len(f.TestNames), totalRuns, totalTestResults) - return nil +var syntheticTests = []syntheticTestSpec{ + // --- NotSignificant: appears in 3 jobs across 2 platforms --- + { + testID: "test-not-significant", testName: "[sig-arch] Check build pods use all cpu cores", + component: "comp-NotSignificant", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {100, 95, 0}, "4.22": {100, 93, 0}}, + awsArm64Parallel: {"4.21": {80, 76, 0}, "4.22": {80, 75, 0}}, + gcpAmd64Parallel: {"4.21": {100, 97, 0}, "4.22": {100, 95, 0}}, + }, + }, + + // --- SignificantRegression: regressed on aws/amd64, fine elsewhere --- + { + testID: "test-significant-regression", testName: "[sig-network] Services should serve endpoints on same port and different protocol", + component: "comp-SignificantRegression", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 170, 0}}, + awsArm64Parallel: {"4.21": {180, 171, 0}, "4.22": {180, 168, 0}}, + gcpAmd64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 188, 0}}, + }, + }, + + // --- ExtremeRegression: extreme on aws/amd64, significant on others --- + { + testID: "test-extreme-regression", testName: "[sig-etcd] etcd leader changes are not excessive", + component: "comp-ExtremeRegression", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 140, 0}}, + awsArm64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 170, 0}}, + gcpAmd64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 170, 0}}, + }, + }, + + // --- MissingSample: test in base, 0 sample runs --- + { + testID: "test-missing-sample", testName: "[sig-storage] CSI volumes should be mountable", + component: "comp-MissingSample", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {100, 95, 0}, "4.22": {0, 0, 0}}, + }, + }, + + // --- MissingBasis: test only in sample --- + { + testID: "test-missing-basis", testName: "[sig-node] New pod lifecycle test", + component: "comp-MissingBasis", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.22": {100, 95, 0}}, + }, + }, + + // --- NewTestPassRateRegression: new test only in sample, below PassRateRequiredNewTests threshold --- + { + testID: "test-new-test-pass-rate-fail", testName: "[sig-node] New flaky pod readiness test", + component: "comp-NewTestPassRate", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.22": {100, 70, 0}}, }, + }, + + // --- BasisOnly: test in base, absent from sample --- + { + testID: "test-basis-only", testName: "[sig-apps] Removed deployment test", + component: "comp-BasisOnly", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {100, 95, 0}}, + }, + }, + + // --- SignificantImprovement: 80% -> 95% --- + { + testID: "test-significant-improvement", testName: "[sig-cli] oc adm should handle upgrades gracefully", + component: "comp-SignificantImprovement", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {200, 160, 0}, "4.22": {200, 190, 0}}, + }, + }, + + // --- BelowMinFailure: only 2 failures, below MinimumFailure=3 --- + { + testID: "test-below-min-failure", testName: "[sig-auth] RBAC should allow access with valid token", + component: "comp-BelowMinFailure", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {100, 100, 0}, "4.22": {100, 98, 0}}, + }, + }, + + // --- Fallback: 4.21 worse, 4.20 better -> swaps to 4.20 --- + { + testID: "test-fallback-improves", testName: "[sig-instrumentation] Metrics should report accurate cpu usage", + component: "comp-FallbackImproves", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: { + "4.21": {200, 180, 0}, + "4.20": {200, 194, 0}, + "4.22": {200, 160, 0}, + }, + }, + }, + + // --- Double fallback: 4.21->4.20->4.19 --- + { + testID: "test-fallback-double", testName: "[sig-scheduling] Scheduler should spread pods evenly", + component: "comp-FallbackDouble", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: { + "4.21": {200, 180, 0}, + "4.20": {200, 186, 0}, + "4.19": {200, 194, 0}, + "4.22": {200, 160, 0}, + }, + }, + }, + + // --- Fallback insufficient runs: 4.20 has <60% of 4.21 count --- + { + testID: "test-fallback-insufficient-runs", testName: "[sig-network] DNS should resolve cluster services", + component: "comp-FallbackInsufficient", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: { + "4.21": {1000, 940, 0}, + "4.20": {100, 99, 0}, + "4.22": {1000, 850, 0}, + }, + }, + }, + + // --- Install / health indicator tests: run on every job, every release --- + { + testID: "test-install-overall", testName: "install should succeed: overall", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 95, 0}, "4.21": {100, 96, 0}, "4.20": {100, 97, 0}, "4.19": {100, 97, 0}, + }), + }, + { + testID: "test-install-config", testName: "install should succeed: configuration", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 97, 0}, "4.21": {100, 98, 0}, "4.20": {100, 98, 0}, "4.19": {100, 98, 0}, + }), + }, + { + testID: "test-install-bootstrap", testName: "install should succeed: cluster bootstrap", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 96, 0}, "4.21": {100, 97, 0}, "4.20": {100, 97, 0}, "4.19": {100, 97, 0}, + }), + }, + { + testID: "test-install-other", testName: "install should succeed: other", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 98, 0}, "4.21": {100, 99, 0}, "4.20": {100, 99, 0}, "4.19": {100, 99, 0}, + }), + }, + { + testID: "test-install-infra", testName: "install should succeed: infrastructure", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 96, 0}, "4.21": {100, 97, 0}, "4.20": {100, 97, 0}, "4.19": {100, 97, 0}, + }), + }, + { + testID: "test-upgrade", testName: "[sig-sippy] upgrade should work", + component: "comp-Install", capabilities: []string{"upgrade"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 94, 0}, "4.21": {100, 95, 0}, "4.20": {100, 96, 0}, "4.19": {100, 96, 0}, + }), + }, + { + testID: "test-openshift-tests", testName: "[sig-sippy] openshift-tests should work", + component: "comp-Install", capabilities: []string{"tests"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 90, 0}, "4.21": {100, 92, 0}, "4.20": {100, 93, 0}, "4.19": {100, 93, 0}, + }), + }, +} + +// releaseTimeWindow returns the start/end times for a release's test data. +func releaseTimeWindow(release string) (start, end time.Time) { + now := time.Now().UTC().Truncate(time.Hour) + switch release { + case "4.22": + return now.Add(-3 * 24 * time.Hour), now + case "4.21": + return now.Add(-60 * 24 * time.Hour), now.Add(-30 * 24 * time.Hour) + case "4.20": + return now.Add(-120 * 24 * time.Hour), now.Add(-90 * 24 * time.Hour) + case "4.19": + return now.Add(-180 * 24 * time.Hour), now.Add(-150 * 24 * time.Hour) + default: + return now.Add(-14 * 24 * time.Hour), now } +} - f.BindFlags(cmd.Flags()) +func seedSyntheticData(dbc *db.DB) error { + // Check if data already exists + var count int64 + if err := dbc.DB.Model(&models.ProwJob{}).Count(&count).Error; err != nil { + return fmt.Errorf("failed to check for existing data: %w", err) + } + if count > 0 { + log.Infof("Database already contains %d ProwJobs, skipping seed. Use --init-database to reset.", count) + return nil + } - return cmd + if err := createTestSuite(dbc, "synthetic"); err != nil { + return errors.WithMessage(err, "failed to create test suite") + } + log.Info("Created test suite 'synthetic'") + + if err := seedProwJobs(dbc); err != nil { + return err + } + + if err := seedTestsAndOwnerships(dbc); err != nil { + return err + } + + totalRuns, totalResults, err := seedJobRunsAndResults(dbc) + if err != nil { + return err + } + + if err := createLabelsAndSymptoms(dbc); err != nil { + return errors.WithMessage(err, "failed to create labels and symptoms") + } + + if err := writeSyntheticViewsFile(); err != nil { + return errors.WithMessage(err, "failed to write views file") + } + + log.Info("Refreshing materialized views...") + sippyserver.RefreshData(dbc, nil, false) + + log.Info("Syncing regressions...") + if err := syncRegressions(dbc); err != nil { + return errors.WithMessage(err, "failed to sync regressions") + } + + log.Infof("Seeded synthetic data: %d ProwJobRuns, %d test results across %d releases", + totalRuns, totalResults, len(syntheticReleases)) + return nil } -func createProwJobsForRelease(dbc *db.DB, release string, jobsPerRelease int) error { - for i := 1; i <= jobsPerRelease; i++ { - // Choose JobTier based on whether i is even or odd - var jobTier = "JobTier:standard" // even number job index = standard - if i%2 != 0 { - jobTier = "JobTier:hidden" // odd = hidden +func seedProwJobs(dbc *db.DB) error { + for _, release := range syntheticReleases { + for _, job := range syntheticJobs { + name := fmt.Sprintf(job.nameTemplate, release) + variants := variantMapToArray(job.variants) + prowJob := models.ProwJob{ + Kind: models.ProwKind("periodic"), + Name: name, + Release: release, + Variants: variants, + } + var existing models.ProwJob + if err := dbc.DB.Where("name = ?", name).FirstOrCreate(&existing, prowJob).Error; err != nil { + return fmt.Errorf("failed to create ProwJob %s: %w", name, err) + } } + } + log.Infof("Created ProwJobs for %d releases x %d jobs", len(syntheticReleases), len(syntheticJobs)) + return nil +} + +type testInfo struct { + name string + uniqueID string + component string + capabilities []string +} + +func seedTestsAndOwnerships(dbc *db.DB) error { + var suite models.Suite + if err := dbc.DB.Where("name = ?", "synthetic").First(&suite).Error; err != nil { + return fmt.Errorf("failed to find suite: %w", err) + } - prowJob := models.ProwJob{ - Kind: models.ProwKind("periodic"), - Name: fmt.Sprintf("sippy-test-job-%s-test-%d", release, i), - Release: release, - // TestGridURL, Bugs, and JobRuns are left empty as requested - Variants: []string{"Platform:aws", "Upgrade:none", jobTier}, + seenTests := map[string]testInfo{} + for _, ts := range syntheticTests { + if _, ok := seenTests[ts.testName]; !ok { + seenTests[ts.testName] = testInfo{ + name: ts.testName, + uniqueID: ts.testID, + component: ts.component, + capabilities: ts.capabilities, + } } + } - // Use FirstOrCreate to avoid duplicates - only creates if a ProwJob with this name doesn't exist - var existingJob models.ProwJob - if err := dbc.DB.Where("name = ?", prowJob.Name).FirstOrCreate(&existingJob, prowJob).Error; err != nil { - return fmt.Errorf("failed to create or find ProwJob %s: %v", prowJob.Name, err) + for _, info := range seenTests { + testModel := models.Test{Name: info.name} + var existingTest models.Test + if err := dbc.DB.Where("name = ?", info.name).FirstOrCreate(&existingTest, testModel).Error; err != nil { + return fmt.Errorf("failed to create Test %s: %w", info.name, err) } - // Log whether we created a new job or found an existing one - if existingJob.CreatedAt.IsZero() || existingJob.CreatedAt.Equal(existingJob.UpdatedAt) { - log.Debugf("Created new ProwJob: %s", prowJob.Name) - } else { - log.Debugf("ProwJob already exists: %s", prowJob.Name) + ownership := models.TestOwnership{ + UniqueID: info.uniqueID, + Name: info.name, + TestID: existingTest.ID, + Suite: "synthetic", + SuiteID: &suite.ID, + Component: info.component, + Capabilities: info.capabilities, + } + var existingOwnership models.TestOwnership + if err := dbc.DB.Where("name = ? AND suite = ?", info.name, "synthetic").FirstOrCreate(&existingOwnership, ownership).Error; err != nil { + return fmt.Errorf("failed to create TestOwnership for %s: %w", info.name, err) } } - + log.Infof("Created %d tests with ownership records", len(seenTests)) return nil } -func createTestModels(dbc *db.DB, testNames []string) error { - for _, testName := range testNames { - testModel := models.Test{ - Name: testName, +type jobReleaseKey struct { + jobTemplate string + release string +} + +func seedJobRunsAndResults(dbc *db.DB) (int, int, error) { + var suite models.Suite + if err := dbc.DB.Where("name = ?", "synthetic").First(&suite).Error; err != nil { + return 0, 0, fmt.Errorf("failed to find suite: %w", err) + } + + maxRuns := map[jobReleaseKey]int{} + for _, ts := range syntheticTests { + for jobTpl, releaseCounts := range ts.jobCounts { + for release, counts := range releaseCounts { + key := jobReleaseKey{jobTpl, release} + if counts.total > maxRuns[key] { + maxRuns[key] = counts.total + } + } } + } - // Use FirstOrCreate to avoid duplicates - only creates if a Test with this name doesn't exist - var existingTest models.Test - if err := dbc.DB.Where("name = ?", testModel.Name).FirstOrCreate(&existingTest, testModel).Error; err != nil { - return fmt.Errorf("failed to create or find Test %s: %v", testModel.Name, err) + testIDsByName := map[string]uint{} + var allTests []models.Test + if err := dbc.DB.Find(&allTests).Error; err != nil { + return 0, 0, fmt.Errorf("failed to fetch tests: %w", err) + } + for _, t := range allTests { + testIDsByName[t.Name] = t.ID + } + + totalRuns := 0 + totalResults := 0 + for jrKey, runCount := range maxRuns { + if runCount == 0 { + continue } - if existingTest.CreatedAt.IsZero() || existingTest.CreatedAt.Equal(existingTest.UpdatedAt) { - log.Debugf("Created new Test: %s", testModel.Name) - } else { - log.Debugf("Test already exists: %s", testModel.Name) + jobName := fmt.Sprintf(jrKey.jobTemplate, jrKey.release) + var prowJob models.ProwJob + if err := dbc.DB.Where("name = ?", jobName).First(&prowJob).Error; err != nil { + return 0, 0, fmt.Errorf("failed to find ProwJob %s: %w", jobName, err) + } + + runs, results, err := seedRunsForJob(dbc, &suite, prowJob, jrKey, runCount, testIDsByName) + if err != nil { + return 0, 0, err } + totalRuns += runs + totalResults += results + + log.Debugf("Created %d runs for %s", runCount, jobName) } - return nil + return totalRuns, totalResults, nil } -func createTestSuite(dbc *db.DB) error { - suite := models.Suite{ - Name: "ourtests", +func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrKey jobReleaseKey, runCount int, testIDsByName map[string]uint) (int, int, error) { + start, end := releaseTimeWindow(jrKey.release) + window := end.Sub(start) + interval := window / time.Duration(runCount) + + runIDs := make([]uint, runCount) + for i := range runCount { + timestamp := start.Add(time.Duration(i) * interval) + run := models.ProwJobRun{ + ProwJobID: prowJob.ID, + Cluster: "build01", + Timestamp: timestamp, + Duration: 3 * time.Hour, + } + if err := dbc.DB.Create(&run).Error; err != nil { + return 0, 0, fmt.Errorf("failed to create ProwJobRun: %w", err) + } + runIDs[i] = run.ID } - // Use FirstOrCreate to avoid duplicates - var existingSuite models.Suite - if err := dbc.DB.Where("name = ?", suite.Name).FirstOrCreate(&existingSuite, suite).Error; err != nil { - return fmt.Errorf("failed to create or find Suite %s: %v", suite.Name, err) + // Runs that get test results (all except the last 2) + testableRuns := runCount + if testableRuns > 2 { + testableRuns = runCount - 2 } - return nil -} + runsWithFailure := map[uint]bool{} + totalResults := 0 + + for _, ts := range syntheticTests { + releaseCounts, hasJob := ts.jobCounts[jrKey.jobTemplate] + if !hasJob { + continue + } + counts, hasRelease := releaseCounts[jrKey.release] + if !hasRelease || counts.total == 0 { + continue + } + + testID, ok := testIDsByName[ts.testName] + if !ok { + return 0, 0, fmt.Errorf("test %q not found in DB", ts.testName) + } -func createProwJobRuns(dbc *db.DB, runsPerJob int) error { - var prowJobs []models.ProwJob - if err := dbc.DB.Find(&prowJobs).Error; err != nil { - return fmt.Errorf("failed to fetch existing ProwJobs: %v", err) + for i := 0; i < counts.total && i < testableRuns; i++ { + var status int + switch { + case i < counts.success-counts.flake: + status = 1 // pass + case i < counts.success: + status = 13 // flake (counts as success too) + default: + status = 12 // failure + runsWithFailure[runIDs[i]] = true + } + + result := models.ProwJobRunTest{ + ProwJobRunID: runIDs[i], + TestID: testID, + SuiteID: &suite.ID, + Status: status, + Duration: 5.0, + CreatedAt: start.Add(time.Duration(i) * interval), + } + if err := dbc.DB.Create(&result).Error; err != nil { + return 0, 0, fmt.Errorf("failed to create ProwJobRunTest: %w", err) + } + totalResults++ + } } - var tests []models.Test - if err := dbc.DB.Find(&tests).Error; err != nil { - return fmt.Errorf("failed to fetch existing Tests: %v", err) + // Set OverallResult on all runs + for i, runID := range runIDs { + var overallResult v1.JobOverallResult + var succeeded, failed bool + + if i >= testableRuns { + overallResult = v1.JobInternalInfrastructureFailure + failed = true + } else if runsWithFailure[runID] { + overallResult = v1.JobTestFailure + failed = true + } else { + overallResult = v1.JobSucceeded + succeeded = true + } + + if err := dbc.DB.Model(&models.ProwJobRun{}).Where("id = ?", runID). + Updates(map[string]any{ + "overall_result": overallResult, + "succeeded": succeeded, + "failed": failed, + }).Error; err != nil { + return 0, 0, fmt.Errorf("failed to update ProwJobRun result: %w", err) + } } - var suite models.Suite - if err := dbc.DB.Where("name = ?", "ourtests").First(&suite).Error; err != nil { - return fmt.Errorf("failed to find Suite 'ourtests': %v", err) + // Update test_failures count + if err := dbc.DB.Exec(` + UPDATE prow_job_runs SET test_failures = COALESCE(( + SELECT COUNT(*) FROM prow_job_run_tests + WHERE prow_job_run_id = prow_job_runs.id AND status = 12 + ), 0) WHERE prow_job_id = ?`, prowJob.ID).Error; err != nil { + return 0, 0, fmt.Errorf("updating test_failures for prow job %s: %w", prowJob.Name, err) } - log.Infof("Found %d ProwJobs, creating %d runs for each", len(prowJobs), runsPerJob) + return runCount, totalResults, nil +} - // Calculate time range: past 2 weeks from now - now := time.Now() - twoWeeksAgo := now.AddDate(0, 0, -14) +func syncRegressions(dbc *db.DB) error { + provider := pgprovider.NewPostgresProvider(dbc, nil) + ctx := context.Background() - // Duration for each run: 3 hours - runDuration := 3 * time.Hour + releases, err := provider.QueryReleases(ctx) + if err != nil { + return fmt.Errorf("querying releases: %w", err) + } - for _, prowJob := range prowJobs { - log.Infof("Creating %d ProwJobRuns for ProwJob: %s", runsPerJob, prowJob.Name) + viewsData, err := os.ReadFile(syntheticViewsFile) + if err != nil { + return fmt.Errorf("reading views file: %w", err) + } + var views apitype.SippyViews + if err := yaml.Unmarshal(viewsData, &views); err != nil { + return fmt.Errorf("parsing views file: %w", err) + } - for i := 0; i < runsPerJob; i++ { - // Log progress every 10 runs to show activity - if (i+1)%10 == 0 { - log.Infof(" Progress: %d/%d runs created for %s", i+1, runsPerJob, prowJob.Name) - } + backend := componentreadiness.NewPostgresRegressionStore(dbc, nil) + rLog := log.WithField("source", "seed-regression-sync") - // Calculate timestamp: spread evenly over the past 2 weeks - totalDuration := 14 * 24 * time.Hour - // Time between runs = total duration / runs - timeBetweenRuns := totalDuration / time.Duration(runsPerJob) - timestamp := twoWeeksAgo.Add(time.Duration(i) * timeBetweenRuns) - - prowJobRun := models.ProwJobRun{ - ProwJobID: prowJob.ID, - Cluster: "build01", - Timestamp: timestamp, - Duration: runDuration, - TestCount: len(tests), - } + for _, view := range views.ComponentReadiness { + baseRelease, err := utils.GetViewReleaseOptions(releases, "basis", view.BaseRelease, 0) + if err != nil { + return fmt.Errorf("error getting base release for view %s: %w", view.Name, err) + } + sampleRelease, err := utils.GetViewReleaseOptions(releases, "sample", view.SampleRelease, 0) + if err != nil { + return fmt.Errorf("error getting sample release for view %s: %w", view.Name, err) + } - if err := dbc.DB.Create(&prowJobRun).Error; err != nil { - return fmt.Errorf("failed to create ProwJobRun for ProwJob %s: %v", prowJob.Name, err) - } + reportOpts := reqopts.RequestOptions{ + BaseRelease: baseRelease, + SampleRelease: sampleRelease, + VariantOption: view.VariantOptions, + AdvancedOption: view.AdvancedOptions, + } - var testFailures int - for _, test := range tests { - // Determine test status based on random chance - // 5% chance of failure, 10% chance of flake, 85% chance of pass - // nolint: gosec - randNum := rand.Float64() - var status int - if randNum < 0.05 { - status = 12 // failure - testFailures++ - } else if randNum < 0.15 { - status = 13 // flake - } else { - status = 1 // pass - } + report, reportErrs := componentreadiness.GetComponentReport(ctx, provider, dbc, reportOpts, "") + if len(reportErrs) > 0 { + for _, e := range reportErrs { + rLog.WithError(e).Warn("report generation error") + } + return fmt.Errorf("error generating component report for view %s", view.Name) + } - prowJobRunTest := models.ProwJobRunTest{ - ProwJobRunID: prowJobRun.ID, - TestID: test.ID, - SuiteID: &suite.ID, - Status: status, - Duration: 5.0, // 5 seconds - CreatedAt: timestamp, - } + activeRegs, err := componentreadiness.SyncRegressionsForReport(backend, view, rLog, &report) + if err != nil { + return fmt.Errorf("error syncing regressions for view %s: %w", view.Name, err) + } - if err := dbc.DB.Create(&prowJobRunTest).Error; err != nil { - return fmt.Errorf("failed to create ProwJobRunTest for test %s: %v", test.Name, err) + // Close regressions no longer in the report + allRegs, err := backend.ListCurrentRegressionsForRelease(view.SampleRelease.Name) + if err != nil { + return fmt.Errorf("error listing regressions: %w", err) + } + activeIDs := map[uint]bool{} + for _, r := range activeRegs { + activeIDs[r.ID] = true + } + now := time.Now() + for _, reg := range allRegs { + if !activeIDs[reg.ID] && !reg.Closed.Valid { + reg.Closed = sql.NullTime{Valid: true, Time: now} + if err := backend.UpdateRegression(reg); err != nil { + return fmt.Errorf("error closing regression %d: %w", reg.ID, err) } } + } - // Set overall result based on test failures and random factors - var overallResult v1.JobOverallResult - if testFailures > 0 { - prowJobRun.Failed = true - prowJobRun.Succeeded = false - prowJobRun.TestFailures = testFailures - - // Randomly assign different failure types - // nolint: gosec - failureType := rand.Float64() - if failureType < 0.7 { - overallResult = v1.JobTestFailure // 70% test failures - } else if failureType < 0.85 { - overallResult = v1.JobUpgradeFailure // 15% upgrade failures - } else if failureType < 0.92 { - overallResult = v1.JobInstallFailure // 7% install failures - } else { - overallResult = v1.JobExternalInfrastructureFailure // 8% infrastructure failures - } - } else { - prowJobRun.Failed = false - prowJobRun.Succeeded = true - prowJobRun.TestFailures = 0 - overallResult = v1.JobSucceeded - } - prowJobRun.OverallResult = overallResult + rLog.Infof("synced regressions for view %s: %d active", view.Name, len(activeRegs)) + } + + if err := backend.ResolveTriages(); err != nil { + return fmt.Errorf("error resolving triages: %w", err) + } - if err := dbc.DB.Save(&prowJobRun).Error; err != nil { - return fmt.Errorf("failed to update ProwJobRun for ProwJob %s: %v", prowJob.Name, err) + return nil +} + +const syntheticViewsFile = "config/e2e-views.yaml" + +// writeSyntheticViewsFile generates a views file with include_variants matching the seed data. +func writeSyntheticViewsFile() error { + // Collect all unique variant values from synthetic jobs + allVariants := map[string]map[string]bool{} + for _, job := range syntheticJobs { + for k, v := range job.variants { + if allVariants[k] == nil { + allVariants[k] = map[string]bool{} } + allVariants[k][v] = true } + } + + includeVariants := map[string][]string{} + for k, vals := range allVariants { + sorted := make([]string, 0, len(vals)) + for v := range vals { + sorted = append(sorted, v) + } + sort.Strings(sorted) + includeVariants[k] = sorted + } + + dbGroupBy := sets.NewString("Architecture", "FeatureSet", "Installer", "Network", "Platform", + "Suite", "Topology", "Upgrade", "LayeredProduct") + columnGroupBy := sets.NewString("Network", "Platform", "Topology") + + views := apitype.SippyViews{ + ComponentReadiness: []crview.View{ + { + Name: "4.22-main", + BaseRelease: reqopts.RelativeRelease{ + Release: reqopts.Release{Name: "4.21"}, + RelativeStart: "now-60d", + RelativeEnd: "now-30d", + }, + SampleRelease: reqopts.RelativeRelease{ + Release: reqopts.Release{Name: "4.22"}, + RelativeStart: "now-3d", + RelativeEnd: "now", + }, + VariantOptions: reqopts.Variants{ + ColumnGroupBy: columnGroupBy, + DBGroupBy: dbGroupBy, + IncludeVariants: includeVariants, + }, + AdvancedOptions: reqopts.Advanced{ + Confidence: 95, + PityFactor: 5, + MinimumFailure: 3, + PassRateRequiredNewTests: 90, + IncludeMultiReleaseAnalysis: true, + }, + PrimeCache: crview.PrimeCache{Enabled: true}, + RegressionTracking: crview.RegressionTracking{Enabled: true}, + }, + }, + } + + data, err := yaml.Marshal(views) + if err != nil { + return fmt.Errorf("marshaling views: %w", err) + } + + if err := os.WriteFile(syntheticViewsFile, data, 0o600); err != nil { + return fmt.Errorf("writing %s: %w", syntheticViewsFile, err) + } + + log.Infof("Generated views file: %s", syntheticViewsFile) + return nil +} + +// variantMapToArray converts a variant map to a pq.StringArray. +func variantMapToArray(m map[string]string) pq.StringArray { + result := make([]string, 0, len(m)) + for k, v := range m { + result = append(result, k+":"+v) + } + return result +} - log.Infof("Completed creating %d ProwJobRuns for ProwJob: %s", runsPerJob, prowJob.Name) +func createTestSuite(dbc *db.DB, name string) error { + suite := models.Suite{ + Name: name, + } + + var existingSuite models.Suite + if err := dbc.DB.Where("name = ?", suite.Name).FirstOrCreate(&existingSuite, suite).Error; err != nil { + return fmt.Errorf("failed to create or find Suite %s: %v", suite.Name, err) } return nil @@ -349,13 +861,12 @@ func createLabelsAndSymptoms(dbc *db.DB) error { UpdatedBy: "seed-data", } - // Create sample labels labels := []jobrunscan.Label{ { LabelContent: jobrunscan.LabelContent{ ID: "InfraFailure", LabelTitle: "Infrastructure failure: omit job from CR", - Explanation: "Job failed due to **infrastructure issues** not related to product code. See [TRT documentation](https://docs.ci.openshift.org/docs/architecture/ci-operator/) for more details.", + Explanation: "Job failed due to **infrastructure issues** not related to product code.", }, Metadata: metadata, }, @@ -363,7 +874,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { LabelContent: jobrunscan.LabelContent{ ID: "ClusterDNSFlake", LabelTitle: "Cluster DNS resolution failure(s)", - Explanation: "Job experienced DNS resolution timeouts in the cluster:\n\n- Check for network issues\n- Review DNS server logs\n- Examine cluster network configuration", + Explanation: "Job experienced DNS resolution timeouts in the cluster.", }, Metadata: metadata, }, @@ -371,7 +882,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { LabelContent: jobrunscan.LabelContent{ ID: "ClusterInstallTimeout", LabelTitle: "Cluster install timeout", - Explanation: "Cluster installation exceeded timeout threshold. This may indicate:\n\n1. Slow infrastructure provisioning\n2. Network connectivity problems\n3. Image pull failures", + Explanation: "Cluster installation exceeded timeout threshold.", }, Metadata: metadata, }, @@ -379,7 +890,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { LabelContent: jobrunscan.LabelContent{ ID: "IntervalFile", LabelTitle: "Has interval file(s)", - Explanation: "Job produced interval monitoring files. Use the `intervals` tool to analyze timing data.", + Explanation: "Job produced interval monitoring files.", }, HideDisplayContexts: []string{jobrunscan.MetricsContext, jobrunscan.JAQOptsContext}, Metadata: metadata, @@ -388,7 +899,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { LabelContent: jobrunscan.LabelContent{ ID: "APIServerTimeout", LabelTitle: "API server timeout", - Explanation: "Requests to the API server timed out. Common causes:\n\n- High API server load\n- Network latency issues\n- Slow etcd responses", + Explanation: "Requests to the API server timed out.", }, Metadata: metadata, }, @@ -399,14 +910,8 @@ func createLabelsAndSymptoms(dbc *db.DB) error { if err := dbc.DB.Where("id = ?", label.ID).FirstOrCreate(&existing, label).Error; err != nil { return fmt.Errorf("failed to create or find label %s: %v", label.ID, err) } - if existing.CreatedAt.IsZero() || existing.CreatedAt.Equal(existing.UpdatedAt) { - log.Debugf("Created new Label: %s", label.ID) - } else { - log.Debugf("Label already exists: %s", label.ID) - } } - // Create sample symptoms symptoms := []jobrunscan.Symptom{ { SymptomContent: jobrunscan.SymptomContent{ @@ -459,71 +964,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { if err := dbc.DB.Where("id = ?", symptom.ID).FirstOrCreate(&existing, symptom).Error; err != nil { return fmt.Errorf("failed to create or find symptom %s: %v", symptom.ID, err) } - if existing.CreatedAt.IsZero() || existing.CreatedAt.Equal(existing.UpdatedAt) { - log.Debugf("Created new Symptom: %s", symptom.ID) - } else { - log.Debugf("Symptom already exists: %s", symptom.ID) - } } return nil } - -func applyLabelsToJobRuns(dbc *db.DB) error { - // Fetch all job runs - var jobRuns []models.ProwJobRun - if err := dbc.DB.Find(&jobRuns).Error; err != nil { - return fmt.Errorf("failed to fetch job runs: %v", err) - } - - // Fetch all labels - var labels []jobrunscan.Label - if err := dbc.DB.Find(&labels).Error; err != nil { - return fmt.Errorf("failed to fetch labels: %v", err) - } - - if len(labels) == 0 { - log.Warn("No labels found, skipping label application") - return nil - } - - labelIDs := make([]string, len(labels)) - for i, label := range labels { - labelIDs[i] = label.ID - } - - // Apply labels to approximately 25% of job runs - labeledCount := 0 - for i := range jobRuns { - // nolint: gosec // we do not care that the randomness is weak - if rand.Float64() > 0.25 { - continue - } - // Randomly select 1-3 labels - // nolint: gosec - numLabels := rand.Intn(3) + 1 - selectedLabels := make([]string, 0, numLabels) - - // Randomly pick unique labels - usedIndices := make(map[int]bool) - for len(selectedLabels) < numLabels && len(selectedLabels) < len(labelIDs) { - // nolint: gosec - idx := rand.Intn(len(labelIDs)) - if !usedIndices[idx] { - selectedLabels = append(selectedLabels, labelIDs[idx]) - usedIndices[idx] = true - } - } - - jobRuns[i].Labels = selectedLabels - if err := dbc.DB.Save(&jobRuns[i]).Error; err != nil { - return fmt.Errorf("failed to update job run %d with labels: %v", jobRuns[i].ID, err) - } - labeledCount++ - } - - log.Infof("Applied labels to %d of %d job runs (%.1f%%)", - labeledCount, len(jobRuns), float64(labeledCount)/float64(len(jobRuns))*100) - - return nil -} diff --git a/cmd/sippy/serve.go b/cmd/sippy/serve.go index ee16451e68..631a85d5ea 100644 --- a/cmd/sippy/serve.go +++ b/cmd/sippy/serve.go @@ -19,6 +19,7 @@ import ( resources "github.com/openshift/sippy" "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider" bqprovider "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider/bigquery" + pgprovider "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider/postgres" "github.com/openshift/sippy/pkg/apis/cache" "github.com/openshift/sippy/pkg/bigquery" "github.com/openshift/sippy/pkg/bigquery/bqlabel" @@ -69,10 +70,13 @@ func (f *ServerFlags) BindFlags(flagSet *pflag.FlagSet) { f.ConfigFlags.BindFlags(flagSet) f.APIFlags.BindFlags(flagSet) f.JiraFlags.BindFlags(flagSet) - flagSet.StringVar(&f.DataProvider, "data-provider", "bigquery", "Data provider for component readiness: bigquery") + flagSet.StringVar(&f.DataProvider, "data-provider", "bigquery", "Data provider for component readiness: bigquery, postgres") } func (f *ServerFlags) Validate() error { + if f.DataProvider == "postgres" { + return nil + } return f.GoogleCloudFlags.Validate() } @@ -132,8 +136,12 @@ func NewServeCommand() *cobra.Command { crDataProvider = bqprovider.NewBigQueryProvider(bigQueryClient, config.ComponentReadinessConfig.VariantJunitTableOverrides) } + case "postgres": + crDataProvider = pgprovider.NewPostgresProvider(dbc, cacheClient) + log.Info("Using Postgres data provider for component readiness") + default: - return fmt.Errorf("unknown --data-provider %q, must be bigquery", f.DataProvider) + return fmt.Errorf("unknown --data-provider %q, must be bigquery or postgres", f.DataProvider) } gcsClient, err = gcs.NewGCSClient(context.TODO(), diff --git a/config/e2e-views.yaml b/config/e2e-views.yaml index aab16d861c..36b9e3abb6 100644 --- a/config/e2e-views.yaml +++ b/config/e2e-views.yaml @@ -1,81 +1,71 @@ ---- component_readiness: -- name: 4.20-main - base_release: - release: "4.19" - relative_start: ga-30d - relative_end: ga - sample_release: - release: "4.20" - relative_start: now-7d - relative_end: now - variant_options: - column_group_by: - Architecture: {} - Network: {} - Platform: {} - Topology: {} - db_group_by: - Architecture: {} - FeatureSet: {} - Installer: {} - Network: {} - Platform: {} - Suite: {} - Topology: {} - Upgrade: {} - include_variants: - Architecture: - - amd64 - FeatureSet: - - default - - techpreview - Installer: - - ipi - - upi - - hypershift - JobTier: - - blocking - - informing - - standard - LayeredProduct: - - none - - virt - Network: - - ovn - Owner: - - eng - - service-delivery - Platform: - - aws - - azure - - gcp - - metal - - rosa - - vsphere - Topology: - - ha - - microshift - - external - CGroupMode: - - v2 - ContainerRuntime: - - runc - - crun - advanced_options: - minimum_failure: 3 - confidence: 95 - pity_factor: 5 - ignore_missing: false - ignore_disruption: true - flake_as_failure: false - pass_rate_required_new_tests: 95 - include_multi_release_analysis: true - metrics: - enabled: true - regression_tracking: - enabled: true - prime_cache: - enabled: true - automate_jira: - enabled: true + - name: 4.22-main + base_release: + release: "4.21" + relative_start: now-60d + relative_end: now-30d + sample_release: + release: "4.22" + relative_start: now-3d + relative_end: now + test_id_options: {} + test_filters: {} + variant_options: + column_group_by: + Network: {} + Platform: {} + Topology: {} + db_group_by: + Architecture: {} + FeatureSet: {} + Installer: {} + LayeredProduct: {} + Network: {} + Platform: {} + Suite: {} + Topology: {} + Upgrade: {} + include_variants: + Architecture: + - amd64 + - arm64 + FeatureSet: + - default + - techpreview + Installer: + - ipi + LayeredProduct: + - none + Network: + - ovn + Platform: + - aws + - gcp + Suite: + - parallel + - serial + - unknown + Topology: + - ha + Upgrade: + - micro + - minor + - none + advanced_options: + minimum_failure: 3 + confidence: 95 + pity_factor: 5 + pass_rate_required_new_tests: 90 + pass_rate_required_all_tests: 0 + ignore_missing: false + ignore_disruption: false + flake_as_failure: false + include_multi_release_analysis: true + metrics: + enabled: false + regression_tracking: + enabled: true + automate_jira: + enabled: false + prime_cache: + enabled: true diff --git a/pkg/api/componentreadiness/dataprovider/postgres/provider.go b/pkg/api/componentreadiness/dataprovider/postgres/provider.go new file mode 100644 index 0000000000..ef09207645 --- /dev/null +++ b/pkg/api/componentreadiness/dataprovider/postgres/provider.go @@ -0,0 +1,791 @@ +package postgres + +import ( + "context" + "fmt" + "math/big" + "slices" + "sort" + "strings" + "time" + + "github.com/lib/pq" + log "github.com/sirupsen/logrus" + + "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider" + "github.com/openshift/sippy/pkg/api/componentreadiness/utils" + "github.com/openshift/sippy/pkg/apis/api/componentreport/crstatus" + "github.com/openshift/sippy/pkg/apis/api/componentreport/crtest" + "github.com/openshift/sippy/pkg/apis/api/componentreport/reqopts" + "github.com/openshift/sippy/pkg/apis/cache" + v1 "github.com/openshift/sippy/pkg/apis/sippy/v1" + "github.com/openshift/sippy/pkg/db" +) + +var _ dataprovider.DataProvider = &PostgresProvider{} + +// PostgresProvider implements dataprovider.DataProvider using PostgreSQL. +// Designed for local development and testing — not optimized for production scale. +type PostgresProvider struct { + dbc *db.DB + cache cache.Cache +} + +func NewPostgresProvider(dbc *db.DB, c cache.Cache) *PostgresProvider { + if c == nil { + c = &noOpCache{} + } + return &PostgresProvider{dbc: dbc, cache: c} +} + +// noOpCache never stores or returns data; no Redis needed for local dev. +type noOpCache struct{} + +func (n *noOpCache) Get(_ context.Context, _ string, _ time.Duration) ([]byte, error) { + return nil, fmt.Errorf("cache miss") +} +func (n *noOpCache) Set(_ context.Context, _ string, _ []byte, _ time.Duration) error { return nil } + +func (p *PostgresProvider) Cache() cache.Cache { + return p.cache +} + +// --- Variant helpers --- + +// parseVariants splits a pq.StringArray like ["Platform:aws", "Upgrade:none"] into a map. +func parseVariants(variants pq.StringArray) map[string]string { + result := make(map[string]string, len(variants)) + for _, v := range variants { + if k, val, ok := strings.Cut(v, ":"); ok { + result[k] = val + } + } + return result +} + +// variantMapToSlice converts a map to sorted "Key:Value" strings. +func variantMapToSlice(m map[string]string) []string { + result := make([]string, 0, len(m)) + for k, v := range m { + result = append(result, k+":"+v) + } + sort.Strings(result) + return result +} + +// filterByDBGroupBy returns a copy of the variant map keeping only keys in dbGroupBy. +func filterByDBGroupBy(variants map[string]string, dbGroupBy map[string]bool) map[string]string { + filtered := make(map[string]string, len(dbGroupBy)) + for k, v := range variants { + if dbGroupBy[k] { + filtered[k] = v + } + } + return filtered +} + +// matchesIncludeVariants checks if a variant map passes the include filter. +func matchesIncludeVariants(variants map[string]string, includeVariants map[string][]string) bool { + for key, allowed := range includeVariants { + val, exists := variants[key] + if !exists { + return false + } + if !slices.Contains(allowed, val) { + return false + } + } + return true +} + +// --- MetadataQuerier --- + +func (p *PostgresProvider) QueryJobVariants(_ context.Context) (crtest.JobVariants, []error) { + variants := crtest.JobVariants{Variants: map[string][]string{}} + + var pairs []string + err := p.dbc.DB.Raw(`SELECT DISTINCT unnest(variants) AS pair FROM prow_jobs WHERE deleted_at IS NULL`). + Pluck("pair", &pairs).Error + if err != nil { + return variants, []error{fmt.Errorf("querying job variants: %w", err)} + } + + grouped := map[string]map[string]bool{} + for _, pair := range pairs { + k, v, ok := strings.Cut(pair, ":") + if !ok { + continue + } + if grouped[k] == nil { + grouped[k] = map[string]bool{} + } + grouped[k][v] = true + } + + for k, vals := range grouped { + sorted := make([]string, 0, len(vals)) + for v := range vals { + sorted = append(sorted, v) + } + sort.Strings(sorted) + variants.Variants[k] = sorted + } + return variants, nil +} + +// releaseMetadata holds hardcoded release info for known releases. +// This avoids needing a releases table — we derive release names from prow_jobs +// and fill in metadata from this map. +var releaseMetadata = map[string]struct { + previousRelease string + gaOffsetDays int // 0 = no GA date (in development) +}{ + "4.17": {previousRelease: "4.16", gaOffsetDays: -540}, + "4.18": {previousRelease: "4.17", gaOffsetDays: -395}, + "4.19": {previousRelease: "4.18", gaOffsetDays: -289}, + "4.20": {previousRelease: "4.19", gaOffsetDays: -163}, + "4.21": {previousRelease: "4.20", gaOffsetDays: -58}, + "4.22": {previousRelease: "4.21"}, + "5.0": {previousRelease: "4.22"}, +} + +func (p *PostgresProvider) QueryReleases(_ context.Context) ([]v1.Release, error) { + var releaseNames []string + err := p.dbc.DB.Raw(`SELECT DISTINCT release FROM prow_jobs WHERE deleted_at IS NULL ORDER BY release DESC`). + Pluck("release", &releaseNames).Error + if err != nil { + return nil, fmt.Errorf("querying releases: %w", err) + } + + caps := map[v1.ReleaseCapability]bool{ + v1.ComponentReadinessCap: true, + v1.FeatureGatesCap: true, + v1.MetricsCap: true, + v1.PayloadTagsCap: true, + v1.SippyClassicCap: true, + } + + now := time.Now().UTC() + var releases []v1.Release + for _, name := range releaseNames { + rel := v1.Release{ + Release: name, + Capabilities: caps, + } + if meta, ok := releaseMetadata[name]; ok { + rel.PreviousRelease = meta.previousRelease + if meta.gaOffsetDays != 0 { + ga := now.AddDate(0, 0, meta.gaOffsetDays) + rel.GADate = &ga + } + } + releases = append(releases, rel) + } + return releases, nil +} + +func (p *PostgresProvider) QueryReleaseDates(_ context.Context, _ reqopts.RequestOptions) ([]crtest.ReleaseTimeRange, []error) { + // Derive time ranges from actual data in the DB rather than hardcoded GA dates. + // This ensures fallback queries find data where it actually exists. + type releaseRange struct { + Release string + Start time.Time + End time.Time + } + var ranges []releaseRange + err := p.dbc.DB.Raw(` + SELECT pj.release, + MIN(pjr.timestamp) AS start, + MAX(pjr.timestamp) AS end + FROM prow_job_runs pjr + JOIN prow_jobs pj ON pj.id = pjr.prow_job_id + WHERE pj.deleted_at IS NULL AND pjr.deleted_at IS NULL + GROUP BY pj.release + ORDER BY pj.release DESC + `).Scan(&ranges).Error + if err != nil { + return nil, []error{fmt.Errorf("querying release dates: %w", err)} + } + + var dates []crtest.ReleaseTimeRange + for _, r := range ranges { + start := r.Start + end := r.End + dates = append(dates, crtest.ReleaseTimeRange{ + Release: r.Release, + Start: &start, + End: &end, + }) + } + return dates, nil +} + +func (p *PostgresProvider) QueryUniqueVariantValues(_ context.Context, field string, nested bool) ([]string, error) { + if nested { + // Return all variant key names + var pairs []string + err := p.dbc.DB.Raw(` + SELECT DISTINCT unnest(variants) AS pair FROM prow_jobs + WHERE deleted_at IS NULL + `).Pluck("pair", &pairs).Error + if err != nil { + return nil, err + } + keys := map[string]bool{} + for _, pair := range pairs { + if k, _, ok := strings.Cut(pair, ":"); ok { + keys[k] = true + } + } + result := make([]string, 0, len(keys)) + for k := range keys { + result = append(result, k) + } + sort.Strings(result) + return result, nil + } + + // Map BQ column names to variant key names + fieldMap := map[string]string{ + "platform": "Platform", + "network": "Network", + "arch": "Architecture", + "upgrade": "Upgrade", + } + variantKey, ok := fieldMap[field] + if !ok { + return []string{}, nil + } + + var pairs []string + err := p.dbc.DB.Raw(` + SELECT DISTINCT unnest(variants) AS pair FROM prow_jobs + WHERE deleted_at IS NULL + `).Pluck("pair", &pairs).Error + if err != nil { + return nil, err + } + + vals := map[string]bool{} + for _, pair := range pairs { + if k, v, ok := strings.Cut(pair, ":"); ok && k == variantKey { + vals[v] = true + } + } + result := make([]string, 0, len(vals)) + for v := range vals { + result = append(result, v) + } + sort.Strings(result) + return result, nil +} + +// --- TestStatusQuerier --- + +// testStatusRow is the result of the aggregation query. +type testStatusRow struct { + TestID string `gorm:"column:test_id"` + TestName string `gorm:"column:test_name"` + TestSuite string `gorm:"column:test_suite"` + Component string `gorm:"column:component"` + Capabilities pq.StringArray `gorm:"column:capabilities;type:text[]"` + ProwJobID uint `gorm:"column:prow_job_id"` + TotalCount int `gorm:"column:total_count"` + SuccessCount int `gorm:"column:success_count"` + FlakeCount int `gorm:"column:flake_count"` + LastFailure *time.Time `gorm:"column:last_failure"` +} + +const testStatusQuery = ` +WITH deduped AS ( + SELECT DISTINCT ON (pjrt.prow_job_run_id, pjrt.test_id, pjrt.suite_id) + pjrt.test_id, pjrt.suite_id, pjrt.status, + pjr.timestamp, pj.id AS prow_job_id + FROM prow_job_run_tests pjrt + JOIN prow_job_runs pjr ON pjr.id = pjrt.prow_job_run_id + JOIN prow_jobs pj ON pj.id = pjr.prow_job_id + WHERE pj.release = ? + AND pjr.timestamp >= ? AND pjr.timestamp < ? + AND pjrt.deleted_at IS NULL AND pjr.deleted_at IS NULL AND pj.deleted_at IS NULL + AND (pjr.labels IS NULL OR NOT pjr.labels @> ARRAY['InfraFailure']) + ORDER BY pjrt.prow_job_run_id, pjrt.test_id, pjrt.suite_id, + CASE WHEN pjrt.status = 13 THEN 0 WHEN pjrt.status = 1 THEN 1 ELSE 2 END +) +SELECT + tow.unique_id AS test_id, + t.name AS test_name, + COALESCE(s.name, '') AS test_suite, + tow.component, + tow.capabilities, + d.prow_job_id, + COUNT(*) AS total_count, + SUM(CASE WHEN d.status IN (1, 13) THEN 1 ELSE 0 END) AS success_count, + SUM(CASE WHEN d.status = 13 THEN 1 ELSE 0 END) AS flake_count, + MAX(CASE WHEN d.status NOT IN (1, 13) THEN d.timestamp ELSE NULL END) AS last_failure +FROM deduped d +JOIN tests t ON t.id = d.test_id +JOIN test_ownerships tow ON tow.test_id = d.test_id + AND (tow.suite_id = d.suite_id OR (tow.suite_id IS NULL AND d.suite_id IS NULL)) +LEFT JOIN suites s ON s.id = d.suite_id +WHERE tow.staff_approved_obsolete = false +GROUP BY tow.unique_id, t.name, s.name, tow.component, tow.capabilities, d.prow_job_id +` + +func (p *PostgresProvider) queryTestStatus(ctx context.Context, release string, start, end time.Time, + _ crtest.JobVariants, includeVariants map[string][]string, + dbGroupBy map[string]bool) (map[string]crstatus.TestStatus, []error) { + + var rows []testStatusRow + if err := p.dbc.DB.WithContext(ctx).Raw(testStatusQuery, release, start, end).Scan(&rows).Error; err != nil { + return nil, []error{fmt.Errorf("querying test status: %w", err)} + } + + // Batch-fetch all ProwJob variants we need + jobVariantMap := p.fetchJobVariants(rows) + + result := map[string]crstatus.TestStatus{} + for _, row := range rows { + variants, ok := jobVariantMap[row.ProwJobID] + if !ok { + continue + } + + if !matchesIncludeVariants(variants, includeVariants) { + continue + } + + filtered := filterByDBGroupBy(variants, dbGroupBy) + key := crtest.KeyWithVariants{ + TestID: row.TestID, + Variants: filtered, + } + keyStr := key.KeyOrDie() + + existing, exists := result[keyStr] + if exists { + // Merge counts for same test+variant combo from different job runs + existing.Count.TotalCount += row.TotalCount + existing.Count.SuccessCount += row.SuccessCount + existing.Count.FlakeCount += row.FlakeCount + if row.LastFailure != nil && (existing.LastFailure.IsZero() || row.LastFailure.After(existing.LastFailure)) { + existing.LastFailure = *row.LastFailure + } + result[keyStr] = existing + } else { + ts := crstatus.TestStatus{ + TestName: row.TestName, + TestSuite: row.TestSuite, + Component: row.Component, + Capabilities: row.Capabilities, + Variants: variantMapToSlice(filtered), + Count: crtest.Count{ + TotalCount: row.TotalCount, + SuccessCount: row.SuccessCount, + FlakeCount: row.FlakeCount, + }, + } + if row.LastFailure != nil { + ts.LastFailure = *row.LastFailure + } + result[keyStr] = ts + } + } + + return result, nil +} + +// fetchJobVariants loads and caches ProwJob variant maps for the given rows. +func (p *PostgresProvider) fetchJobVariants(rows []testStatusRow) map[uint]map[string]string { + jobIDs := map[uint]bool{} + for _, r := range rows { + jobIDs[r.ProwJobID] = true + } + + ids := make([]uint, 0, len(jobIDs)) + for id := range jobIDs { + ids = append(ids, id) + } + + type jobRow struct { + ID uint `gorm:"column:id"` + Variants pq.StringArray `gorm:"column:variants;type:text[]"` + } + + var jobRows []jobRow + if err := p.dbc.DB.Raw(`SELECT id, variants FROM prow_jobs WHERE id IN (?)`, ids).Scan(&jobRows).Error; err != nil { + log.WithError(err).Error("error fetching job variants") + return map[uint]map[string]string{} + } + + result := make(map[uint]map[string]string, len(jobRows)) + for _, jr := range jobRows { + result[jr.ID] = parseVariants(jr.Variants) + } + return result +} + +func (p *PostgresProvider) QueryBaseTestStatus(ctx context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants) (map[string]crstatus.TestStatus, []error) { + + dbGroupBy := make(map[string]bool, reqOptions.VariantOption.DBGroupBy.Len()) + for _, k := range reqOptions.VariantOption.DBGroupBy.List() { + dbGroupBy[k] = true + } + + includeVariants := reqOptions.VariantOption.IncludeVariants + if includeVariants == nil { + includeVariants = map[string][]string{} + } + + return p.queryTestStatus( + ctx, + reqOptions.BaseRelease.Name, + reqOptions.BaseRelease.Start, + reqOptions.BaseRelease.End, + allJobVariants, + includeVariants, + dbGroupBy, + ) +} + +func (p *PostgresProvider) QuerySampleTestStatus(ctx context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants, + includeVariants map[string][]string, + start, end time.Time) (map[string]crstatus.TestStatus, []error) { + + dbGroupBy := make(map[string]bool, reqOptions.VariantOption.DBGroupBy.Len()) + for _, k := range reqOptions.VariantOption.DBGroupBy.List() { + dbGroupBy[k] = true + } + + if includeVariants == nil { + includeVariants = map[string][]string{} + } + + return p.queryTestStatus( + ctx, + reqOptions.SampleRelease.Name, + start, end, + allJobVariants, + includeVariants, + dbGroupBy, + ) +} + +// --- TestDetailsQuerier --- + +type testDetailRow struct { + TestID string `gorm:"column:test_id"` + TestName string `gorm:"column:test_name"` + ProwJobName string `gorm:"column:prowjob_name"` + ProwJobRunID string `gorm:"column:prowjob_run_id"` + ProwJobURL string `gorm:"column:prowjob_url"` + ProwJobStart time.Time `gorm:"column:prowjob_start"` + ProwJobID uint `gorm:"column:prow_job_id"` + Status int `gorm:"column:status"` + JiraComponent string `gorm:"column:jira_component"` + JiraComponentID *uint `gorm:"column:jira_component_id"` + Capabilities pq.StringArray `gorm:"column:capabilities;type:text[]"` +} + +const testDetailQuery = ` +SELECT + tow.unique_id AS test_id, + t.name AS test_name, + pj.name AS prowjob_name, + CAST(pjr.id AS TEXT) AS prowjob_run_id, + COALESCE(pjr.url, '') AS prowjob_url, + pjr.timestamp AS prowjob_start, + pj.id AS prow_job_id, + pjrt.status, + COALESCE(tow.jira_component, '') AS jira_component, + tow.jira_component_id, + tow.capabilities +FROM prow_job_run_tests pjrt +JOIN prow_job_runs pjr ON pjr.id = pjrt.prow_job_run_id +JOIN prow_jobs pj ON pj.id = pjr.prow_job_id +JOIN tests t ON t.id = pjrt.test_id +JOIN test_ownerships tow ON tow.test_id = pjrt.test_id + AND (tow.suite_id = pjrt.suite_id OR (tow.suite_id IS NULL AND pjrt.suite_id IS NULL)) +WHERE pj.release = ? + AND pjr.timestamp >= ? AND pjr.timestamp < ? + AND pjrt.deleted_at IS NULL AND pjr.deleted_at IS NULL AND pj.deleted_at IS NULL + AND tow.staff_approved_obsolete = false + AND (pjr.labels IS NULL OR NOT pjr.labels @> ARRAY['InfraFailure']) +ORDER BY pjr.timestamp +` + +func (p *PostgresProvider) queryTestDetails(release string, start, end time.Time, + reqOptions reqopts.RequestOptions, _ crtest.JobVariants, + includeVariants map[string][]string) (map[string][]crstatus.TestJobRunRows, []error) { + + var rows []testDetailRow + if err := p.dbc.DB.Raw(testDetailQuery, release, start, end).Scan(&rows).Error; err != nil { + return nil, []error{fmt.Errorf("querying test details: %w", err)} + } + + dbGroupBy := make(map[string]bool, reqOptions.VariantOption.DBGroupBy.Len()) + for _, k := range reqOptions.VariantOption.DBGroupBy.List() { + dbGroupBy[k] = true + } + + if includeVariants == nil { + includeVariants = map[string][]string{} + } + + // Batch-fetch job variants + jobIDs := map[uint]bool{} + for _, r := range rows { + jobIDs[r.ProwJobID] = true + } + ids := make([]uint, 0, len(jobIDs)) + for id := range jobIDs { + ids = append(ids, id) + } + type jobRow struct { + ID uint `gorm:"column:id"` + Variants pq.StringArray `gorm:"column:variants;type:text[]"` + } + var jobRows []jobRow + if len(ids) > 0 { + if err := p.dbc.DB.Raw(`SELECT id, variants FROM prow_jobs WHERE id IN (?)`, ids).Scan(&jobRows).Error; err != nil { + return nil, []error{fmt.Errorf("fetching job variants: %w", err)} + } + } + jobVariantMap := make(map[uint]map[string]string, len(jobRows)) + for _, jr := range jobRows { + jobVariantMap[jr.ID] = parseVariants(jr.Variants) + } + + // Filter test IDs if specified + // Build test ID filter and per-test requested variant filters + testIDFilter := map[string]bool{} + requestedVariantsByTestID := map[string]map[string]string{} + for _, tid := range reqOptions.TestIDOptions { + testIDFilter[tid.TestID] = true + if len(tid.RequestedVariants) > 0 { + requestedVariantsByTestID[tid.TestID] = tid.RequestedVariants + } + } + + result := map[string][]crstatus.TestJobRunRows{} + for _, row := range rows { + if len(testIDFilter) > 0 && !testIDFilter[row.TestID] { + continue + } + + variants, ok := jobVariantMap[row.ProwJobID] + if !ok { + continue + } + if !matchesIncludeVariants(variants, includeVariants) { + continue + } + + // Filter by requested variants (exact match for specific test+variant combo) + if rv, ok := requestedVariantsByTestID[row.TestID]; ok { + match := true + for k, v := range rv { + if variants[k] != v { + match = false + break + } + } + if !match { + continue + } + } + + filtered := filterByDBGroupBy(variants, dbGroupBy) + key := crtest.KeyWithVariants{ + TestID: row.TestID, + Variants: filtered, + } + + successCount := 0 + flakeCount := 0 + if row.Status == 1 || row.Status == 13 { + successCount = 1 + } + if row.Status == 13 { + flakeCount = 1 + } + + var jiraComponentID *big.Rat + if row.JiraComponentID != nil { + jiraComponentID = new(big.Rat).SetUint64(uint64(*row.JiraComponentID)) + } + + entry := crstatus.TestJobRunRows{ + TestKey: key, + TestKeyStr: key.KeyOrDie(), + TestName: row.TestName, + ProwJob: utils.NormalizeProwJobName(row.ProwJobName), + ProwJobRunID: row.ProwJobRunID, + ProwJobURL: row.ProwJobURL, + StartTime: row.ProwJobStart, + Count: crtest.Count{TotalCount: 1, SuccessCount: successCount, FlakeCount: flakeCount}, + JiraComponent: row.JiraComponent, + JiraComponentID: jiraComponentID, + } + + normalizedName := utils.NormalizeProwJobName(row.ProwJobName) + result[normalizedName] = append(result[normalizedName], entry) + } + + return result, nil +} + +func (p *PostgresProvider) QueryBaseJobRunTestStatus(_ context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants) (map[string][]crstatus.TestJobRunRows, []error) { + + return p.queryTestDetails( + reqOptions.BaseRelease.Name, + reqOptions.BaseRelease.Start, reqOptions.BaseRelease.End, + reqOptions, allJobVariants, reqOptions.VariantOption.IncludeVariants, + ) +} + +func (p *PostgresProvider) QuerySampleJobRunTestStatus(_ context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants, + includeVariants map[string][]string, + start, end time.Time) (map[string][]crstatus.TestJobRunRows, []error) { + + return p.queryTestDetails( + reqOptions.SampleRelease.Name, + start, end, + reqOptions, allJobVariants, includeVariants, + ) +} + +// --- JobQuerier --- + +func (p *PostgresProvider) QueryJobRuns(_ context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants, + release string, start, end time.Time) (map[string]dataprovider.JobRunStats, error) { + + type jobRunRow struct { + JobName string `gorm:"column:job_name"` + TotalRuns int `gorm:"column:total_runs"` + Successful int `gorm:"column:successful_runs"` + } + + var rows []jobRunRow + err := p.dbc.DB.Raw(` + SELECT + pj.name AS job_name, + COUNT(DISTINCT pjr.id) AS total_runs, + COUNT(DISTINCT CASE WHEN pjr.succeeded THEN pjr.id END) AS successful_runs + FROM prow_jobs pj + JOIN prow_job_runs pjr ON pjr.prow_job_id = pj.id + WHERE pj.release = ? + AND pjr.timestamp >= ? AND pjr.timestamp < ? + AND pj.deleted_at IS NULL AND pjr.deleted_at IS NULL + AND (pj.name LIKE 'periodic-%%' OR pj.name LIKE 'release-%%' OR pj.name LIKE 'aggregator-%%') + GROUP BY pj.name + ORDER BY pj.name + `, release, start, end).Scan(&rows).Error + if err != nil { + return nil, fmt.Errorf("querying job runs: %w", err) + } + + // Apply variant filtering in Go + includeVariants := reqOptions.VariantOption.IncludeVariants + if includeVariants == nil { + includeVariants = map[string][]string{} + } + + // Fetch variants for all jobs + jobNames := make([]string, 0, len(rows)) + for _, r := range rows { + jobNames = append(jobNames, r.JobName) + } + jobVariantMap := map[string]map[string]string{} + if len(jobNames) > 0 { + type jvRow struct { + Name string `gorm:"column:name"` + Variants pq.StringArray `gorm:"column:variants;type:text[]"` + } + var jvRows []jvRow + if err := p.dbc.DB.Raw(`SELECT name, variants FROM prow_jobs WHERE name IN (?) AND deleted_at IS NULL`, jobNames).Scan(&jvRows).Error; err != nil { + return nil, fmt.Errorf("fetching job variants: %w", err) + } + for _, jr := range jvRows { + jobVariantMap[jr.Name] = parseVariants(jr.Variants) + } + } + + results := map[string]dataprovider.JobRunStats{} + for _, row := range rows { + if variants, ok := jobVariantMap[row.JobName]; ok { + if !matchesIncludeVariants(variants, includeVariants) { + continue + } + } + passRate := 0.0 + if row.TotalRuns > 0 { + passRate = float64(row.Successful) / float64(row.TotalRuns) * 100 + } + results[row.JobName] = dataprovider.JobRunStats{ + JobName: row.JobName, + TotalRuns: row.TotalRuns, + SuccessfulRuns: row.Successful, + PassRate: passRate, + } + } + + return results, nil +} + +func (p *PostgresProvider) QueryJobVariantValues(_ context.Context, jobNames []string, + variantKeys []string) (map[string]map[string]string, error) { + + if len(jobNames) == 0 { + return map[string]map[string]string{}, nil + } + + type jvRow struct { + Name string `gorm:"column:name"` + Variants pq.StringArray `gorm:"column:variants;type:text[]"` + } + + var rows []jvRow + if err := p.dbc.DB.Raw(`SELECT name, variants FROM prow_jobs WHERE name IN (?) AND deleted_at IS NULL`, jobNames).Scan(&rows).Error; err != nil { + return nil, fmt.Errorf("querying job variant values: %w", err) + } + + keyFilter := map[string]bool{} + for _, k := range variantKeys { + keyFilter[k] = true + } + + results := map[string]map[string]string{} + for _, row := range rows { + parsed := parseVariants(row.Variants) + if len(keyFilter) > 0 { + filtered := map[string]string{} + for k, v := range parsed { + if keyFilter[k] { + filtered[k] = v + } + } + results[row.Name] = filtered + } else { + results[row.Name] = parsed + } + } + return results, nil +} + +func (p *PostgresProvider) LookupJobVariants(_ context.Context, jobName string) (map[string]string, error) { + type jvRow struct { + Variants pq.StringArray `gorm:"column:variants;type:text[]"` + } + + var row jvRow + err := p.dbc.DB.Raw(`SELECT variants FROM prow_jobs WHERE name = ? AND deleted_at IS NULL LIMIT 1`, jobName).Scan(&row).Error + if err != nil { + return nil, fmt.Errorf("looking up job variants: %w", err) + } + return parseVariants(row.Variants), nil +} diff --git a/scripts/e2e.sh b/scripts/e2e.sh index 8c8487217c..583b43d318 100755 --- a/scripts/e2e.sh +++ b/scripts/e2e.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # Shell script meant for developers to run the e2e tests locally without impacting # their running postgres container or sippy process. # It's quite quick to import the older releases below, but in theory @@ -11,14 +11,17 @@ PSQL_PORT="23433" REDIS_CONTAINER="sippy-e2e-test-redis" REDIS_PORT="23479" -if [[ -z "$GCS_SA_JSON_PATH" ]]; then - echo "Must provide path to GCS credential in GCS_SA_JSON_PATH env var" 1>&2 - exit 1 +if [ -z "$GCS_SA_JSON_PATH" ]; then + echo "WARNING: GCS_SA_JSON_PATH not set, data sync and BigQuery tests will be skipped" 1>&2 fi +E2E_EXIT_CODE=0 clean_up () { ARG=$? + if [ $ARG -ne 0 ]; then + E2E_EXIT_CODE=$ARG + fi echo "Stopping sippy API child process: $CHILD_PID" kill $CHILD_PID 2>/dev/null && wait $CHILD_PID 2>/dev/null # Generate coverage report from the server's coverage data @@ -27,10 +30,13 @@ clean_up () { go tool covdata percent -i="$COVDIR" go tool covdata textfmt -i="$COVDIR" -o=e2e-coverage.out # Merge test binary coverage (from -coverprofile) into server binary coverage - if [ -f e2e-test-coverage.out ]; then - echo "Merging test binary coverage into server coverage..." - tail -n +2 e2e-test-coverage.out >> e2e-coverage.out - fi + for f in e2e-test-coverage.out e2e-bq-test-coverage.out unit-test-coverage.out; do + if [ -f "$f" ]; then + echo "Merging $f into server coverage..." + tail -n +2 "$f" >> e2e-coverage.out + rm -f "$f" + fi + done echo "Coverage data written to e2e-coverage.out" echo "View HTML report: go tool cover -html=e2e-coverage.out -o=e2e-coverage.html" fi @@ -40,7 +46,23 @@ clean_up () { echo "Tearing down container $REDIS_CONTAINER" $DOCKER stop -i $REDIS_CONTAINER $DOCKER rm -i $REDIS_CONTAINER - exit $ARG + exit $E2E_EXIT_CODE +} + +wait_for_sippy() { + echo "Waiting for sippy API to start on port $SIPPY_API_PORT..." + TIMEOUT=600 + ELAPSED=0 + while [ $ELAPSED -lt $TIMEOUT ]; do + if curl -s "http://localhost:$SIPPY_API_PORT/api/health" > /dev/null 2>&1; then + echo "Sippy API is ready after ${ELAPSED}s" + return 0 + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) + done + echo "Timeout waiting for sippy API to start after ${TIMEOUT}s" + return 1 } trap clean_up EXIT @@ -65,6 +87,7 @@ sleep 5 export SIPPY_E2E_DSN="postgresql://postgres:password@localhost:$PSQL_PORT/postgres" export REDIS_URL="redis://localhost:$REDIS_PORT" +export SIPPY_E2E_REPO_ROOT="$(pwd)" # Build with coverage instrumentation COVDIR="$(pwd)/e2e-coverage" @@ -76,13 +99,17 @@ go build -cover -coverpkg=./cmd/...,./pkg/... -mod vendor -o ./sippy ./cmd/sippy echo "Loading database..." GOCOVERDIR="$COVDIR" ./sippy seed-data \ --init-database \ - --database-dsn="$SIPPY_E2E_DSN" \ - --release="4.20" + --database-dsn="$SIPPY_E2E_DSN" # Spawn sippy server off into a separate process: export SIPPY_API_PORT="18080" export SIPPY_ENDPOINT="127.0.0.1" +GCS_ARGS="" +if [ -n "$GCS_SA_JSON_PATH" ]; then + GCS_ARGS="--google-service-account-credential-file $GCS_SA_JSON_PATH" +fi + GOCOVERDIR="$COVDIR" ./sippy serve \ --listen ":$SIPPY_API_PORT" \ --listen-metrics ":12112" \ @@ -90,31 +117,22 @@ GOCOVERDIR="$COVDIR" ./sippy serve \ --enable-write-endpoints \ --log-level debug \ --views config/e2e-views.yaml \ - --google-service-account-credential-file $GCS_SA_JSON_PATH \ - --redis-url="$REDIS_URL" > e2e.log 2>&1 & + $GCS_ARGS \ + --redis-url="$REDIS_URL" \ + --data-provider postgres > e2e.log 2>&1 & CHILD_PID=$! -# Give it time to start up, and fill the redis cache -echo "Waiting for sippy API to start on port $SIPPY_API_PORT, see e2e.log for output..." -TIMEOUT=600 -ELAPSED=0 -while [ $ELAPSED -lt $TIMEOUT ]; do - if curl -s "http://localhost:$SIPPY_API_PORT/api/health" > /dev/null 2>&1; then - echo "Sippy API is ready after ${ELAPSED}s" - break - fi - sleep 2 - ELAPSED=$((ELAPSED + 2)) -done - -if [ $ELAPSED -ge $TIMEOUT ]; then - echo "Timeout waiting for sippy API to start after ${TIMEOUT}s" - exit 1 -fi +wait_for_sippy || exit 1 +# Prime the component readiness cache so triage tests can find cached reports +echo "Priming component readiness cache..." +VIEWS=$(curl -sf "http://localhost:$SIPPY_API_PORT/api/component_readiness/views") || { echo "Failed to fetch views"; exit 1; } +for VIEW in $(echo "$VIEWS" | jq -r '.[].name'); do + echo " Priming cache for view: $VIEW" + curl -sf "http://localhost:$SIPPY_API_PORT/api/component_readiness?view=$VIEW" > /dev/null || { echo "Failed to prime cache for view: $VIEW"; exit 1; } +done +echo "Cache priming complete" -# Run our tests that request against the API, args ensure serially and fresh test code compile. -# All output is tee'd to e2e-test.log so results can be reviewed without re-running. -gotestsum ./test/e2e/... -count 1 -p 1 -coverprofile=e2e-test-coverage.out -coverpkg=./pkg/...,./cmd/... 2>&1 | tee e2e-test.log -E2E_EXIT=${PIPESTATUS[0]} -exit $E2E_EXIT +# Run e2e tests +gotestsum ./test/e2e/... -count 1 -p 1 -coverprofile=e2e-test-coverage.out -coverpkg=./pkg/...,./cmd/... +E2E_EXIT_CODE=$? From dbe98b23f13d4c211c0040ef8ac43ce728cec438 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 29 Apr 2026 06:45:08 -0400 Subject: [PATCH 2/8] Add standalone dev setup script for seeded local environment Adds scripts/dev-setup.sh which stands up PostgreSQL and Redis containers seeded with deterministic Component Readiness data. Starting the sippy server is optional (--serve flag) so developers using devcontainers can connect to the database directly. Accessible via: make dev (seed only) or make dev SERVE=1 Co-Authored-By: Claude Opus 4.6 --- Makefile | 3 ++ scripts/dev-setup.sh | 105 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100755 scripts/dev-setup.sh diff --git a/Makefile b/Makefile index cdd42b6a3b..e3a06eb10b 100644 --- a/Makefile +++ b/Makefile @@ -62,6 +62,9 @@ clean: e2e: ./scripts/e2e.sh +dev: sippy + ./scripts/dev-setup.sh $(if $(SERVE),--serve) + images: $(DOCKER) build . diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh new file mode 100755 index 0000000000..9a28579f18 --- /dev/null +++ b/scripts/dev-setup.sh @@ -0,0 +1,105 @@ +#!/bin/sh +# Stand up a seeded PostgreSQL + Redis environment for local development. +# By default only seeds the database and prints connection info. +# Pass --serve to also start the sippy API server. +# +# Usage: +# make dev # build + seed only +# make dev SERVE=1 # build + seed + start sippy +# scripts/dev-setup.sh # seed only (assumes sippy binary exists) +# scripts/dev-setup.sh --serve # seed + start sippy +# +# To tear down: Ctrl-C (containers are cleaned up automatically) + +set -e + +SERVE=false +for arg in "$@"; do + case "$arg" in + --serve) SERVE=true ;; + esac +done + +DOCKER="${DOCKER:-podman}" +PSQL_CONTAINER="sippy-dev-postgresql" +PSQL_PORT="${PSQL_PORT:-25433}" +REDIS_CONTAINER="sippy-dev-redis" +REDIS_PORT="${REDIS_PORT:-25479}" +SIPPY_API_PORT="${SIPPY_API_PORT:-8080}" + +clean_up() { + echo "" + echo "Shutting down..." + if [ -n "$CHILD_PID" ]; then + kill $CHILD_PID 2>/dev/null && wait $CHILD_PID 2>/dev/null + fi + echo "Stopping $PSQL_CONTAINER" + $DOCKER stop $PSQL_CONTAINER 2>/dev/null + $DOCKER rm $PSQL_CONTAINER 2>/dev/null + echo "Stopping $REDIS_CONTAINER" + $DOCKER stop $REDIS_CONTAINER 2>/dev/null + $DOCKER rm $REDIS_CONTAINER 2>/dev/null +} +trap clean_up EXIT + +# Clean up any stale containers from a previous run +$DOCKER stop $PSQL_CONTAINER 2>/dev/null || true +$DOCKER rm $PSQL_CONTAINER 2>/dev/null || true +$DOCKER stop $REDIS_CONTAINER 2>/dev/null || true +$DOCKER rm $REDIS_CONTAINER 2>/dev/null || true + +echo "Starting PostgreSQL on port $PSQL_PORT..." +$DOCKER run --name $PSQL_CONTAINER -e POSTGRES_PASSWORD=password -p $PSQL_PORT:5432 -d quay.io/enterprisedb/postgresql + +echo "Starting Redis on port $REDIS_PORT..." +$DOCKER run --name $REDIS_CONTAINER -p $REDIS_PORT:6379 -d quay.io/openshiftci/redis:latest + +echo "Waiting for PostgreSQL to be ready..." +sleep 5 + +DSN="postgresql://postgres:password@localhost:$PSQL_PORT/postgres" +REDIS_URL="redis://localhost:$REDIS_PORT" + +echo "Seeding database..." +./sippy seed-data --init-database --database-dsn="$DSN" + +echo "" +echo "================================================" +echo " Dev environment ready" +echo " PostgreSQL: $DSN" +echo " Redis: $REDIS_URL" +echo "================================================" + +if [ "$SERVE" = true ]; then + GCS_ARGS="" + if [ -n "$GCS_SA_JSON_PATH" ]; then + GCS_ARGS="--google-service-account-credential-file $GCS_SA_JSON_PATH" + fi + + echo "" + echo "Starting sippy on http://localhost:$SIPPY_API_PORT ..." + echo "Press Ctrl-C to stop" + echo "" + + ./sippy serve \ + --listen ":$SIPPY_API_PORT" \ + --listen-metrics ":12112" \ + --database-dsn="$DSN" \ + --enable-write-endpoints \ + --log-level debug \ + --views config/e2e-views.yaml \ + $GCS_ARGS \ + --redis-url="$REDIS_URL" \ + --data-provider postgres & + CHILD_PID=$! + + wait $CHILD_PID +else + echo "" + echo "To start sippy against this database:" + echo " ./sippy serve --database-dsn=\"$DSN\" --redis-url=\"$REDIS_URL\" --data-provider postgres --views config/e2e-views.yaml --log-level debug" + echo "" + echo "Press Ctrl-C to tear down containers" + # Keep containers alive until user hits Ctrl-C + while true; do sleep 60; done +fi From a8be84271b80526e51301298305f00de2f18ebd9 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 29 Apr 2026 07:03:40 -0400 Subject: [PATCH 3/8] Use non-default metrics port in dev-setup to avoid conflicts The default metrics port 12112 may already be in use by a running sippy instance. Use 22112 by default, overridable via SIPPY_METRICS_PORT. Co-Authored-By: Claude Opus 4.6 --- scripts/dev-setup.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh index 9a28579f18..e3f4f72b0a 100755 --- a/scripts/dev-setup.sh +++ b/scripts/dev-setup.sh @@ -26,6 +26,7 @@ PSQL_PORT="${PSQL_PORT:-25433}" REDIS_CONTAINER="sippy-dev-redis" REDIS_PORT="${REDIS_PORT:-25479}" SIPPY_API_PORT="${SIPPY_API_PORT:-8080}" +SIPPY_METRICS_PORT="${SIPPY_METRICS_PORT:-22112}" clean_up() { echo "" @@ -83,7 +84,7 @@ if [ "$SERVE" = true ]; then ./sippy serve \ --listen ":$SIPPY_API_PORT" \ - --listen-metrics ":12112" \ + --listen-metrics ":$SIPPY_METRICS_PORT" \ --database-dsn="$DSN" \ --enable-write-endpoints \ --log-level debug \ From 2f5a0c671a47351d1249fc46fd6c3c5350001919 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 29 Apr 2026 07:04:00 -0400 Subject: [PATCH 4/8] Revert "Use non-default metrics port in dev-setup to avoid conflicts" This reverts commit a8be84271b80526e51301298305f00de2f18ebd9. --- scripts/dev-setup.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh index e3f4f72b0a..9a28579f18 100755 --- a/scripts/dev-setup.sh +++ b/scripts/dev-setup.sh @@ -26,7 +26,6 @@ PSQL_PORT="${PSQL_PORT:-25433}" REDIS_CONTAINER="sippy-dev-redis" REDIS_PORT="${REDIS_PORT:-25479}" SIPPY_API_PORT="${SIPPY_API_PORT:-8080}" -SIPPY_METRICS_PORT="${SIPPY_METRICS_PORT:-22112}" clean_up() { echo "" @@ -84,7 +83,7 @@ if [ "$SERVE" = true ]; then ./sippy serve \ --listen ":$SIPPY_API_PORT" \ - --listen-metrics ":$SIPPY_METRICS_PORT" \ + --listen-metrics ":12112" \ --database-dsn="$DSN" \ --enable-write-endpoints \ --log-level debug \ From ce8efb4d75afb903e333182a60c402b3f29783c4 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 29 Apr 2026 07:58:06 -0400 Subject: [PATCH 5/8] Fix seed data to create infra runs beyond test runs, refactor provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create 2 additional infra-only runs beyond runCount instead of stealing from the test result pool — tests with counts.total == runCount now get all their results. Also refactor fetchJobVariants into fetchJobVariantsByIDs to return errors instead of silently logging, and deduplicate the variant fetching in queryTestDetails. Co-Authored-By: Claude Opus 4.6 --- cmd/sippy/seed_data.go | 99 ++----------------- .../dataprovider/postgres/provider.go | 53 +++++----- 2 files changed, 31 insertions(+), 121 deletions(-) diff --git a/cmd/sippy/seed_data.go b/cmd/sippy/seed_data.go index 4526398971..5f7f9d7d04 100644 --- a/cmd/sippy/seed_data.go +++ b/cmd/sippy/seed_data.go @@ -5,7 +5,6 @@ import ( "database/sql" "fmt" "os" - "sort" "strings" "time" @@ -20,7 +19,6 @@ import ( pgprovider "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider/postgres" "github.com/openshift/sippy/pkg/api/componentreadiness/utils" apitype "github.com/openshift/sippy/pkg/apis/api" - "github.com/openshift/sippy/pkg/apis/api/componentreport/crview" "github.com/openshift/sippy/pkg/apis/api/componentreport/reqopts" v1 "github.com/openshift/sippy/pkg/apis/sippyprocessing/v1" "github.com/openshift/sippy/pkg/db" @@ -28,7 +26,6 @@ import ( "github.com/openshift/sippy/pkg/db/models/jobrunscan" "github.com/openshift/sippy/pkg/flags" "github.com/openshift/sippy/pkg/sippyserver" - "github.com/openshift/sippy/pkg/util/sets" ) type SeedDataFlags struct { @@ -398,7 +395,7 @@ func seedSyntheticData(dbc *db.DB) error { return fmt.Errorf("failed to check for existing data: %w", err) } if count > 0 { - log.Infof("Database already contains %d ProwJobs, skipping seed. Use --init-database to reset.", count) + log.Infof("Database already contains %d ProwJobs, skipping seed. Drop and recreate the database to re-seed (e.g. docker compose down -v).", count) return nil } @@ -424,10 +421,6 @@ func seedSyntheticData(dbc *db.DB) error { return errors.WithMessage(err, "failed to create labels and symptoms") } - if err := writeSyntheticViewsFile(); err != nil { - return errors.WithMessage(err, "failed to write views file") - } - log.Info("Refreshing materialized views...") sippyserver.RefreshData(dbc, nil, false) @@ -575,8 +568,10 @@ func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrK window := end.Sub(start) interval := window / time.Duration(runCount) - runIDs := make([]uint, runCount) - for i := range runCount { + infraRuns := 2 + totalRuns := runCount + infraRuns + runIDs := make([]uint, totalRuns) + for i := range totalRuns { timestamp := start.Add(time.Duration(i) * interval) run := models.ProwJobRun{ ProwJobID: prowJob.ID, @@ -590,12 +585,6 @@ func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrK runIDs[i] = run.ID } - // Runs that get test results (all except the last 2) - testableRuns := runCount - if testableRuns > 2 { - testableRuns = runCount - 2 - } - runsWithFailure := map[uint]bool{} totalResults := 0 @@ -614,7 +603,7 @@ func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrK return 0, 0, fmt.Errorf("test %q not found in DB", ts.testName) } - for i := 0; i < counts.total && i < testableRuns; i++ { + for i := 0; i < counts.total && i < runCount; i++ { var status int switch { case i < counts.success-counts.flake: @@ -646,7 +635,7 @@ func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrK var overallResult v1.JobOverallResult var succeeded, failed bool - if i >= testableRuns { + if i >= runCount { overallResult = v1.JobInternalInfrastructureFailure failed = true } else if runsWithFailure[runID] { @@ -676,7 +665,7 @@ func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrK return 0, 0, fmt.Errorf("updating test_failures for prow job %s: %w", prowJob.Name, err) } - return runCount, totalResults, nil + return totalRuns, totalResults, nil } func syncRegressions(dbc *db.DB) error { @@ -761,78 +750,6 @@ func syncRegressions(dbc *db.DB) error { const syntheticViewsFile = "config/e2e-views.yaml" -// writeSyntheticViewsFile generates a views file with include_variants matching the seed data. -func writeSyntheticViewsFile() error { - // Collect all unique variant values from synthetic jobs - allVariants := map[string]map[string]bool{} - for _, job := range syntheticJobs { - for k, v := range job.variants { - if allVariants[k] == nil { - allVariants[k] = map[string]bool{} - } - allVariants[k][v] = true - } - } - - includeVariants := map[string][]string{} - for k, vals := range allVariants { - sorted := make([]string, 0, len(vals)) - for v := range vals { - sorted = append(sorted, v) - } - sort.Strings(sorted) - includeVariants[k] = sorted - } - - dbGroupBy := sets.NewString("Architecture", "FeatureSet", "Installer", "Network", "Platform", - "Suite", "Topology", "Upgrade", "LayeredProduct") - columnGroupBy := sets.NewString("Network", "Platform", "Topology") - - views := apitype.SippyViews{ - ComponentReadiness: []crview.View{ - { - Name: "4.22-main", - BaseRelease: reqopts.RelativeRelease{ - Release: reqopts.Release{Name: "4.21"}, - RelativeStart: "now-60d", - RelativeEnd: "now-30d", - }, - SampleRelease: reqopts.RelativeRelease{ - Release: reqopts.Release{Name: "4.22"}, - RelativeStart: "now-3d", - RelativeEnd: "now", - }, - VariantOptions: reqopts.Variants{ - ColumnGroupBy: columnGroupBy, - DBGroupBy: dbGroupBy, - IncludeVariants: includeVariants, - }, - AdvancedOptions: reqopts.Advanced{ - Confidence: 95, - PityFactor: 5, - MinimumFailure: 3, - PassRateRequiredNewTests: 90, - IncludeMultiReleaseAnalysis: true, - }, - PrimeCache: crview.PrimeCache{Enabled: true}, - RegressionTracking: crview.RegressionTracking{Enabled: true}, - }, - }, - } - - data, err := yaml.Marshal(views) - if err != nil { - return fmt.Errorf("marshaling views: %w", err) - } - - if err := os.WriteFile(syntheticViewsFile, data, 0o600); err != nil { - return fmt.Errorf("writing %s: %w", syntheticViewsFile, err) - } - - log.Infof("Generated views file: %s", syntheticViewsFile) - return nil -} - // variantMapToArray converts a variant map to a pq.StringArray. func variantMapToArray(m map[string]string) pq.StringArray { result := make([]string, 0, len(m)) diff --git a/pkg/api/componentreadiness/dataprovider/postgres/provider.go b/pkg/api/componentreadiness/dataprovider/postgres/provider.go index ef09207645..0eee1a0105 100644 --- a/pkg/api/componentreadiness/dataprovider/postgres/provider.go +++ b/pkg/api/componentreadiness/dataprovider/postgres/provider.go @@ -10,7 +10,6 @@ import ( "time" "github.com/lib/pq" - log "github.com/sirupsen/logrus" "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider" "github.com/openshift/sippy/pkg/api/componentreadiness/utils" @@ -341,7 +340,18 @@ func (p *PostgresProvider) queryTestStatus(ctx context.Context, release string, } // Batch-fetch all ProwJob variants we need - jobVariantMap := p.fetchJobVariants(rows) + jobIDs := make(map[uint]bool, len(rows)) + for _, r := range rows { + jobIDs[r.ProwJobID] = true + } + ids := make([]uint, 0, len(jobIDs)) + for id := range jobIDs { + ids = append(ids, id) + } + jobVariantMap, err := p.fetchJobVariantsByIDs(ids) + if err != nil { + return nil, []error{err} + } result := map[string]crstatus.TestStatus{} for _, row := range rows { @@ -394,16 +404,10 @@ func (p *PostgresProvider) queryTestStatus(ctx context.Context, release string, return result, nil } -// fetchJobVariants loads and caches ProwJob variant maps for the given rows. -func (p *PostgresProvider) fetchJobVariants(rows []testStatusRow) map[uint]map[string]string { - jobIDs := map[uint]bool{} - for _, r := range rows { - jobIDs[r.ProwJobID] = true - } - - ids := make([]uint, 0, len(jobIDs)) - for id := range jobIDs { - ids = append(ids, id) +// fetchJobVariantsByIDs loads ProwJob variant maps for the given job IDs. +func (p *PostgresProvider) fetchJobVariantsByIDs(ids []uint) (map[uint]map[string]string, error) { + if len(ids) == 0 { + return map[uint]map[string]string{}, nil } type jobRow struct { @@ -413,15 +417,14 @@ func (p *PostgresProvider) fetchJobVariants(rows []testStatusRow) map[uint]map[s var jobRows []jobRow if err := p.dbc.DB.Raw(`SELECT id, variants FROM prow_jobs WHERE id IN (?)`, ids).Scan(&jobRows).Error; err != nil { - log.WithError(err).Error("error fetching job variants") - return map[uint]map[string]string{} + return nil, fmt.Errorf("fetching job variants: %w", err) } result := make(map[uint]map[string]string, len(jobRows)) for _, jr := range jobRows { result[jr.ID] = parseVariants(jr.Variants) } - return result + return result, nil } func (p *PostgresProvider) QueryBaseTestStatus(ctx context.Context, reqOptions reqopts.RequestOptions, @@ -542,19 +545,9 @@ func (p *PostgresProvider) queryTestDetails(release string, start, end time.Time for id := range jobIDs { ids = append(ids, id) } - type jobRow struct { - ID uint `gorm:"column:id"` - Variants pq.StringArray `gorm:"column:variants;type:text[]"` - } - var jobRows []jobRow - if len(ids) > 0 { - if err := p.dbc.DB.Raw(`SELECT id, variants FROM prow_jobs WHERE id IN (?)`, ids).Scan(&jobRows).Error; err != nil { - return nil, []error{fmt.Errorf("fetching job variants: %w", err)} - } - } - jobVariantMap := make(map[uint]map[string]string, len(jobRows)) - for _, jr := range jobRows { - jobVariantMap[jr.ID] = parseVariants(jr.Variants) + jobVariantMap, err := p.fetchJobVariantsByIDs(ids) + if err != nil { + return nil, []error{err} } // Filter test IDs if specified @@ -616,11 +609,12 @@ func (p *PostgresProvider) queryTestDetails(release string, start, end time.Time jiraComponentID = new(big.Rat).SetUint64(uint64(*row.JiraComponentID)) } + normalizedName := utils.NormalizeProwJobName(row.ProwJobName) entry := crstatus.TestJobRunRows{ TestKey: key, TestKeyStr: key.KeyOrDie(), TestName: row.TestName, - ProwJob: utils.NormalizeProwJobName(row.ProwJobName), + ProwJob: normalizedName, ProwJobRunID: row.ProwJobRunID, ProwJobURL: row.ProwJobURL, StartTime: row.ProwJobStart, @@ -629,7 +623,6 @@ func (p *PostgresProvider) queryTestDetails(release string, start, end time.Time JiraComponentID: jiraComponentID, } - normalizedName := utils.NormalizeProwJobName(row.ProwJobName) result[normalizedName] = append(result[normalizedName], entry) } From 6f4faac70ef4a561e201f30379c081b65136d66d Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 29 Apr 2026 07:58:57 -0400 Subject: [PATCH 6/8] Replace fixed sleep with PostgreSQL readiness check in dev-setup Poll with psql in a retry loop (1s interval, 30s timeout) instead of a hardcoded sleep 5, so seed-data only runs after PostgreSQL is actually accepting connections. Co-Authored-By: Claude Opus 4.6 --- scripts/dev-setup.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh index 9a28579f18..e50f2590e9 100755 --- a/scripts/dev-setup.sh +++ b/scripts/dev-setup.sh @@ -55,7 +55,16 @@ echo "Starting Redis on port $REDIS_PORT..." $DOCKER run --name $REDIS_CONTAINER -p $REDIS_PORT:6379 -d quay.io/openshiftci/redis:latest echo "Waiting for PostgreSQL to be ready..." -sleep 5 +timeout=30 +elapsed=0 +until psql -h localhost -p "$PSQL_PORT" -U postgres -d postgres -c '\q' 2>/dev/null; do + if [ "$elapsed" -ge "$timeout" ]; then + echo "ERROR: PostgreSQL did not become ready within ${timeout}s" + exit 1 + fi + sleep 1 + elapsed=$((elapsed + 1)) +done DSN="postgresql://postgres:password@localhost:$PSQL_PORT/postgres" REDIS_URL="redis://localhost:$REDIS_PORT" From 377968a3838f0fbd222e88d9dabd63b025ca7317 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 29 Apr 2026 13:49:55 -0400 Subject: [PATCH 7/8] Address coderabbit comments --- cmd/sippy/seed_data.go | 12 +++++++----- scripts/dev-setup.sh | 25 ++++++++++++------------- scripts/e2e.sh | 15 +++++++-------- test/e2e/util/e2erequest.go | 6 +++--- 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/cmd/sippy/seed_data.go b/cmd/sippy/seed_data.go index 5f7f9d7d04..7db0f0d1ad 100644 --- a/cmd/sippy/seed_data.go +++ b/cmd/sippy/seed_data.go @@ -5,6 +5,7 @@ import ( "database/sql" "fmt" "os" + "sort" "strings" "time" @@ -566,10 +567,9 @@ func seedJobRunsAndResults(dbc *db.DB) (int, int, error) { func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrKey jobReleaseKey, runCount int, testIDsByName map[string]uint) (int, int, error) { start, end := releaseTimeWindow(jrKey.release) window := end.Sub(start) - interval := window / time.Duration(runCount) - infraRuns := 2 totalRuns := runCount + infraRuns + interval := window / time.Duration(totalRuns) runIDs := make([]uint, totalRuns) for i := range totalRuns { timestamp := start.Add(time.Duration(i) * interval) @@ -648,9 +648,10 @@ func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrK if err := dbc.DB.Model(&models.ProwJobRun{}).Where("id = ?", runID). Updates(map[string]any{ - "overall_result": overallResult, - "succeeded": succeeded, - "failed": failed, + "overall_result": overallResult, + "succeeded": succeeded, + "failed": failed, + "infrastructure_failure": i >= runCount, }).Error; err != nil { return 0, 0, fmt.Errorf("failed to update ProwJobRun result: %w", err) } @@ -756,6 +757,7 @@ func variantMapToArray(m map[string]string) pq.StringArray { for k, v := range m { result = append(result, k+":"+v) } + sort.Strings(result) return result } diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh index e50f2590e9..b1f682dc16 100755 --- a/scripts/dev-setup.sh +++ b/scripts/dev-setup.sh @@ -57,7 +57,7 @@ $DOCKER run --name $REDIS_CONTAINER -p $REDIS_PORT:6379 -d quay.io/openshiftci/r echo "Waiting for PostgreSQL to be ready..." timeout=30 elapsed=0 -until psql -h localhost -p "$PSQL_PORT" -U postgres -d postgres -c '\q' 2>/dev/null; do +until $DOCKER exec $PSQL_CONTAINER psql -U postgres -d postgres -c '\q' 2>/dev/null; do if [ "$elapsed" -ge "$timeout" ]; then echo "ERROR: PostgreSQL did not become ready within ${timeout}s" exit 1 @@ -80,9 +80,17 @@ echo " Redis: $REDIS_URL" echo "================================================" if [ "$SERVE" = true ]; then - GCS_ARGS="" + set -- \ + --listen ":$SIPPY_API_PORT" \ + --listen-metrics ":12112" \ + --database-dsn="$DSN" \ + --enable-write-endpoints \ + --log-level debug \ + --views config/e2e-views.yaml \ + --redis-url="$REDIS_URL" \ + --data-provider postgres if [ -n "$GCS_SA_JSON_PATH" ]; then - GCS_ARGS="--google-service-account-credential-file $GCS_SA_JSON_PATH" + set -- "$@" --google-service-account-credential-file "$GCS_SA_JSON_PATH" fi echo "" @@ -90,16 +98,7 @@ if [ "$SERVE" = true ]; then echo "Press Ctrl-C to stop" echo "" - ./sippy serve \ - --listen ":$SIPPY_API_PORT" \ - --listen-metrics ":12112" \ - --database-dsn="$DSN" \ - --enable-write-endpoints \ - --log-level debug \ - --views config/e2e-views.yaml \ - $GCS_ARGS \ - --redis-url="$REDIS_URL" \ - --data-provider postgres & + ./sippy serve "$@" & CHILD_PID=$! wait $CHILD_PID diff --git a/scripts/e2e.sh b/scripts/e2e.sh index 583b43d318..97283b217b 100755 --- a/scripts/e2e.sh +++ b/scripts/e2e.sh @@ -105,21 +105,20 @@ GOCOVERDIR="$COVDIR" ./sippy seed-data \ export SIPPY_API_PORT="18080" export SIPPY_ENDPOINT="127.0.0.1" -GCS_ARGS="" -if [ -n "$GCS_SA_JSON_PATH" ]; then - GCS_ARGS="--google-service-account-credential-file $GCS_SA_JSON_PATH" -fi - -GOCOVERDIR="$COVDIR" ./sippy serve \ +set -- \ --listen ":$SIPPY_API_PORT" \ --listen-metrics ":12112" \ --database-dsn="$SIPPY_E2E_DSN" \ --enable-write-endpoints \ --log-level debug \ --views config/e2e-views.yaml \ - $GCS_ARGS \ --redis-url="$REDIS_URL" \ - --data-provider postgres > e2e.log 2>&1 & + --data-provider postgres +if [ -n "$GCS_SA_JSON_PATH" ]; then + set -- "$@" --google-service-account-credential-file "$GCS_SA_JSON_PATH" +fi + +GOCOVERDIR="$COVDIR" ./sippy serve "$@" > e2e.log 2>&1 & CHILD_PID=$! wait_for_sippy || exit 1 diff --git a/test/e2e/util/e2erequest.go b/test/e2e/util/e2erequest.go index aca9bb91cb..ca8dbd40f2 100644 --- a/test/e2e/util/e2erequest.go +++ b/test/e2e/util/e2erequest.go @@ -12,9 +12,9 @@ import ( ) const ( - // Needs to match what we import in the e2e.sh script - Release = "4.20" - BaseRelease = "4.19" + // Needs to match the releases in config/e2e-views.yaml + Release = "4.22" + BaseRelease = "4.21" // APIPort is the port e2e.sh launches the sippy API on. These values must be kept in sync. APIPort = 18080 From 9513902d3038b2c49fe3e9b836e372115a70f563 Mon Sep 17 00:00:00 2001 From: Stephen Benjamin Date: Wed, 29 Apr 2026 19:30:18 +0000 Subject: [PATCH 8/8] Update CI e2e scripts to use seed-data and postgres provider Replace 'sippy load' (slow GCS import) with 'sippy seed-data' for CI e2e tests. This seeds deterministic synthetic data instead of importing real data from GCS, which is much faster and provides the release data (4.22, 4.21, etc.) that the tests expect. Also adds --data-provider postgres to the serve command, a wait-for-API health check after port-forward setup, and cache priming for component readiness views before running tests. GCS credentials are still provisioned for BigQuery-dependent tests (e.g. TestRegressionCacheLoader). Co-Authored-By: Claude Opus 4.6 --- .../sippy-e2e-sippy-e2e-setup-commands.sh | 49 +++++++------------ .../sippy-e2e-sippy-e2e-test-commands.sh | 28 +++++++++++ 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/e2e-scripts/sippy-e2e-sippy-e2e-setup-commands.sh b/e2e-scripts/sippy-e2e-sippy-e2e-setup-commands.sh index cd210804b5..873f8c21a3 100755 --- a/e2e-scripts/sippy-e2e-sippy-e2e-setup-commands.sh +++ b/e2e-scripts/sippy-e2e-sippy-e2e-setup-commands.sh @@ -283,12 +283,13 @@ spec: storage: 100Mi END -# Make the "sippy loader" pod. +# Seed the database with synthetic data for e2e tests. +# TODO: Add a scoped 'sippy load' test back (e.g. single job) to exercise the GCS loading path. cat << END | ${KUBECTL_CMD} apply -f - apiVersion: batch/v1 kind: Job metadata: - name: sippy-load-job + name: sippy-seed-job namespace: sippy-e2e spec: template: @@ -304,20 +305,9 @@ spec: terminationMessagePolicy: File command: ["/bin/sh", "-c"] args: - - /bin/sippy load --init-database --log-level=debug --release 4.20 --database-dsn=postgresql://postgres:password@postgres.sippy-e2e.svc.cluster.local:5432/postgres --redis-url=redis://redis.sippy-e2e.svc.cluster.local:6379 --mode=ocp --config ./config/e2e-openshift.yaml --google-service-account-credential-file /tmp/secrets/gcs-cred - env: - - name: GCS_SA_JSON_PATH - value: /tmp/secrets/gcs-cred - volumeMounts: - - mountPath: /tmp/secrets - name: gcs-cred - readOnly: true + - /bin/sippy seed-data --init-database --log-level=debug --database-dsn=postgresql://postgres:password@postgres.sippy-e2e.svc.cluster.local:5432/postgres imagePullSecrets: - name: regcred - volumes: - - name: gcs-cred - secret: - secretName: gcs-cred dnsPolicy: ClusterFirst restartPolicy: Never schedulerName: default-scheduler @@ -327,33 +317,28 @@ spec: END date -echo "Waiting for sippy loader job to finish ..." -${KUBECTL_CMD} -n sippy-e2e get job sippy-load-job -${KUBECTL_CMD} -n sippy-e2e describe job sippy-load-job +echo "Waiting for sippy seed job to finish ..." +${KUBECTL_CMD} -n sippy-e2e get job sippy-seed-job -# We set +e to avoid the script aborting before we can retrieve logs. set +e - -echo "Waiting up to ${SIPPY_LOAD_TIMEOUT:=1200s} for the sippy-load-job to complete..." -${KUBECTL_CMD} -n sippy-e2e wait --for=condition=complete job/sippy-load-job --timeout ${SIPPY_LOAD_TIMEOUT} -retVal=$? +echo "Waiting up to 300s for the sippy-seed-job to complete..." +${KUBECTL_CMD} -n sippy-e2e wait --for=condition=complete job/sippy-seed-job --timeout 300s +seedRetVal=$? set -e -job_pod=$(${KUBECTL_CMD} -n sippy-e2e get pod --selector=job-name=sippy-load-job --output=jsonpath='{.items[0].metadata.name}') -${KUBECTL_CMD} -n sippy-e2e logs ${job_pod} > ${ARTIFACT_DIR}/sippy-load.log 2>&1 +seed_pod=$(${KUBECTL_CMD} -n sippy-e2e get pod --selector=job-name=sippy-seed-job --output=jsonpath='{.items[0].metadata.name}') +${KUBECTL_CMD} -n sippy-e2e logs ${seed_pod} > ${ARTIFACT_DIR}/sippy-seed.log 2>&1 -if [ ${retVal} -ne 0 ]; then +if [ ${seedRetVal} -ne 0 ]; then echo - echo "=== SIPPY LOAD JOB FAILURE DIAGNOSTICS ===" + echo "=== SIPPY SEED JOB FAILURE DIAGNOSTICS ===" echo "=== Job status ===" - ${KUBECTL_CMD} -n sippy-e2e describe job sippy-load-job + ${KUBECTL_CMD} -n sippy-e2e describe job sippy-seed-job echo "=== Job pod status ===" - ${KUBECTL_CMD} -n sippy-e2e describe pod ${job_pod} - echo "=== Recent namespace events ===" - ${KUBECTL_CMD} -n sippy-e2e get events --sort-by='.lastTimestamp' - echo "=== END SIPPY LOAD JOB FAILURE DIAGNOSTICS ===" + ${KUBECTL_CMD} -n sippy-e2e describe pod ${seed_pod} + echo "=== END SIPPY SEED JOB FAILURE DIAGNOSTICS ===" echo - echo "ERROR: sippy-load-job did not complete within ${SIPPY_LOAD_TIMEOUT}" + echo "ERROR: sippy-seed-job did not complete within 300s" exit 1 fi diff --git a/e2e-scripts/sippy-e2e-sippy-e2e-test-commands.sh b/e2e-scripts/sippy-e2e-sippy-e2e-test-commands.sh index 72b05151d9..6f34382b33 100755 --- a/e2e-scripts/sippy-e2e-sippy-e2e-test-commands.sh +++ b/e2e-scripts/sippy-e2e-sippy-e2e-test-commands.sh @@ -83,6 +83,8 @@ spec: - ocp - --views - ./config/e2e-views.yaml + - --data-provider + - postgres env: - name: GCS_SA_JSON_PATH value: /tmp/secrets/gcs-cred @@ -166,6 +168,32 @@ ${KUBECTL_CMD} -n sippy-e2e port-forward pod/redis1 ${SIPPY_REDIS_PORT}:6379 & ${KUBECTL_CMD} -n sippy-e2e get svc,ep +# Wait for the sippy API to be reachable through the port-forward +echo "Waiting for sippy API to be reachable on port ${SIPPY_API_PORT}..." +TIMEOUT=120 +ELAPSED=0 +while [ $ELAPSED -lt $TIMEOUT ]; do + if curl -s "http://localhost:${SIPPY_API_PORT}/api/health" > /dev/null 2>&1; then + echo "Sippy API is ready after ${ELAPSED}s" + break + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) +done +if [ $ELAPSED -ge $TIMEOUT ]; then + echo "ERROR: Timed out waiting for sippy API after ${TIMEOUT}s" + exit 1 +fi + +# Prime the component readiness cache so triage tests can find cached reports +echo "Priming component readiness cache..." +VIEWS=$(curl -sf "http://localhost:${SIPPY_API_PORT}/api/component_readiness/views") || { echo "Failed to fetch views"; exit 1; } +for VIEW in $(echo "$VIEWS" | jq -r '.[].name'); do + echo " Priming cache for view: $VIEW" + curl -sf "http://localhost:${SIPPY_API_PORT}/api/component_readiness?view=$VIEW" > /dev/null || { echo "Failed to prime cache for view: $VIEW"; exit 1; } +done +echo "Cache priming complete" + # only 1 in parallel, some tests will clash if run at the same time gotestsum --junitfile ${ARTIFACT_DIR}/junit_e2e.xml -- ./test/e2e/... -v -p 1 -coverprofile=${ARTIFACT_DIR}/e2e-test-coverage.out -coverpkg=./pkg/...,./cmd/... TEST_EXIT=$?