diff --git a/Makefile b/Makefile index 05a0f8386d..a72bc0afe4 100644 --- a/Makefile +++ b/Makefile @@ -74,6 +74,9 @@ verify-apm: apm e2e: ./scripts/e2e.sh +dev: sippy + ./scripts/dev-setup.sh $(if $(SERVE),--serve) + images: $(DOCKER) build . diff --git a/cmd/sippy/seed_data.go b/cmd/sippy/seed_data.go index e6829df337..7db0f0d1ad 100644 --- a/cmd/sippy/seed_data.go +++ b/cmd/sippy/seed_data.go @@ -1,15 +1,26 @@ package main import ( + "context" + "database/sql" "fmt" - "math/rand" + "os" + "sort" + "strings" "time" + "github.com/lib/pq" "github.com/pkg/errors" log "github.com/sirupsen/logrus" "github.com/spf13/cobra" "github.com/spf13/pflag" + "gopkg.in/yaml.v3" + componentreadiness "github.com/openshift/sippy/pkg/api/componentreadiness" + pgprovider "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider/postgres" + "github.com/openshift/sippy/pkg/api/componentreadiness/utils" + apitype "github.com/openshift/sippy/pkg/apis/api" + "github.com/openshift/sippy/pkg/apis/api/componentreport/reqopts" v1 "github.com/openshift/sippy/pkg/apis/sippyprocessing/v1" "github.com/openshift/sippy/pkg/db" "github.com/openshift/sippy/pkg/db/models" @@ -19,43 +30,19 @@ import ( ) type SeedDataFlags struct { - DBFlags *flags.PostgresFlags - CacheFlags *flags.CacheFlags - InitDatabase bool - Releases []string - JobsPerRelease int - TestNames []string - RunsPerJob int + DBFlags *flags.PostgresFlags + InitDatabase bool } func NewSeedDataFlags() *SeedDataFlags { return &SeedDataFlags{ - DBFlags: flags.NewPostgresDatabaseFlags(), - CacheFlags: flags.NewCacheFlags(), - Releases: []string{"5.0", "4.22", "4.21"}, // Default releases - JobsPerRelease: 3, // Default jobs per release - TestNames: []string{ - "install should succeed: infrastructure", - "install should succeed: overall", - "install should succeed: configuration", - "install should succeed: cluster bootstrap", - "install should succeed: other", - "[sig-cluster-lifecycle] Cluster completes upgrade", - "[sig-sippy] upgrade should work", - "[sig-sippy] openshift-tests should work", - }, - RunsPerJob: 20, // Default runs per job + DBFlags: flags.NewPostgresDatabaseFlags(), } } func (f *SeedDataFlags) BindFlags(fs *pflag.FlagSet) { f.DBFlags.BindFlags(fs) - f.CacheFlags.BindFlags(fs) fs.BoolVar(&f.InitDatabase, "init-database", false, "Initialize the DB schema before seeding data") - fs.StringSliceVar(&f.Releases, "release", f.Releases, "Releases to create ProwJobs for (can be specified multiple times)") - fs.IntVar(&f.JobsPerRelease, "jobs", f.JobsPerRelease, "Number of ProwJobs to create for each release") - fs.StringSliceVar(&f.TestNames, "test", f.TestNames, "Test names to create (can be specified multiple times)") - fs.IntVar(&f.RunsPerJob, "runs", f.RunsPerJob, "Number of ProwJobRuns to create for each ProwJob") } func NewSeedDataCommand() *cobra.Command { @@ -65,16 +52,19 @@ func NewSeedDataCommand() *cobra.Command { Use: "seed-data", Short: "Populate test data in the database", Long: `Populate test data in the database for development purposes. -This command creates sample ProwJob and Test records with realistic test data -that can be used for local development and testing. -Test results are randomized with 85% pass rate, 10% flake rate, and 5% failure rate. -All counts, releases, and test names are configurable via command-line flags. +Creates deterministic Component Readiness data covering all CR statuses +(NotSignificant, SignificantRegression, ExtremeRegression, MissingSample, +MissingBasis, BasisOnly, SignificantImprovement, BelowMinFailure) and +fallback scenarios. Use with 'sippy serve --data-provider postgres'. -The command can be re-run as needed to add more runs, or because your old job runs -rolled off the 1 week window. +Drop and recreate the database to re-seed (e.g. docker compose down -v). `, RunE: func(cmd *cobra.Command, args []string) error { + if strings.Contains(f.DBFlags.DSN, "amazonaws.com") { + return fmt.Errorf("refusing to seed synthetic data into a production database") + } + dbc, err := f.DBFlags.GetDBClient() if err != nil { return errors.WithMessage(err, "could not connect to database") @@ -89,255 +79,696 @@ rolled off the 1 week window. log.Info("Database schema initialized successfully") } - cacheClient, cacheErr := f.CacheFlags.GetCacheClient() - if cacheErr != nil { - return fmt.Errorf("failed to get cache client: %v", cacheErr) - } else if cacheClient == nil { - log.Warn("no cache provided; refresh timestamps will not be cached") - } - log.Info("Starting to seed test data...") + return seedSyntheticData(dbc) + }, + } - // Create the test suite - if err := createTestSuite(dbc); err != nil { - return errors.WithMessage(err, "failed to create test suite") - } - log.Info("Created test suite 'ourtests'") + f.BindFlags(cmd.Flags()) - // Create ProwJobs for each release - for _, release := range f.Releases { - if err := createProwJobsForRelease(dbc, release, f.JobsPerRelease); err != nil { - return errors.WithMessagef(err, "failed to create ProwJobs for release %s", release) - } - log.Infof("Processed %d ProwJobs for release %s", f.JobsPerRelease, release) - } + return cmd +} - // Create Test models - if err := createTestModels(dbc, f.TestNames); err != nil { - return errors.WithMessage(err, "failed to create Test models") - } - log.Infof("Processed %d Test models", len(f.TestNames)) +// --- Synthetic data seeding --- - // Create labels and symptoms - if err := createLabelsAndSymptoms(dbc); err != nil { - return errors.WithMessage(err, "failed to create labels and symptoms") - } - log.Info("Created sample labels and symptoms") +// syntheticJobDef defines a job with its full 9-key variant map. +type syntheticJobDef struct { + nameTemplate string + variants map[string]string +} - // Create ProwJobRuns for each ProwJob - if err := createProwJobRuns(dbc, f.RunsPerJob); err != nil { - return errors.WithMessage(err, "failed to create ProwJobRuns") - } - log.Info("Created ProwJobRuns and test results for all ProwJobs") +// syntheticTestSpec defines a test with deterministic pass/fail counts per release per job. +type syntheticTestSpec struct { + testID string + testName string + component string + capabilities []string + // Each entry maps a job name template -> per-release counts. + // The job template determines which variants the test runs with. + jobCounts map[string]map[string]testCount // jobTemplate -> release -> counts +} - // Apply labels to job runs - if err := applyLabelsToJobRuns(dbc); err != nil { - return errors.WithMessage(err, "failed to apply labels to job runs") - } - log.Info("Applied labels to ~25% of job runs") +type testCount struct { + total int + success int + flake int +} - totalProwJobs := len(f.Releases) * f.JobsPerRelease - totalRuns := totalProwJobs * f.RunsPerJob - totalTestResults := totalRuns * len(f.TestNames) +var syntheticReleases = []string{"4.22", "4.21", "4.20", "4.19"} - log.Info("Refreshing materialized views...") - sippyserver.RefreshData(dbc, cacheClient, false) +var syntheticJobs = []syntheticJobDef{ + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-upgrade-from-stable-4.21-e2e-aws-ovn-upgrade", + variants: map[string]string{ + "Platform": "aws", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "unknown", "Upgrade": "minor", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-amd64", + variants: map[string]string{ + "Platform": "aws", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "parallel", "Upgrade": "none", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-arm64", + variants: map[string]string{ + "Platform": "aws", "Architecture": "arm64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "parallel", "Upgrade": "none", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-techpreview-serial", + variants: map[string]string{ + "Platform": "aws", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "techpreview", + "Suite": "serial", "Upgrade": "none", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-gcp-ovn-amd64", + variants: map[string]string{ + "Platform": "gcp", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "parallel", "Upgrade": "none", "LayeredProduct": "none", + }, + }, + { + nameTemplate: "periodic-ci-openshift-release-master-ci-%s-e2e-gcp-ovn-upgrade-micro", + variants: map[string]string{ + "Platform": "gcp", "Architecture": "amd64", "Network": "ovn", + "Topology": "ha", "Installer": "ipi", "FeatureSet": "default", + "Suite": "unknown", "Upgrade": "micro", "LayeredProduct": "none", + }, + }, +} + +// Job template constants for referencing specific jobs in test specs. +const awsAmd64Parallel = "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-amd64" +const awsArm64Parallel = "periodic-ci-openshift-release-master-ci-%s-e2e-aws-ovn-arm64" +const gcpAmd64Parallel = "periodic-ci-openshift-release-master-ci-%s-e2e-gcp-ovn-amd64" + +// allJobTemplates returns name templates from syntheticJobs for use in test specs +// that should run on every job (e.g. install tests). +func allJobTemplates() []string { + templates := make([]string, len(syntheticJobs)) + for i, j := range syntheticJobs { + templates[i] = j.nameTemplate + } + return templates +} - log.Infof("Successfully seeded test data! Created %d ProwJobs, %d Tests, %d ProwJobRuns, and %d test results", - totalProwJobs, len(f.TestNames), totalRuns, totalTestResults) - return nil +// allJobCounts builds a jobCounts map that assigns the given per-release counts +// to every synthetic job. Used for tests like install indicators that run everywhere. +func allJobCounts(releaseCounts map[string]testCount) map[string]map[string]testCount { + result := make(map[string]map[string]testCount, len(syntheticJobs)) + for _, tpl := range allJobTemplates() { + result[tpl] = releaseCounts + } + return result +} + +var syntheticTests = []syntheticTestSpec{ + // --- NotSignificant: appears in 3 jobs across 2 platforms --- + { + testID: "test-not-significant", testName: "[sig-arch] Check build pods use all cpu cores", + component: "comp-NotSignificant", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {100, 95, 0}, "4.22": {100, 93, 0}}, + awsArm64Parallel: {"4.21": {80, 76, 0}, "4.22": {80, 75, 0}}, + gcpAmd64Parallel: {"4.21": {100, 97, 0}, "4.22": {100, 95, 0}}, + }, + }, + + // --- SignificantRegression: regressed on aws/amd64, fine elsewhere --- + { + testID: "test-significant-regression", testName: "[sig-network] Services should serve endpoints on same port and different protocol", + component: "comp-SignificantRegression", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 170, 0}}, + awsArm64Parallel: {"4.21": {180, 171, 0}, "4.22": {180, 168, 0}}, + gcpAmd64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 188, 0}}, + }, + }, + + // --- ExtremeRegression: extreme on aws/amd64, significant on others --- + { + testID: "test-extreme-regression", testName: "[sig-etcd] etcd leader changes are not excessive", + component: "comp-ExtremeRegression", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 140, 0}}, + awsArm64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 170, 0}}, + gcpAmd64Parallel: {"4.21": {200, 190, 0}, "4.22": {200, 170, 0}}, + }, + }, + + // --- MissingSample: test in base, 0 sample runs --- + { + testID: "test-missing-sample", testName: "[sig-storage] CSI volumes should be mountable", + component: "comp-MissingSample", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {100, 95, 0}, "4.22": {0, 0, 0}}, + }, + }, + + // --- MissingBasis: test only in sample --- + { + testID: "test-missing-basis", testName: "[sig-node] New pod lifecycle test", + component: "comp-MissingBasis", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.22": {100, 95, 0}}, + }, + }, + + // --- NewTestPassRateRegression: new test only in sample, below PassRateRequiredNewTests threshold --- + { + testID: "test-new-test-pass-rate-fail", testName: "[sig-node] New flaky pod readiness test", + component: "comp-NewTestPassRate", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.22": {100, 70, 0}}, + }, + }, + + // --- BasisOnly: test in base, absent from sample --- + { + testID: "test-basis-only", testName: "[sig-apps] Removed deployment test", + component: "comp-BasisOnly", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {100, 95, 0}}, + }, + }, + + // --- SignificantImprovement: 80% -> 95% --- + { + testID: "test-significant-improvement", testName: "[sig-cli] oc adm should handle upgrades gracefully", + component: "comp-SignificantImprovement", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {200, 160, 0}, "4.22": {200, 190, 0}}, + }, + }, + + // --- BelowMinFailure: only 2 failures, below MinimumFailure=3 --- + { + testID: "test-below-min-failure", testName: "[sig-auth] RBAC should allow access with valid token", + component: "comp-BelowMinFailure", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: {"4.21": {100, 100, 0}, "4.22": {100, 98, 0}}, + }, + }, + + // --- Fallback: 4.21 worse, 4.20 better -> swaps to 4.20 --- + { + testID: "test-fallback-improves", testName: "[sig-instrumentation] Metrics should report accurate cpu usage", + component: "comp-FallbackImproves", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: { + "4.21": {200, 180, 0}, + "4.20": {200, 194, 0}, + "4.22": {200, 160, 0}, + }, + }, + }, + + // --- Double fallback: 4.21->4.20->4.19 --- + { + testID: "test-fallback-double", testName: "[sig-scheduling] Scheduler should spread pods evenly", + component: "comp-FallbackDouble", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: { + "4.21": {200, 180, 0}, + "4.20": {200, 186, 0}, + "4.19": {200, 194, 0}, + "4.22": {200, 160, 0}, + }, + }, + }, + + // --- Fallback insufficient runs: 4.20 has <60% of 4.21 count --- + { + testID: "test-fallback-insufficient-runs", testName: "[sig-network] DNS should resolve cluster services", + component: "comp-FallbackInsufficient", capabilities: []string{"cap1"}, + jobCounts: map[string]map[string]testCount{ + awsAmd64Parallel: { + "4.21": {1000, 940, 0}, + "4.20": {100, 99, 0}, + "4.22": {1000, 850, 0}, + }, }, + }, + + // --- Install / health indicator tests: run on every job, every release --- + { + testID: "test-install-overall", testName: "install should succeed: overall", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 95, 0}, "4.21": {100, 96, 0}, "4.20": {100, 97, 0}, "4.19": {100, 97, 0}, + }), + }, + { + testID: "test-install-config", testName: "install should succeed: configuration", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 97, 0}, "4.21": {100, 98, 0}, "4.20": {100, 98, 0}, "4.19": {100, 98, 0}, + }), + }, + { + testID: "test-install-bootstrap", testName: "install should succeed: cluster bootstrap", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 96, 0}, "4.21": {100, 97, 0}, "4.20": {100, 97, 0}, "4.19": {100, 97, 0}, + }), + }, + { + testID: "test-install-other", testName: "install should succeed: other", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 98, 0}, "4.21": {100, 99, 0}, "4.20": {100, 99, 0}, "4.19": {100, 99, 0}, + }), + }, + { + testID: "test-install-infra", testName: "install should succeed: infrastructure", + component: "comp-Install", capabilities: []string{"install"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 96, 0}, "4.21": {100, 97, 0}, "4.20": {100, 97, 0}, "4.19": {100, 97, 0}, + }), + }, + { + testID: "test-upgrade", testName: "[sig-sippy] upgrade should work", + component: "comp-Install", capabilities: []string{"upgrade"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 94, 0}, "4.21": {100, 95, 0}, "4.20": {100, 96, 0}, "4.19": {100, 96, 0}, + }), + }, + { + testID: "test-openshift-tests", testName: "[sig-sippy] openshift-tests should work", + component: "comp-Install", capabilities: []string{"tests"}, + jobCounts: allJobCounts(map[string]testCount{ + "4.22": {100, 90, 0}, "4.21": {100, 92, 0}, "4.20": {100, 93, 0}, "4.19": {100, 93, 0}, + }), + }, +} + +// releaseTimeWindow returns the start/end times for a release's test data. +func releaseTimeWindow(release string) (start, end time.Time) { + now := time.Now().UTC().Truncate(time.Hour) + switch release { + case "4.22": + return now.Add(-3 * 24 * time.Hour), now + case "4.21": + return now.Add(-60 * 24 * time.Hour), now.Add(-30 * 24 * time.Hour) + case "4.20": + return now.Add(-120 * 24 * time.Hour), now.Add(-90 * 24 * time.Hour) + case "4.19": + return now.Add(-180 * 24 * time.Hour), now.Add(-150 * 24 * time.Hour) + default: + return now.Add(-14 * 24 * time.Hour), now } +} - f.BindFlags(cmd.Flags()) +func seedSyntheticData(dbc *db.DB) error { + // Check if data already exists + var count int64 + if err := dbc.DB.Model(&models.ProwJob{}).Count(&count).Error; err != nil { + return fmt.Errorf("failed to check for existing data: %w", err) + } + if count > 0 { + log.Infof("Database already contains %d ProwJobs, skipping seed. Drop and recreate the database to re-seed (e.g. docker compose down -v).", count) + return nil + } - return cmd + if err := createTestSuite(dbc, "synthetic"); err != nil { + return errors.WithMessage(err, "failed to create test suite") + } + log.Info("Created test suite 'synthetic'") + + if err := seedProwJobs(dbc); err != nil { + return err + } + + if err := seedTestsAndOwnerships(dbc); err != nil { + return err + } + + totalRuns, totalResults, err := seedJobRunsAndResults(dbc) + if err != nil { + return err + } + + if err := createLabelsAndSymptoms(dbc); err != nil { + return errors.WithMessage(err, "failed to create labels and symptoms") + } + + log.Info("Refreshing materialized views...") + sippyserver.RefreshData(dbc, nil, false) + + log.Info("Syncing regressions...") + if err := syncRegressions(dbc); err != nil { + return errors.WithMessage(err, "failed to sync regressions") + } + + log.Infof("Seeded synthetic data: %d ProwJobRuns, %d test results across %d releases", + totalRuns, totalResults, len(syntheticReleases)) + return nil } -func createProwJobsForRelease(dbc *db.DB, release string, jobsPerRelease int) error { - for i := 1; i <= jobsPerRelease; i++ { - // Choose JobTier based on whether i is even or odd - var jobTier = "JobTier:standard" // even number job index = standard - if i%2 != 0 { - jobTier = "JobTier:hidden" // odd = hidden +func seedProwJobs(dbc *db.DB) error { + for _, release := range syntheticReleases { + for _, job := range syntheticJobs { + name := fmt.Sprintf(job.nameTemplate, release) + variants := variantMapToArray(job.variants) + prowJob := models.ProwJob{ + Kind: models.ProwKind("periodic"), + Name: name, + Release: release, + Variants: variants, + } + var existing models.ProwJob + if err := dbc.DB.Where("name = ?", name).FirstOrCreate(&existing, prowJob).Error; err != nil { + return fmt.Errorf("failed to create ProwJob %s: %w", name, err) + } } + } + log.Infof("Created ProwJobs for %d releases x %d jobs", len(syntheticReleases), len(syntheticJobs)) + return nil +} + +type testInfo struct { + name string + uniqueID string + component string + capabilities []string +} + +func seedTestsAndOwnerships(dbc *db.DB) error { + var suite models.Suite + if err := dbc.DB.Where("name = ?", "synthetic").First(&suite).Error; err != nil { + return fmt.Errorf("failed to find suite: %w", err) + } - prowJob := models.ProwJob{ - Kind: models.ProwKind("periodic"), - Name: fmt.Sprintf("sippy-test-job-%s-test-%d", release, i), - Release: release, - // TestGridURL, Bugs, and JobRuns are left empty as requested - Variants: []string{"Platform:aws", "Upgrade:none", jobTier}, + seenTests := map[string]testInfo{} + for _, ts := range syntheticTests { + if _, ok := seenTests[ts.testName]; !ok { + seenTests[ts.testName] = testInfo{ + name: ts.testName, + uniqueID: ts.testID, + component: ts.component, + capabilities: ts.capabilities, + } } + } - // Use FirstOrCreate to avoid duplicates - only creates if a ProwJob with this name doesn't exist - var existingJob models.ProwJob - if err := dbc.DB.Where("name = ?", prowJob.Name).FirstOrCreate(&existingJob, prowJob).Error; err != nil { - return fmt.Errorf("failed to create or find ProwJob %s: %v", prowJob.Name, err) + for _, info := range seenTests { + testModel := models.Test{Name: info.name} + var existingTest models.Test + if err := dbc.DB.Where("name = ?", info.name).FirstOrCreate(&existingTest, testModel).Error; err != nil { + return fmt.Errorf("failed to create Test %s: %w", info.name, err) } - // Log whether we created a new job or found an existing one - if existingJob.CreatedAt.IsZero() || existingJob.CreatedAt.Equal(existingJob.UpdatedAt) { - log.Debugf("Created new ProwJob: %s", prowJob.Name) - } else { - log.Debugf("ProwJob already exists: %s", prowJob.Name) + ownership := models.TestOwnership{ + UniqueID: info.uniqueID, + Name: info.name, + TestID: existingTest.ID, + Suite: "synthetic", + SuiteID: &suite.ID, + Component: info.component, + Capabilities: info.capabilities, + } + var existingOwnership models.TestOwnership + if err := dbc.DB.Where("name = ? AND suite = ?", info.name, "synthetic").FirstOrCreate(&existingOwnership, ownership).Error; err != nil { + return fmt.Errorf("failed to create TestOwnership for %s: %w", info.name, err) } } - + log.Infof("Created %d tests with ownership records", len(seenTests)) return nil } -func createTestModels(dbc *db.DB, testNames []string) error { - for _, testName := range testNames { - testModel := models.Test{ - Name: testName, +type jobReleaseKey struct { + jobTemplate string + release string +} + +func seedJobRunsAndResults(dbc *db.DB) (int, int, error) { + var suite models.Suite + if err := dbc.DB.Where("name = ?", "synthetic").First(&suite).Error; err != nil { + return 0, 0, fmt.Errorf("failed to find suite: %w", err) + } + + maxRuns := map[jobReleaseKey]int{} + for _, ts := range syntheticTests { + for jobTpl, releaseCounts := range ts.jobCounts { + for release, counts := range releaseCounts { + key := jobReleaseKey{jobTpl, release} + if counts.total > maxRuns[key] { + maxRuns[key] = counts.total + } + } } + } - // Use FirstOrCreate to avoid duplicates - only creates if a Test with this name doesn't exist - var existingTest models.Test - if err := dbc.DB.Where("name = ?", testModel.Name).FirstOrCreate(&existingTest, testModel).Error; err != nil { - return fmt.Errorf("failed to create or find Test %s: %v", testModel.Name, err) + testIDsByName := map[string]uint{} + var allTests []models.Test + if err := dbc.DB.Find(&allTests).Error; err != nil { + return 0, 0, fmt.Errorf("failed to fetch tests: %w", err) + } + for _, t := range allTests { + testIDsByName[t.Name] = t.ID + } + + totalRuns := 0 + totalResults := 0 + for jrKey, runCount := range maxRuns { + if runCount == 0 { + continue } - if existingTest.CreatedAt.IsZero() || existingTest.CreatedAt.Equal(existingTest.UpdatedAt) { - log.Debugf("Created new Test: %s", testModel.Name) - } else { - log.Debugf("Test already exists: %s", testModel.Name) + jobName := fmt.Sprintf(jrKey.jobTemplate, jrKey.release) + var prowJob models.ProwJob + if err := dbc.DB.Where("name = ?", jobName).First(&prowJob).Error; err != nil { + return 0, 0, fmt.Errorf("failed to find ProwJob %s: %w", jobName, err) + } + + runs, results, err := seedRunsForJob(dbc, &suite, prowJob, jrKey, runCount, testIDsByName) + if err != nil { + return 0, 0, err } + totalRuns += runs + totalResults += results + + log.Debugf("Created %d runs for %s", runCount, jobName) } - return nil + return totalRuns, totalResults, nil } -func createTestSuite(dbc *db.DB) error { - suite := models.Suite{ - Name: "ourtests", +func seedRunsForJob(dbc *db.DB, suite *models.Suite, prowJob models.ProwJob, jrKey jobReleaseKey, runCount int, testIDsByName map[string]uint) (int, int, error) { + start, end := releaseTimeWindow(jrKey.release) + window := end.Sub(start) + infraRuns := 2 + totalRuns := runCount + infraRuns + interval := window / time.Duration(totalRuns) + runIDs := make([]uint, totalRuns) + for i := range totalRuns { + timestamp := start.Add(time.Duration(i) * interval) + run := models.ProwJobRun{ + ProwJobID: prowJob.ID, + Cluster: "build01", + Timestamp: timestamp, + Duration: 3 * time.Hour, + } + if err := dbc.DB.Create(&run).Error; err != nil { + return 0, 0, fmt.Errorf("failed to create ProwJobRun: %w", err) + } + runIDs[i] = run.ID } - // Use FirstOrCreate to avoid duplicates - var existingSuite models.Suite - if err := dbc.DB.Where("name = ?", suite.Name).FirstOrCreate(&existingSuite, suite).Error; err != nil { - return fmt.Errorf("failed to create or find Suite %s: %v", suite.Name, err) + runsWithFailure := map[uint]bool{} + totalResults := 0 + + for _, ts := range syntheticTests { + releaseCounts, hasJob := ts.jobCounts[jrKey.jobTemplate] + if !hasJob { + continue + } + counts, hasRelease := releaseCounts[jrKey.release] + if !hasRelease || counts.total == 0 { + continue + } + + testID, ok := testIDsByName[ts.testName] + if !ok { + return 0, 0, fmt.Errorf("test %q not found in DB", ts.testName) + } + + for i := 0; i < counts.total && i < runCount; i++ { + var status int + switch { + case i < counts.success-counts.flake: + status = 1 // pass + case i < counts.success: + status = 13 // flake (counts as success too) + default: + status = 12 // failure + runsWithFailure[runIDs[i]] = true + } + + result := models.ProwJobRunTest{ + ProwJobRunID: runIDs[i], + TestID: testID, + SuiteID: &suite.ID, + Status: status, + Duration: 5.0, + CreatedAt: start.Add(time.Duration(i) * interval), + } + if err := dbc.DB.Create(&result).Error; err != nil { + return 0, 0, fmt.Errorf("failed to create ProwJobRunTest: %w", err) + } + totalResults++ + } } - return nil -} + // Set OverallResult on all runs + for i, runID := range runIDs { + var overallResult v1.JobOverallResult + var succeeded, failed bool + + if i >= runCount { + overallResult = v1.JobInternalInfrastructureFailure + failed = true + } else if runsWithFailure[runID] { + overallResult = v1.JobTestFailure + failed = true + } else { + overallResult = v1.JobSucceeded + succeeded = true + } -func createProwJobRuns(dbc *db.DB, runsPerJob int) error { - var prowJobs []models.ProwJob - if err := dbc.DB.Find(&prowJobs).Error; err != nil { - return fmt.Errorf("failed to fetch existing ProwJobs: %v", err) + if err := dbc.DB.Model(&models.ProwJobRun{}).Where("id = ?", runID). + Updates(map[string]any{ + "overall_result": overallResult, + "succeeded": succeeded, + "failed": failed, + "infrastructure_failure": i >= runCount, + }).Error; err != nil { + return 0, 0, fmt.Errorf("failed to update ProwJobRun result: %w", err) + } } - var tests []models.Test - if err := dbc.DB.Find(&tests).Error; err != nil { - return fmt.Errorf("failed to fetch existing Tests: %v", err) + // Update test_failures count + if err := dbc.DB.Exec(` + UPDATE prow_job_runs SET test_failures = COALESCE(( + SELECT COUNT(*) FROM prow_job_run_tests + WHERE prow_job_run_id = prow_job_runs.id AND status = 12 + ), 0) WHERE prow_job_id = ?`, prowJob.ID).Error; err != nil { + return 0, 0, fmt.Errorf("updating test_failures for prow job %s: %w", prowJob.Name, err) } - var suite models.Suite - if err := dbc.DB.Where("name = ?", "ourtests").First(&suite).Error; err != nil { - return fmt.Errorf("failed to find Suite 'ourtests': %v", err) + return totalRuns, totalResults, nil +} + +func syncRegressions(dbc *db.DB) error { + provider := pgprovider.NewPostgresProvider(dbc, nil) + ctx := context.Background() + + releases, err := provider.QueryReleases(ctx) + if err != nil { + return fmt.Errorf("querying releases: %w", err) } - log.Infof("Found %d ProwJobs, creating %d runs for each", len(prowJobs), runsPerJob) + viewsData, err := os.ReadFile(syntheticViewsFile) + if err != nil { + return fmt.Errorf("reading views file: %w", err) + } + var views apitype.SippyViews + if err := yaml.Unmarshal(viewsData, &views); err != nil { + return fmt.Errorf("parsing views file: %w", err) + } - // Calculate time range: past 2 weeks from now - now := time.Now() - twoWeeksAgo := now.AddDate(0, 0, -14) + backend := componentreadiness.NewPostgresRegressionStore(dbc, nil) + rLog := log.WithField("source", "seed-regression-sync") - // Duration for each run: 3 hours - runDuration := 3 * time.Hour + for _, view := range views.ComponentReadiness { + baseRelease, err := utils.GetViewReleaseOptions(releases, "basis", view.BaseRelease, 0) + if err != nil { + return fmt.Errorf("error getting base release for view %s: %w", view.Name, err) + } + sampleRelease, err := utils.GetViewReleaseOptions(releases, "sample", view.SampleRelease, 0) + if err != nil { + return fmt.Errorf("error getting sample release for view %s: %w", view.Name, err) + } - for _, prowJob := range prowJobs { - log.Infof("Creating %d ProwJobRuns for ProwJob: %s", runsPerJob, prowJob.Name) + reportOpts := reqopts.RequestOptions{ + BaseRelease: baseRelease, + SampleRelease: sampleRelease, + VariantOption: view.VariantOptions, + AdvancedOption: view.AdvancedOptions, + } - for i := 0; i < runsPerJob; i++ { - // Log progress every 10 runs to show activity - if (i+1)%10 == 0 { - log.Infof(" Progress: %d/%d runs created for %s", i+1, runsPerJob, prowJob.Name) + report, reportErrs := componentreadiness.GetComponentReport(ctx, provider, dbc, reportOpts, "") + if len(reportErrs) > 0 { + for _, e := range reportErrs { + rLog.WithError(e).Warn("report generation error") } + return fmt.Errorf("error generating component report for view %s", view.Name) + } - // Calculate timestamp: spread evenly over the past 2 weeks - totalDuration := 14 * 24 * time.Hour - // Time between runs = total duration / runs - timeBetweenRuns := totalDuration / time.Duration(runsPerJob) - timestamp := twoWeeksAgo.Add(time.Duration(i) * timeBetweenRuns) - - prowJobRun := models.ProwJobRun{ - ProwJobID: prowJob.ID, - Cluster: "build01", - Timestamp: timestamp, - Duration: runDuration, - TestCount: len(tests), - } + activeRegs, err := componentreadiness.SyncRegressionsForReport(backend, view, rLog, &report) + if err != nil { + return fmt.Errorf("error syncing regressions for view %s: %w", view.Name, err) + } - if err := dbc.DB.Create(&prowJobRun).Error; err != nil { - return fmt.Errorf("failed to create ProwJobRun for ProwJob %s: %v", prowJob.Name, err) + // Close regressions no longer in the report + allRegs, err := backend.ListCurrentRegressionsForRelease(view.SampleRelease.Name) + if err != nil { + return fmt.Errorf("error listing regressions: %w", err) + } + activeIDs := map[uint]bool{} + for _, r := range activeRegs { + activeIDs[r.ID] = true + } + now := time.Now() + for _, reg := range allRegs { + if !activeIDs[reg.ID] && !reg.Closed.Valid { + reg.Closed = sql.NullTime{Valid: true, Time: now} + if err := backend.UpdateRegression(reg); err != nil { + return fmt.Errorf("error closing regression %d: %w", reg.ID, err) + } } + } - var testFailures int - for _, test := range tests { - // Determine test status based on random chance - // 5% chance of failure, 10% chance of flake, 85% chance of pass - // nolint: gosec - randNum := rand.Float64() - var status int - if randNum < 0.05 { - status = 12 // failure - testFailures++ - } else if randNum < 0.15 { - status = 13 // flake - } else { - status = 1 // pass - } + rLog.Infof("synced regressions for view %s: %d active", view.Name, len(activeRegs)) + } - prowJobRunTest := models.ProwJobRunTest{ - ProwJobRunID: prowJobRun.ID, - TestID: test.ID, - SuiteID: &suite.ID, - Status: status, - Duration: 5.0, // 5 seconds - CreatedAt: timestamp, - } + if err := backend.ResolveTriages(); err != nil { + return fmt.Errorf("error resolving triages: %w", err) + } - if err := dbc.DB.Create(&prowJobRunTest).Error; err != nil { - return fmt.Errorf("failed to create ProwJobRunTest for test %s: %v", test.Name, err) - } - } + return nil +} - // Set overall result based on test failures and random factors - var overallResult v1.JobOverallResult - if testFailures > 0 { - prowJobRun.Failed = true - prowJobRun.Succeeded = false - prowJobRun.TestFailures = testFailures - - // Randomly assign different failure types - // nolint: gosec - failureType := rand.Float64() - if failureType < 0.7 { - overallResult = v1.JobTestFailure // 70% test failures - } else if failureType < 0.85 { - overallResult = v1.JobUpgradeFailure // 15% upgrade failures - } else if failureType < 0.92 { - overallResult = v1.JobInstallFailure // 7% install failures - } else { - overallResult = v1.JobExternalInfrastructureFailure // 8% infrastructure failures - } - } else { - prowJobRun.Failed = false - prowJobRun.Succeeded = true - prowJobRun.TestFailures = 0 - overallResult = v1.JobSucceeded - } - prowJobRun.OverallResult = overallResult +const syntheticViewsFile = "config/e2e-views.yaml" - if err := dbc.DB.Save(&prowJobRun).Error; err != nil { - return fmt.Errorf("failed to update ProwJobRun for ProwJob %s: %v", prowJob.Name, err) - } - } +// variantMapToArray converts a variant map to a pq.StringArray. +func variantMapToArray(m map[string]string) pq.StringArray { + result := make([]string, 0, len(m)) + for k, v := range m { + result = append(result, k+":"+v) + } + sort.Strings(result) + return result +} + +func createTestSuite(dbc *db.DB, name string) error { + suite := models.Suite{ + Name: name, + } - log.Infof("Completed creating %d ProwJobRuns for ProwJob: %s", runsPerJob, prowJob.Name) + var existingSuite models.Suite + if err := dbc.DB.Where("name = ?", suite.Name).FirstOrCreate(&existingSuite, suite).Error; err != nil { + return fmt.Errorf("failed to create or find Suite %s: %v", suite.Name, err) } return nil @@ -349,13 +780,12 @@ func createLabelsAndSymptoms(dbc *db.DB) error { UpdatedBy: "seed-data", } - // Create sample labels labels := []jobrunscan.Label{ { LabelContent: jobrunscan.LabelContent{ ID: "InfraFailure", LabelTitle: "Infrastructure failure: omit job from CR", - Explanation: "Job failed due to **infrastructure issues** not related to product code. See [TRT documentation](https://docs.ci.openshift.org/docs/architecture/ci-operator/) for more details.", + Explanation: "Job failed due to **infrastructure issues** not related to product code.", }, Metadata: metadata, }, @@ -363,7 +793,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { LabelContent: jobrunscan.LabelContent{ ID: "ClusterDNSFlake", LabelTitle: "Cluster DNS resolution failure(s)", - Explanation: "Job experienced DNS resolution timeouts in the cluster:\n\n- Check for network issues\n- Review DNS server logs\n- Examine cluster network configuration", + Explanation: "Job experienced DNS resolution timeouts in the cluster.", }, Metadata: metadata, }, @@ -371,7 +801,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { LabelContent: jobrunscan.LabelContent{ ID: "ClusterInstallTimeout", LabelTitle: "Cluster install timeout", - Explanation: "Cluster installation exceeded timeout threshold. This may indicate:\n\n1. Slow infrastructure provisioning\n2. Network connectivity problems\n3. Image pull failures", + Explanation: "Cluster installation exceeded timeout threshold.", }, Metadata: metadata, }, @@ -379,7 +809,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { LabelContent: jobrunscan.LabelContent{ ID: "IntervalFile", LabelTitle: "Has interval file(s)", - Explanation: "Job produced interval monitoring files. Use the `intervals` tool to analyze timing data.", + Explanation: "Job produced interval monitoring files.", }, HideDisplayContexts: []string{jobrunscan.MetricsContext, jobrunscan.JAQOptsContext}, Metadata: metadata, @@ -388,7 +818,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { LabelContent: jobrunscan.LabelContent{ ID: "APIServerTimeout", LabelTitle: "API server timeout", - Explanation: "Requests to the API server timed out. Common causes:\n\n- High API server load\n- Network latency issues\n- Slow etcd responses", + Explanation: "Requests to the API server timed out.", }, Metadata: metadata, }, @@ -399,14 +829,8 @@ func createLabelsAndSymptoms(dbc *db.DB) error { if err := dbc.DB.Where("id = ?", label.ID).FirstOrCreate(&existing, label).Error; err != nil { return fmt.Errorf("failed to create or find label %s: %v", label.ID, err) } - if existing.CreatedAt.IsZero() || existing.CreatedAt.Equal(existing.UpdatedAt) { - log.Debugf("Created new Label: %s", label.ID) - } else { - log.Debugf("Label already exists: %s", label.ID) - } } - // Create sample symptoms symptoms := []jobrunscan.Symptom{ { SymptomContent: jobrunscan.SymptomContent{ @@ -459,71 +883,7 @@ func createLabelsAndSymptoms(dbc *db.DB) error { if err := dbc.DB.Where("id = ?", symptom.ID).FirstOrCreate(&existing, symptom).Error; err != nil { return fmt.Errorf("failed to create or find symptom %s: %v", symptom.ID, err) } - if existing.CreatedAt.IsZero() || existing.CreatedAt.Equal(existing.UpdatedAt) { - log.Debugf("Created new Symptom: %s", symptom.ID) - } else { - log.Debugf("Symptom already exists: %s", symptom.ID) - } - } - - return nil -} - -func applyLabelsToJobRuns(dbc *db.DB) error { - // Fetch all job runs - var jobRuns []models.ProwJobRun - if err := dbc.DB.Find(&jobRuns).Error; err != nil { - return fmt.Errorf("failed to fetch job runs: %v", err) - } - - // Fetch all labels - var labels []jobrunscan.Label - if err := dbc.DB.Find(&labels).Error; err != nil { - return fmt.Errorf("failed to fetch labels: %v", err) - } - - if len(labels) == 0 { - log.Warn("No labels found, skipping label application") - return nil } - labelIDs := make([]string, len(labels)) - for i, label := range labels { - labelIDs[i] = label.ID - } - - // Apply labels to approximately 25% of job runs - labeledCount := 0 - for i := range jobRuns { - // nolint: gosec // we do not care that the randomness is weak - if rand.Float64() > 0.25 { - continue - } - // Randomly select 1-3 labels - // nolint: gosec - numLabels := rand.Intn(3) + 1 - selectedLabels := make([]string, 0, numLabels) - - // Randomly pick unique labels - usedIndices := make(map[int]bool) - for len(selectedLabels) < numLabels && len(selectedLabels) < len(labelIDs) { - // nolint: gosec - idx := rand.Intn(len(labelIDs)) - if !usedIndices[idx] { - selectedLabels = append(selectedLabels, labelIDs[idx]) - usedIndices[idx] = true - } - } - - jobRuns[i].Labels = selectedLabels - if err := dbc.DB.Save(&jobRuns[i]).Error; err != nil { - return fmt.Errorf("failed to update job run %d with labels: %v", jobRuns[i].ID, err) - } - labeledCount++ - } - - log.Infof("Applied labels to %d of %d job runs (%.1f%%)", - labeledCount, len(jobRuns), float64(labeledCount)/float64(len(jobRuns))*100) - return nil } diff --git a/cmd/sippy/serve.go b/cmd/sippy/serve.go index ee16451e68..631a85d5ea 100644 --- a/cmd/sippy/serve.go +++ b/cmd/sippy/serve.go @@ -19,6 +19,7 @@ import ( resources "github.com/openshift/sippy" "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider" bqprovider "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider/bigquery" + pgprovider "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider/postgres" "github.com/openshift/sippy/pkg/apis/cache" "github.com/openshift/sippy/pkg/bigquery" "github.com/openshift/sippy/pkg/bigquery/bqlabel" @@ -69,10 +70,13 @@ func (f *ServerFlags) BindFlags(flagSet *pflag.FlagSet) { f.ConfigFlags.BindFlags(flagSet) f.APIFlags.BindFlags(flagSet) f.JiraFlags.BindFlags(flagSet) - flagSet.StringVar(&f.DataProvider, "data-provider", "bigquery", "Data provider for component readiness: bigquery") + flagSet.StringVar(&f.DataProvider, "data-provider", "bigquery", "Data provider for component readiness: bigquery, postgres") } func (f *ServerFlags) Validate() error { + if f.DataProvider == "postgres" { + return nil + } return f.GoogleCloudFlags.Validate() } @@ -132,8 +136,12 @@ func NewServeCommand() *cobra.Command { crDataProvider = bqprovider.NewBigQueryProvider(bigQueryClient, config.ComponentReadinessConfig.VariantJunitTableOverrides) } + case "postgres": + crDataProvider = pgprovider.NewPostgresProvider(dbc, cacheClient) + log.Info("Using Postgres data provider for component readiness") + default: - return fmt.Errorf("unknown --data-provider %q, must be bigquery", f.DataProvider) + return fmt.Errorf("unknown --data-provider %q, must be bigquery or postgres", f.DataProvider) } gcsClient, err = gcs.NewGCSClient(context.TODO(), diff --git a/config/e2e-views.yaml b/config/e2e-views.yaml index aab16d861c..36b9e3abb6 100644 --- a/config/e2e-views.yaml +++ b/config/e2e-views.yaml @@ -1,81 +1,71 @@ ---- component_readiness: -- name: 4.20-main - base_release: - release: "4.19" - relative_start: ga-30d - relative_end: ga - sample_release: - release: "4.20" - relative_start: now-7d - relative_end: now - variant_options: - column_group_by: - Architecture: {} - Network: {} - Platform: {} - Topology: {} - db_group_by: - Architecture: {} - FeatureSet: {} - Installer: {} - Network: {} - Platform: {} - Suite: {} - Topology: {} - Upgrade: {} - include_variants: - Architecture: - - amd64 - FeatureSet: - - default - - techpreview - Installer: - - ipi - - upi - - hypershift - JobTier: - - blocking - - informing - - standard - LayeredProduct: - - none - - virt - Network: - - ovn - Owner: - - eng - - service-delivery - Platform: - - aws - - azure - - gcp - - metal - - rosa - - vsphere - Topology: - - ha - - microshift - - external - CGroupMode: - - v2 - ContainerRuntime: - - runc - - crun - advanced_options: - minimum_failure: 3 - confidence: 95 - pity_factor: 5 - ignore_missing: false - ignore_disruption: true - flake_as_failure: false - pass_rate_required_new_tests: 95 - include_multi_release_analysis: true - metrics: - enabled: true - regression_tracking: - enabled: true - prime_cache: - enabled: true - automate_jira: - enabled: true + - name: 4.22-main + base_release: + release: "4.21" + relative_start: now-60d + relative_end: now-30d + sample_release: + release: "4.22" + relative_start: now-3d + relative_end: now + test_id_options: {} + test_filters: {} + variant_options: + column_group_by: + Network: {} + Platform: {} + Topology: {} + db_group_by: + Architecture: {} + FeatureSet: {} + Installer: {} + LayeredProduct: {} + Network: {} + Platform: {} + Suite: {} + Topology: {} + Upgrade: {} + include_variants: + Architecture: + - amd64 + - arm64 + FeatureSet: + - default + - techpreview + Installer: + - ipi + LayeredProduct: + - none + Network: + - ovn + Platform: + - aws + - gcp + Suite: + - parallel + - serial + - unknown + Topology: + - ha + Upgrade: + - micro + - minor + - none + advanced_options: + minimum_failure: 3 + confidence: 95 + pity_factor: 5 + pass_rate_required_new_tests: 90 + pass_rate_required_all_tests: 0 + ignore_missing: false + ignore_disruption: false + flake_as_failure: false + include_multi_release_analysis: true + metrics: + enabled: false + regression_tracking: + enabled: true + automate_jira: + enabled: false + prime_cache: + enabled: true diff --git a/e2e-scripts/sippy-e2e-sippy-e2e-setup-commands.sh b/e2e-scripts/sippy-e2e-sippy-e2e-setup-commands.sh index cd210804b5..873f8c21a3 100755 --- a/e2e-scripts/sippy-e2e-sippy-e2e-setup-commands.sh +++ b/e2e-scripts/sippy-e2e-sippy-e2e-setup-commands.sh @@ -283,12 +283,13 @@ spec: storage: 100Mi END -# Make the "sippy loader" pod. +# Seed the database with synthetic data for e2e tests. +# TODO: Add a scoped 'sippy load' test back (e.g. single job) to exercise the GCS loading path. cat << END | ${KUBECTL_CMD} apply -f - apiVersion: batch/v1 kind: Job metadata: - name: sippy-load-job + name: sippy-seed-job namespace: sippy-e2e spec: template: @@ -304,20 +305,9 @@ spec: terminationMessagePolicy: File command: ["/bin/sh", "-c"] args: - - /bin/sippy load --init-database --log-level=debug --release 4.20 --database-dsn=postgresql://postgres:password@postgres.sippy-e2e.svc.cluster.local:5432/postgres --redis-url=redis://redis.sippy-e2e.svc.cluster.local:6379 --mode=ocp --config ./config/e2e-openshift.yaml --google-service-account-credential-file /tmp/secrets/gcs-cred - env: - - name: GCS_SA_JSON_PATH - value: /tmp/secrets/gcs-cred - volumeMounts: - - mountPath: /tmp/secrets - name: gcs-cred - readOnly: true + - /bin/sippy seed-data --init-database --log-level=debug --database-dsn=postgresql://postgres:password@postgres.sippy-e2e.svc.cluster.local:5432/postgres imagePullSecrets: - name: regcred - volumes: - - name: gcs-cred - secret: - secretName: gcs-cred dnsPolicy: ClusterFirst restartPolicy: Never schedulerName: default-scheduler @@ -327,33 +317,28 @@ spec: END date -echo "Waiting for sippy loader job to finish ..." -${KUBECTL_CMD} -n sippy-e2e get job sippy-load-job -${KUBECTL_CMD} -n sippy-e2e describe job sippy-load-job +echo "Waiting for sippy seed job to finish ..." +${KUBECTL_CMD} -n sippy-e2e get job sippy-seed-job -# We set +e to avoid the script aborting before we can retrieve logs. set +e - -echo "Waiting up to ${SIPPY_LOAD_TIMEOUT:=1200s} for the sippy-load-job to complete..." -${KUBECTL_CMD} -n sippy-e2e wait --for=condition=complete job/sippy-load-job --timeout ${SIPPY_LOAD_TIMEOUT} -retVal=$? +echo "Waiting up to 300s for the sippy-seed-job to complete..." +${KUBECTL_CMD} -n sippy-e2e wait --for=condition=complete job/sippy-seed-job --timeout 300s +seedRetVal=$? set -e -job_pod=$(${KUBECTL_CMD} -n sippy-e2e get pod --selector=job-name=sippy-load-job --output=jsonpath='{.items[0].metadata.name}') -${KUBECTL_CMD} -n sippy-e2e logs ${job_pod} > ${ARTIFACT_DIR}/sippy-load.log 2>&1 +seed_pod=$(${KUBECTL_CMD} -n sippy-e2e get pod --selector=job-name=sippy-seed-job --output=jsonpath='{.items[0].metadata.name}') +${KUBECTL_CMD} -n sippy-e2e logs ${seed_pod} > ${ARTIFACT_DIR}/sippy-seed.log 2>&1 -if [ ${retVal} -ne 0 ]; then +if [ ${seedRetVal} -ne 0 ]; then echo - echo "=== SIPPY LOAD JOB FAILURE DIAGNOSTICS ===" + echo "=== SIPPY SEED JOB FAILURE DIAGNOSTICS ===" echo "=== Job status ===" - ${KUBECTL_CMD} -n sippy-e2e describe job sippy-load-job + ${KUBECTL_CMD} -n sippy-e2e describe job sippy-seed-job echo "=== Job pod status ===" - ${KUBECTL_CMD} -n sippy-e2e describe pod ${job_pod} - echo "=== Recent namespace events ===" - ${KUBECTL_CMD} -n sippy-e2e get events --sort-by='.lastTimestamp' - echo "=== END SIPPY LOAD JOB FAILURE DIAGNOSTICS ===" + ${KUBECTL_CMD} -n sippy-e2e describe pod ${seed_pod} + echo "=== END SIPPY SEED JOB FAILURE DIAGNOSTICS ===" echo - echo "ERROR: sippy-load-job did not complete within ${SIPPY_LOAD_TIMEOUT}" + echo "ERROR: sippy-seed-job did not complete within 300s" exit 1 fi diff --git a/e2e-scripts/sippy-e2e-sippy-e2e-test-commands.sh b/e2e-scripts/sippy-e2e-sippy-e2e-test-commands.sh index 72b05151d9..9f8c6db4ff 100755 --- a/e2e-scripts/sippy-e2e-sippy-e2e-test-commands.sh +++ b/e2e-scripts/sippy-e2e-sippy-e2e-test-commands.sh @@ -83,6 +83,8 @@ spec: - ocp - --views - ./config/e2e-views.yaml + - --data-provider + - postgres env: - name: GCS_SA_JSON_PATH value: /tmp/secrets/gcs-cred @@ -166,6 +168,32 @@ ${KUBECTL_CMD} -n sippy-e2e port-forward pod/redis1 ${SIPPY_REDIS_PORT}:6379 & ${KUBECTL_CMD} -n sippy-e2e get svc,ep +# Wait for the sippy API to be reachable through the port-forward +echo "Waiting for sippy API to be reachable on port ${SIPPY_API_PORT}..." +TIMEOUT=120 +ELAPSED=0 +while [ $ELAPSED -lt $TIMEOUT ]; do + if curl -s "http://localhost:${SIPPY_API_PORT}/api/health" > /dev/null 2>&1; then + echo "Sippy API is ready after ${ELAPSED}s" + break + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) +done +if [ $ELAPSED -ge $TIMEOUT ]; then + echo "ERROR: Timed out waiting for sippy API after ${TIMEOUT}s" + exit 1 +fi + +# Prime the component readiness cache so triage tests can find cached reports +echo "Priming component readiness cache..." +VIEWS=$(curl -sf "http://localhost:${SIPPY_API_PORT}/api/component_readiness/views") || { echo "Failed to fetch views"; exit 1; } +for VIEW in $(echo "$VIEWS" | grep -o '"name":"[^"]*"' | cut -d'"' -f4); do + echo " Priming cache for view: $VIEW" + curl -sf "http://localhost:${SIPPY_API_PORT}/api/component_readiness?view=$VIEW" > /dev/null || { echo "Failed to prime cache for view: $VIEW"; exit 1; } +done +echo "Cache priming complete" + # only 1 in parallel, some tests will clash if run at the same time gotestsum --junitfile ${ARTIFACT_DIR}/junit_e2e.xml -- ./test/e2e/... -v -p 1 -coverprofile=${ARTIFACT_DIR}/e2e-test-coverage.out -coverpkg=./pkg/...,./cmd/... TEST_EXIT=$? diff --git a/pkg/api/componentreadiness/dataprovider/postgres/provider.go b/pkg/api/componentreadiness/dataprovider/postgres/provider.go new file mode 100644 index 0000000000..0eee1a0105 --- /dev/null +++ b/pkg/api/componentreadiness/dataprovider/postgres/provider.go @@ -0,0 +1,784 @@ +package postgres + +import ( + "context" + "fmt" + "math/big" + "slices" + "sort" + "strings" + "time" + + "github.com/lib/pq" + + "github.com/openshift/sippy/pkg/api/componentreadiness/dataprovider" + "github.com/openshift/sippy/pkg/api/componentreadiness/utils" + "github.com/openshift/sippy/pkg/apis/api/componentreport/crstatus" + "github.com/openshift/sippy/pkg/apis/api/componentreport/crtest" + "github.com/openshift/sippy/pkg/apis/api/componentreport/reqopts" + "github.com/openshift/sippy/pkg/apis/cache" + v1 "github.com/openshift/sippy/pkg/apis/sippy/v1" + "github.com/openshift/sippy/pkg/db" +) + +var _ dataprovider.DataProvider = &PostgresProvider{} + +// PostgresProvider implements dataprovider.DataProvider using PostgreSQL. +// Designed for local development and testing — not optimized for production scale. +type PostgresProvider struct { + dbc *db.DB + cache cache.Cache +} + +func NewPostgresProvider(dbc *db.DB, c cache.Cache) *PostgresProvider { + if c == nil { + c = &noOpCache{} + } + return &PostgresProvider{dbc: dbc, cache: c} +} + +// noOpCache never stores or returns data; no Redis needed for local dev. +type noOpCache struct{} + +func (n *noOpCache) Get(_ context.Context, _ string, _ time.Duration) ([]byte, error) { + return nil, fmt.Errorf("cache miss") +} +func (n *noOpCache) Set(_ context.Context, _ string, _ []byte, _ time.Duration) error { return nil } + +func (p *PostgresProvider) Cache() cache.Cache { + return p.cache +} + +// --- Variant helpers --- + +// parseVariants splits a pq.StringArray like ["Platform:aws", "Upgrade:none"] into a map. +func parseVariants(variants pq.StringArray) map[string]string { + result := make(map[string]string, len(variants)) + for _, v := range variants { + if k, val, ok := strings.Cut(v, ":"); ok { + result[k] = val + } + } + return result +} + +// variantMapToSlice converts a map to sorted "Key:Value" strings. +func variantMapToSlice(m map[string]string) []string { + result := make([]string, 0, len(m)) + for k, v := range m { + result = append(result, k+":"+v) + } + sort.Strings(result) + return result +} + +// filterByDBGroupBy returns a copy of the variant map keeping only keys in dbGroupBy. +func filterByDBGroupBy(variants map[string]string, dbGroupBy map[string]bool) map[string]string { + filtered := make(map[string]string, len(dbGroupBy)) + for k, v := range variants { + if dbGroupBy[k] { + filtered[k] = v + } + } + return filtered +} + +// matchesIncludeVariants checks if a variant map passes the include filter. +func matchesIncludeVariants(variants map[string]string, includeVariants map[string][]string) bool { + for key, allowed := range includeVariants { + val, exists := variants[key] + if !exists { + return false + } + if !slices.Contains(allowed, val) { + return false + } + } + return true +} + +// --- MetadataQuerier --- + +func (p *PostgresProvider) QueryJobVariants(_ context.Context) (crtest.JobVariants, []error) { + variants := crtest.JobVariants{Variants: map[string][]string{}} + + var pairs []string + err := p.dbc.DB.Raw(`SELECT DISTINCT unnest(variants) AS pair FROM prow_jobs WHERE deleted_at IS NULL`). + Pluck("pair", &pairs).Error + if err != nil { + return variants, []error{fmt.Errorf("querying job variants: %w", err)} + } + + grouped := map[string]map[string]bool{} + for _, pair := range pairs { + k, v, ok := strings.Cut(pair, ":") + if !ok { + continue + } + if grouped[k] == nil { + grouped[k] = map[string]bool{} + } + grouped[k][v] = true + } + + for k, vals := range grouped { + sorted := make([]string, 0, len(vals)) + for v := range vals { + sorted = append(sorted, v) + } + sort.Strings(sorted) + variants.Variants[k] = sorted + } + return variants, nil +} + +// releaseMetadata holds hardcoded release info for known releases. +// This avoids needing a releases table — we derive release names from prow_jobs +// and fill in metadata from this map. +var releaseMetadata = map[string]struct { + previousRelease string + gaOffsetDays int // 0 = no GA date (in development) +}{ + "4.17": {previousRelease: "4.16", gaOffsetDays: -540}, + "4.18": {previousRelease: "4.17", gaOffsetDays: -395}, + "4.19": {previousRelease: "4.18", gaOffsetDays: -289}, + "4.20": {previousRelease: "4.19", gaOffsetDays: -163}, + "4.21": {previousRelease: "4.20", gaOffsetDays: -58}, + "4.22": {previousRelease: "4.21"}, + "5.0": {previousRelease: "4.22"}, +} + +func (p *PostgresProvider) QueryReleases(_ context.Context) ([]v1.Release, error) { + var releaseNames []string + err := p.dbc.DB.Raw(`SELECT DISTINCT release FROM prow_jobs WHERE deleted_at IS NULL ORDER BY release DESC`). + Pluck("release", &releaseNames).Error + if err != nil { + return nil, fmt.Errorf("querying releases: %w", err) + } + + caps := map[v1.ReleaseCapability]bool{ + v1.ComponentReadinessCap: true, + v1.FeatureGatesCap: true, + v1.MetricsCap: true, + v1.PayloadTagsCap: true, + v1.SippyClassicCap: true, + } + + now := time.Now().UTC() + var releases []v1.Release + for _, name := range releaseNames { + rel := v1.Release{ + Release: name, + Capabilities: caps, + } + if meta, ok := releaseMetadata[name]; ok { + rel.PreviousRelease = meta.previousRelease + if meta.gaOffsetDays != 0 { + ga := now.AddDate(0, 0, meta.gaOffsetDays) + rel.GADate = &ga + } + } + releases = append(releases, rel) + } + return releases, nil +} + +func (p *PostgresProvider) QueryReleaseDates(_ context.Context, _ reqopts.RequestOptions) ([]crtest.ReleaseTimeRange, []error) { + // Derive time ranges from actual data in the DB rather than hardcoded GA dates. + // This ensures fallback queries find data where it actually exists. + type releaseRange struct { + Release string + Start time.Time + End time.Time + } + var ranges []releaseRange + err := p.dbc.DB.Raw(` + SELECT pj.release, + MIN(pjr.timestamp) AS start, + MAX(pjr.timestamp) AS end + FROM prow_job_runs pjr + JOIN prow_jobs pj ON pj.id = pjr.prow_job_id + WHERE pj.deleted_at IS NULL AND pjr.deleted_at IS NULL + GROUP BY pj.release + ORDER BY pj.release DESC + `).Scan(&ranges).Error + if err != nil { + return nil, []error{fmt.Errorf("querying release dates: %w", err)} + } + + var dates []crtest.ReleaseTimeRange + for _, r := range ranges { + start := r.Start + end := r.End + dates = append(dates, crtest.ReleaseTimeRange{ + Release: r.Release, + Start: &start, + End: &end, + }) + } + return dates, nil +} + +func (p *PostgresProvider) QueryUniqueVariantValues(_ context.Context, field string, nested bool) ([]string, error) { + if nested { + // Return all variant key names + var pairs []string + err := p.dbc.DB.Raw(` + SELECT DISTINCT unnest(variants) AS pair FROM prow_jobs + WHERE deleted_at IS NULL + `).Pluck("pair", &pairs).Error + if err != nil { + return nil, err + } + keys := map[string]bool{} + for _, pair := range pairs { + if k, _, ok := strings.Cut(pair, ":"); ok { + keys[k] = true + } + } + result := make([]string, 0, len(keys)) + for k := range keys { + result = append(result, k) + } + sort.Strings(result) + return result, nil + } + + // Map BQ column names to variant key names + fieldMap := map[string]string{ + "platform": "Platform", + "network": "Network", + "arch": "Architecture", + "upgrade": "Upgrade", + } + variantKey, ok := fieldMap[field] + if !ok { + return []string{}, nil + } + + var pairs []string + err := p.dbc.DB.Raw(` + SELECT DISTINCT unnest(variants) AS pair FROM prow_jobs + WHERE deleted_at IS NULL + `).Pluck("pair", &pairs).Error + if err != nil { + return nil, err + } + + vals := map[string]bool{} + for _, pair := range pairs { + if k, v, ok := strings.Cut(pair, ":"); ok && k == variantKey { + vals[v] = true + } + } + result := make([]string, 0, len(vals)) + for v := range vals { + result = append(result, v) + } + sort.Strings(result) + return result, nil +} + +// --- TestStatusQuerier --- + +// testStatusRow is the result of the aggregation query. +type testStatusRow struct { + TestID string `gorm:"column:test_id"` + TestName string `gorm:"column:test_name"` + TestSuite string `gorm:"column:test_suite"` + Component string `gorm:"column:component"` + Capabilities pq.StringArray `gorm:"column:capabilities;type:text[]"` + ProwJobID uint `gorm:"column:prow_job_id"` + TotalCount int `gorm:"column:total_count"` + SuccessCount int `gorm:"column:success_count"` + FlakeCount int `gorm:"column:flake_count"` + LastFailure *time.Time `gorm:"column:last_failure"` +} + +const testStatusQuery = ` +WITH deduped AS ( + SELECT DISTINCT ON (pjrt.prow_job_run_id, pjrt.test_id, pjrt.suite_id) + pjrt.test_id, pjrt.suite_id, pjrt.status, + pjr.timestamp, pj.id AS prow_job_id + FROM prow_job_run_tests pjrt + JOIN prow_job_runs pjr ON pjr.id = pjrt.prow_job_run_id + JOIN prow_jobs pj ON pj.id = pjr.prow_job_id + WHERE pj.release = ? + AND pjr.timestamp >= ? AND pjr.timestamp < ? + AND pjrt.deleted_at IS NULL AND pjr.deleted_at IS NULL AND pj.deleted_at IS NULL + AND (pjr.labels IS NULL OR NOT pjr.labels @> ARRAY['InfraFailure']) + ORDER BY pjrt.prow_job_run_id, pjrt.test_id, pjrt.suite_id, + CASE WHEN pjrt.status = 13 THEN 0 WHEN pjrt.status = 1 THEN 1 ELSE 2 END +) +SELECT + tow.unique_id AS test_id, + t.name AS test_name, + COALESCE(s.name, '') AS test_suite, + tow.component, + tow.capabilities, + d.prow_job_id, + COUNT(*) AS total_count, + SUM(CASE WHEN d.status IN (1, 13) THEN 1 ELSE 0 END) AS success_count, + SUM(CASE WHEN d.status = 13 THEN 1 ELSE 0 END) AS flake_count, + MAX(CASE WHEN d.status NOT IN (1, 13) THEN d.timestamp ELSE NULL END) AS last_failure +FROM deduped d +JOIN tests t ON t.id = d.test_id +JOIN test_ownerships tow ON tow.test_id = d.test_id + AND (tow.suite_id = d.suite_id OR (tow.suite_id IS NULL AND d.suite_id IS NULL)) +LEFT JOIN suites s ON s.id = d.suite_id +WHERE tow.staff_approved_obsolete = false +GROUP BY tow.unique_id, t.name, s.name, tow.component, tow.capabilities, d.prow_job_id +` + +func (p *PostgresProvider) queryTestStatus(ctx context.Context, release string, start, end time.Time, + _ crtest.JobVariants, includeVariants map[string][]string, + dbGroupBy map[string]bool) (map[string]crstatus.TestStatus, []error) { + + var rows []testStatusRow + if err := p.dbc.DB.WithContext(ctx).Raw(testStatusQuery, release, start, end).Scan(&rows).Error; err != nil { + return nil, []error{fmt.Errorf("querying test status: %w", err)} + } + + // Batch-fetch all ProwJob variants we need + jobIDs := make(map[uint]bool, len(rows)) + for _, r := range rows { + jobIDs[r.ProwJobID] = true + } + ids := make([]uint, 0, len(jobIDs)) + for id := range jobIDs { + ids = append(ids, id) + } + jobVariantMap, err := p.fetchJobVariantsByIDs(ids) + if err != nil { + return nil, []error{err} + } + + result := map[string]crstatus.TestStatus{} + for _, row := range rows { + variants, ok := jobVariantMap[row.ProwJobID] + if !ok { + continue + } + + if !matchesIncludeVariants(variants, includeVariants) { + continue + } + + filtered := filterByDBGroupBy(variants, dbGroupBy) + key := crtest.KeyWithVariants{ + TestID: row.TestID, + Variants: filtered, + } + keyStr := key.KeyOrDie() + + existing, exists := result[keyStr] + if exists { + // Merge counts for same test+variant combo from different job runs + existing.Count.TotalCount += row.TotalCount + existing.Count.SuccessCount += row.SuccessCount + existing.Count.FlakeCount += row.FlakeCount + if row.LastFailure != nil && (existing.LastFailure.IsZero() || row.LastFailure.After(existing.LastFailure)) { + existing.LastFailure = *row.LastFailure + } + result[keyStr] = existing + } else { + ts := crstatus.TestStatus{ + TestName: row.TestName, + TestSuite: row.TestSuite, + Component: row.Component, + Capabilities: row.Capabilities, + Variants: variantMapToSlice(filtered), + Count: crtest.Count{ + TotalCount: row.TotalCount, + SuccessCount: row.SuccessCount, + FlakeCount: row.FlakeCount, + }, + } + if row.LastFailure != nil { + ts.LastFailure = *row.LastFailure + } + result[keyStr] = ts + } + } + + return result, nil +} + +// fetchJobVariantsByIDs loads ProwJob variant maps for the given job IDs. +func (p *PostgresProvider) fetchJobVariantsByIDs(ids []uint) (map[uint]map[string]string, error) { + if len(ids) == 0 { + return map[uint]map[string]string{}, nil + } + + type jobRow struct { + ID uint `gorm:"column:id"` + Variants pq.StringArray `gorm:"column:variants;type:text[]"` + } + + var jobRows []jobRow + if err := p.dbc.DB.Raw(`SELECT id, variants FROM prow_jobs WHERE id IN (?)`, ids).Scan(&jobRows).Error; err != nil { + return nil, fmt.Errorf("fetching job variants: %w", err) + } + + result := make(map[uint]map[string]string, len(jobRows)) + for _, jr := range jobRows { + result[jr.ID] = parseVariants(jr.Variants) + } + return result, nil +} + +func (p *PostgresProvider) QueryBaseTestStatus(ctx context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants) (map[string]crstatus.TestStatus, []error) { + + dbGroupBy := make(map[string]bool, reqOptions.VariantOption.DBGroupBy.Len()) + for _, k := range reqOptions.VariantOption.DBGroupBy.List() { + dbGroupBy[k] = true + } + + includeVariants := reqOptions.VariantOption.IncludeVariants + if includeVariants == nil { + includeVariants = map[string][]string{} + } + + return p.queryTestStatus( + ctx, + reqOptions.BaseRelease.Name, + reqOptions.BaseRelease.Start, + reqOptions.BaseRelease.End, + allJobVariants, + includeVariants, + dbGroupBy, + ) +} + +func (p *PostgresProvider) QuerySampleTestStatus(ctx context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants, + includeVariants map[string][]string, + start, end time.Time) (map[string]crstatus.TestStatus, []error) { + + dbGroupBy := make(map[string]bool, reqOptions.VariantOption.DBGroupBy.Len()) + for _, k := range reqOptions.VariantOption.DBGroupBy.List() { + dbGroupBy[k] = true + } + + if includeVariants == nil { + includeVariants = map[string][]string{} + } + + return p.queryTestStatus( + ctx, + reqOptions.SampleRelease.Name, + start, end, + allJobVariants, + includeVariants, + dbGroupBy, + ) +} + +// --- TestDetailsQuerier --- + +type testDetailRow struct { + TestID string `gorm:"column:test_id"` + TestName string `gorm:"column:test_name"` + ProwJobName string `gorm:"column:prowjob_name"` + ProwJobRunID string `gorm:"column:prowjob_run_id"` + ProwJobURL string `gorm:"column:prowjob_url"` + ProwJobStart time.Time `gorm:"column:prowjob_start"` + ProwJobID uint `gorm:"column:prow_job_id"` + Status int `gorm:"column:status"` + JiraComponent string `gorm:"column:jira_component"` + JiraComponentID *uint `gorm:"column:jira_component_id"` + Capabilities pq.StringArray `gorm:"column:capabilities;type:text[]"` +} + +const testDetailQuery = ` +SELECT + tow.unique_id AS test_id, + t.name AS test_name, + pj.name AS prowjob_name, + CAST(pjr.id AS TEXT) AS prowjob_run_id, + COALESCE(pjr.url, '') AS prowjob_url, + pjr.timestamp AS prowjob_start, + pj.id AS prow_job_id, + pjrt.status, + COALESCE(tow.jira_component, '') AS jira_component, + tow.jira_component_id, + tow.capabilities +FROM prow_job_run_tests pjrt +JOIN prow_job_runs pjr ON pjr.id = pjrt.prow_job_run_id +JOIN prow_jobs pj ON pj.id = pjr.prow_job_id +JOIN tests t ON t.id = pjrt.test_id +JOIN test_ownerships tow ON tow.test_id = pjrt.test_id + AND (tow.suite_id = pjrt.suite_id OR (tow.suite_id IS NULL AND pjrt.suite_id IS NULL)) +WHERE pj.release = ? + AND pjr.timestamp >= ? AND pjr.timestamp < ? + AND pjrt.deleted_at IS NULL AND pjr.deleted_at IS NULL AND pj.deleted_at IS NULL + AND tow.staff_approved_obsolete = false + AND (pjr.labels IS NULL OR NOT pjr.labels @> ARRAY['InfraFailure']) +ORDER BY pjr.timestamp +` + +func (p *PostgresProvider) queryTestDetails(release string, start, end time.Time, + reqOptions reqopts.RequestOptions, _ crtest.JobVariants, + includeVariants map[string][]string) (map[string][]crstatus.TestJobRunRows, []error) { + + var rows []testDetailRow + if err := p.dbc.DB.Raw(testDetailQuery, release, start, end).Scan(&rows).Error; err != nil { + return nil, []error{fmt.Errorf("querying test details: %w", err)} + } + + dbGroupBy := make(map[string]bool, reqOptions.VariantOption.DBGroupBy.Len()) + for _, k := range reqOptions.VariantOption.DBGroupBy.List() { + dbGroupBy[k] = true + } + + if includeVariants == nil { + includeVariants = map[string][]string{} + } + + // Batch-fetch job variants + jobIDs := map[uint]bool{} + for _, r := range rows { + jobIDs[r.ProwJobID] = true + } + ids := make([]uint, 0, len(jobIDs)) + for id := range jobIDs { + ids = append(ids, id) + } + jobVariantMap, err := p.fetchJobVariantsByIDs(ids) + if err != nil { + return nil, []error{err} + } + + // Filter test IDs if specified + // Build test ID filter and per-test requested variant filters + testIDFilter := map[string]bool{} + requestedVariantsByTestID := map[string]map[string]string{} + for _, tid := range reqOptions.TestIDOptions { + testIDFilter[tid.TestID] = true + if len(tid.RequestedVariants) > 0 { + requestedVariantsByTestID[tid.TestID] = tid.RequestedVariants + } + } + + result := map[string][]crstatus.TestJobRunRows{} + for _, row := range rows { + if len(testIDFilter) > 0 && !testIDFilter[row.TestID] { + continue + } + + variants, ok := jobVariantMap[row.ProwJobID] + if !ok { + continue + } + if !matchesIncludeVariants(variants, includeVariants) { + continue + } + + // Filter by requested variants (exact match for specific test+variant combo) + if rv, ok := requestedVariantsByTestID[row.TestID]; ok { + match := true + for k, v := range rv { + if variants[k] != v { + match = false + break + } + } + if !match { + continue + } + } + + filtered := filterByDBGroupBy(variants, dbGroupBy) + key := crtest.KeyWithVariants{ + TestID: row.TestID, + Variants: filtered, + } + + successCount := 0 + flakeCount := 0 + if row.Status == 1 || row.Status == 13 { + successCount = 1 + } + if row.Status == 13 { + flakeCount = 1 + } + + var jiraComponentID *big.Rat + if row.JiraComponentID != nil { + jiraComponentID = new(big.Rat).SetUint64(uint64(*row.JiraComponentID)) + } + + normalizedName := utils.NormalizeProwJobName(row.ProwJobName) + entry := crstatus.TestJobRunRows{ + TestKey: key, + TestKeyStr: key.KeyOrDie(), + TestName: row.TestName, + ProwJob: normalizedName, + ProwJobRunID: row.ProwJobRunID, + ProwJobURL: row.ProwJobURL, + StartTime: row.ProwJobStart, + Count: crtest.Count{TotalCount: 1, SuccessCount: successCount, FlakeCount: flakeCount}, + JiraComponent: row.JiraComponent, + JiraComponentID: jiraComponentID, + } + + result[normalizedName] = append(result[normalizedName], entry) + } + + return result, nil +} + +func (p *PostgresProvider) QueryBaseJobRunTestStatus(_ context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants) (map[string][]crstatus.TestJobRunRows, []error) { + + return p.queryTestDetails( + reqOptions.BaseRelease.Name, + reqOptions.BaseRelease.Start, reqOptions.BaseRelease.End, + reqOptions, allJobVariants, reqOptions.VariantOption.IncludeVariants, + ) +} + +func (p *PostgresProvider) QuerySampleJobRunTestStatus(_ context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants, + includeVariants map[string][]string, + start, end time.Time) (map[string][]crstatus.TestJobRunRows, []error) { + + return p.queryTestDetails( + reqOptions.SampleRelease.Name, + start, end, + reqOptions, allJobVariants, includeVariants, + ) +} + +// --- JobQuerier --- + +func (p *PostgresProvider) QueryJobRuns(_ context.Context, reqOptions reqopts.RequestOptions, + allJobVariants crtest.JobVariants, + release string, start, end time.Time) (map[string]dataprovider.JobRunStats, error) { + + type jobRunRow struct { + JobName string `gorm:"column:job_name"` + TotalRuns int `gorm:"column:total_runs"` + Successful int `gorm:"column:successful_runs"` + } + + var rows []jobRunRow + err := p.dbc.DB.Raw(` + SELECT + pj.name AS job_name, + COUNT(DISTINCT pjr.id) AS total_runs, + COUNT(DISTINCT CASE WHEN pjr.succeeded THEN pjr.id END) AS successful_runs + FROM prow_jobs pj + JOIN prow_job_runs pjr ON pjr.prow_job_id = pj.id + WHERE pj.release = ? + AND pjr.timestamp >= ? AND pjr.timestamp < ? + AND pj.deleted_at IS NULL AND pjr.deleted_at IS NULL + AND (pj.name LIKE 'periodic-%%' OR pj.name LIKE 'release-%%' OR pj.name LIKE 'aggregator-%%') + GROUP BY pj.name + ORDER BY pj.name + `, release, start, end).Scan(&rows).Error + if err != nil { + return nil, fmt.Errorf("querying job runs: %w", err) + } + + // Apply variant filtering in Go + includeVariants := reqOptions.VariantOption.IncludeVariants + if includeVariants == nil { + includeVariants = map[string][]string{} + } + + // Fetch variants for all jobs + jobNames := make([]string, 0, len(rows)) + for _, r := range rows { + jobNames = append(jobNames, r.JobName) + } + jobVariantMap := map[string]map[string]string{} + if len(jobNames) > 0 { + type jvRow struct { + Name string `gorm:"column:name"` + Variants pq.StringArray `gorm:"column:variants;type:text[]"` + } + var jvRows []jvRow + if err := p.dbc.DB.Raw(`SELECT name, variants FROM prow_jobs WHERE name IN (?) AND deleted_at IS NULL`, jobNames).Scan(&jvRows).Error; err != nil { + return nil, fmt.Errorf("fetching job variants: %w", err) + } + for _, jr := range jvRows { + jobVariantMap[jr.Name] = parseVariants(jr.Variants) + } + } + + results := map[string]dataprovider.JobRunStats{} + for _, row := range rows { + if variants, ok := jobVariantMap[row.JobName]; ok { + if !matchesIncludeVariants(variants, includeVariants) { + continue + } + } + passRate := 0.0 + if row.TotalRuns > 0 { + passRate = float64(row.Successful) / float64(row.TotalRuns) * 100 + } + results[row.JobName] = dataprovider.JobRunStats{ + JobName: row.JobName, + TotalRuns: row.TotalRuns, + SuccessfulRuns: row.Successful, + PassRate: passRate, + } + } + + return results, nil +} + +func (p *PostgresProvider) QueryJobVariantValues(_ context.Context, jobNames []string, + variantKeys []string) (map[string]map[string]string, error) { + + if len(jobNames) == 0 { + return map[string]map[string]string{}, nil + } + + type jvRow struct { + Name string `gorm:"column:name"` + Variants pq.StringArray `gorm:"column:variants;type:text[]"` + } + + var rows []jvRow + if err := p.dbc.DB.Raw(`SELECT name, variants FROM prow_jobs WHERE name IN (?) AND deleted_at IS NULL`, jobNames).Scan(&rows).Error; err != nil { + return nil, fmt.Errorf("querying job variant values: %w", err) + } + + keyFilter := map[string]bool{} + for _, k := range variantKeys { + keyFilter[k] = true + } + + results := map[string]map[string]string{} + for _, row := range rows { + parsed := parseVariants(row.Variants) + if len(keyFilter) > 0 { + filtered := map[string]string{} + for k, v := range parsed { + if keyFilter[k] { + filtered[k] = v + } + } + results[row.Name] = filtered + } else { + results[row.Name] = parsed + } + } + return results, nil +} + +func (p *PostgresProvider) LookupJobVariants(_ context.Context, jobName string) (map[string]string, error) { + type jvRow struct { + Variants pq.StringArray `gorm:"column:variants;type:text[]"` + } + + var row jvRow + err := p.dbc.DB.Raw(`SELECT variants FROM prow_jobs WHERE name = ? AND deleted_at IS NULL LIMIT 1`, jobName).Scan(&row).Error + if err != nil { + return nil, fmt.Errorf("looking up job variants: %w", err) + } + return parseVariants(row.Variants), nil +} diff --git a/scripts/dev-setup.sh b/scripts/dev-setup.sh new file mode 100755 index 0000000000..b1f682dc16 --- /dev/null +++ b/scripts/dev-setup.sh @@ -0,0 +1,113 @@ +#!/bin/sh +# Stand up a seeded PostgreSQL + Redis environment for local development. +# By default only seeds the database and prints connection info. +# Pass --serve to also start the sippy API server. +# +# Usage: +# make dev # build + seed only +# make dev SERVE=1 # build + seed + start sippy +# scripts/dev-setup.sh # seed only (assumes sippy binary exists) +# scripts/dev-setup.sh --serve # seed + start sippy +# +# To tear down: Ctrl-C (containers are cleaned up automatically) + +set -e + +SERVE=false +for arg in "$@"; do + case "$arg" in + --serve) SERVE=true ;; + esac +done + +DOCKER="${DOCKER:-podman}" +PSQL_CONTAINER="sippy-dev-postgresql" +PSQL_PORT="${PSQL_PORT:-25433}" +REDIS_CONTAINER="sippy-dev-redis" +REDIS_PORT="${REDIS_PORT:-25479}" +SIPPY_API_PORT="${SIPPY_API_PORT:-8080}" + +clean_up() { + echo "" + echo "Shutting down..." + if [ -n "$CHILD_PID" ]; then + kill $CHILD_PID 2>/dev/null && wait $CHILD_PID 2>/dev/null + fi + echo "Stopping $PSQL_CONTAINER" + $DOCKER stop $PSQL_CONTAINER 2>/dev/null + $DOCKER rm $PSQL_CONTAINER 2>/dev/null + echo "Stopping $REDIS_CONTAINER" + $DOCKER stop $REDIS_CONTAINER 2>/dev/null + $DOCKER rm $REDIS_CONTAINER 2>/dev/null +} +trap clean_up EXIT + +# Clean up any stale containers from a previous run +$DOCKER stop $PSQL_CONTAINER 2>/dev/null || true +$DOCKER rm $PSQL_CONTAINER 2>/dev/null || true +$DOCKER stop $REDIS_CONTAINER 2>/dev/null || true +$DOCKER rm $REDIS_CONTAINER 2>/dev/null || true + +echo "Starting PostgreSQL on port $PSQL_PORT..." +$DOCKER run --name $PSQL_CONTAINER -e POSTGRES_PASSWORD=password -p $PSQL_PORT:5432 -d quay.io/enterprisedb/postgresql + +echo "Starting Redis on port $REDIS_PORT..." +$DOCKER run --name $REDIS_CONTAINER -p $REDIS_PORT:6379 -d quay.io/openshiftci/redis:latest + +echo "Waiting for PostgreSQL to be ready..." +timeout=30 +elapsed=0 +until $DOCKER exec $PSQL_CONTAINER psql -U postgres -d postgres -c '\q' 2>/dev/null; do + if [ "$elapsed" -ge "$timeout" ]; then + echo "ERROR: PostgreSQL did not become ready within ${timeout}s" + exit 1 + fi + sleep 1 + elapsed=$((elapsed + 1)) +done + +DSN="postgresql://postgres:password@localhost:$PSQL_PORT/postgres" +REDIS_URL="redis://localhost:$REDIS_PORT" + +echo "Seeding database..." +./sippy seed-data --init-database --database-dsn="$DSN" + +echo "" +echo "================================================" +echo " Dev environment ready" +echo " PostgreSQL: $DSN" +echo " Redis: $REDIS_URL" +echo "================================================" + +if [ "$SERVE" = true ]; then + set -- \ + --listen ":$SIPPY_API_PORT" \ + --listen-metrics ":12112" \ + --database-dsn="$DSN" \ + --enable-write-endpoints \ + --log-level debug \ + --views config/e2e-views.yaml \ + --redis-url="$REDIS_URL" \ + --data-provider postgres + if [ -n "$GCS_SA_JSON_PATH" ]; then + set -- "$@" --google-service-account-credential-file "$GCS_SA_JSON_PATH" + fi + + echo "" + echo "Starting sippy on http://localhost:$SIPPY_API_PORT ..." + echo "Press Ctrl-C to stop" + echo "" + + ./sippy serve "$@" & + CHILD_PID=$! + + wait $CHILD_PID +else + echo "" + echo "To start sippy against this database:" + echo " ./sippy serve --database-dsn=\"$DSN\" --redis-url=\"$REDIS_URL\" --data-provider postgres --views config/e2e-views.yaml --log-level debug" + echo "" + echo "Press Ctrl-C to tear down containers" + # Keep containers alive until user hits Ctrl-C + while true; do sleep 60; done +fi diff --git a/scripts/e2e.sh b/scripts/e2e.sh index 8c8487217c..2a9672f4cb 100755 --- a/scripts/e2e.sh +++ b/scripts/e2e.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/bin/sh # Shell script meant for developers to run the e2e tests locally without impacting # their running postgres container or sippy process. # It's quite quick to import the older releases below, but in theory @@ -11,14 +11,17 @@ PSQL_PORT="23433" REDIS_CONTAINER="sippy-e2e-test-redis" REDIS_PORT="23479" -if [[ -z "$GCS_SA_JSON_PATH" ]]; then - echo "Must provide path to GCS credential in GCS_SA_JSON_PATH env var" 1>&2 - exit 1 +if [ -z "$GCS_SA_JSON_PATH" ]; then + echo "WARNING: GCS_SA_JSON_PATH not set, data sync and BigQuery tests will be skipped" 1>&2 fi +E2E_EXIT_CODE=0 clean_up () { ARG=$? + if [ $ARG -ne 0 ]; then + E2E_EXIT_CODE=$ARG + fi echo "Stopping sippy API child process: $CHILD_PID" kill $CHILD_PID 2>/dev/null && wait $CHILD_PID 2>/dev/null # Generate coverage report from the server's coverage data @@ -27,10 +30,13 @@ clean_up () { go tool covdata percent -i="$COVDIR" go tool covdata textfmt -i="$COVDIR" -o=e2e-coverage.out # Merge test binary coverage (from -coverprofile) into server binary coverage - if [ -f e2e-test-coverage.out ]; then - echo "Merging test binary coverage into server coverage..." - tail -n +2 e2e-test-coverage.out >> e2e-coverage.out - fi + for f in e2e-test-coverage.out e2e-bq-test-coverage.out unit-test-coverage.out; do + if [ -f "$f" ]; then + echo "Merging $f into server coverage..." + tail -n +2 "$f" >> e2e-coverage.out + rm -f "$f" + fi + done echo "Coverage data written to e2e-coverage.out" echo "View HTML report: go tool cover -html=e2e-coverage.out -o=e2e-coverage.html" fi @@ -40,7 +46,23 @@ clean_up () { echo "Tearing down container $REDIS_CONTAINER" $DOCKER stop -i $REDIS_CONTAINER $DOCKER rm -i $REDIS_CONTAINER - exit $ARG + exit $E2E_EXIT_CODE +} + +wait_for_sippy() { + echo "Waiting for sippy API to start on port $SIPPY_API_PORT..." + TIMEOUT=600 + ELAPSED=0 + while [ $ELAPSED -lt $TIMEOUT ]; do + if curl -s "http://localhost:$SIPPY_API_PORT/api/health" > /dev/null 2>&1; then + echo "Sippy API is ready after ${ELAPSED}s" + return 0 + fi + sleep 2 + ELAPSED=$((ELAPSED + 2)) + done + echo "Timeout waiting for sippy API to start after ${TIMEOUT}s" + return 1 } trap clean_up EXIT @@ -65,6 +87,7 @@ sleep 5 export SIPPY_E2E_DSN="postgresql://postgres:password@localhost:$PSQL_PORT/postgres" export REDIS_URL="redis://localhost:$REDIS_PORT" +export SIPPY_E2E_REPO_ROOT="$(pwd)" # Build with coverage instrumentation COVDIR="$(pwd)/e2e-coverage" @@ -76,45 +99,39 @@ go build -cover -coverpkg=./cmd/...,./pkg/... -mod vendor -o ./sippy ./cmd/sippy echo "Loading database..." GOCOVERDIR="$COVDIR" ./sippy seed-data \ --init-database \ - --database-dsn="$SIPPY_E2E_DSN" \ - --release="4.20" + --database-dsn="$SIPPY_E2E_DSN" # Spawn sippy server off into a separate process: export SIPPY_API_PORT="18080" export SIPPY_ENDPOINT="127.0.0.1" -GOCOVERDIR="$COVDIR" ./sippy serve \ +set -- \ --listen ":$SIPPY_API_PORT" \ --listen-metrics ":12112" \ --database-dsn="$SIPPY_E2E_DSN" \ --enable-write-endpoints \ --log-level debug \ --views config/e2e-views.yaml \ - --google-service-account-credential-file $GCS_SA_JSON_PATH \ - --redis-url="$REDIS_URL" > e2e.log 2>&1 & -CHILD_PID=$! + --redis-url="$REDIS_URL" \ + --data-provider postgres +if [ -n "$GCS_SA_JSON_PATH" ]; then + set -- "$@" --google-service-account-credential-file "$GCS_SA_JSON_PATH" +fi -# Give it time to start up, and fill the redis cache -echo "Waiting for sippy API to start on port $SIPPY_API_PORT, see e2e.log for output..." -TIMEOUT=600 -ELAPSED=0 -while [ $ELAPSED -lt $TIMEOUT ]; do - if curl -s "http://localhost:$SIPPY_API_PORT/api/health" > /dev/null 2>&1; then - echo "Sippy API is ready after ${ELAPSED}s" - break - fi - sleep 2 - ELAPSED=$((ELAPSED + 2)) -done +GOCOVERDIR="$COVDIR" ./sippy serve "$@" > e2e.log 2>&1 & +CHILD_PID=$! -if [ $ELAPSED -ge $TIMEOUT ]; then - echo "Timeout waiting for sippy API to start after ${TIMEOUT}s" - exit 1 -fi +wait_for_sippy || exit 1 +# Prime the component readiness cache so triage tests can find cached reports +echo "Priming component readiness cache..." +VIEWS=$(curl -sf "http://localhost:$SIPPY_API_PORT/api/component_readiness/views") || { echo "Failed to fetch views"; exit 1; } +for VIEW in $(echo "$VIEWS" | grep -o '"name":"[^"]*"' | cut -d'"' -f4); do + echo " Priming cache for view: $VIEW" + curl -sf "http://localhost:$SIPPY_API_PORT/api/component_readiness?view=$VIEW" > /dev/null || { echo "Failed to prime cache for view: $VIEW"; exit 1; } +done +echo "Cache priming complete" -# Run our tests that request against the API, args ensure serially and fresh test code compile. -# All output is tee'd to e2e-test.log so results can be reviewed without re-running. -gotestsum ./test/e2e/... -count 1 -p 1 -coverprofile=e2e-test-coverage.out -coverpkg=./pkg/...,./cmd/... 2>&1 | tee e2e-test.log -E2E_EXIT=${PIPESTATUS[0]} -exit $E2E_EXIT +# Run e2e tests +gotestsum ./test/e2e/... -count 1 -p 1 -coverprofile=e2e-test-coverage.out -coverpkg=./pkg/...,./cmd/... +E2E_EXIT_CODE=$? diff --git a/test/e2e/util/e2erequest.go b/test/e2e/util/e2erequest.go index aca9bb91cb..ca8dbd40f2 100644 --- a/test/e2e/util/e2erequest.go +++ b/test/e2e/util/e2erequest.go @@ -12,9 +12,9 @@ import ( ) const ( - // Needs to match what we import in the e2e.sh script - Release = "4.20" - BaseRelease = "4.19" + // Needs to match the releases in config/e2e-views.yaml + Release = "4.22" + BaseRelease = "4.21" // APIPort is the port e2e.sh launches the sippy API on. These values must be kept in sync. APIPort = 18080