From 4dc6ffef8b83fb320ee451d733a2d6c65e11a034 Mon Sep 17 00:00:00 2001 From: "ivan.hladush" Date: Tue, 4 Nov 2025 15:54:10 -0700 Subject: [PATCH 1/3] Add functionality for old jobs removal --- configs/local.yaml | 3 + internal/pkg/janitor/janitor.go | 18 ++++-- internal/pkg/janitor/job.go | 57 +++++++++++++++++++ .../queries/old_jobs_cluster_tags_delete.sql | 4 ++ .../queries/old_jobs_command_tags_delete.sql | 5 ++ .../pkg/janitor/queries/old_jobs_delete.sql | 3 + .../janitor/queries/old_jobs_tags_delete.sql | 4 ++ 7 files changed, 89 insertions(+), 5 deletions(-) create mode 100644 internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql create mode 100644 internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql create mode 100644 internal/pkg/janitor/queries/old_jobs_delete.sql create mode 100644 internal/pkg/janitor/queries/old_jobs_tags_delete.sql diff --git a/configs/local.yaml b/configs/local.yaml index b420f30..a61c8da 100644 --- a/configs/local.yaml +++ b/configs/local.yaml @@ -11,6 +11,9 @@ pool: # plugins location plugin_directory: ./plugins +janitor: + finished_job_retention_days: 14 + # auth plugin auth: plugin: ./plugins/auth_header.so diff --git a/internal/pkg/janitor/janitor.go b/internal/pkg/janitor/janitor.go index 165d5ef..9a39094 100644 --- a/internal/pkg/janitor/janitor.go +++ b/internal/pkg/janitor/janitor.go @@ -1,16 +1,21 @@ package janitor import ( - "fmt" "time" + "github.com/hladush/go-telemetry/pkg/telemetry" "github.com/patterninc/heimdall/internal/pkg/database" ) +var ( + startMethod = telemetry.NewMethod("Start", "janitor") +) + type Janitor struct { - Keepalive int `yaml:"keepalive,omitempty" json:"keepalive,omitempty"` - StaleJob int `yaml:"stale_job,omitempty" json:"stale_job,omitempty"` - db *database.Database + Keepalive int `yaml:"keepalive,omitempty" json:"keepalive,omitempty"` + StaleJob int `yaml:"stale_job,omitempty" json:"stale_job,omitempty"` + FinishedJobRetentionDays int `yaml:"finished_job_retention_days,omitempty" json:"finished_job_retention_days,omitempty"` + db *database.Database } func (j *Janitor) Start(d *database.Database) error { @@ -29,9 +34,12 @@ func (j *Janitor) Start(d *database.Database) error { for { if err := j.cleanupStaleJobs(); err != nil { - fmt.Println(`Janitor error:`, err) + startMethod.LogAndCountError(err, "cleanup_stale_jobs") } + if err := j.cleanupFinishedJobs(); err != nil { + startMethod.LogAndCountError(err, "cleanup_finished_jobs") + } time.Sleep(60 * time.Second) } diff --git a/internal/pkg/janitor/job.go b/internal/pkg/janitor/job.go index 2e4ea80..4de8819 100644 --- a/internal/pkg/janitor/job.go +++ b/internal/pkg/janitor/job.go @@ -2,6 +2,7 @@ package janitor import ( _ "embed" + "fmt" "github.com/patterninc/heimdall/internal/pkg/database" ) @@ -15,6 +16,27 @@ var queryFailStaleJobs string //go:embed queries/stale_jobs_delete.sql var queryStaleJobsDelete string +//go:embed queries/old_jobs_cluster_tags_delete.sql +var queryOldJobsClusterTagsDelete string + +//go:embed queries/old_jobs_command_tags_delete.sql +var queryOldJobsCommandTagsDelete string + +//go:embed queries/old_jobs_tags_delete.sql +var queryOldJobsTagsDelete string + +//go:embed queries/old_jobs_delete.sql +var queryOldJobsDelete string + +var ( + queriesForOldJobsCleanup = []string{ + queryOldJobsClusterTagsDelete, + queryOldJobsCommandTagsDelete, + queryOldJobsTagsDelete, + queryOldJobsDelete, + } +) + func (j *Janitor) cleanupStaleJobs() error { // let's find the jobs we'll be cleaning up... @@ -74,3 +96,38 @@ func (j *Janitor) cleanupStaleJobs() error { return nil } + +func (j *Janitor) cleanupFinishedJobs() error { + if j.FinishedJobRetentionDays == 0 { + return nil + } + // Start transactional session + sess, err := j.db.NewSession(true) + if err != nil { + return err + } + defer sess.Close() + + defer func() { + _ = sess.Rollback() + }() + + exec := func(query string, args ...any) error { + if _, err := sess.Exec(query, args...); err != nil { + return fmt.Errorf("failed to exec query %q: %w", query, err) + } + return nil + } + + for _, q := range queriesForOldJobsCleanup { + if err := exec(q, j.FinishedJobRetentionDays); err != nil { + return err + } + } + + if err := sess.Commit(); err != nil { + return fmt.Errorf("failed to commit cleanup transaction: %w", err) + } + + return nil +} diff --git a/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql b/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql new file mode 100644 index 0000000..8ce92ae --- /dev/null +++ b/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql @@ -0,0 +1,4 @@ +DELETE FROM job_cluster_tags +WHERE system_job_id IN ( + SELECT system_job_id FROM jobs WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int +); diff --git a/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql b/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql new file mode 100644 index 0000000..73d02ce --- /dev/null +++ b/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql @@ -0,0 +1,5 @@ + +DELETE FROM job_command_tags +WHERE system_job_id IN ( + SELECT system_job_id FROM jobs WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int +); diff --git a/internal/pkg/janitor/queries/old_jobs_delete.sql b/internal/pkg/janitor/queries/old_jobs_delete.sql new file mode 100644 index 0000000..d0614a5 --- /dev/null +++ b/internal/pkg/janitor/queries/old_jobs_delete.sql @@ -0,0 +1,3 @@ + +DELETE FROM jobs +WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int; diff --git a/internal/pkg/janitor/queries/old_jobs_tags_delete.sql b/internal/pkg/janitor/queries/old_jobs_tags_delete.sql new file mode 100644 index 0000000..31c80dd --- /dev/null +++ b/internal/pkg/janitor/queries/old_jobs_tags_delete.sql @@ -0,0 +1,4 @@ +DELETE FROM job_tags +WHERE system_job_id IN ( + SELECT system_job_id FROM jobs WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int +); From 3dad5431398bed776e28cc22a5df2fb9f261f822 Mon Sep 17 00:00:00 2001 From: "ivan.hladush" Date: Tue, 11 Nov 2025 12:20:17 -0700 Subject: [PATCH 2/3] Improve code remove transaction managment from it --- internal/pkg/janitor/janitor.go | 2 +- internal/pkg/janitor/job.go | 34 ++++++++++++------- .../janitor/queries/old_job_biggest_id.sql | 1 + .../queries/old_jobs_cluster_tags_delete.sql | 5 ++- .../queries/old_jobs_command_tags_delete.sql | 7 ++-- .../pkg/janitor/queries/old_jobs_delete.sql | 7 +++- .../janitor/queries/old_jobs_tags_delete.sql | 7 ++-- 7 files changed, 43 insertions(+), 20 deletions(-) create mode 100644 internal/pkg/janitor/queries/old_job_biggest_id.sql diff --git a/internal/pkg/janitor/janitor.go b/internal/pkg/janitor/janitor.go index 9a39094..81c847c 100644 --- a/internal/pkg/janitor/janitor.go +++ b/internal/pkg/janitor/janitor.go @@ -8,7 +8,7 @@ import ( ) var ( - startMethod = telemetry.NewMethod("Start", "janitor") + startMethod = telemetry.NewMethod("Start", "Janitor") ) type Janitor struct { diff --git a/internal/pkg/janitor/job.go b/internal/pkg/janitor/job.go index 4de8819..62fda8d 100644 --- a/internal/pkg/janitor/job.go +++ b/internal/pkg/janitor/job.go @@ -1,6 +1,7 @@ package janitor import ( + "database/sql" _ "embed" "fmt" @@ -28,6 +29,9 @@ var queryOldJobsTagsDelete string //go:embed queries/old_jobs_delete.sql var queryOldJobsDelete string +//go:embed queries/old_job_biggest_id.sql +var queryOldJobsBiggestID string + var ( queriesForOldJobsCleanup = []string{ queryOldJobsClusterTagsDelete, @@ -101,33 +105,37 @@ func (j *Janitor) cleanupFinishedJobs() error { if j.FinishedJobRetentionDays == 0 { return nil } - // Start transactional session - sess, err := j.db.NewSession(true) + // open session + sess, err := j.db.NewSession(false) if err != nil { return err } defer sess.Close() - defer func() { - _ = sess.Rollback() - }() + // get biggest ID of old jobs + row, err := sess.QueryRow(queryOldJobsBiggestID, j.FinishedJobRetentionDays) + if err != nil { + return fmt.Errorf("failed to get biggest ID of old jobs: %w", err) + } - exec := func(query string, args ...any) error { - if _, err := sess.Exec(query, args...); err != nil { - return fmt.Errorf("failed to exec query %q: %w", query, err) + var biggestID sql.NullInt64 + if err := row.Scan(&biggestID); err != nil { + if err == sql.ErrNoRows { + return nil } + return fmt.Errorf("failed to get biggest ID of old jobs: %w", err) + } + + if !biggestID.Valid || biggestID.Int64 == 0 { return nil } + // remove old jobs data for _, q := range queriesForOldJobsCleanup { - if err := exec(q, j.FinishedJobRetentionDays); err != nil { + if _, err := sess.Exec(q, biggestID.Int64); err != nil { return err } } - if err := sess.Commit(); err != nil { - return fmt.Errorf("failed to commit cleanup transaction: %w", err) - } - return nil } diff --git a/internal/pkg/janitor/queries/old_job_biggest_id.sql b/internal/pkg/janitor/queries/old_job_biggest_id.sql new file mode 100644 index 0000000..ce949e4 --- /dev/null +++ b/internal/pkg/janitor/queries/old_job_biggest_id.sql @@ -0,0 +1 @@ +SELECT MAX(system_job_id) FROM jobs WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int; \ No newline at end of file diff --git a/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql b/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql index 8ce92ae..dd67f40 100644 --- a/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql +++ b/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql @@ -1,4 +1,7 @@ DELETE FROM job_cluster_tags WHERE system_job_id IN ( - SELECT system_job_id FROM jobs WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int + SELECT system_job_id + FROM job_cluster_tags + WHERE system_job_id <= $1 + LIMIT 1000 ); diff --git a/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql b/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql index 73d02ce..aa8a573 100644 --- a/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql +++ b/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql @@ -1,5 +1,8 @@ DELETE FROM job_command_tags WHERE system_job_id IN ( - SELECT system_job_id FROM jobs WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int -); + SELECT system_job_id + FROM job_command_tags + WHERE system_job_id <= $1 + LIMIT 1000 +); \ No newline at end of file diff --git a/internal/pkg/janitor/queries/old_jobs_delete.sql b/internal/pkg/janitor/queries/old_jobs_delete.sql index d0614a5..ca76c50 100644 --- a/internal/pkg/janitor/queries/old_jobs_delete.sql +++ b/internal/pkg/janitor/queries/old_jobs_delete.sql @@ -1,3 +1,8 @@ DELETE FROM jobs -WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int; +WHERE system_job_id IN ( + SELECT system_job_id + FROM jobs + WHERE system_job_id <= $1 + LIMIT 1000 +); \ No newline at end of file diff --git a/internal/pkg/janitor/queries/old_jobs_tags_delete.sql b/internal/pkg/janitor/queries/old_jobs_tags_delete.sql index 31c80dd..55be5b1 100644 --- a/internal/pkg/janitor/queries/old_jobs_tags_delete.sql +++ b/internal/pkg/janitor/queries/old_jobs_tags_delete.sql @@ -1,4 +1,7 @@ DELETE FROM job_tags WHERE system_job_id IN ( - SELECT system_job_id FROM jobs WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int -); + SELECT system_job_id + FROM job_tags + WHERE system_job_id <= $1 + LIMIT 1000 +); \ No newline at end of file From be2aa816a13a217c2a739fc8d9e70307514e3a71 Mon Sep 17 00:00:00 2001 From: "ivan.hladush" Date: Thu, 13 Nov 2025 12:27:59 -0700 Subject: [PATCH 3/3] Fix review comments --- internal/pkg/janitor/job.go | 17 +++++++++++++---- .../pkg/janitor/queries/old_job_biggest_id.sql | 6 +++++- .../queries/old_jobs_cluster_tags_delete.sql | 2 +- .../queries/old_jobs_command_tags_delete.sql | 2 +- .../pkg/janitor/queries/old_jobs_delete.sql | 2 +- .../janitor/queries/old_jobs_tags_delete.sql | 2 +- 6 files changed, 22 insertions(+), 9 deletions(-) diff --git a/internal/pkg/janitor/job.go b/internal/pkg/janitor/job.go index 62fda8d..290644d 100644 --- a/internal/pkg/janitor/job.go +++ b/internal/pkg/janitor/job.go @@ -4,6 +4,7 @@ import ( "database/sql" _ "embed" "fmt" + "time" "github.com/patterninc/heimdall/internal/pkg/database" ) @@ -112,8 +113,10 @@ func (j *Janitor) cleanupFinishedJobs() error { } defer sess.Close() + retentionTimestamp := time.Now().AddDate(0, 0, -j.FinishedJobRetentionDays).Unix() + // get biggest ID of old jobs - row, err := sess.QueryRow(queryOldJobsBiggestID, j.FinishedJobRetentionDays) + row, err := sess.QueryRow(queryOldJobsBiggestID, retentionTimestamp) if err != nil { return fmt.Errorf("failed to get biggest ID of old jobs: %w", err) } @@ -125,15 +128,21 @@ func (j *Janitor) cleanupFinishedJobs() error { } return fmt.Errorf("failed to get biggest ID of old jobs: %w", err) } - + if !biggestID.Valid || biggestID.Int64 == 0 { return nil } // remove old jobs data for _, q := range queriesForOldJobsCleanup { - if _, err := sess.Exec(q, biggestID.Int64); err != nil { - return err + for { + affectedRows, err := sess.Exec(q, biggestID.Int64) + if err != nil { + return err + } + if affectedRows == 0 { + break + } } } diff --git a/internal/pkg/janitor/queries/old_job_biggest_id.sql b/internal/pkg/janitor/queries/old_job_biggest_id.sql index ce949e4..a49bb73 100644 --- a/internal/pkg/janitor/queries/old_job_biggest_id.sql +++ b/internal/pkg/janitor/queries/old_job_biggest_id.sql @@ -1 +1,5 @@ -SELECT MAX(system_job_id) FROM jobs WHERE updated_at < extract(epoch FROM now() - ($1 || ' days')::interval)::int; \ No newline at end of file +SELECT system_job_id +FROM jobs +WHERE updated_at < $1 +ORDER BY updated_at desc +LIMIT 1 \ No newline at end of file diff --git a/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql b/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql index dd67f40..bcd0305 100644 --- a/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql +++ b/internal/pkg/janitor/queries/old_jobs_cluster_tags_delete.sql @@ -3,5 +3,5 @@ WHERE system_job_id IN ( SELECT system_job_id FROM job_cluster_tags WHERE system_job_id <= $1 - LIMIT 1000 + LIMIT 100 ); diff --git a/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql b/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql index aa8a573..5b2d570 100644 --- a/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql +++ b/internal/pkg/janitor/queries/old_jobs_command_tags_delete.sql @@ -4,5 +4,5 @@ WHERE system_job_id IN ( SELECT system_job_id FROM job_command_tags WHERE system_job_id <= $1 - LIMIT 1000 + LIMIT 100 ); \ No newline at end of file diff --git a/internal/pkg/janitor/queries/old_jobs_delete.sql b/internal/pkg/janitor/queries/old_jobs_delete.sql index ca76c50..d248e5b 100644 --- a/internal/pkg/janitor/queries/old_jobs_delete.sql +++ b/internal/pkg/janitor/queries/old_jobs_delete.sql @@ -4,5 +4,5 @@ WHERE system_job_id IN ( SELECT system_job_id FROM jobs WHERE system_job_id <= $1 - LIMIT 1000 + LIMIT 100 ); \ No newline at end of file diff --git a/internal/pkg/janitor/queries/old_jobs_tags_delete.sql b/internal/pkg/janitor/queries/old_jobs_tags_delete.sql index 55be5b1..110b778 100644 --- a/internal/pkg/janitor/queries/old_jobs_tags_delete.sql +++ b/internal/pkg/janitor/queries/old_jobs_tags_delete.sql @@ -3,5 +3,5 @@ WHERE system_job_id IN ( SELECT system_job_id FROM job_tags WHERE system_job_id <= $1 - LIMIT 1000 + LIMIT 100 ); \ No newline at end of file