Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion api-service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ func main() {
if err != nil {
log.Fatalf("Invalid cleanup interval: %v", err)
}
cleanManager := service.NewAEnvCleanManager(service.NewKubeCleaner(scheduleClient), interval)
cleanManager := service.NewAEnvCleanManager(scheduleClient, interval).
WithMetrics(middleware.IncrementCleanupSuccess, middleware.IncrementCleanupFailure)
go cleanManager.Start()
defer cleanManager.Stop()

Expand Down
25 changes: 25 additions & 0 deletions api-service/middleware/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,33 @@ var (
},
[]string{"method", "endpoint"},
)

// Auto cleanup metrics
cleanupSuccessCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "auto_cleanup_success_total",
Help: "Total number of successfully auto-cleaned instances",
},
)

cleanupFailureCount = promauto.NewCounter(
prometheus.CounterOpts{
Name: "auto_cleanup_failure_total",
Help: "Total number of failed auto-cleanup attempts",
},
)
)

// IncrementCleanupSuccess increments the cleanup success counter
func IncrementCleanupSuccess() {
cleanupSuccessCount.Inc()
}

// IncrementCleanupFailure increments the cleanup failure counter
func IncrementCleanupFailure() {
cleanupFailureCount.Inc()
}

// MetricsMiddleware metrics collection middleware
func MetricsMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
Expand Down
114 changes: 91 additions & 23 deletions api-service/service/cleanup_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,40 +17,52 @@ limitations under the License.
package service

import (
"api-service/models"
"context"
"log"
"time"
)

type AEnvCleaner interface {
cleanup()
}

type AEnvCleanManager struct {
cleaner AEnvCleaner
envInstanceService EnvInstanceService

interval time.Duration
ctx context.Context
cancel context.CancelFunc

// Metrics counters
incrementCleanupSuccess func()
incrementCleanupFailure func()
}

func NewAEnvCleanManager(cleaner AEnvCleaner, duration time.Duration) *AEnvCleanManager {
func NewAEnvCleanManager(envInstanceService EnvInstanceService, duration time.Duration) *AEnvCleanManager {
ctx, cancel := context.WithCancel(context.Background())
AEnvCleanManager := &AEnvCleanManager{
cleaner: cleaner,
envInstanceService: envInstanceService,

interval: duration,
ctx: ctx,
cancel: cancel,

// Default metrics functions
incrementCleanupSuccess: func() {},
incrementCleanupFailure: func() {},
}
return AEnvCleanManager
}

// WithMetrics sets the metrics functions for the clean manager
func (cm *AEnvCleanManager) WithMetrics(incrementSuccess, incrementFailure func()) *AEnvCleanManager {
cm.incrementCleanupSuccess = incrementSuccess
cm.incrementCleanupFailure = incrementFailure
return cm
}

// Start starts the cleanup service
func (cm *AEnvCleanManager) Start() {
log.Printf("Starting cleanup service with interval: %v", cm.interval)
// Execute cleanup immediately
cm.cleaner.cleanup()
cm.performCleanup()

// Start timer
ticker := time.NewTicker(cm.interval)
Expand All @@ -59,7 +71,7 @@ func (cm *AEnvCleanManager) Start() {
for {
select {
case <-ticker.C:
cm.cleaner.cleanup()
cm.performCleanup()
case <-cm.ctx.Done():
log.Println("Cleanup service stopped")
return
Expand All @@ -68,24 +80,80 @@ func (cm *AEnvCleanManager) Start() {
}()
}

// Stop stops the cleanup service
func (cm *AEnvCleanManager) Stop() {
cm.cancel()
}
// performCleanup performs the actual cleanup task by checking TTL expiration
func (cm *AEnvCleanManager) performCleanup() {
log.Println("Starting TTL-based cleanup task...")

// Get all environment instances
envInstances, err := cm.envInstanceService.ListEnvInstances("")
if err != nil {
log.Printf("Failed to list environment instances: %v", err)
return
}
Comment on lines +88 to +92
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current implementation of performCleanup fetches all environment instances and then filters for expired ones on the client side. This could lead to performance issues if the number of instances is large, as it puts a load on both the network and the api-service memory. The previous implementation delegated filtering to the backend (.../pods?filter=expired), which is a more scalable approach. Consider re-introducing server-side filtering to avoid fetching all instances into memory.


if len(envInstances) == 0 {
log.Println("No environment instances found")
return
}

var deletedCount int

// KubeCleaner cleanup service responsible for periodically cleaning expired EnvInstances
type KubeCleaner struct {
scheduleClient EnvInstanceService
// Check each instance for TTL expiration
for _, instance := range envInstances {
// Skip already terminated instances
if instance.Status == "Terminated" {
continue
}

// Check if TTL is set and has expired
if cm.isExpired(instance) {
log.Printf("Instance %s has expired (TTL: %s), deleting...", instance.ID, instance.TTL)
err := cm.envInstanceService.DeleteEnvInstance(instance.ID)
if err != nil {
log.Printf("Failed to delete expired instance %s: %v", instance.ID, err)
cm.incrementCleanupFailure()
continue
}
deletedCount++
cm.incrementCleanupSuccess()
log.Printf("Successfully deleted expired instance %s", instance.ID)
}
}

log.Printf("TTL-based cleanup task completed. Deleted %d expired instances", deletedCount)
}

// NewCleanupService
func NewKubeCleaner(scheduleClient EnvInstanceService) *KubeCleaner {
return &KubeCleaner{
scheduleClient: scheduleClient,
// isExpired checks if an environment instance has expired based on its TTL and creation time
func (cm *AEnvCleanManager) isExpired(instance *models.EnvInstance) bool {
// If TTL is not set, consider it as non-expiring
if instance.TTL == "" {
return false
}

// Parse TTL duration
ttlDuration, err := time.ParseDuration(instance.TTL)
if err != nil {
log.Printf("Failed to parse TTL '%s' for instance %s: %v", instance.TTL, instance.ID, err)
return false
}

// Parse creation time using time.DateTime format (2006-01-02 15:04:05)
createdAt, err := time.Parse(time.DateTime, instance.CreatedAt)
if err != nil {
// Fallback to RFC3339 if DateTime parsing fails
createdAt, err = time.Parse(time.RFC3339, instance.CreatedAt)
if err != nil {
log.Printf("Failed to parse creation time '%s' for instance %s: %v", instance.CreatedAt, instance.ID, err)
return false
}
}

// Check if the instance has expired
expirationTime := createdAt.Add(ttlDuration)
return time.Now().After(expirationTime)
}

// cleanup executes cleanup task
func (cs *KubeCleaner) cleanup() {
_ = cs.scheduleClient.Cleanup()
// Stop stops the cleanup service
func (cm *AEnvCleanManager) Stop() {
cm.cancel()
}
Loading
Loading