From 434ef85cd0e28d3253eb37a91315ee7321ed6c54 Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@tremily.us>
Date: Thu, 14 Feb 2019 21:46:16 -0800
Subject: [PATCH] cmd/ci-operator: Add --grace-period

Because while two seconds may be sufficient teardown time for unit
tests and such, it's too short for e2e teardown (where cluster logs
should be collected, cluster resources need to be reaped, and
collected assets need to be uploaded).  The two-second timeout is from
c2ac3abf (When interrupted, mark any in progress pods/templates as
deleted, 2018-05-04, #16).  And actually, I'm not sure how the
2-second default worked even for unit tests and such, because
Kubernetes pods have a 30-second grace period by default [1].

A recent aborted e2e job that did not run teardown to completion or
collect any cluster assets is [2,3].  A recent successful e2e job
showing current teardown timing is [4,5].  From [4]:

  2019/02/14 18:19:54 Container setup in pod e2e-aws completed successfully
  2019/02/14 18:46:08 Container test in pod e2e-aws completed successfully
  2019/02/14 18:51:38 Container teardown in pod e2e-aws completed successfully

So 5.5 minutes to teardown.  And from [5]:

  time="2019-02-14T18:47:40Z" level=debug msg="search for and delete matching resources by tag in us-east-1 matching aws.Filter{\"openshiftClusterID\":\"1341ccc2-66ac-46bb-ae15-0295d4a126ba\"}"
  ...
  time="2019-02-14T18:51:38Z" level=debug msg="Purging asset \"Cluster\" from disk"

So about 4 minutes of that is resource reaping (with the previous 1.5
minutes being log collection).

[1]: https://kubernetes.io/docs/concepts/workloads/pods/pod/#termination-of-pods
[2]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_installer/1255/pull-ci-openshift-installer-master-e2e-aws/3707/build-log.txt
[3]: https://gcsweb-ci.svc.ci.openshift.org/gcs/origin-ci-test/pr-logs/pull/openshift_installer/1255/pull-ci-openshift-installer-master-e2e-aws/3707/artifacts/
[4]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_installer/1045/pull-ci-openshift-installer-master-e2e-aws/3706/build-log.txt
[5]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_installer/1045/pull-ci-openshift-installer-master-e2e-aws/3706/artifacts/e2e-aws/installer/.openshift_install.log
---
 cmd/ci-operator/main.go | 9 +++++++--
 pkg/api/job_spec.go     | 3 +++
 pkg/steps/artifacts.go  | 2 +-
 pkg/steps/pod.go        | 8 +++++++-
 4 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/cmd/ci-operator/main.go b/cmd/ci-operator/main.go
index a688c01e..fcdb0a57 100644
--- a/cmd/ci-operator/main.go
+++ b/cmd/ci-operator/main.go
@@ -191,6 +191,7 @@ type options struct {
 	extraInputHash      stringSlice
 	idleCleanupDuration time.Duration
 	cleanupDuration     time.Duration
+	graceDuration       time.Duration
 
 	inputHash     string
 	secrets       []*coreapi.Secret
@@ -207,6 +208,7 @@ func bindOptions(flag *flag.FlagSet) *options {
 	opt := &options{
 		idleCleanupDuration: time.Duration(1 * time.Hour),
 		cleanupDuration:     time.Duration(12 * time.Hour),
+		graceDuration:       time.Duration(coreapi.DefaultTerminationGracePeriodSeconds * time.Second),
 	}
 
 	// command specific options
@@ -229,6 +231,7 @@ func bindOptions(flag *flag.FlagSet) *options {
 	flag.StringVar(&opt.baseNamespace, "base-namespace", "stable", "Namespace to read builds from, defaults to stable.")
 	flag.DurationVar(&opt.idleCleanupDuration, "delete-when-idle", opt.idleCleanupDuration, "If no pod is running for longer than this interval, delete the namespace. Set to zero to retain the contents. Requires the namespace TTL controller to be deployed.")
 	flag.DurationVar(&opt.cleanupDuration, "delete-after", opt.cleanupDuration, "If namespace exists for longer than this interval, delete the namespace. Set to zero to retain the contents. Requires the namespace TTL controller to be deployed.")
+	flag.DurationVar(&opt.graceDuration, "grace-period", opt.graceDuration, "Interval to wait after requesting pod termination before killing the pod.  ci-operator itself will wait an additional 30 seconds to try and receive the successful deletion response.")
 
 	// actions to add to the graph
 	flag.BoolVar(&opt.promote, "promote", false, "When all other targets complete, publish the set of images built by this job into the release configuration.")
@@ -296,6 +299,7 @@ func (o *options) Complete() error {
 		jobSpec.Refs = spec.Refs
 	}
 	jobSpec.BaseNamespace = o.baseNamespace
+	jobSpec.GracePeriod = &o.graceDuration
 	o.jobSpec = jobSpec
 
 	if o.dry && o.verbose {
@@ -392,9 +396,10 @@ func (o *options) Run() error {
 		if o.dry {
 			os.Exit(0)
 		}
-		log.Printf("error: Process interrupted with signal %s, exiting in 2s ...", s)
+		duration := o.graceDuration + 30*time.Second
+		log.Printf("error: Process interrupted with signal %s, exiting in %v ...", s, duration)
 		cancel()
-		time.Sleep(2 * time.Second)
+		time.Sleep(duration)
 		os.Exit(1)
 	}
 
diff --git a/pkg/api/job_spec.go b/pkg/api/job_spec.go
index dafee768..c67d8a23 100644
--- a/pkg/api/job_spec.go
+++ b/pkg/api/job_spec.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"os"
 	"strings"
+	"time"
 
 	meta "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
@@ -28,6 +29,8 @@ type JobSpec struct {
 	Namespace     string
 	BaseNamespace string
 
+	GracePeriod *time.Duration
+
 	// if set, any new artifacts will be a child of this object
 	owner *meta.OwnerReference
 }
diff --git a/pkg/steps/artifacts.go b/pkg/steps/artifacts.go
index 2a2cc6dd..15e66e64 100644
--- a/pkg/steps/artifacts.go
+++ b/pkg/steps/artifacts.go
@@ -384,7 +384,7 @@ func (w *ArtifactWorker) downloadArtifacts(podName string, hasArtifacts bool) er
 	}
 
 	defer func() {
-		// signal to artifacts container to gracefully shut don
+		// signal to artifacts container to gracefully shut down
 		err := removeFile(w.podClient, w.namespace, podName, "artifacts", []string{"/tmp/done"})
 		if err == nil {
 			return
diff --git a/pkg/steps/pod.go b/pkg/steps/pod.go
index f7252f78..e98f5150 100644
--- a/pkg/steps/pod.go
+++ b/pkg/steps/pod.go
@@ -5,6 +5,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"log"
+	"math"
 	"path/filepath"
 
 	coreapi "k8s.io/api/core/v1"
@@ -93,7 +94,12 @@ func (s *podStep) Run(ctx context.Context, dry bool) error {
 		<-ctx.Done()
 		notifier.Cancel()
 		log.Printf("cleanup: Deleting %s pod %s", s.name, s.config.As)
-		if err := s.podClient.Pods(s.jobSpec.Namespace).Delete(s.config.As, nil); err != nil && !errors.IsNotFound(err) {
+		opts := &meta.DeleteOptions{}
+		if s.jobSpec.GracePeriod != nil {
+			sec := int64(math.Ceil(s.jobSpec.GracePeriod.Seconds()))
+			opts.GracePeriodSeconds = &sec
+		}
+		if err := s.podClient.Pods(s.jobSpec.Namespace).Delete(s.config.As, opts); err != nil && !errors.IsNotFound(err) {
 			log.Printf("error: Could not delete %s pod: %v", s.name, err)
 		}
 	}()