From 434ef85cd0e28d3253eb37a91315ee7321ed6c54 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 14 Feb 2019 21:46:16 -0800 Subject: [PATCH] cmd/ci-operator: Add --grace-period Because while two seconds may be sufficient teardown time for unit tests and such, it's too short for e2e teardown (where cluster logs should be collected, cluster resources need to be reaped, and collected assets need to be uploaded). The two-second timeout is from c2ac3abf (When interrupted, mark any in progress pods/templates as deleted, 2018-05-04, #16). And actually, I'm not sure how the 2-second default worked even for unit tests and such, because Kubernetes pods have a 30-second grace period by default [1]. A recent aborted e2e job that did not run teardown to completion or collect any cluster assets is [2,3]. A recent successful e2e job showing current teardown timing is [4,5]. From [4]: 2019/02/14 18:19:54 Container setup in pod e2e-aws completed successfully 2019/02/14 18:46:08 Container test in pod e2e-aws completed successfully 2019/02/14 18:51:38 Container teardown in pod e2e-aws completed successfully So 5.5 minutes to teardown. And from [5]: time="2019-02-14T18:47:40Z" level=debug msg="search for and delete matching resources by tag in us-east-1 matching aws.Filter{\"openshiftClusterID\":\"1341ccc2-66ac-46bb-ae15-0295d4a126ba\"}" ... time="2019-02-14T18:51:38Z" level=debug msg="Purging asset \"Cluster\" from disk" So about 4 minutes of that is resource reaping (with the previous 1.5 minutes being log collection). [1]: https://kubernetes.io/docs/concepts/workloads/pods/pod/#termination-of-pods [2]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_installer/1255/pull-ci-openshift-installer-master-e2e-aws/3707/build-log.txt [3]: https://gcsweb-ci.svc.ci.openshift.org/gcs/origin-ci-test/pr-logs/pull/openshift_installer/1255/pull-ci-openshift-installer-master-e2e-aws/3707/artifacts/ [4]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_installer/1045/pull-ci-openshift-installer-master-e2e-aws/3706/build-log.txt [5]: https://storage.googleapis.com/origin-ci-test/pr-logs/pull/openshift_installer/1045/pull-ci-openshift-installer-master-e2e-aws/3706/artifacts/e2e-aws/installer/.openshift_install.log --- cmd/ci-operator/main.go | 9 +++++++-- pkg/api/job_spec.go | 3 +++ pkg/steps/artifacts.go | 2 +- pkg/steps/pod.go | 8 +++++++- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/cmd/ci-operator/main.go b/cmd/ci-operator/main.go index a688c01e..fcdb0a57 100644 --- a/cmd/ci-operator/main.go +++ b/cmd/ci-operator/main.go @@ -191,6 +191,7 @@ type options struct { extraInputHash stringSlice idleCleanupDuration time.Duration cleanupDuration time.Duration + graceDuration time.Duration inputHash string secrets []*coreapi.Secret @@ -207,6 +208,7 @@ func bindOptions(flag *flag.FlagSet) *options { opt := &options{ idleCleanupDuration: time.Duration(1 * time.Hour), cleanupDuration: time.Duration(12 * time.Hour), + graceDuration: time.Duration(coreapi.DefaultTerminationGracePeriodSeconds * time.Second), } // command specific options @@ -229,6 +231,7 @@ func bindOptions(flag *flag.FlagSet) *options { flag.StringVar(&opt.baseNamespace, "base-namespace", "stable", "Namespace to read builds from, defaults to stable.") flag.DurationVar(&opt.idleCleanupDuration, "delete-when-idle", opt.idleCleanupDuration, "If no pod is running for longer than this interval, delete the namespace. Set to zero to retain the contents. Requires the namespace TTL controller to be deployed.") flag.DurationVar(&opt.cleanupDuration, "delete-after", opt.cleanupDuration, "If namespace exists for longer than this interval, delete the namespace. Set to zero to retain the contents. Requires the namespace TTL controller to be deployed.") + flag.DurationVar(&opt.graceDuration, "grace-period", opt.graceDuration, "Interval to wait after requesting pod termination before killing the pod. ci-operator itself will wait an additional 30 seconds to try and receive the successful deletion response.") // actions to add to the graph flag.BoolVar(&opt.promote, "promote", false, "When all other targets complete, publish the set of images built by this job into the release configuration.") @@ -296,6 +299,7 @@ func (o *options) Complete() error { jobSpec.Refs = spec.Refs } jobSpec.BaseNamespace = o.baseNamespace + jobSpec.GracePeriod = &o.graceDuration o.jobSpec = jobSpec if o.dry && o.verbose { @@ -392,9 +396,10 @@ func (o *options) Run() error { if o.dry { os.Exit(0) } - log.Printf("error: Process interrupted with signal %s, exiting in 2s ...", s) + duration := o.graceDuration + 30*time.Second + log.Printf("error: Process interrupted with signal %s, exiting in %v ...", s, duration) cancel() - time.Sleep(2 * time.Second) + time.Sleep(duration) os.Exit(1) } diff --git a/pkg/api/job_spec.go b/pkg/api/job_spec.go index dafee768..c67d8a23 100644 --- a/pkg/api/job_spec.go +++ b/pkg/api/job_spec.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "strings" + "time" meta "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -28,6 +29,8 @@ type JobSpec struct { Namespace string BaseNamespace string + GracePeriod *time.Duration + // if set, any new artifacts will be a child of this object owner *meta.OwnerReference } diff --git a/pkg/steps/artifacts.go b/pkg/steps/artifacts.go index 2a2cc6dd..15e66e64 100644 --- a/pkg/steps/artifacts.go +++ b/pkg/steps/artifacts.go @@ -384,7 +384,7 @@ func (w *ArtifactWorker) downloadArtifacts(podName string, hasArtifacts bool) er } defer func() { - // signal to artifacts container to gracefully shut don + // signal to artifacts container to gracefully shut down err := removeFile(w.podClient, w.namespace, podName, "artifacts", []string{"/tmp/done"}) if err == nil { return diff --git a/pkg/steps/pod.go b/pkg/steps/pod.go index f7252f78..e98f5150 100644 --- a/pkg/steps/pod.go +++ b/pkg/steps/pod.go @@ -5,6 +5,7 @@ import ( "encoding/json" "fmt" "log" + "math" "path/filepath" coreapi "k8s.io/api/core/v1" @@ -93,7 +94,12 @@ func (s *podStep) Run(ctx context.Context, dry bool) error { <-ctx.Done() notifier.Cancel() log.Printf("cleanup: Deleting %s pod %s", s.name, s.config.As) - if err := s.podClient.Pods(s.jobSpec.Namespace).Delete(s.config.As, nil); err != nil && !errors.IsNotFound(err) { + opts := &meta.DeleteOptions{} + if s.jobSpec.GracePeriod != nil { + sec := int64(math.Ceil(s.jobSpec.GracePeriod.Seconds())) + opts.GracePeriodSeconds = &sec + } + if err := s.podClient.Pods(s.jobSpec.Namespace).Delete(s.config.As, opts); err != nil && !errors.IsNotFound(err) { log.Printf("error: Could not delete %s pod: %v", s.name, err) } }()