Skip to content

Commit 7b47a4b

Browse files
committed
[core] New cleanup facilities for ODC
1 parent 55aadad commit 7b47a4b

2 files changed

Lines changed: 146 additions & 7 deletions

File tree

core/integration/odc/handlers.go

Lines changed: 95 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232

3333
"github.com/AliceO2Group/Control/common/logger/infologger"
3434
"github.com/AliceO2Group/Control/common/utils"
35+
"github.com/AliceO2Group/Control/core/environment"
3536
"github.com/AliceO2Group/Control/core/integration/odc/odcutils"
3637
odcpb "github.com/AliceO2Group/Control/core/integration/odc/protos"
3738
"github.com/sirupsen/logrus"
@@ -243,7 +244,7 @@ func handleReset(ctx context.Context, odcClient *RpcClient, arguments map[string
243244
return nil
244245
}
245246

246-
func handleCleanup(ctx context.Context, odcClient *RpcClient, arguments map[string]string, envId string) error {
247+
func handleCleanupLegacy(ctx context.Context, odcClient *RpcClient, arguments map[string]string, envId string) error {
247248
defer utils.TimeTrackFunction(time.Now(), log.WithPrefix("odcclient"))
248249
if envId == "" {
249250
return errors.New("cannot proceed with empty environment id")
@@ -277,6 +278,93 @@ func handleCleanup(ctx context.Context, odcClient *RpcClient, arguments map[stri
277278
return nil // We clobber the error because nothing can be done for a failed cleanup
278279
}
279280

281+
func handleCleanup(ctx context.Context, odcClient *RpcClient, arguments map[string]string, envId string) error {
282+
defer utils.TimeTrackFunction(time.Now(), log.WithPrefix("odcclient"))
283+
284+
// First we query ODC for the full list of active partitions
285+
req := &odcpb.StatusRequest{}
286+
287+
var err error = nil
288+
var rep *odcpb.StatusReply
289+
290+
rep, err = odcClient.Status(ctx, req, grpc.EmptyCallOption{})
291+
if err != nil {
292+
return printGrpcError(err)
293+
}
294+
295+
if rep == nil || rep.GetStatus() == odcpb.ReplyStatus_UNKNOWN {
296+
// We got a nil response with nil error, this should never happen
297+
return errors.New("nil response error")
298+
}
299+
300+
if odcErr := rep.GetError(); odcErr != nil {
301+
return fmt.Errorf("code %d from ODC: %s", odcErr.GetCode(), odcErr.GetMsg())
302+
}
303+
if replyStatus := rep.GetStatus(); replyStatus != odcpb.ReplyStatus_SUCCESS {
304+
return fmt.Errorf("status %s from ODC", replyStatus.String())
305+
}
306+
log.WithFields(logrus.Fields{
307+
"odcMsg": rep.GetMsg(),
308+
"odcStatus": rep.GetStatus().String(),
309+
"odcExectime": rep.GetExectime(),
310+
}).
311+
Trace("call to ODC complete")
312+
313+
knownEnvs := environment.ManagerInstance().Ids()
314+
partitionsToClean := make(map[string]struct{})
315+
for _, odcPartition := range rep.GetPartitions() {
316+
isOrphan := true
317+
for _, knownEnv := range knownEnvs {
318+
if odcPartition.Partitionid == knownEnv.String() { // found a matching env
319+
isOrphan = false
320+
break
321+
}
322+
}
323+
if isOrphan { // no env was found for the given ODC partition
324+
partitionsToClean[odcPartition.Partitionid] = struct{}{}
325+
}
326+
}
327+
328+
// The present function can in principle be called with envId = "", if the cleanup is triggered from
329+
// outside of an active environment.
330+
// If an envId is passed, we append it to the list of partitions to clean up just in case, otherwise we
331+
// ignore it.
332+
if envId != "" {
333+
partitionsToClean[envId] = struct {}{}
334+
}
335+
336+
// Then the actual cleanup calls begin, one partition at a time...
337+
for odcPartitionId, _ := range partitionsToClean {
338+
// This block tries to perform the regular teardown sequence.
339+
// Since Shutdown is supposed to work in any state, we don't bail on error.
340+
err := doReset(ctx, odcClient, arguments, odcPartitionId)
341+
if err != nil {
342+
log.WithError(printGrpcError(err)).
343+
WithField("level", infologger.IL_Devel).
344+
WithField("partition", odcPartitionId).
345+
Warn("ODC Reset call failed")
346+
}
347+
348+
err = doTerminate(ctx, odcClient, arguments, odcPartitionId)
349+
if err != nil {
350+
log.WithError(printGrpcError(err)).
351+
WithField("level", infologger.IL_Devel).
352+
WithField("partition", odcPartitionId).
353+
Warn("ODC Terminate call failed")
354+
}
355+
356+
err = doShutdown(ctx, odcClient, arguments, odcPartitionId)
357+
if err != nil {
358+
log.WithError(printGrpcError(err)).
359+
WithField("level", infologger.IL_Devel).
360+
WithField("partition", odcPartitionId).
361+
Warn("ODC Shutdown call failed")
362+
}
363+
}
364+
365+
return nil // We clobber the error because nothing can be done for a failed cleanup
366+
}
367+
280368
func doReset(ctx context.Context, odcClient *RpcClient, arguments map[string]string, envId string) error {
281369
// RESET
282370
req := &odcpb.ResetRequest{
@@ -307,12 +395,12 @@ func doReset(ctx context.Context, odcClient *RpcClient, arguments map[string]str
307395
return fmt.Errorf("status %s from ODC", replyStatus.String())
308396
}
309397
log.WithFields(logrus.Fields{
310-
"odcMsg": rep.Reply.Msg,
311-
"odcStatus": rep.Reply.Status.String(),
312-
"odcExectime": rep.Reply.Exectime,
313-
"odcRunid": rep.Reply.Partitionid,
314-
"odcSessionid": rep.Reply.Sessionid,
315-
}).
398+
"odcMsg": rep.Reply.Msg,
399+
"odcStatus": rep.Reply.Status.String(),
400+
"odcExectime": rep.Reply.Exectime,
401+
"odcRunid": rep.Reply.Partitionid,
402+
"odcSessionid": rep.Reply.Sessionid,
403+
}).
316404
Debug("call to ODC complete")
317405
return err
318406
}

core/integration/odc/plugin.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ func (p *Plugin) ObjectStack(data interface{}) (stack map[string]interface{}) {
154154

155155
stack = make(map[string]interface{})
156156
stack["Configure"] = func() (out string) {
157+
// ODC Run + SetProperties + Configure
158+
157159
var topology, plugin, resources string
158160
ok := false
159161
topology, ok = varStack["odc_topology"]
@@ -209,6 +211,8 @@ func (p *Plugin) ObjectStack(data interface{}) (stack map[string]interface{}) {
209211
return
210212
}
211213
stack["Start"] = func() (out string) { // must formally return string even when we return nothing
214+
// ODC SetProperties + Start
215+
212216
rn, ok := varStack["run_number"]
213217
if !ok {
214218
log.WithField("partition", envId).
@@ -238,6 +242,8 @@ func (p *Plugin) ObjectStack(data interface{}) (stack map[string]interface{}) {
238242
return
239243
}
240244
stack["Stop"] = func() (out string) {
245+
// ODC Stop
246+
241247
timeout := callable.AcquireTimeout(ODC_STOP_TIMEOUT, varStack, "Stop", envId)
242248

243249
ctx, cancel := context.WithTimeout(context.Background(), timeout)
@@ -256,6 +262,8 @@ func (p *Plugin) ObjectStack(data interface{}) (stack map[string]interface{}) {
256262
return
257263
}
258264
stack["Reset"] = func() (out string) {
265+
// ODC Reset + Terminate + Shutdown
266+
259267
timeout := callable.AcquireTimeout(ODC_RESET_TIMEOUT, varStack, "Reset", envId)
260268

261269
ctx, cancel := context.WithTimeout(context.Background(), timeout)
@@ -273,7 +281,30 @@ func (p *Plugin) ObjectStack(data interface{}) (stack map[string]interface{}) {
273281
}
274282
return
275283
}
284+
stack["EnsureCleanupLegacy"] = func() (out string) {
285+
// ODC Reset + Terminate + Shutdown for current env
286+
287+
timeout := callable.AcquireTimeout(ODC_GENERAL_OP_TIMEOUT, varStack, "EnsureCleanup", envId)
288+
289+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
290+
defer cancel()
291+
err := handleCleanupLegacy(ctx, p.odcClient, nil, envId)
292+
if err != nil {
293+
log.WithError(err).
294+
WithField("level", infologger.IL_Support).
295+
WithField("partition", envId).
296+
WithField("call", "EnsureCleanupLegacy").
297+
298+
Error("ODC error")
299+
log.WithField("partition", envId).
300+
WithField("call", "EnsureCleanupLegacy").
301+
Error("EPN Cleanup sequence failed")
302+
}
303+
return
304+
}
276305
stack["EnsureCleanup"] = func() (out string) {
306+
// ODC Reset + Terminate + Shutdown for currend env + all orphans
307+
277308
timeout := callable.AcquireTimeout(ODC_GENERAL_OP_TIMEOUT, varStack, "EnsureCleanup", envId)
278309

279310
ctx, cancel := context.WithTimeout(context.Background(), timeout)
@@ -292,7 +323,27 @@ func (p *Plugin) ObjectStack(data interface{}) (stack map[string]interface{}) {
292323
}
293324
return
294325
}
326+
stack["PreDeploymentCleanup"] = func() (out string) {
327+
// ODC Reset + Terminate + Shutdown for all orphans
328+
329+
timeout := callable.AcquireTimeout(ODC_GENERAL_OP_TIMEOUT, varStack, "EnsureCleanup", envId)
295330

331+
ctx, cancel := context.WithTimeout(context.Background(), timeout)
332+
defer cancel()
333+
err := handleCleanup(ctx, p.odcClient, nil, "")
334+
if err != nil {
335+
log.WithError(err).
336+
WithField("level", infologger.IL_Support).
337+
WithField("partition", envId).
338+
WithField("call", "PreDeploymentCleanup").
339+
340+
Error("ODC error")
341+
log.WithField("partition", envId).
342+
WithField("call", "PreDeploymentCleanup").
343+
Error("EPN PreDeploymentCleanup sequence failed")
344+
}
345+
return
346+
}
296347
return
297348
}
298349

0 commit comments

Comments
 (0)