Skip to content

Commit c9c9467

Browse files
author
Michal Tichák
committed
OCTRL-1081 kubectl task to create rudimentary bridge to kubernetes
user infor setup properly... kubectl passes arguments properly to the kubernetes attempt for fairmq bla
1 parent adbed10 commit c9c9467

File tree

9 files changed

+599
-15
lines changed

9 files changed

+599
-15
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ There are two ways of interacting with AliECS:
193193
* [Sampling reservoir](/docs/metrics.md#sampling-reservoir)
194194
* [OCC API debugging with grpcc](/docs/using_grpcc_occ.md#occ-api-debugging-with-grpcc)
195195
* [Running tasks inside docker](/docs/running_docker.md#running-a-task-inside-a-docker-container)
196+
* Kubernetes
197+
* [ECS bridge to Kubernetes](/docs/kubernetes_ecs.md)
196198
* Resources
197199
* T. Mrnjavac et. al, [AliECS: A New Experiment Control System for the ALICE Experiment](https://doi.org/10.1051/epjconf/202429502027), CHEP23
198200

common/controlmode/controlmode.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ const (
3939
FAIRMQ
4040
BASIC
4141
HOOK
42+
KUBECTL_DIRECT
43+
KUBECTL_FAIRMQ
4244
)
4345

4446
func (cm ControlMode) String() string {
@@ -51,6 +53,10 @@ func (cm ControlMode) String() string {
5153
return "basic"
5254
case HOOK:
5355
return "hook"
56+
case KUBECTL_DIRECT:
57+
return "kubectl_direct"
58+
case KUBECTL_FAIRMQ:
59+
return "kubectl_fairmq"
5460
}
5561
return "direct"
5662
}
@@ -71,6 +77,10 @@ func (cm *ControlMode) UnmarshalText(b []byte) error {
7177
*cm = BASIC
7278
case "hook":
7379
*cm = HOOK
80+
case "kubectl_direct":
81+
*cm = KUBECTL_DIRECT
82+
case "kubectl_fairmq":
83+
*cm = KUBECTL_FAIRMQ
7484
default:
7585
*cm = DIRECT
7686
}

core/task/scheduler.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1432,7 +1432,8 @@ func makeTaskForMesosResources(
14321432
cmd.Env = append(cmd.Env, fmt.Sprintf("%s=%d", "OCC_CONTROL_PORT", controlPort))
14331433
}
14341434

1435-
if cmd.ControlMode == controlmode.FAIRMQ {
1435+
if cmd.ControlMode == controlmode.FAIRMQ ||
1436+
cmd.ControlMode == controlmode.KUBECTL_FAIRMQ {
14361437
cmd.Arguments = append(cmd.Arguments, "--control-port", strconv.FormatUint(controlPort, 10))
14371438
}
14381439

core/task/task.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,9 @@ func (t *Task) BuildTaskCommand(role parentRole) (err error) {
286286
if class.Control.Mode == controlmode.BASIC ||
287287
class.Control.Mode == controlmode.HOOK ||
288288
class.Control.Mode == controlmode.DIRECT ||
289-
class.Control.Mode == controlmode.FAIRMQ {
289+
class.Control.Mode == controlmode.FAIRMQ ||
290+
class.Control.Mode == controlmode.KUBECTL_DIRECT ||
291+
class.Control.Mode == controlmode.KUBECTL_FAIRMQ {
290292
var varStack map[string]string
291293

292294
// First we get the full varStack from the parent role, and
@@ -393,7 +395,8 @@ func (t *Task) BuildTaskCommand(role parentRole) (err error) {
393395
}
394396
}
395397

396-
if class.Control.Mode == controlmode.FAIRMQ {
398+
if class.Control.Mode == controlmode.FAIRMQ ||
399+
class.Control.Mode == controlmode.KUBECTL_FAIRMQ {
397400
// FIXME read this from configuration
398401
// if the task class doesn't provide an id, we generate one ourselves
399402
if !utils.StringSliceContains(cmd.Arguments, "--id") {
@@ -635,7 +638,9 @@ func (t *Task) BuildPropertyMap(bindMap channel.BindMap) (propMap controlcommand
635638

636639
// For FAIRMQ tasks, we append FairMQ channel configuration
637640
if class.Control.Mode == controlmode.FAIRMQ ||
638-
class.Control.Mode == controlmode.DIRECT {
641+
class.Control.Mode == controlmode.DIRECT ||
642+
class.Control.Mode == controlmode.KUBECTL_DIRECT ||
643+
class.Control.Mode == controlmode.KUBECTL_FAIRMQ {
639644
for _, inbCh := range channel.MergeInbound(parent.CollectInboundChannels(), class.Bind) {
640645
// We get the FairMQ-formatted propertyMap from the inbound channel spec
641646
var chanProps controlcommands.PropertyMap

core/task/taskclass/class.go

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,6 @@ func (c *Class) UnmarshalYAML(unmarshal func(interface{}) error) (err error) {
123123
}
124124
}
125125
return
126-
127126
}
128127

129128
func (c *Class) MarshalYAML() (interface{}, error) {
@@ -154,13 +153,17 @@ func (c *Class) MarshalYAML() (interface{}, error) {
154153
Command: c.Command,
155154
}
156155

157-
if c.Control.Mode == controlmode.FAIRMQ {
158-
aux.Control.Mode = "fairmq"
159-
} else if c.Control.Mode == controlmode.BASIC {
160-
aux.Control.Mode = "basic"
161-
} else {
162-
aux.Control.Mode = "direct"
163-
}
156+
// if c.Control.Mode == controlmode.FAIRMQ {
157+
// aux.Control.Mode = "fairmq"
158+
// } else if c.Control.Mode == controlmode.BASIC {
159+
// aux.Control.Mode = "basic"
160+
// } else if c.Control.Mode == controlmode.KUBECTL {
161+
// aux.Control.Mode = "kubectl"
162+
// } else {
163+
// aux.Control.Mode = "direct"
164+
// }
165+
166+
aux.Control.Mode = c.Control.Mode.String()
164167

165168
return aux, nil
166169
}

docs/kubernetes_ecs.md

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# ECS with Kubernetes
2+
3+
> ⚠️ **Warning**
4+
> All Kubernetes work done is in a stage of prototype.
5+
6+
## Kubernetes Cluster
7+
8+
While prototyping we used many Kubernetes clusters, namely [`kind`](https://kind.sigs.k8s.io/), [`minikube`](https://minikube.sigs.k8s.io/docs/) and [`k3s`](https://k3s.io/)
9+
in both local and remote cluster deployment. We used Openstack for remote deployment.
10+
Follow the guides at the individual distributions in order to create the desired cluster setup.
11+
For now we chose `k3s` for most of the activities performed because it is lightweight
12+
and easily installed distribution which is also [`CNCF`](https://www.cncf.io/training/certification/) certified.
13+
14+
All settings of `k3s` were used as default except one: locked-in-memory size. Use `ulimit -l` to learn
15+
what is the limit for the current user and `LimitMEMLOCK` inside the k3s systemd service config
16+
to set it for correct value. Right now the `flp` user has unlimited size (`LimitMEMLOCK=infinity`).
17+
This config is necessary because even if you are running PODs with the privileged security context
18+
under user flp, Kubernetes still sets limits according to its internal settings and doesn't
19+
respect linux settings.
20+
21+
Another setup we expect at this moment to be present at the target nodes
22+
is ability to run PODs with privileged permissions and also under user `flp`.
23+
This means that the machine has to have `flp` user setup the same way as
24+
if you would do the installation with [`o2-flp-setup`](https://alice-flp.docs.cern.ch/Operations/Experts/system-configuration/utils/o2-flp-setup/).
25+
26+
## Running tasks (`KubectlTask`)
27+
28+
ECS is setup to run tasks through Mesos on all required hosts baremetal with active
29+
task management (see [`ControllableTask`](/executor/executable/controllabletask.go))
30+
and OCC gRPC communication. When running docker task through ECS we could easily
31+
wrap command to be run into the docker container with proper settings
32+
([see](/docs/running_docker.md)). This is however not possible for Kubernetes
33+
workloads as the PODs are "hidden" inside the cluster. So we plan
34+
to deploy our own Task Controller which will connect to and guide
35+
OCC state machine of required tasks. Thus we need to create custom
36+
POC way to communicate with Kubernetes cluster from Mesos executor.
37+
38+
The reason why we don't call Kubernetes cluster directly from ECS core
39+
is that ECS does a lot of heavy lifting while deploying workloads,
40+
monitoring workloads and by generating a lot of configuration which
41+
is not trivial to replicate manually. However, if we create some class
42+
that would be able to deploy one task into the Kubernetes and monitor its
43+
state we could replicate `ControllableTask` workflow and leave ECS
44+
mostly intact for now, save a lot of work and focus on prototyping
45+
Kubernetes operator pattern.
46+
47+
Thus [`KubectlTask`](/executor/executable/kubectltask.go) was created. This class
48+
is written as a wrapper around `kubectl` utility to manage Kubernetes cluster.
49+
It is based on following `kubectl` commands:
50+
51+
* `apply` => `kubectl apply -f manifest.yaml` - deploys resource described inside given manifest
52+
* `delete` => `kubectl delete -f manifest.yaml` - deletes resource from cluster
53+
* `patch` => `kubectl patch -f exampletask.yaml --type='json' -p='[{"op": "replace", "path": "/spec/state", "value": "running"}]` - changes the state of resource inside cluster
54+
* `get` => `kubectl get -f manifest.yaml -o jsonpath='{.spec.state}'` - queries exact field of resource (`state` in the example) inside cluster.
55+
56+
These four commands allow us to deploy and monitor status of the deployed
57+
resource without necessity to interact with it directly. However `KubectlTask`
58+
expects that resource is the CRD [Task](/control-operator/api/v1alpha1/task_types.go).
59+
60+
In order to activate `KubectlTask` you need to change yaml template
61+
inside the `ControlWorkflows` directory. Namely:
62+
63+
* add path to the kubectl manifest as the first argument in `.command.arguments` field
64+
* change `.control.mode` to either `kubectl_direct` or `kubectl_fairmq`
65+
You can find working template inside `control-operator/ecs-manifests/control-workflows/*_kube.yaml`
66+
67+
Working kubectl manifests are to be found in `control-operator/ecs-manifests/kubernetes-manifests`.
68+
You can see `*test.yaml` for concrete deployable manifests by `kubectl apply`, the rest
69+
are the templates with variables to be filled in in a `${var}` format. `KubectlTask`
70+
fills these variables from env vars.

0 commit comments

Comments
 (0)