From ece31d20b6f1238bb2525dc6216c173f153b4809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EB=B0=95=ED=98=9C=EB=A6=B0?= Date: Thu, 19 Mar 2026 12:03:05 +0900 Subject: [PATCH 1/3] Change namespace from 'cssh' to 'ailab-infra' --- .github/workflows/deploy-config-server.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-config-server.yaml b/.github/workflows/deploy-config-server.yaml index 97e1999..19db8da 100644 --- a/.github/workflows/deploy-config-server.yaml +++ b/.github/workflows/deploy-config-server.yaml @@ -66,7 +66,7 @@ jobs: # --set image.pullPolicy=Always: 무조건 최신 이미지를 다운로드 받도록 설정 helm upgrade --install containerssh-config-server ~/deploy-temp/config-server/Chart \ - --namespace cssh \ + --namespace ailab-infra \ --create-namespace \ --set image.repository=${{ secrets.DOCKER_USERNAME }}/config-server \ --set image.tag=$IMAGE_TAG \ From 1ef53a23a1d365de03a331bbb871b6dda2db6706 Mon Sep 17 00:00:00 2001 From: dongmin0204 Date: Thu, 2 Apr 2026 04:10:36 +0900 Subject: [PATCH 2/3] =?UTF-8?q?hotfix:=20=EB=8C=80=EC=86=8C=EB=AC=B8?= =?UTF-8?q?=EC=9E=90=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config-server/main.py | 6 +++++- config-server/utils.py | 17 ++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/config-server/main.py b/config-server/main.py index 0e339d7..79aa3ed 100644 --- a/config-server/main.py +++ b/config-server/main.py @@ -421,7 +421,11 @@ def create_pod(): app.logger.debug("[CREATE POD] pod does not exist yet") # Prometheus 기반 노드 선택 - node_list = [n["node_name"] for n in user_info["gpu_nodes"]] + node_list = [ + str(n["node_name"]).strip().lower() + for n in user_info["gpu_nodes"] + if n.get("node_name") + ] app.logger.info(f"[CREATE POD] candidate nodes: {node_list}") best_node = select_best_node_from_prometheus( diff --git a/config-server/utils.py b/config-server/utils.py index cd6c03c..13b7306 100644 --- a/config-server/utils.py +++ b/config-server/utils.py @@ -44,12 +44,12 @@ def load_k8s(): def resolve_k8s_node_name(candidate: Optional[str]) -> Optional[str]: """ - WAS/Prometheus 등에서 온 node 이름과 실제 Node.metadata.name 대소문자가 - 달라도 cluster에 등록된 정식 이름으로 맞추자 nodeName 바인딩은 대소문자까지 일치해야 한다. + WAS/Prometheus 등에서 온 node 이름을 클러스터 Node와 대소문자 무시로 매칭하고, + 반환은 항상 소문자로 정규화한다(운영 노드명이 소문자인 환경 기준). """ if candidate is None: return None - s = str(candidate).strip() + s = str(candidate).strip().lower() if not s: return None load_k8s() @@ -59,13 +59,12 @@ def resolve_k8s_node_name(candidate: Optional[str]) -> Optional[str]: except Exception: app.logger.exception("[NODE] list_node failed while resolving %r", s) return None - key = s.lower() for n in resp.items or []: - if n.metadata.name.lower() == key: - real = n.metadata.name - if real != s: - app.logger.info("[NODE] resolved node name %r -> %r", s, real) - return real + if n.metadata.name.lower() == s: + out = n.metadata.name.lower() + if n.metadata.name != out: + app.logger.info("[NODE] normalized node name %r -> %r", n.metadata.name, out) + return out app.logger.warning("[NODE] no cluster node matches %r (case-insensitive)", s) return None From f9abdd768847f930e727617a88394b1a0fb3e559 Mon Sep 17 00:00:00 2001 From: dongmin0204 Date: Thu, 2 Apr 2026 04:15:43 +0900 Subject: [PATCH 3/3] =?UTF-8?q?hotfix:=20node=20list=20=EA=B6=8C=ED=95=9C?= =?UTF-8?q?=20=EB=B6=80=EC=97=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config-server/Chart/templates/rbac.yaml | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/config-server/Chart/templates/rbac.yaml b/config-server/Chart/templates/rbac.yaml index f5ff44b..eaf718f 100644 --- a/config-server/Chart/templates/rbac.yaml +++ b/config-server/Chart/templates/rbac.yaml @@ -28,3 +28,30 @@ roleRef: kind: Role name: config-server-role apiGroup: rbac.authorization.k8s.io +--- +# nodes는 cluster-scoped — resolve_k8s_node_name() 등에서 list/get node 필요 +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "containerssh-config-server.fullname" . }}-node-reader + labels: + app: containerssh-config-server +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "containerssh-config-server.fullname" . }}-node-reader-binding + labels: + app: containerssh-config-server +subjects: + - kind: ServiceAccount + name: config-server + namespace: {{ .Values.config.namespace }} +roleRef: + kind: ClusterRole + name: {{ include "containerssh-config-server.fullname" . }}-node-reader + apiGroup: rbac.authorization.k8s.io