diff --git a/config-server/Chart/templates/rbac.yaml b/config-server/Chart/templates/rbac.yaml index f5ff44b..eaf718f 100644 --- a/config-server/Chart/templates/rbac.yaml +++ b/config-server/Chart/templates/rbac.yaml @@ -28,3 +28,30 @@ roleRef: kind: Role name: config-server-role apiGroup: rbac.authorization.k8s.io +--- +# nodes는 cluster-scoped — resolve_k8s_node_name() 등에서 list/get node 필요 +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "containerssh-config-server.fullname" . }}-node-reader + labels: + app: containerssh-config-server +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "containerssh-config-server.fullname" . }}-node-reader-binding + labels: + app: containerssh-config-server +subjects: + - kind: ServiceAccount + name: config-server + namespace: {{ .Values.config.namespace }} +roleRef: + kind: ClusterRole + name: {{ include "containerssh-config-server.fullname" . }}-node-reader + apiGroup: rbac.authorization.k8s.io diff --git a/config-server/main.py b/config-server/main.py index 0e339d7..79aa3ed 100644 --- a/config-server/main.py +++ b/config-server/main.py @@ -421,7 +421,11 @@ def create_pod(): app.logger.debug("[CREATE POD] pod does not exist yet") # Prometheus 기반 노드 선택 - node_list = [n["node_name"] for n in user_info["gpu_nodes"]] + node_list = [ + str(n["node_name"]).strip().lower() + for n in user_info["gpu_nodes"] + if n.get("node_name") + ] app.logger.info(f"[CREATE POD] candidate nodes: {node_list}") best_node = select_best_node_from_prometheus( diff --git a/config-server/utils.py b/config-server/utils.py index cd6c03c..13b7306 100644 --- a/config-server/utils.py +++ b/config-server/utils.py @@ -44,12 +44,12 @@ def load_k8s(): def resolve_k8s_node_name(candidate: Optional[str]) -> Optional[str]: """ - WAS/Prometheus 등에서 온 node 이름과 실제 Node.metadata.name 대소문자가 - 달라도 cluster에 등록된 정식 이름으로 맞추자 nodeName 바인딩은 대소문자까지 일치해야 한다. + WAS/Prometheus 등에서 온 node 이름을 클러스터 Node와 대소문자 무시로 매칭하고, + 반환은 항상 소문자로 정규화한다(운영 노드명이 소문자인 환경 기준). """ if candidate is None: return None - s = str(candidate).strip() + s = str(candidate).strip().lower() if not s: return None load_k8s() @@ -59,13 +59,12 @@ def resolve_k8s_node_name(candidate: Optional[str]) -> Optional[str]: except Exception: app.logger.exception("[NODE] list_node failed while resolving %r", s) return None - key = s.lower() for n in resp.items or []: - if n.metadata.name.lower() == key: - real = n.metadata.name - if real != s: - app.logger.info("[NODE] resolved node name %r -> %r", s, real) - return real + if n.metadata.name.lower() == s: + out = n.metadata.name.lower() + if n.metadata.name != out: + app.logger.info("[NODE] normalized node name %r -> %r", n.metadata.name, out) + return out app.logger.warning("[NODE] no cluster node matches %r (case-insensitive)", s) return None