Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions config-server/Chart/templates/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,3 +28,30 @@ roleRef:
kind: Role
name: config-server-role
apiGroup: rbac.authorization.k8s.io
---
# nodes는 cluster-scoped — resolve_k8s_node_name() 등에서 list/get node 필요
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "containerssh-config-server.fullname" . }}-node-reader
labels:
app: containerssh-config-server
rules:
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "containerssh-config-server.fullname" . }}-node-reader-binding
labels:
app: containerssh-config-server
subjects:
- kind: ServiceAccount
name: config-server
namespace: {{ .Values.config.namespace }}
roleRef:
kind: ClusterRole
name: {{ include "containerssh-config-server.fullname" . }}-node-reader
apiGroup: rbac.authorization.k8s.io
6 changes: 5 additions & 1 deletion config-server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,11 @@ def create_pod():
app.logger.debug("[CREATE POD] pod does not exist yet")

# Prometheus 기반 노드 선택
node_list = [n["node_name"] for n in user_info["gpu_nodes"]]
node_list = [
str(n["node_name"]).strip().lower()
for n in user_info["gpu_nodes"]
if n.get("node_name")
]
app.logger.info(f"[CREATE POD] candidate nodes: {node_list}")

best_node = select_best_node_from_prometheus(
Expand Down
17 changes: 8 additions & 9 deletions config-server/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ def load_k8s():

def resolve_k8s_node_name(candidate: Optional[str]) -> Optional[str]:
"""
WAS/Prometheus 등에서 온 node 이름과 실제 Node.metadata.name 대소문자가
달라도 cluster에 등록된 정식 이름으로 맞추자 nodeName 바인딩은 대소문자까지 일치해야 한다.
WAS/Prometheus 등에서 온 node 이름을 클러스터 Node와 대소문자 무시로 매칭하고,
반환은 항상 소문자로 정규화한다(운영 노드명이 소문자인 환경 기준).
"""
if candidate is None:
return None
s = str(candidate).strip()
s = str(candidate).strip().lower()
if not s:
return None
load_k8s()
Expand All @@ -59,13 +59,12 @@ def resolve_k8s_node_name(candidate: Optional[str]) -> Optional[str]:
except Exception:
app.logger.exception("[NODE] list_node failed while resolving %r", s)
return None
key = s.lower()
for n in resp.items or []:
if n.metadata.name.lower() == key:
real = n.metadata.name
if real != s:
app.logger.info("[NODE] resolved node name %r -> %r", s, real)
return real
if n.metadata.name.lower() == s:
out = n.metadata.name.lower()
if n.metadata.name != out:
app.logger.info("[NODE] normalized node name %r -> %r", n.metadata.name, out)
return out
app.logger.warning("[NODE] no cluster node matches %r (case-insensitive)", s)
return None

Expand Down
Loading