Skip to content

Commit 7d61085

Browse files
committed
fix: address CI failures and review feedback
- Auto-format all Python files with yapf (pep8 style, 120 col limit) - Fix ruff lint error (1 auto-fixed issue) - Remove redundant langchain-core and langgraph deps from pyproject.toml per reviewer feedback (already included via nvidia-nat[langchain]) Signed-off-by: futhgar <jmaldonado.rosa@gmail.com>
1 parent e8ad305 commit 7d61085

7 files changed

Lines changed: 146 additions & 125 deletions

File tree

examples/k8s_infra_monitor/pyproject.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@ requires-python = ">=3.11,<3.14"
3131
description = "Kubernetes Infrastructure Monitor using NeMo Agent Toolkit"
3232
dependencies = [
3333
"nvidia-nat[eval,langchain,profiler,test]~=1.5",
34-
"langchain-core",
35-
"langgraph>=0.0.10",
3634
]
3735
keywords = ["ai", "kubernetes", "monitoring", "agents"]
3836
classifiers = ["Programming Language :: Python"]

examples/k8s_infra_monitor/src/nat_k8s_infra_monitor/event_collector_tool.py

Lines changed: 42 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15-
1615
"""Tool for collecting and analyzing Kubernetes cluster events."""
1716

1817
import json
@@ -87,17 +86,23 @@ def _run_live(kubeconfig_path: str | None, event_limit: int) -> str:
8786
# Get warning events
8887
try:
8988
result = subprocess.run(
90-
[*cmd_base, "get", "events", "--all-namespaces",
91-
"--field-selector=type=Warning",
92-
"--sort-by=.lastTimestamp",
93-
"--no-headers"],
94-
capture_output=True, text=True, timeout=30, check=False,
89+
[
90+
*cmd_base,
91+
"get",
92+
"events",
93+
"--all-namespaces",
94+
"--field-selector=type=Warning",
95+
"--sort-by=.lastTimestamp",
96+
"--no-headers"
97+
],
98+
capture_output=True,
99+
text=True,
100+
timeout=30,
101+
check=False,
95102
)
96103
if result.returncode != 0:
97-
sections.append(
98-
"Error: kubectl failed while fetching warning events\n"
99-
f"```\n{(result.stderr or result.stdout).strip()}\n```"
100-
)
104+
sections.append("Error: kubectl failed while fetching warning events\n"
105+
f"```\n{(result.stderr or result.stdout).strip()}\n```")
101106
else:
102107
warnings = result.stdout.strip()
103108
if warnings:
@@ -111,27 +116,32 @@ def _run_live(kubeconfig_path: str | None, event_limit: int) -> str:
111116
# Get recent events summary
112117
try:
113118
result = subprocess.run(
114-
[*cmd_base, "get", "events", "--all-namespaces",
115-
"--sort-by=.lastTimestamp",
116-
"-o", "custom-columns="
117-
"NAMESPACE:.metadata.namespace,"
118-
"TYPE:.type,"
119-
"REASON:.reason,"
120-
"OBJECT:.involvedObject.kind/.involvedObject.name,"
121-
"MESSAGE:.message",
122-
"--no-headers"],
123-
capture_output=True, text=True, timeout=30, check=False,
119+
[
120+
*cmd_base,
121+
"get",
122+
"events",
123+
"--all-namespaces",
124+
"--sort-by=.lastTimestamp",
125+
"-o",
126+
"custom-columns="
127+
"NAMESPACE:.metadata.namespace,"
128+
"TYPE:.type,"
129+
"REASON:.reason,"
130+
"OBJECT:.involvedObject.kind/.involvedObject.name,"
131+
"MESSAGE:.message",
132+
"--no-headers"
133+
],
134+
capture_output=True,
135+
text=True,
136+
timeout=30,
137+
check=False,
124138
)
125139
if result.returncode != 0:
126-
sections.append(
127-
"Error: kubectl failed while fetching recent events\n"
128-
f"```\n{(result.stderr or result.stdout).strip()}\n```"
129-
)
140+
sections.append("Error: kubectl failed while fetching recent events\n"
141+
f"```\n{(result.stderr or result.stdout).strip()}\n```")
130142
elif result.stdout.strip():
131143
lines = result.stdout.strip().split("\n")[:event_limit]
132-
sections.append(
133-
f"## Recent Events ({len(lines)} most recent)\n```\n" + "\n".join(lines) + "\n```"
134-
)
144+
sections.append(f"## Recent Events ({len(lines)} most recent)\n```\n" + "\n".join(lines) + "\n```")
135145
except subprocess.TimeoutExpired:
136146
pass
137147

@@ -140,10 +150,8 @@ def _run_live(kubeconfig_path: str | None, event_limit: int) -> str:
140150

141151
def _get_default_healthy_response() -> str:
142152
"""Return a default healthy event response for offline mode."""
143-
return (
144-
"## Warning Events\n"
145-
"No warning events found.\n\n"
146-
"## Recent Events\n"
147-
"Recent events are routine: Pulled, Created, Started, Scheduled. "
148-
"No abnormal patterns detected."
149-
)
153+
return ("## Warning Events\n"
154+
"No warning events found.\n\n"
155+
"## Recent Events\n"
156+
"Recent events are routine: Pulled, Created, Started, Scheduled. "
157+
"No abnormal patterns detected.")

examples/k8s_infra_monitor/src/nat_k8s_infra_monitor/node_status_tool.py

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15-
1615
"""Tool for retrieving Kubernetes node status and resource utilization."""
1716

1817
import json
@@ -32,10 +31,8 @@ class NodeStatusToolConfig(FunctionBaseConfig, name="node_status_check"):
3231
"""Configuration for the Kubernetes node status check tool."""
3332

3433
offline_mode: bool = Field(default=True, description="Whether to run in offline mode")
35-
kubeconfig_path: str | None = Field(
36-
default=None,
37-
description="Path to kubeconfig file. If None, uses default kubectl config."
38-
)
34+
kubeconfig_path: str | None = Field(default=None,
35+
description="Path to kubeconfig file. If None, uses default kubectl config.")
3936

4037

4138
@register_function(config_type=NodeStatusToolConfig)
@@ -88,13 +85,14 @@ def _run_live(kubeconfig_path: str | None) -> str:
8885
try:
8986
result = subprocess.run(
9087
[*cmd_base, "get", "nodes", "-o", "wide", "--no-headers"],
91-
capture_output=True, text=True, timeout=30, check=False,
88+
capture_output=True,
89+
text=True,
90+
timeout=30,
91+
check=False,
9292
)
9393
if result.returncode != 0:
94-
sections.append(
95-
"Error: kubectl failed while fetching node status\n"
96-
f"```\n{(result.stderr or result.stdout).strip()}\n```"
97-
)
94+
sections.append("Error: kubectl failed while fetching node status\n"
95+
f"```\n{(result.stderr or result.stdout).strip()}\n```")
9896
else:
9997
sections.append(f"## Node Status\n```\n{result.stdout.strip()}\n```")
10098
except subprocess.TimeoutExpired:
@@ -104,13 +102,14 @@ def _run_live(kubeconfig_path: str | None) -> str:
104102
try:
105103
result = subprocess.run(
106104
[*cmd_base, "top", "nodes", "--no-headers"],
107-
capture_output=True, text=True, timeout=30, check=False,
105+
capture_output=True,
106+
text=True,
107+
timeout=30,
108+
check=False,
108109
)
109110
if result.returncode != 0:
110-
sections.append(
111-
"Error: kubectl top failed\n"
112-
f"```\n{(result.stderr or result.stdout).strip()}\n```"
113-
)
111+
sections.append("Error: kubectl top failed\n"
112+
f"```\n{(result.stderr or result.stdout).strip()}\n```")
114113
else:
115114
sections.append(f"## Node Resource Usage\n```\n{result.stdout.strip()}\n```")
116115
except subprocess.TimeoutExpired:
@@ -121,12 +120,10 @@ def _run_live(kubeconfig_path: str | None) -> str:
121120

122121
def _get_default_healthy_response() -> str:
123122
"""Return a default healthy node status response for offline mode."""
124-
return (
125-
"## Node Status\n"
126-
"All 3 nodes are in Ready state.\n"
127-
"- control-plane-1: Ready, SchedulingDisabled (control-plane taint)\n"
128-
"- worker-1: Ready, 6 vCPU, 30Gi RAM, 74% CPU, 62% memory\n"
129-
"- worker-2: Ready, 6 vCPU, 20Gi RAM, 45% CPU, 51% memory\n\n"
130-
"## Node Resource Usage\n"
131-
"No resource pressure conditions detected on any node."
132-
)
123+
return ("## Node Status\n"
124+
"All 3 nodes are in Ready state.\n"
125+
"- control-plane-1: Ready, SchedulingDisabled (control-plane taint)\n"
126+
"- worker-1: Ready, 6 vCPU, 30Gi RAM, 74% CPU, 62% memory\n"
127+
"- worker-2: Ready, 6 vCPU, 20Gi RAM, 45% CPU, 51% memory\n\n"
128+
"## Node Resource Usage\n"
129+
"No resource pressure conditions detected on any node.")

examples/k8s_infra_monitor/src/nat_k8s_infra_monitor/pod_health_tool.py

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15-
1615
"""Tool for checking Kubernetes pod health across namespaces."""
1716

1817
import json
@@ -33,10 +32,8 @@ class PodHealthToolConfig(FunctionBaseConfig, name="pod_health_check"):
3332

3433
offline_mode: bool = Field(default=True, description="Whether to run in offline mode")
3534
kubeconfig_path: str | None = Field(default=None, description="Path to kubeconfig file")
36-
namespaces: list[str] | None = Field(
37-
default=None,
38-
description="Specific namespaces to check. If None, checks all namespaces."
39-
)
35+
namespaces: list[str] | None = Field(default=None,
36+
description="Specific namespaces to check. If None, checks all namespaces.")
4037

4138

4239
@register_function(config_type=PodHealthToolConfig)
@@ -93,29 +90,39 @@ def _run_live(kubeconfig_path: str | None, namespaces: list[str] | None) -> str:
9390
try:
9491
result = subprocess.run(
9592
[*cmd_base, "get", "pods", "-n", ns, "-o", "wide", "--no-headers"],
96-
capture_output=True, text=True, timeout=30, check=False,
93+
capture_output=True,
94+
text=True,
95+
timeout=30,
96+
check=False,
9797
)
9898
if result.returncode != 0:
99-
sections.append(
100-
f"### Namespace: {ns}\nError: kubectl failed\n"
101-
f"```\n{(result.stderr or result.stdout).strip()}\n```"
102-
)
99+
sections.append(f"### Namespace: {ns}\nError: kubectl failed\n"
100+
f"```\n{(result.stderr or result.stdout).strip()}\n```")
103101
else:
104102
sections.append(f"### Namespace: {ns}\n```\n{result.stdout.strip()}\n```")
105103
except subprocess.TimeoutExpired:
106104
sections.append(f"### Namespace: {ns}\nError: kubectl timed out")
107105
else:
108106
try:
109107
result = subprocess.run(
110-
[*cmd_base, "get", "pods", *ns_flag, "-o", "wide", "--no-headers",
111-
"--field-selector=status.phase!=Running,status.phase!=Succeeded"],
112-
capture_output=True, text=True, timeout=30, check=False,
108+
[
109+
*cmd_base,
110+
"get",
111+
"pods",
112+
*ns_flag,
113+
"-o",
114+
"wide",
115+
"--no-headers",
116+
"--field-selector=status.phase!=Running,status.phase!=Succeeded"
117+
],
118+
capture_output=True,
119+
text=True,
120+
timeout=30,
121+
check=False,
113122
)
114123
if result.returncode != 0:
115-
sections.append(
116-
"Error: kubectl failed while fetching pod status\n"
117-
f"```\n{(result.stderr or result.stdout).strip()}\n```"
118-
)
124+
sections.append("Error: kubectl failed while fetching pod status\n"
125+
f"```\n{(result.stderr or result.stdout).strip()}\n```")
119126
else:
120127
unhealthy = result.stdout.strip()
121128
if unhealthy:
@@ -128,11 +135,20 @@ def _run_live(kubeconfig_path: str | None, namespaces: list[str] | None) -> str:
128135
# Check for pods with high restart counts (> 5)
129136
try:
130137
result = subprocess.run(
131-
[*cmd_base, "get", "pods", "--all-namespaces", "-o",
132-
"jsonpath={range .items[*]}{.metadata.namespace}{' '}"
133-
"{.metadata.name}{' '}{range .status.containerStatuses[*]}"
134-
"{.restartCount}{' '}{end}{'\\n'}{end}"],
135-
capture_output=True, text=True, timeout=30, check=False,
138+
[
139+
*cmd_base,
140+
"get",
141+
"pods",
142+
"--all-namespaces",
143+
"-o",
144+
"jsonpath={range .items[*]}{.metadata.namespace}{' '}"
145+
"{.metadata.name}{' '}{range .status.containerStatuses[*]}"
146+
"{.restartCount}{' '}{end}{'\\n'}{end}"
147+
],
148+
capture_output=True,
149+
text=True,
150+
timeout=30,
151+
check=False,
136152
)
137153
if result.returncode == 0:
138154
high_restarts = []
@@ -153,9 +169,7 @@ def _run_live(kubeconfig_path: str | None, namespaces: list[str] | None) -> str:
153169

154170
def _get_default_healthy_response() -> str:
155171
"""Return a default healthy pod status response for offline mode."""
156-
return (
157-
"## Pod Health Summary\n"
158-
"All pods are in Running or Succeeded state across all namespaces.\n\n"
159-
"## High Restart Pods\n"
160-
"No pods with excessive restart counts detected."
161-
)
172+
return ("## Pod Health Summary\n"
173+
"All pods are in Running or Succeeded state across all namespaces.\n\n"
174+
"## High Restart Pods\n"
175+
"No pods with excessive restart counts detected.")

examples/k8s_infra_monitor/src/nat_k8s_infra_monitor/register.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,9 @@ async def _analyze_cluster(input_message: str) -> str:
121121

122122
if not result:
123123
utils.logger.warning("Agent returned empty report (input_length=%d)", len(input_message))
124-
result = (
125-
"The agent was unable to generate a diagnostic report for this query. "
126-
"This may indicate the LLM model is insufficient for the task complexity. "
127-
"Consider using a larger model (e.g. meta/llama-3.3-70b-instruct).\n\n"
128-
)
124+
result = ("The agent was unable to generate a diagnostic report for this query. "
125+
"This may indicate the LLM model is insufficient for the task complexity. "
126+
"Consider using a larger model (e.g. meta/llama-3.3-70b-instruct).\n\n")
129127

130128
# Append severity classification
131129
severity = await severity_tool.arun(result)

0 commit comments

Comments
 (0)