From dd7e07213c8923819b9fbc189e25e2f1cfc65f2b Mon Sep 17 00:00:00 2001 From: lkolluru05 Date: Thu, 15 Jan 2026 00:48:06 +0000 Subject: [PATCH 1/3] updated retry and timeout in lws --- axlearn/cloud/gcp/k8s_service.py | 39 ++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/axlearn/cloud/gcp/k8s_service.py b/axlearn/cloud/gcp/k8s_service.py index a27c25046..ac88a0a89 100644 --- a/axlearn/cloud/gcp/k8s_service.py +++ b/axlearn/cloud/gcp/k8s_service.py @@ -159,10 +159,41 @@ def _build_service(self) -> Nested[Any]: lws_name = self.name.split("-service")[0] custom_api = k8s.client.CustomObjectsApi() - # Fetch the CR object - lws = custom_api.get_namespaced_custom_object( - group=group, version=version, namespace=namespace, plural=plural, name=lws_name - ) + from kubernetes.client.exceptions import ApiException + import time + + max_tries = 5 + retry_delay = 20 + + for attempt in range(max_tries): + try: + lws = custom_api.get_namespaced_custom_object( + group=group, + version=version, + namespace=namespace, + plural=plural, + name=lws_name + ) + print(f"Successfully retrieved {lws_name}") + break + except ApiException as e: + # Check if it's a 404 error + if e.status == 404: + print(f"Attempt {attempt + 1}: Resource '{lws_name}' not found yet.") + + if attempt < max_tries - 1: + print(f"Waiting {retry_delay} seconds...") + time.sleep(retry_delay) + else: + print("Max retries reached. Resource was never found.") + raise + else: + print(f"An unexpected Kubernetes API error occurred: {e}") + raise + + except Exception as e: + print(f"An unexpected error occurred: {e}") + raise ports_map_list = [] for i in range(len(self.ports)): From 6112b2898139f1b5f0032a6982f598263b0e290f Mon Sep 17 00:00:00 2001 From: lkolluru05 Date: Thu, 15 Jan 2026 01:25:54 +0000 Subject: [PATCH 2/3] imports on top --- axlearn/cloud/gcp/k8s_service.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/axlearn/cloud/gcp/k8s_service.py b/axlearn/cloud/gcp/k8s_service.py index ac88a0a89..8048b027f 100644 --- a/axlearn/cloud/gcp/k8s_service.py +++ b/axlearn/cloud/gcp/k8s_service.py @@ -12,6 +12,8 @@ from axlearn.cloud.gcp.utils import custom_leaderworkerset_kwargs from axlearn.common.config import REQUIRED, Required, config_class from axlearn.common.utils import Nested +from kubernetes.client.exceptions import ApiException +import time class Service(FlagConfigurable): @@ -159,9 +161,6 @@ def _build_service(self) -> Nested[Any]: lws_name = self.name.split("-service")[0] custom_api = k8s.client.CustomObjectsApi() - from kubernetes.client.exceptions import ApiException - import time - max_tries = 5 retry_delay = 20 From cebc5ed5f48dade4033c72c294af5d1e74682c5e Mon Sep 17 00:00:00 2001 From: lkolluru05 Date: Thu, 15 Jan 2026 19:24:10 +0000 Subject: [PATCH 3/3] logging added --- axlearn/cloud/gcp/k8s_service.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/axlearn/cloud/gcp/k8s_service.py b/axlearn/cloud/gcp/k8s_service.py index 8048b027f..9b4b9bb58 100644 --- a/axlearn/cloud/gcp/k8s_service.py +++ b/axlearn/cloud/gcp/k8s_service.py @@ -173,25 +173,25 @@ def _build_service(self) -> Nested[Any]: plural=plural, name=lws_name ) - print(f"Successfully retrieved {lws_name}") + logging.info("Successfully retrieved %s", lws_name) break except ApiException as e: # Check if it's a 404 error if e.status == 404: - print(f"Attempt {attempt + 1}: Resource '{lws_name}' not found yet.") + logging.info("Attempt %s: Resource %s not found yet.",str(attempt + 1),lws_name) if attempt < max_tries - 1: - print(f"Waiting {retry_delay} seconds...") + logging.info("Waiting %s seconds...",str(retry_delay)) time.sleep(retry_delay) else: - print("Max retries reached. Resource was never found.") + logging.info("Max retries reached. Resource was never found.") raise else: - print(f"An unexpected Kubernetes API error occurred: {e}") + logging.info("An unexpected Kubernetes API error occurred") raise except Exception as e: - print(f"An unexpected error occurred: {e}") + logging.info("An unexpected error occurred: %s",str(e)) raise ports_map_list = []