Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions axlearn/cloud/gcp/k8s_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from axlearn.cloud.gcp.utils import custom_leaderworkerset_kwargs
from axlearn.common.config import REQUIRED, Required, config_class
from axlearn.common.utils import Nested
from kubernetes.client.exceptions import ApiException
import time


class Service(FlagConfigurable):
Expand Down Expand Up @@ -159,10 +161,38 @@ def _build_service(self) -> Nested[Any]:
lws_name = self.name.split("-service")[0]
custom_api = k8s.client.CustomObjectsApi()

# Fetch the CR object
lws = custom_api.get_namespaced_custom_object(
group=group, version=version, namespace=namespace, plural=plural, name=lws_name
)
max_tries = 5
retry_delay = 20

for attempt in range(max_tries):
try:
lws = custom_api.get_namespaced_custom_object(
group=group,
version=version,
namespace=namespace,
plural=plural,
name=lws_name
)
logging.info("Successfully retrieved %s", lws_name)
break
except ApiException as e:
# Check if it's a 404 error
if e.status == 404:
logging.info("Attempt %s: Resource %s not found yet.",str(attempt + 1),lws_name)

if attempt < max_tries - 1:
logging.info("Waiting %s seconds...",str(retry_delay))
time.sleep(retry_delay)
else:
logging.info("Max retries reached. Resource was never found.")
raise
else:
logging.info("An unexpected Kubernetes API error occurred")
raise

except Exception as e:
logging.info("An unexpected error occurred: %s",str(e))
raise

ports_map_list = []
for i in range(len(self.ports)):
Expand Down
Loading