From 26640314fdacf3fb1edab46dd717414adff49f3a Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 4 Feb 2026 16:51:34 -0800 Subject: [PATCH 1/6] Fix numa parsing in job exporter (#143) Fix numa parsing in job exporter on GB200. --- .../src/Moneo/src/worker/exporters/node_exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/job-exporter/src/Moneo/src/worker/exporters/node_exporter.py b/src/job-exporter/src/Moneo/src/worker/exporters/node_exporter.py index b0ee72af..0fb3fb1a 100644 --- a/src/job-exporter/src/Moneo/src/worker/exporters/node_exporter.py +++ b/src/job-exporter/src/Moneo/src/worker/exporters/node_exporter.py @@ -273,7 +273,7 @@ def get_core_numa_mapping(core_count): numa_mapping = {} lines = output.split('\n') for line in lines: - if 'node ' in line and 'cpus' in line: + if 'node ' in line and 'cpus' in line and not line.strip().endswith('cpus:'): current_numa_domain = int(re.search(r'node (\d+)', line).group(1)) if ':' in line: cpus_str = line.split(': ')[1].split() From 2110283539f2a76dd5c03b479cee8e2d1533a30c Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Wed, 4 Feb 2026 17:28:51 -0800 Subject: [PATCH 2/6] Fix kubespray deployment on bare metal (#144) Fix kubespray deployment on bare metal when cffi package is installed by package manager. --- contrib/kubespray/script/environment.sh | 2 +- contrib/kubespray/script/requirements.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/kubespray/script/environment.sh b/contrib/kubespray/script/environment.sh index 75d8ca38..5575e71a 100644 --- a/contrib/kubespray/script/environment.sh +++ b/contrib/kubespray/script/environment.sh @@ -46,7 +46,7 @@ echo "Install sshpass" sudo apt-get -y install sshpass echo "Install kubespray's requirements and ansible is included" -sudo apt-get -y remove python3-cryptography # avoid conflict with pip +sudo apt-get -y remove python3-cryptography python3-cffi # avoid conflict with pip sudo python3 -m pip install -r ${HOME}/pai-deploy/kubespray/requirements.txt # workaround python3-apt issue diff --git a/contrib/kubespray/script/requirements.txt b/contrib/kubespray/script/requirements.txt index 5982f934..6cb444d7 100644 --- a/contrib/kubespray/script/requirements.txt +++ b/contrib/kubespray/script/requirements.txt @@ -4,3 +4,4 @@ jinja2==3.1.4 pyOpenSSL==24.1.0 requests==2.32.3 oauthlib==3.3.1 +cffi==2.0.0 From 51dd2f13b7ece39d8248668c74eab368ce5ba384 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 5 Feb 2026 12:14:58 +0800 Subject: [PATCH 3/6] fix the circular dependency (#142) Co-authored-by: Rui Gao --- src/rest-server/src/models/v2/job/k8s.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/rest-server/src/models/v2/job/k8s.js b/src/rest-server/src/models/v2/job/k8s.js index 2711cd14..f21ccc8f 100644 --- a/src/rest-server/src/models/v2/job/k8s.js +++ b/src/rest-server/src/models/v2/job/k8s.js @@ -23,7 +23,6 @@ const runtimeEnv = require('./runtime-env'); const launcherConfig = require('@pai/config/launcher'); const createError = require('@pai/utils/error'); const protocolSecret = require('@pai/utils/protocolSecret'); -const userModel = require('@pai/models/v2/user'); const tokenModel = require('@pai/models/token'); const storageModel = require('@pai/models/v2/storage'); const logger = require('@pai/config/logger'); @@ -1135,6 +1134,8 @@ const get = async (frameworkName, jobAttemptId) => { }; const put = async (frameworkName, config, rawConfig) => { + // Lazy load to avoid circular dependency + const userModel = require('@pai/models/v2/user'); const [userName] = frameworkName.split(/~(.+)/); const virtualCluster = From 534c8b418a05bef93bdcddac882bc730c6796396 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 5 Feb 2026 17:01:00 -0800 Subject: [PATCH 4/6] fix more circular dependencies in rest server (#145) Co-authored-by: Rui Gao Co-authored-by: zhogu <57975490+zhogu@users.noreply.github.com> --- src/rest-server/src/models/v2/storage.js | 7 ++++++- src/rest-server/src/utils/manager/user/crudK8sSecret.js | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/rest-server/src/models/v2/storage.js b/src/rest-server/src/models/v2/storage.js index 8002e37a..8bf884d5 100644 --- a/src/rest-server/src/models/v2/storage.js +++ b/src/rest-server/src/models/v2/storage.js @@ -18,7 +18,6 @@ // module dependencies const status = require('statuses'); const createError = require('@pai/utils/error'); -const user = require('@pai/models/v2/user'); const secret = require('@pai/models/kubernetes/k8s-secret'); const kubernetes = require('@pai/models/kubernetes/kubernetes'); const logger = require('@pai/config/logger'); @@ -178,6 +177,9 @@ const convertVolumeDetail = async (pvc) => { }; const list = async (userName, filterDefault = false) => { + // Lazy require to avoid circular dependency + const user = require('@pai/models/v2/user'); + let response; if (pvcCache.has('storageList')) { logger.info('Read persistant volume claim list from cache'); @@ -247,6 +249,9 @@ const list = async (userName, filterDefault = false) => { }; const get = async (storageName, userName) => { + // Lazy require to avoid circular dependency + const user = require('@pai/models/v2/user'); + let response; if (pvcCache.has(storageName)) { logger.info(`Read persistant volume claim from cache: ${storageName}`); diff --git a/src/rest-server/src/utils/manager/user/crudK8sSecret.js b/src/rest-server/src/utils/manager/user/crudK8sSecret.js index 07ed4532..83933cc1 100644 --- a/src/rest-server/src/utils/manager/user/crudK8sSecret.js +++ b/src/rest-server/src/utils/manager/user/crudK8sSecret.js @@ -20,7 +20,6 @@ const logger = require('@pai/config/logger'); const groupModel = require('@pai/models/v2/group'); const k8sModel = require('@pai/models/kubernetes/kubernetes'); const { Mutex } = require('async-mutex'); -const { job } = require('@pai/models/v2/job'); const USER_NAMESPACE = process.env.PAI_USER_NAMESPACE || 'pai-user-v2'; @@ -44,6 +43,9 @@ const cache = new Map(); const readMutex = new Mutex(); async function getHistoryVCs(name, grouplist, retrieveFromHistory=true) { + // Lazy require to avoid circular dependency + const { job } = require('@pai/models/v2/job'); + // Retrieve VC list from the user's job history let vcsFromJob = []; if (retrieveFromHistory) { From 9173b41f3310bcbd664ef7868bb98e35989250c7 Mon Sep 17 00:00:00 2001 From: zhogu <57975490+zhogu@users.noreply.github.com> Date: Fri, 6 Feb 2026 15:29:31 +0800 Subject: [PATCH 5/6] Update the workflow tigger (#146) * change trigger of github workflow * update * update * update --- .github/workflows/build-all.yaml | 6 +++--- .github/workflows/build-deploy-changes.yaml | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-all.yaml b/.github/workflows/build-all.yaml index 020ba918..89ecdd43 100644 --- a/.github/workflows/build-all.yaml +++ b/.github/workflows/build-all.yaml @@ -4,9 +4,8 @@ permissions: contents: read on: - push: - branches: ["release/*"] pull_request: + types: [opened, reopened, closed] branches: ["release/*"] release: types: [published] @@ -15,7 +14,7 @@ on: branch: description: 'The branch name or tag to run the workflow on' required: true - default: 'main' + default: 'dev' type: string env: @@ -27,6 +26,7 @@ jobs: runs-on: [self-hosted, paicicd] timeout-minutes: 120 environment: auto-test + if: github.event_name != 'pull_request' || ( github.event.action == 'opened' || github.event.action == 'reopened' || github.event.pull_request.merged == true) container: image: ubuntu:latest volumes: diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index 7dafb03f..6c8987b5 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -5,8 +5,6 @@ permissions: contents: read on: - push: - branches: [main, dev, "release/*"] pull_request: branches: [main, dev, "release/*"] From 9a5df0cd3b6f80d1cec888485878ec4a30b56d95 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Mon, 9 Feb 2026 14:06:28 +0800 Subject: [PATCH 6/6] add release note for v1.5 (#148) Co-authored-by: Rui Gao --- .../blog/2026-02-06-release-1-5.md | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 docs/LuciaTrainingPlatform/blog/2026-02-06-release-1-5.md diff --git a/docs/LuciaTrainingPlatform/blog/2026-02-06-release-1-5.md b/docs/LuciaTrainingPlatform/blog/2026-02-06-release-1-5.md new file mode 100644 index 00000000..56b4aa1f --- /dev/null +++ b/docs/LuciaTrainingPlatform/blog/2026-02-06-release-1-5.md @@ -0,0 +1,39 @@ +--- +slug: release-ltp-v1.5 +title: Releasing Lucia Training Platform v1.5 +author: Lucia Training Platform Team +tags: [ltp, announcement, release] +--- + +We are pleased to announce the official release of **Lucia Training Platform v1.5.0**! + +## Lucia Training Platform v1.5.0 Release Notes + +This release focuses on platform stability improvements, deployment enhancements, hardware compatibility, and security updates. + +## Platform Features & Stability +- Added feature to track user's historical jobs in non-existing virtual clusters including the job status and logs +- Support for assigning custom job names +- Update the workflow trigger conditions to make the workflow triggered and executed only when a pull request targeting main, dev or release/* branch is closed + +## Deployment & Hardware Support +- Fixed kubespray deployment on bare metal when cffi package is installed by package manager +- Fixed NUMA parsing in job exporter on GB200 hardware +- Fixed job exporter compatibility issue on ARM nodes +- Enhanced multi-architecture support across platform components + +## Security +- Updated Kubernetes scheduler version to 1.33.1 +- Updated Go version to 1.24.9 for framework controller, hivedscheduler, and watchdog +- Updated Node.js packages for alert-handler, job-status-change-notification, rest-server +- Updated Python packages for cluster local storage, copilot-chat, and dashboard-data-backup +- Updated RPM packages for database-controller +- Updated Docker version for webportal-dind +- Fixed security issues in multiple component dependencies +- Fixed the module "logger" missing when running DCGM with higher versions + +## Storage & Infrastructure +- Mounted SSH key pairs for cluster local storage +- Enhanced cluster local storage security and stability +- Fixed Docker pull problem after Docker version update +