From 0a3d654c20e7e542ea46d827e58136d105ca97b5 Mon Sep 17 00:00:00 2001 From: frostebite Date: Thu, 14 May 2026 15:55:43 +0100 Subject: [PATCH 1/2] fix: add image-level admin retry actions --- .../docs/versions/image-job-admin-actions.tsx | 153 ++++++++++++++++++ .../docs/versions/unity-version.tsx | 2 + 2 files changed, 155 insertions(+) create mode 100644 src/components/docs/versions/image-job-admin-actions.tsx diff --git a/src/components/docs/versions/image-job-admin-actions.tsx b/src/components/docs/versions/image-job-admin-actions.tsx new file mode 100644 index 00000000..3235f4a4 --- /dev/null +++ b/src/components/docs/versions/image-job-admin-actions.tsx @@ -0,0 +1,153 @@ +import React, { useState } from 'react'; +import { HiOutlineRefresh } from 'react-icons/hi'; +import { MdRestartAlt } from 'react-icons/md'; +import { useFirestore, useFirestoreCollectionData, useUser } from 'reactfire'; +import { SimpleAuthCheck } from '@site/src/components/auth/safe-auth-check'; +import config from '@site/src/core/config'; +import { useNotification } from '@site/src/core/hooks/use-notification'; +import Spinner from '@site/src/components/molecules/spinner'; +import Tooltip from '@site/src/components/molecules/tooltip/tooltip'; + +interface Props { + ciJobId: string; + status: string; +} + +type BuildRecord = { + buildId: string; + relatedJobId: string; + status: string; + meta?: { + failureCount?: number; + }; +}; + +const buttonStyle: React.CSSProperties = { + padding: 0, + border: 0, + outline: 0, + cursor: 'pointer', + background: 'transparent', + display: 'inline-flex', + alignItems: 'center', +}; + +const ImageJobAdminActions = ({ ciJobId, status }: Props) => { + const firestore = useFirestore(); + const { data: user } = useUser(); + const notify = useNotification(); + const [runningAction, setRunningAction] = useState<'reset' | 'retry' | null>(null); + + const ciBuilds = firestore.collection('ciBuilds').where('relatedJobId', '==', ciJobId); + const { status: buildStatus, data = [] } = useFirestoreCollectionData(ciBuilds); + + if (status !== 'failed' || buildStatus === 'loading') return null; + + const failedBuilds = data.filter((build) => build.status === 'failed'); + if (failedBuilds.length === 0) return null; + + const maxedOutBuilds = failedBuilds.filter((build) => (build.meta?.failureCount || 0) >= 15); + + const callEndpoint = async (endpoint: string, payload: object) => { + if (!user) { + throw new Error('User not authenticated'); + } + + const token = await user.getIdToken(); + const response = await fetch(`${config.backendUrl}/${endpoint}`, { + headers: { + Authorization: `Bearer ${token}`, + 'Content-Type': 'application/json', + }, + mode: 'cors', + method: 'POST', + body: JSON.stringify(payload), + }); + + const body = await response.json(); + if (!response.ok) { + const detail = body.error ? `${body.message}: ${body.error}` : body.message; + throw new Error(detail || `Request failed (${response.status})`); + } + return body; + }; + + const runAction = async ( + action: 'reset' | 'retry', + builds: BuildRecord[], + endpoint: 'resetFailedBuilds' | 'retryBuild', + ) => { + if (builds.length === 0) return; + + setRunningAction(action); + try { + let succeeded = 0; + let failed = 0; + + for (const build of builds) { + try { + const payload = + endpoint === 'retryBuild' + ? { buildId: build.buildId, relatedJobId: build.relatedJobId } + : { buildId: build.buildId }; + // Sequential calls avoid hammering the backend for a single image row action. + await callEndpoint(endpoint, payload); + succeeded += 1; + } catch { + failed += 1; + } + } + + if (failed > 0) { + notify.error( + `${action === 'retry' ? 'Retried' : 'Reset'} ${succeeded}/${builds.length} builds. ${failed} failed.`, + ); + } else { + notify.success( + `${action === 'retry' ? 'Retried' : 'Reset'} ${succeeded} build${succeeded === 1 ? '' : 's'} for ${ciJobId}.`, + ); + } + } finally { + setRunningAction(null); + } + }; + + return ( + } requiredClaims={{ admin: true }}> + + {maxedOutBuilds.length > 0 && ( + + + + )} + + + + + + ); +}; + +export default ImageJobAdminActions; diff --git a/src/components/docs/versions/unity-version.tsx b/src/components/docs/versions/unity-version.tsx index cffa609e..2ed1d00c 100644 --- a/src/components/docs/versions/unity-version.tsx +++ b/src/components/docs/versions/unity-version.tsx @@ -1,6 +1,7 @@ import React, { useRef, useState } from 'react'; import Builds from '@site/src/components/docs/versions/builds/builds'; import DateTime from '@site/src/components/docs/versions/date-time'; +import ImageJobAdminActions from '@site/src/components/docs/versions/image-job-admin-actions'; import ShowAndCopyChangeSetHashButton from '@site/src/components/docs/versions/show-and-copy-change-set-hash-button'; import Spinner from '@site/src/components/molecules/spinner'; import styles from './unity-version.module.scss'; @@ -65,6 +66,7 @@ const UnityVersion = ({ data }: Props) => { {ciJobStatusToIconMap[status]} {id} + - Last updated: From bd011e12fa553cb603a3bf48e460045f0d89a4ad Mon Sep 17 00:00:00 2001 From: frostebite Date: Thu, 14 May 2026 16:32:40 +0100 Subject: [PATCH 2/2] feat: improve admin queue diagnostics ui --- .../docs/versions/build-status-dashboard.tsx | 32 ++ .../docs/versions/image-versions.tsx | 67 +++- .../docs/versions/queue-management-panel.tsx | 346 ++++++++++++++++++ 3 files changed, 428 insertions(+), 17 deletions(-) diff --git a/src/components/docs/versions/build-status-dashboard.tsx b/src/components/docs/versions/build-status-dashboard.tsx index d314e2f3..d86ecb57 100644 --- a/src/components/docs/versions/build-status-dashboard.tsx +++ b/src/components/docs/versions/build-status-dashboard.tsx @@ -6,6 +6,24 @@ interface Props { selectedRepoVersion: string | undefined; } +const minutesSinceTimestamp = ( + timestamp?: { + seconds?: number; + } | null, +): number | null => { + if (!timestamp?.seconds) return null; + const diffMs = Date.now() - timestamp.seconds * 1000; + return diffMs < 0 ? 0 : Math.floor(diffMs / 60000); +}; + +const formatAgeMinutes = (minutes: number | null): string => { + if (minutes === null) return 'n/a'; + if (minutes < 60) return `${minutes}m`; + const hours = Math.floor(minutes / 60); + const remainingMinutes = minutes % 60; + return remainingMinutes === 0 ? `${hours}h` : `${hours}h ${remainingMinutes}m`; +}; + const BuildStatusDashboard = ({ selectedRepoVersion }: Props) => { const firestore = useFirestore(); const ciBuilds = firestore @@ -31,6 +49,12 @@ const BuildStatusDashboard = ({ selectedRepoVersion }: Props) => { const maxedOut = builds.filter( (b) => b.status === 'failed' && (b.meta?.failureCount || 0) >= 15, ).length; + const startedAges = builds + .filter((b) => b.status === 'started') + .map((b) => minutesSinceTimestamp(b.meta?.lastBuildStart)) + .filter((minutes): minutes is number => minutes !== null); + const staleStarted = startedAges.filter((minutes) => minutes >= 45).length; + const oldestStarted = startedAges.length > 0 ? Math.max(...startedAges) : null; const total = builds.length; const statStyle = (color: string): React.CSSProperties => ({ @@ -74,6 +98,14 @@ const BuildStatusDashboard = ({ selectedRepoVersion }: Props) => { Stuck (15+): {maxedOut} )} + {staleStarted > 0 && ( + + Started 45m+: {staleStarted} + + )} + + Oldest started: {formatAgeMinutes(oldestStarted)} + { minWidth: 220, }; + const headerCardStyle: React.CSSProperties = { + display: 'grid', + gap: 12, + padding: '14px 16px', + borderRadius: 10, + border: '1px solid #33333322', + background: '#fafafa08', + marginBottom: 12, + }; + + const toolbarStyle: React.CSSProperties = { + display: 'flex', + flexWrap: 'wrap', + gap: 10, + alignItems: 'center', + justifyContent: 'space-between', + }; + return (

Supported Editor Versions

-
- Docker repo version: +
+
+
+ Docker repo version + setSelectedVersion(event.target.value)}> - {versions.map((version) => { - const { NO_ID_FIELD: id } = version; + return ( + + ); + })} + +
- return ( - - ); - })} - - - - - - +
+ } requiredClaims={{ admin: true }}> + + + + +
+
+ +

+ Admin controls for queue recovery, retry management, and selected-repo diagnostics live + below. Use the dashboard first, then the queue panel for root-cause detail. +

diff --git a/src/components/docs/versions/queue-management-panel.tsx b/src/components/docs/versions/queue-management-panel.tsx index d010528a..6388364a 100644 --- a/src/components/docs/versions/queue-management-panel.tsx +++ b/src/components/docs/versions/queue-management-panel.tsx @@ -12,6 +12,12 @@ type QueueJob = { repoVersionInfo?: { version?: string; }; + addedDate?: { + seconds?: number; + }; + modifiedDate?: { + seconds?: number; + }; }; type QueueBuild = { @@ -28,6 +34,15 @@ type QueueBuild = { dockerInfo?: { digest?: string; } | null; + meta?: { + lastBuildStart?: { + seconds?: number; + } | null; + publishedDate?: { + seconds?: number; + } | null; + failureCount?: number; + }; }; type QueueStatusResponse = { @@ -74,6 +89,24 @@ const cellStyle: React.CSSProperties = { const getJobRepoVersion = (job: QueueJob | undefined): string => job?.repoVersionInfo?.version || ''; +const minutesSinceTimestamp = ( + timestamp?: { + seconds?: number; + } | null, +): number | null => { + if (!timestamp?.seconds) return null; + const diffMs = Date.now() - timestamp.seconds * 1000; + return diffMs < 0 ? 0 : Math.floor(diffMs / 60000); +}; + +const formatAgeMinutes = (minutes: number | null): string => { + if (minutes === null) return 'n/a'; + if (minutes < 60) return `${minutes}m`; + const hours = Math.floor(minutes / 60); + const remainingMinutes = minutes % 60; + return remainingMinutes === 0 ? `${hours}h` : `${hours}h ${remainingMinutes}m`; +}; + const buildDiagnosticsPrompt = ( selectedRepoVersion: string, diagnostics: { @@ -86,6 +119,32 @@ const buildDiagnosticsPrompt = ( buildInfo: { repoVersion: string }; }>; failedWithDockerInfo: Array<{ buildId: string; dockerInfo?: { digest?: string } | null }>; + selectedRepoStaleCreatedJobs: Array<{ jobId?: string; ageMinutes: number }>; + selectedRepoStaleStartedBuilds: Array<{ + buildId: string; + relatedJobId: string; + ageMinutes: number; + }>; + selectedRepoJobCounts: { + created: number; + scheduled: number; + inProgress: number; + completed: number; + failed: number; + }; + selectedRepoBuildCounts: { + started: number; + failed: number; + published: number; + }; + selectedRepoBaseJobStatus: string; + selectedRepoHubJobStatus: string; + selectedRepoCreatedJobsOlderThan60m: number; + selectedRepoOldestCreatedJobAgeMinutes: number | null; + selectedRepoStartedBuildsOlderThan45m: number; + selectedRepoStartedBuildsOlderThan6h: number; + selectedRepoOldestStartedBuildAgeMinutes: number | null; + selectedRepoMaxedOutFailedBuilds: number; totals: { jobs: number; builds: number; @@ -110,8 +169,44 @@ const buildDiagnosticsPrompt = ( `- Older-version created jobs: ${diagnostics.staleCreatedJobs.length}`, `- Repo-version drift builds (build repo == ${selectedRepoVersion}, job repo != ${selectedRepoVersion}): ${diagnostics.repoDriftBuilds.length}`, `- Failed builds with Docker metadata: ${diagnostics.failedWithDockerInfo.length}`, + `- Selected repo base job status: ${diagnostics.selectedRepoBaseJobStatus}`, + `- Selected repo hub job status: ${diagnostics.selectedRepoHubJobStatus}`, + `- Selected repo editor jobs by status: created=${diagnostics.selectedRepoJobCounts.created}, scheduled=${diagnostics.selectedRepoJobCounts.scheduled}, inProgress=${diagnostics.selectedRepoJobCounts.inProgress}, completed=${diagnostics.selectedRepoJobCounts.completed}, failed=${diagnostics.selectedRepoJobCounts.failed}`, + `- Selected repo builds by status: started=${diagnostics.selectedRepoBuildCounts.started}, failed=${diagnostics.selectedRepoBuildCounts.failed}, published=${diagnostics.selectedRepoBuildCounts.published}`, + `- Selected repo created jobs older than 60m: ${diagnostics.selectedRepoCreatedJobsOlderThan60m}`, + `- Selected repo oldest created job age: ${formatAgeMinutes(diagnostics.selectedRepoOldestCreatedJobAgeMinutes)}`, + `- Selected repo started builds older than 45m: ${diagnostics.selectedRepoStartedBuildsOlderThan45m}`, + `- Selected repo started builds older than 6h: ${diagnostics.selectedRepoStartedBuildsOlderThan6h}`, + `- Selected repo oldest started build age: ${formatAgeMinutes(diagnostics.selectedRepoOldestStartedBuildAgeMinutes)}`, + `- Selected repo maxed-out failed builds (15+ failures): ${diagnostics.selectedRepoMaxedOutFailedBuilds}`, ]; + if (diagnostics.selectedRepoStaleCreatedJobs.length > 0) { + const total = diagnostics.selectedRepoStaleCreatedJobs.length; + const label = + total > 10 + ? `Selected repo created jobs older than 60m: (showing first 10 of ${total})` + : 'Selected repo created jobs older than 60m:'; + lines.push('', label); + diagnostics.selectedRepoStaleCreatedJobs.slice(0, 10).forEach((job) => { + lines.push(`- ${job.jobId} (${formatAgeMinutes(job.ageMinutes)} old)`); + }); + } + + if (diagnostics.selectedRepoStaleStartedBuilds.length > 0) { + const total = diagnostics.selectedRepoStaleStartedBuilds.length; + const label = + total > 10 + ? `Selected repo started builds older than 45m: (showing first 10 of ${total})` + : 'Selected repo started builds older than 45m:'; + lines.push('', label); + diagnostics.selectedRepoStaleStartedBuilds.slice(0, 10).forEach((build) => { + lines.push( + `- ${build.buildId} linked to ${build.relatedJobId} (${formatAgeMinutes(build.ageMinutes)} old)`, + ); + }); + } + if (diagnostics.staleFailedJobs.length > 0) { const total = diagnostics.staleFailedJobs.length; const label = @@ -225,6 +320,10 @@ const QueueManagementPanel = ({ selectedRepoVersion }: Props) => { } const editorJobs = jobs.filter((job) => job.imageType === 'editor'); + const selectedRepoJobs = jobs.filter((job) => getJobRepoVersion(job) === selectedRepoVersion); + const selectedRepoEditorJobs = selectedRepoJobs.filter((job) => job.imageType === 'editor'); + const selectedRepoBaseJob = selectedRepoJobs.find((job) => job.imageType === 'base'); + const selectedRepoHubJob = selectedRepoJobs.find((job) => job.imageType === 'hub'); const staleFailedJobs = editorJobs .filter( (job) => @@ -269,12 +368,66 @@ const QueueManagementPanel = ({ selectedRepoVersion }: Props) => { const failedWithDockerInfo = builds.filter( (build) => build.status === 'failed' && build.dockerInfo?.digest, ); + const selectedRepoStartedBuilds = builds.filter((build) => build.status === 'started'); + const selectedRepoStartedBuildAges = selectedRepoStartedBuilds + .map((build) => minutesSinceTimestamp(build.meta?.lastBuildStart)) + .filter((minutes): minutes is number => minutes !== null); + const selectedRepoCreatedJobAges = selectedRepoEditorJobs + .filter((job) => job.status === 'created') + .map((job) => minutesSinceTimestamp(job.addedDate || job.modifiedDate)) + .filter((minutes): minutes is number => minutes !== null); return { staleFailedJobs, staleCreatedJobs, repoDriftBuilds, failedWithDockerInfo, + selectedRepoStaleCreatedJobs: selectedRepoEditorJobs + .filter((job) => job.status === 'created') + .map((job) => ({ + jobId: job.NO_ID_FIELD || job.id, + ageMinutes: minutesSinceTimestamp(job.addedDate || job.modifiedDate) || 0, + })) + .filter((job) => job.ageMinutes >= 60) + .sort((a, b) => b.ageMinutes - a.ageMinutes), + selectedRepoStaleStartedBuilds: selectedRepoStartedBuilds + .map((build) => ({ + buildId: build.buildId, + relatedJobId: build.relatedJobId, + ageMinutes: minutesSinceTimestamp(build.meta?.lastBuildStart) || 0, + })) + .filter((build) => build.ageMinutes >= 45) + .sort((a, b) => b.ageMinutes - a.ageMinutes), + selectedRepoJobCounts: { + created: selectedRepoEditorJobs.filter((job) => job.status === 'created').length, + scheduled: selectedRepoEditorJobs.filter((job) => job.status === 'scheduled').length, + inProgress: selectedRepoEditorJobs.filter((job) => job.status === 'inProgress').length, + completed: selectedRepoEditorJobs.filter((job) => job.status === 'completed').length, + failed: selectedRepoEditorJobs.filter((job) => job.status === 'failed').length, + }, + selectedRepoBuildCounts: { + started: builds.filter((build) => build.status === 'started').length, + failed: builds.filter((build) => build.status === 'failed').length, + published: builds.filter((build) => build.status === 'published').length, + }, + selectedRepoBaseJobStatus: selectedRepoBaseJob?.status || 'missing', + selectedRepoHubJobStatus: selectedRepoHubJob?.status || 'missing', + selectedRepoCreatedJobsOlderThan60m: selectedRepoCreatedJobAges.filter( + (minutes) => minutes >= 60, + ).length, + selectedRepoOldestCreatedJobAgeMinutes: + selectedRepoCreatedJobAges.length > 0 ? Math.max(...selectedRepoCreatedJobAges) : null, + selectedRepoStartedBuildsOlderThan45m: selectedRepoStartedBuildAges.filter( + (minutes) => minutes >= 45, + ).length, + selectedRepoStartedBuildsOlderThan6h: selectedRepoStartedBuildAges.filter( + (minutes) => minutes >= 360, + ).length, + selectedRepoOldestStartedBuildAgeMinutes: + selectedRepoStartedBuildAges.length > 0 ? Math.max(...selectedRepoStartedBuildAges) : null, + selectedRepoMaxedOutFailedBuilds: builds.filter( + (build) => build.status === 'failed' && (build.meta?.failureCount || 0) >= 15, + ).length, totals: { jobs: jobs.length, builds: builds.length, @@ -295,6 +448,52 @@ const QueueManagementPanel = ({ selectedRepoVersion }: Props) => { ); }, [clipboard, diagnostics, notify, selectedRepoVersion]); + const likelyBlockers = useMemo(() => { + const blockers: string[] = []; + + if (diagnostics.selectedRepoBaseJobStatus !== 'completed') { + blockers.push( + `Base image job for ${selectedRepoVersion} is ${diagnostics.selectedRepoBaseJobStatus}, so editor scheduling may be waiting on base-image completion.`, + ); + } + + if (diagnostics.selectedRepoHubJobStatus !== 'completed') { + blockers.push( + `Hub image job for ${selectedRepoVersion} is ${diagnostics.selectedRepoHubJobStatus}, so editor scheduling may be waiting on hub-image completion.`, + ); + } + + if ( + diagnostics.selectedRepoJobCounts.created > 0 && + diagnostics.selectedRepoJobCounts.scheduled === 0 && + diagnostics.selectedRepoJobCounts.inProgress === 0 + ) { + blockers.push( + `Selected repo has created editor jobs but no scheduled or in-progress editor jobs, which points to scheduling starvation rather than per-build failure recovery.`, + ); + } + + if (diagnostics.selectedRepoStaleStartedBuilds.length > 0) { + blockers.push( + `Selected repo has started builds older than 45 minutes, so queue slots may be occupied by stale workflows that need backend reconciliation.`, + ); + } + + if (diagnostics.selectedRepoMaxedOutFailedBuilds > 0) { + blockers.push( + `Selected repo still has builds at max retry count, so automatic reset/retry coverage is relevant for this repo version.`, + ); + } + + if (blockers.length === 0) { + blockers.push( + 'No obvious blocker from the current snapshot. Focus next on scheduled/in-progress job throughput and whether workers are actually reporting back to the backend.', + ); + } + + return blockers; + }, [diagnostics, selectedRepoVersion]); + return (
{
+
+ + Base job: {diagnostics.selectedRepoBaseJobStatus} + + + Hub job: {diagnostics.selectedRepoHubJobStatus} + + + Repo created 60m+: {diagnostics.selectedRepoCreatedJobsOlderThan60m} + + + Started 45m+: {diagnostics.selectedRepoStartedBuildsOlderThan45m} + + + Started 6h+: {diagnostics.selectedRepoStartedBuildsOlderThan6h} + + + Maxed failed: {diagnostics.selectedRepoMaxedOutFailedBuilds} + +
+

Use this panel to identify queue states that need intervention. Existing admin actions on this page remain the operational controls: reset failed builds, retry builds, and @@ -388,6 +608,132 @@ const QueueManagementPanel = ({ selectedRepoVersion }: Props) => {

+
+

Likely Blockers

+
+ {likelyBlockers.map((blocker) => ( +

+ {blocker} +

+ ))} +
+
+ +
+

Selected Repo Queue Health

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
MetricValue
Editor jobs: created / scheduled / in progress + {diagnostics.selectedRepoJobCounts.created} /{' '} + {diagnostics.selectedRepoJobCounts.scheduled} /{' '} + {diagnostics.selectedRepoJobCounts.inProgress} +
Editor jobs: completed / failed + {diagnostics.selectedRepoJobCounts.completed} /{' '} + {diagnostics.selectedRepoJobCounts.failed} +
Builds: started / failed / published + {diagnostics.selectedRepoBuildCounts.started} /{' '} + {diagnostics.selectedRepoBuildCounts.failed} /{' '} + {diagnostics.selectedRepoBuildCounts.published} +
Oldest created job + {formatAgeMinutes(diagnostics.selectedRepoOldestCreatedJobAgeMinutes)} +
Oldest started build + {formatAgeMinutes(diagnostics.selectedRepoOldestStartedBuildAgeMinutes)} +
+
+ +
+

Selected Repo Created Jobs Older Than 60m

+ + + + + + + + + + {diagnostics.selectedRepoStaleCreatedJobs.slice(0, 12).map((job) => ( + + + + + + ))} + {diagnostics.selectedRepoStaleCreatedJobs.length === 0 && ( + + + + )} + +
Job IDAgeAction
{job.jobId}{formatAgeMinutes(job.ageMinutes)} + Scheduling has not promoted this job out of created state. Check whether + base/hub prerequisites or queue capacity are blocking dispatch. +
+ No selected-repo created jobs older than 60 minutes detected. +
+
+ +
+

Selected Repo Started Builds Older Than 45m

+ + + + + + + + + + + {diagnostics.selectedRepoStaleStartedBuilds.slice(0, 12).map((build) => ( + + + + + + + ))} + {diagnostics.selectedRepoStaleStartedBuilds.length === 0 && ( + + + + )} + +
Build IDJob IDAgeAction
{build.buildId}{build.relatedJobId}{formatAgeMinutes(build.ageMinutes)} + If DockerHub already has the image, this should be auto-reconciled back to + published. If not, this build may still be occupying queue capacity. +
+ No selected-repo started builds older than 45 minutes detected. +
+
+

Repo-Version Drift Builds