Skip to content

Commit 83404d9

Browse files
Collect events and upgrade state in must-gather.sh
* Collect Kubernetes events in operator namespace. * Collect per-GPU-node upgrade state (annotations, labels, cordon status, node events). * Collect controller revisions for driver and other operand DaemonSets. Signed-off-by: Rajath Agasthya <ragasthya@nvidia.com>
1 parent 532f2f0 commit 83404d9

1 file changed

Lines changed: 85 additions & 0 deletions

File tree

hack/must-gather.sh

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,91 @@ do
275275
> "${ARTIFACT_DIR}/gpu_operand_ds_$(echo "$ds" | cut -d/ -f2).descr"
276276
done
277277

278+
echo ""
279+
echo "#"
280+
echo "# Kubernetes Events (operator namespace)"
281+
echo "#"
282+
echo ""
283+
284+
echo "Get events in ${OPERATOR_NAMESPACE} (sorted by last timestamp, default TTL is 1h)"
285+
$K get events \
286+
-n "${OPERATOR_NAMESPACE}" \
287+
--sort-by='.lastTimestamp' \
288+
> "${ARTIFACT_DIR}/events_operator_namespace.log" 2>&1 || true
289+
290+
echo ""
291+
echo "#"
292+
echo "# GPU Node Upgrade State"
293+
echo "#"
294+
echo ""
295+
296+
echo "Get upgrade-related annotations and labels for GPU nodes"
297+
for node in $(echo "$gpu_pci_nodes"); do
298+
node_name=$(echo "${node}" | cut -d/ -f2)
299+
echo "=== ${node_name} ===" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
300+
301+
echo "# Upgrade annotations:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
302+
$K get "${node}" -ojsonpath='{.metadata.annotations}' 2>/dev/null \
303+
| tr ',' '\n' \
304+
| grep -E 'nvidia.com/gpu-driver' \
305+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" 2>/dev/null || echo " (none)" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
306+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
307+
308+
echo "# Upgrade state label:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
309+
$K get "${node}" -ojsonpath='{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}' 2>/dev/null \
310+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true
311+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
312+
313+
echo "# Node conditions (Ready, SchedulingDisabled, etc.):" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
314+
$K get "${node}" -o jsonpath='{range .status.conditions[*]}{.type}={.status} {end}' 2>/dev/null \
315+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true
316+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
317+
318+
echo "# Unschedulable:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
319+
$K get "${node}" -ojsonpath='{.spec.unschedulable}' 2>/dev/null \
320+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true
321+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
322+
323+
echo "# Driver pod controller-revision-hash:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
324+
$K get pods -n "${OPERATOR_NAMESPACE}" -lapp=nvidia-driver-daemonset --field-selector "spec.nodeName=${node_name}" \
325+
-ojsonpath='{.items[0].metadata.labels.controller-revision-hash}' 2>/dev/null \
326+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true
327+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
328+
329+
echo "# Events on node (upgrade-related):" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
330+
$K get events -A --field-selector "involvedObject.name=${node_name},involvedObject.kind=Node" \
331+
--sort-by='.lastTimestamp' \
332+
2>/dev/null \
333+
>> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true
334+
echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state"
335+
done
336+
337+
echo ""
338+
echo "#"
339+
echo "# Controller Revisions (operand DaemonSets)"
340+
echo "#"
341+
echo ""
342+
343+
echo "Get controller revisions in ${OPERATOR_NAMESPACE}"
344+
$K get controllerrevisions \
345+
-n "${OPERATOR_NAMESPACE}" \
346+
--sort-by='.revision' \
347+
> "${ARTIFACT_DIR}/controller_revisions.log" 2>&1 || true
348+
349+
echo "Get controller revision details (driver and other operands)"
350+
for cr in $($K get controllerrevisions -n "${OPERATOR_NAMESPACE}" -oname 2>/dev/null); do
351+
cr_owner=$($K get "${cr}" -n "${OPERATOR_NAMESPACE}" -ojsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || true)
352+
if echo "${cr_owner}" | grep -qi 'driver'; then
353+
$K get "${cr}" -n "${OPERATOR_NAMESPACE}" -oyaml \
354+
>> "${ARTIFACT_DIR}/controller_revisions_driver.yaml" 2>&1 || true
355+
echo "---" >> "${ARTIFACT_DIR}/controller_revisions_driver.yaml"
356+
else
357+
$K get "${cr}" -n "${OPERATOR_NAMESPACE}" -oyaml \
358+
>> "${ARTIFACT_DIR}/controller_revisions_other.yaml" 2>&1 || true
359+
echo "---" >> "${ARTIFACT_DIR}/controller_revisions_other.yaml"
360+
fi
361+
done
362+
278363
echo ""
279364
echo "#"
280365
echo "# nvidia-bug-report.sh"

0 commit comments

Comments
 (0)