|
275 | 275 | > "${ARTIFACT_DIR}/gpu_operand_ds_$(echo "$ds" | cut -d/ -f2).descr" |
276 | 276 | done |
277 | 277 |
|
| 278 | +echo "" |
| 279 | +echo "#" |
| 280 | +echo "# Kubernetes Events (operator namespace)" |
| 281 | +echo "#" |
| 282 | +echo "" |
| 283 | + |
| 284 | +echo "Get events in ${OPERATOR_NAMESPACE} (sorted by last timestamp, default TTL is 1h)" |
| 285 | +$K get events \ |
| 286 | + -n "${OPERATOR_NAMESPACE}" \ |
| 287 | + --sort-by='.lastTimestamp' \ |
| 288 | + > "${ARTIFACT_DIR}/events_operator_namespace.log" 2>&1 || true |
| 289 | + |
| 290 | +echo "" |
| 291 | +echo "#" |
| 292 | +echo "# GPU Node Upgrade State" |
| 293 | +echo "#" |
| 294 | +echo "" |
| 295 | + |
| 296 | +echo "Get upgrade-related annotations and labels for GPU nodes" |
| 297 | +for node in $(echo "$gpu_pci_nodes"); do |
| 298 | + node_name=$(echo "${node}" | cut -d/ -f2) |
| 299 | + echo "=== ${node_name} ===" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 300 | + |
| 301 | + echo "# Upgrade annotations:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 302 | + $K get "${node}" -ojsonpath='{.metadata.annotations}' 2>/dev/null \ |
| 303 | + | tr ',' '\n' \ |
| 304 | + | grep -E 'nvidia.com/gpu-driver' \ |
| 305 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" 2>/dev/null || echo " (none)" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 306 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 307 | + |
| 308 | + echo "# Upgrade state label:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 309 | + $K get "${node}" -ojsonpath='{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}' 2>/dev/null \ |
| 310 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true |
| 311 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 312 | + |
| 313 | + echo "# Node conditions (Ready, SchedulingDisabled, etc.):" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 314 | + $K get "${node}" -o jsonpath='{range .status.conditions[*]}{.type}={.status} {end}' 2>/dev/null \ |
| 315 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true |
| 316 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 317 | + |
| 318 | + echo "# Unschedulable:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 319 | + $K get "${node}" -ojsonpath='{.spec.unschedulable}' 2>/dev/null \ |
| 320 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true |
| 321 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 322 | + |
| 323 | + echo "# Driver pod controller-revision-hash:" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 324 | + $K get pods -n "${OPERATOR_NAMESPACE}" -lapp=nvidia-driver-daemonset --field-selector "spec.nodeName=${node_name}" \ |
| 325 | + -ojsonpath='{.items[0].metadata.labels.controller-revision-hash}' 2>/dev/null \ |
| 326 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true |
| 327 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 328 | + |
| 329 | + echo "# Events on node (upgrade-related):" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 330 | + $K get events -A --field-selector "involvedObject.name=${node_name},involvedObject.kind=Node" \ |
| 331 | + --sort-by='.lastTimestamp' \ |
| 332 | + 2>/dev/null \ |
| 333 | + >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" || true |
| 334 | + echo "" >> "${ARTIFACT_DIR}/gpu_nodes.upgrade_state" |
| 335 | +done |
| 336 | + |
| 337 | +echo "" |
| 338 | +echo "#" |
| 339 | +echo "# Controller Revisions (operand DaemonSets)" |
| 340 | +echo "#" |
| 341 | +echo "" |
| 342 | + |
| 343 | +echo "Get controller revisions in ${OPERATOR_NAMESPACE}" |
| 344 | +$K get controllerrevisions \ |
| 345 | + -n "${OPERATOR_NAMESPACE}" \ |
| 346 | + --sort-by='.revision' \ |
| 347 | + > "${ARTIFACT_DIR}/controller_revisions.log" 2>&1 || true |
| 348 | + |
| 349 | +echo "Get controller revision details (driver and other operands)" |
| 350 | +for cr in $($K get controllerrevisions -n "${OPERATOR_NAMESPACE}" -oname 2>/dev/null); do |
| 351 | + cr_owner=$($K get "${cr}" -n "${OPERATOR_NAMESPACE}" -ojsonpath='{.metadata.ownerReferences[0].name}' 2>/dev/null || true) |
| 352 | + if echo "${cr_owner}" | grep -qi 'driver'; then |
| 353 | + $K get "${cr}" -n "${OPERATOR_NAMESPACE}" -oyaml \ |
| 354 | + >> "${ARTIFACT_DIR}/controller_revisions_driver.yaml" 2>&1 || true |
| 355 | + echo "---" >> "${ARTIFACT_DIR}/controller_revisions_driver.yaml" |
| 356 | + else |
| 357 | + $K get "${cr}" -n "${OPERATOR_NAMESPACE}" -oyaml \ |
| 358 | + >> "${ARTIFACT_DIR}/controller_revisions_other.yaml" 2>&1 || true |
| 359 | + echo "---" >> "${ARTIFACT_DIR}/controller_revisions_other.yaml" |
| 360 | + fi |
| 361 | +done |
| 362 | + |
278 | 363 | echo "" |
279 | 364 | echo "#" |
280 | 365 | echo "# nvidia-bug-report.sh" |
|
0 commit comments