From e581ce17a617024f99feef57bc0a09613ba2b550 Mon Sep 17 00:00:00 2001 From: Paul Smith Date: Sat, 6 Dec 2025 08:14:52 +1100 Subject: [PATCH 1/2] Fix macOS launchctl pmcd fork issue (#2307) Run pmcd in foreground mode when launched by launchctl to prevent the parent process exit after fork from being interpreted as a crash. - Add PMCD_LAUNCHED_BY_LAUNCHD env var in io.pcp.pmcd.plist - Enable KeepAlive for proper process monitoring - Detect env var in rc_pmcd and add -f flag to run in foreground --- build/mac/io.pcp.pmcd.plist | 7 ++++++- src/pmcd/rc_pmcd | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/build/mac/io.pcp.pmcd.plist b/build/mac/io.pcp.pmcd.plist index 7dec183f65f..35eecc4125d 100644 --- a/build/mac/io.pcp.pmcd.plist +++ b/build/mac/io.pcp.pmcd.plist @@ -9,6 +9,11 @@ /etc/init.d/pmcd start + EnvironmentVariables + + PMCD_LAUNCHED_BY_LAUNCHD + 1 + Disabled UserName @@ -16,7 +21,7 @@ UserGroup wheel KeepAlive - + ThrottleInterval 30 RunAtLoad diff --git a/src/pmcd/rc_pmcd b/src/pmcd/rc_pmcd index 01bb1b38d98..e2918a95637 100644 --- a/src/pmcd/rc_pmcd +++ b/src/pmcd/rc_pmcd @@ -602,6 +602,12 @@ Error: pmcd log directory $PCP_LOG_DIR/pmcd ("'"$LOGDIR"'") is missing, cannot s echo $prog: pmcd --verify $OPTS failed, cannot start pmcd. exit fi + # When launched by launchctl on macOS, run in foreground mode to prevent + # fork issues that cause launchctl to think pmcd has crashed + if [ -n "$PMCD_LAUNCHED_BY_LAUNCHD" ] + then + OPTS="$OPTS -f" + fi $PMCD $OPTS _start_pmcheck $RC_STATUS -v From f4c96a5bf713a138e751e2a00a9baf08dd101b7f Mon Sep 17 00:00:00 2001 From: Paul Smith Date: Sat, 6 Dec 2025 08:54:25 +1100 Subject: [PATCH 2/2] Add macOS-specific test suite for launchctl pmcd fix Adds automated test script and manual testing guide for validating the launchctl pmcd fork issue fix on macOS systems. - test-pmcd-launchctl.sh: Comprehensive automated test suite with proper timeout handling for slow VMs (3-minute timeouts) - TESTING.md: Manual testing procedures and troubleshooting guide Tests validate foreground mode operation, KeepAlive functionality, and proper launchctl service management. Placed in build/mac/qa/ to keep macOS-specific tests separate from the main qa/ suite. Not included in macOS package distribution (consistent with macOS installer excluding test infrastructure). --- build/mac/qa/TESTING.md | 244 +++++++++++++++++++++ build/mac/qa/test-pmcd-launchctl.sh | 315 ++++++++++++++++++++++++++++ 2 files changed, 559 insertions(+) create mode 100644 build/mac/qa/TESTING.md create mode 100755 build/mac/qa/test-pmcd-launchctl.sh diff --git a/build/mac/qa/TESTING.md b/build/mac/qa/TESTING.md new file mode 100644 index 00000000000..81d16df7ce7 --- /dev/null +++ b/build/mac/qa/TESTING.md @@ -0,0 +1,244 @@ +# macOS launchctl pmcd Test Plan + +## Automated Testing (Recommended) + +The automated test script handles all timing issues for slow VMs: + +```bash +# Make executable and run +chmod +x test-pmcd-launchctl.sh +sudo ./test-pmcd-launchctl.sh +``` + +**Note**: The script is configured for slow VMs with 3-minute timeouts. You can adjust the timeout values at the top of the script if needed. + +--- + +## Manual Testing + +For manual validation on slow VMs, use these commands with proper wait strategies: + +### Prerequisites +```bash +# Build and install PCP with the fix +cd /path/to/pcp-pmcd-daemon-launchctl +./configure --prefix=/usr --libexecdir=/usr/lib --sysconfdir=/etc --localstatedir=/var +make +sudo make install +``` + +### Helper Function for Waiting + +Add this to your shell session for easier testing: + +```bash +# Wait for pmcd to be ready (use PCP's built-in tool) +wait_pmcd() { + echo "Waiting for pmcd to respond..." + if command -v pmcd_wait >/dev/null 2>&1; then + # Use PCP's pmcd_wait with 3 minute timeout + pmcd_wait -t 180 + else + # Fallback: poll until pmcd responds + local count=0 + while ! pminfo -f pmcd.version >/dev/null 2>&1; do + sleep 5 + count=$((count + 1)) + echo "Still waiting... (${count}x5s)" + if [ $count -gt 36 ]; then # 3 minutes + echo "Timeout waiting for pmcd" + return 1 + fi + done + fi + echo "pmcd is ready!" +} +``` + +### Test 1: Verify Configuration Changes + +```bash +# Check plist has environment variable and KeepAlive +cat /Library/LaunchDaemons/io.pcp.pmcd.plist | grep -A4 EnvironmentVariables +# Should show: PMCD_LAUNCHED_BY_LAUNCHD = 1 + +cat /Library/LaunchDaemons/io.pcp.pmcd.plist | grep -A1 KeepAlive +# Should show: +``` + +### Test 2: Clean Start and Foreground Mode Verification + +```bash +# Stop pmcd if running +sudo launchctl unload /Library/LaunchDaemons/io.pcp.pmcd.plist 2>/dev/null + +# Wait for pmcd to fully stop +while pgrep pmcd >/dev/null 2>&1; do + echo "Waiting for pmcd to stop..." + sleep 2 +done + +# Load and start pmcd via launchctl +sudo launchctl load /Library/LaunchDaemons/io.pcp.pmcd.plist + +# Wait for pmcd to be ready (this is the slow part - 1-2 minutes on slow VM) +wait_pmcd + +# Verify pmcd is running +ps aux | grep pmcd | grep -v grep + +# Check that pmcd is running with -f flag (foreground mode) +ps aux | grep '[p]mcd' | grep -- '-f' +# Should see the -f flag in the command line + +# Verify exactly one pmcd process (no fork issues) +echo "pmcd process count: $(ps aux | grep '[p]mcd' | wc -l | tr -d ' ')" +# Should be 1 + +# Check launchctl service status +sudo launchctl list | grep io.pcp.pmcd +# Should show running status with PID +``` + +### Test 3: Verify pmcd Functionality + +```bash +# Test basic metric queries +pminfo -f hinv.ncpu +pminfo -f kernel.all.load +pminfo -f pmcd.version + +# Verify pmcd responds +pmprobe -v pmcd.numclients +# Should return: pmcd.numclients 1 + +# Check pmcd log for errors +sudo tail -50 /var/log/pcp/pmcd/pmcd.log +# Should not show fork-related errors or startup failures +``` + +### Test 4: Verify KeepAlive Behavior (Crash Recovery) + +```bash +# Get pmcd PID +PMCD_PID=$(pgrep pmcd) +echo "pmcd PID: $PMCD_PID" + +# Forcefully kill pmcd to simulate crash +sudo kill -9 $PMCD_PID + +# Wait for launchctl to restart pmcd (can take 1-2 minutes on slow VM) +echo "Waiting for KeepAlive to restart pmcd..." +sleep 10 + +# Poll for new pmcd process +for i in {1..24}; do # 2 minutes max + NEW_PID=$(pgrep pmcd) + if [ -n "$NEW_PID" ] && [ "$PMCD_PID" != "$NEW_PID" ]; then + echo "✓ pmcd restarted with new PID: $NEW_PID (after ${i}x5s)" + break + fi + echo "Attempt $i: waiting..." + sleep 5 +done + +# Verify new PID is different +NEW_PID=$(pgrep pmcd) +if [ "$PMCD_PID" != "$NEW_PID" ] && [ -n "$NEW_PID" ]; then + echo "✓ KeepAlive working - pmcd was restarted" +else + echo "✗ KeepAlive failed - pmcd was not restarted" +fi + +# Wait for pmcd to be ready after restart +wait_pmcd + +# Verify pmcd still works after restart +pminfo -f pmcd.version +``` + +### Test 5: Check Logs + +```bash +# Check system log for launchctl messages about pmcd (macOS 10.12+) +log show --predicate 'process == "launchd"' --last 10m | grep pmcd + +# Check pmcd logs +sudo tail -100 /var/log/pcp/pmcd/pmcd.log + +# Check launchctl stderr/stdout +sudo cat /var/log/pcp/pmcd/plist.stderr +sudo cat /var/log/pcp/pmcd/plist.stdout +``` + +### Test 6: Clean Shutdown and Restart + +```bash +# Stop pmcd cleanly via launchctl +sudo launchctl unload /Library/LaunchDaemons/io.pcp.pmcd.plist + +# Wait for pmcd to stop (should be quick) +while pgrep pmcd >/dev/null 2>&1; do + echo "Waiting for pmcd to stop..." + sleep 2 +done +echo "pmcd stopped" + +# Verify pmcd stopped +pgrep pmcd +# Should return nothing + +# Restart pmcd +sudo launchctl load /Library/LaunchDaemons/io.pcp.pmcd.plist + +# Wait for pmcd to be ready (1-2 minutes on slow VM) +wait_pmcd + +# Verify it started successfully +pminfo -f pmcd.version +``` + +--- + +## Expected Results Summary + +✅ **Configuration**: PMCD_LAUNCHED_BY_LAUNCHD env var present, KeepAlive = true +✅ **Foreground mode**: pmcd runs with `-f` flag +✅ **No fork issues**: Only one pmcd process exists +✅ **launchctl tracking**: Service shows in `launchctl list` with PID +✅ **KeepAlive works**: pmcd automatically restarts after crash +✅ **Functionality**: pmcd responds to metric queries normally +✅ **Clean logs**: No errors about fork failures or unexpected exits + +--- + +## Troubleshooting + +### pmcd takes a very long time to start +This is normal on slow VMs. The automated test script has 3-minute timeouts. You can increase them if needed. + +### KeepAlive doesn't restart pmcd +Check launchctl status: +```bash +sudo launchctl list | grep io.pcp.pmcd +``` + +Check for errors: +```bash +sudo tail -100 /var/log/pcp/pmcd/pmcd.log +sudo cat /var/log/pcp/pmcd/plist.stderr +``` + +### pmcd not running in foreground mode +Verify the environment variable is set: +```bash +grep -A4 EnvironmentVariables /Library/LaunchDaemons/io.pcp.pmcd.plist +``` + +Check if rc_pmcd is detecting it: +```bash +sudo launchctl unload /Library/LaunchDaemons/io.pcp.pmcd.plist +sudo launchctl load /Library/LaunchDaemons/io.pcp.pmcd.plist +wait_pmcd +ps aux | grep '[p]mcd' # Should show -f flag +``` diff --git a/build/mac/qa/test-pmcd-launchctl.sh b/build/mac/qa/test-pmcd-launchctl.sh new file mode 100755 index 00000000000..77cec9b3cb6 --- /dev/null +++ b/build/mac/qa/test-pmcd-launchctl.sh @@ -0,0 +1,315 @@ +#!/bin/bash +# Test script for macOS pmcd launchctl fix +# Designed for slow VMs where pmcd startup can take 1-2 minutes + +set -e + +# Configuration - adjust these for your VM's speed +PMCD_START_TIMEOUT=180 # 3 minutes for pmcd to start responding +PMCD_STOP_TIMEOUT=60 # 1 minute for pmcd to stop +KEEPALIVE_TIMEOUT=180 # 3 minutes for KeepAlive to restart pmcd + +PLIST_PATH="/Library/LaunchDaemons/io.pcp.pmcd.plist" + +echo "=== macOS pmcd launchctl Test Suite ===" +echo "Timeouts: start=${PMCD_START_TIMEOUT}s, stop=${PMCD_STOP_TIMEOUT}s, keepalive=${KEEPALIVE_TIMEOUT}s" + +# Helper function to wait for pmcd to respond to queries +wait_for_pmcd_ready() { + local timeout=$1 + local start_time=$(date +%s) + + echo -n "Waiting for pmcd to respond (timeout ${timeout}s)..." + + # Try using pmcd_wait if available + if command -v pmcd_wait >/dev/null 2>&1; then + if pmcd_wait -t $timeout >/dev/null 2>&1; then + echo " ready!" + return 0 + else + echo " timeout!" + return 1 + fi + fi + + # Fallback: poll with pminfo + while true; do + if pminfo -f pmcd.version >/dev/null 2>&1; then + local elapsed=$(($(date +%s) - start_time)) + echo " ready! (${elapsed}s)" + return 0 + fi + + local elapsed=$(($(date +%s) - start_time)) + if [ $elapsed -ge $timeout ]; then + echo " timeout!" + return 1 + fi + + # Show progress every 10 seconds + if [ $((elapsed % 10)) -eq 0 ]; then + echo -n "." + fi + + sleep 2 + done +} + +# Helper function to wait for pmcd to stop +wait_for_pmcd_stopped() { + local timeout=$1 + local start_time=$(date +%s) + + echo -n "Waiting for pmcd to stop (timeout ${timeout}s)..." + + while pgrep pmcd >/dev/null 2>&1; do + local elapsed=$(($(date +%s) - start_time)) + if [ $elapsed -ge $timeout ]; then + echo " timeout!" + return 1 + fi + + if [ $((elapsed % 5)) -eq 0 ]; then + echo -n "." + fi + + sleep 1 + done + + local elapsed=$(($(date +%s) - start_time)) + echo " stopped! (${elapsed}s)" + return 0 +} + +# Test 1: Configuration +echo -e "\n[Test 1] Checking plist configuration..." +if [ ! -f "$PLIST_PATH" ]; then + echo "✗ Plist file not found at $PLIST_PATH" + exit 1 +fi + +if grep -q "PMCD_LAUNCHED_BY_LAUNCHD" "$PLIST_PATH"; then + echo "✓ PMCD_LAUNCHED_BY_LAUNCHD environment variable present" +else + echo "✗ PMCD_LAUNCHED_BY_LAUNCHD environment variable missing" + exit 1 +fi + +if grep -A1 "KeepAlive" "$PLIST_PATH" | grep -q ""; then + echo "✓ KeepAlive enabled" +else + echo "✗ KeepAlive not enabled" + exit 1 +fi + +# Test 2: Clean start via launchctl +echo -e "\n[Test 2] Starting pmcd via launchctl..." + +# Unload first if loaded +echo "Unloading pmcd service..." +sudo launchctl unload "$PLIST_PATH" 2>/dev/null || true + +if ! wait_for_pmcd_stopped $PMCD_STOP_TIMEOUT; then + echo "✗ pmcd did not stop in time, forcing kill..." + sudo pkill -9 pmcd || true + sleep 2 +fi + +# Load the service +echo "Loading pmcd service..." +sudo launchctl load "$PLIST_PATH" + +# Wait for pmcd to be ready +if wait_for_pmcd_ready $PMCD_START_TIMEOUT; then + echo "✓ pmcd started successfully" +else + echo "✗ pmcd failed to start or respond" + echo "Checking logs:" + sudo tail -20 /var/log/pcp/pmcd/pmcd.log 2>/dev/null || echo "No log file found" + exit 1 +fi + +# Verify pmcd is running +if pgrep pmcd >/dev/null 2>&1; then + echo "✓ pmcd process is running (PID: $(pgrep pmcd))" +else + echo "✗ pmcd process not found" + exit 1 +fi + +# Test 3: Verify foreground mode +echo -e "\n[Test 3] Verifying foreground mode..." + +if ps aux | grep '[p]mcd' | grep -q -- '-f'; then + echo "✓ pmcd running in foreground mode (-f flag present)" +else + echo "⚠ Warning: pmcd not running with -f flag" + echo "Command line: $(ps aux | grep '[p]mcd')" +fi + +# Check for only one pmcd process (no fork issues) +# Use pgrep to match only the pmcd binary, not scripts containing "pmcd" +PMCD_COUNT=$(pgrep -x pmcd | wc -l | tr -d ' ') +if [ "$PMCD_COUNT" -eq 1 ]; then + echo "✓ Exactly one pmcd process running (no fork issues)" +else + echo "✗ Found $PMCD_COUNT pmcd processes (expected 1)" + echo "pmcd processes:" + pgrep -lx pmcd + echo "All processes matching pmcd:" + ps aux | grep '[p]mcd' + exit 1 +fi + +# Test 4: Verify launchctl tracking +echo -e "\n[Test 4] Verifying launchctl service status..." + +if sudo launchctl list | grep -q "io.pcp.pmcd"; then + LAUNCHCTL_STATUS=$(sudo launchctl list | grep io.pcp.pmcd) + echo "✓ launchctl tracking pmcd service" + echo " Status: $LAUNCHCTL_STATUS" +else + echo "✗ launchctl not tracking pmcd service" + exit 1 +fi + +# Test 5: Functionality +echo -e "\n[Test 5] Testing pmcd functionality..." + +if pminfo -f pmcd.version >/dev/null 2>&1; then + VERSION=$(pminfo -f pmcd.version 2>/dev/null | grep value | awk '{print $2}') + echo "✓ pmcd responds to queries (version: $VERSION)" +else + echo "✗ pmcd not responding to queries" + exit 1 +fi + +if pmprobe -v pmcd.numclients >/dev/null 2>&1; then + CLIENTS=$(pmprobe -v pmcd.numclients 2>/dev/null) + echo "✓ pmcd metrics accessible ($CLIENTS)" +else + echo "✗ pmcd metrics not accessible" + exit 1 +fi + +# Test 6: KeepAlive (crash recovery) +echo -e "\n[Test 6] Testing KeepAlive crash recovery..." + +OLD_PID=$(pgrep pmcd) +echo "Current pmcd PID: $OLD_PID" + +echo "Simulating crash (kill -9)..." +sudo kill -9 $OLD_PID + +# Wait a moment for the kill to take effect +sleep 2 + +# Wait for launchctl to restart pmcd +echo -n "Waiting for KeepAlive to restart pmcd (timeout ${KEEPALIVE_TIMEOUT}s)..." +start_time=$(date +%s) +restarted=false + +while true; do + NEW_PID=$(pgrep pmcd 2>/dev/null || true) + + if [ -n "$NEW_PID" ] && [ "$OLD_PID" != "$NEW_PID" ]; then + elapsed=$(($(date +%s) - start_time)) + echo " restarted! (${elapsed}s, new PID: $NEW_PID)" + restarted=true + break + fi + + elapsed=$(($(date +%s) - start_time)) + if [ $elapsed -ge $KEEPALIVE_TIMEOUT ]; then + echo " timeout!" + break + fi + + if [ $((elapsed % 10)) -eq 0 ]; then + echo -n "." + fi + + sleep 2 +done + +if [ "$restarted" = false ]; then + echo "✗ KeepAlive failed to restart pmcd" + echo "Checking launchctl status:" + sudo launchctl list | grep io.pcp.pmcd || echo "Service not found in launchctl" + exit 1 +fi + +echo "✓ KeepAlive successfully restarted pmcd" + +# Test 7: Functionality after restart +echo -e "\n[Test 7] Verifying pmcd functionality after crash recovery..." + +if wait_for_pmcd_ready $PMCD_START_TIMEOUT; then + echo "✓ pmcd responding after KeepAlive restart" +else + echo "✗ pmcd not responding after restart" + exit 1 +fi + +if pminfo -f pmcd.version >/dev/null 2>&1; then + echo "✓ pmcd fully functional after recovery" +else + echo "✗ pmcd not functional after recovery" + exit 1 +fi + +# Test 8: Clean shutdown +echo -e "\n[Test 8] Testing clean shutdown..." + +echo "Unloading pmcd service..." +sudo launchctl unload "$PLIST_PATH" + +if wait_for_pmcd_stopped $PMCD_STOP_TIMEOUT; then + echo "✓ pmcd stopped cleanly" +else + echo "✗ pmcd did not stop cleanly" + exit 1 +fi + +if ! pgrep pmcd >/dev/null 2>&1; then + echo "✓ No pmcd processes remaining" +else + echo "✗ pmcd process still running" + ps aux | grep '[p]mcd' + exit 1 +fi + +# Test 9: Restart capability +echo -e "\n[Test 9] Testing restart capability..." + +echo "Reloading pmcd service..." +sudo launchctl load "$PLIST_PATH" + +if wait_for_pmcd_ready $PMCD_START_TIMEOUT; then + echo "✓ pmcd restarted successfully" +else + echo "✗ pmcd failed to restart" + exit 1 +fi + +if pminfo -f pmcd.version >/dev/null 2>&1; then + echo "✓ pmcd functional after restart" +else + echo "✗ pmcd not functional after restart" + exit 1 +fi + +# Summary +echo -e "\n=== All tests passed! ===" +echo "" +echo "Summary:" +echo " ✓ Configuration correct (PMCD_LAUNCHED_BY_LAUNCHD + KeepAlive)" +echo " ✓ pmcd runs in foreground mode (-f flag)" +echo " ✓ No fork issues (single pmcd process)" +echo " ✓ launchctl properly tracks pmcd" +echo " ✓ pmcd responds to queries" +echo " ✓ KeepAlive restarts pmcd on crash" +echo " ✓ pmcd functions correctly after recovery" +echo " ✓ Clean shutdown/restart works" +echo "" +echo "The fix is working correctly!"