From a2d4dabf4c578d6443dc457abfd4b3b7387ed7e4 Mon Sep 17 00:00:00 2001 From: businesscurry123 Date: Fri, 15 May 2026 22:23:15 +0900 Subject: [PATCH] Fix Android USB missed-cycle stalls --- README.md | 5 + docs/android_usb_host_missed_cycles.md | 141 ++++++++++++++++++ .../android_usb_host_missed_cycles_triage.sh | 107 +++++++++++++ selfdrive/boardd/boardd.cc | 17 ++- selfdrive/boardd/panda.cc | 6 +- 5 files changed, 269 insertions(+), 7 deletions(-) create mode 100644 docs/android_usb_host_missed_cycles.md create mode 100755 scripts/android_usb_host_missed_cycles_triage.sh diff --git a/README.md b/README.md index e7df7f10..585ae9f1 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,11 @@ For running flowpilot on your car, you need: - One of the [200+ supported cars](https://github.com/commaai/openpilot/blob/master/docs/CARS.md). The community supports Honda, Toyota, Hyundai, Nissan, Kia, Chrysler, Lexus, Acura, Audi, VW, and more. If your car is not supported but has adaptive cruise control and lane-keeping assist, it's likely able to run flowpilot. For a more detailed overview, see the [wiring and hardware wiki](https://github.com/flowdriveai/flowpilot/wiki/Connecting-to-Car). + +Android phones or ROMs that show Panda USB cut-outs, slow USB access, or +`boardd` missed-cycle warnings can use the +[Android USB host missed-cycles guide](docs/android_usb_host_missed_cycles.md) +and `scripts/android_usb_host_missed_cycles_triage.sh` to collect diagnostics. # Installation: See the [installation wiki](https://github.com/flowdriveai/flowpilot/wiki/Installation). diff --git a/docs/android_usb_host_missed_cycles.md b/docs/android_usb_host_missed_cycles.md new file mode 100644 index 00000000..17f8bffc --- /dev/null +++ b/docs/android_usb_host_missed_cycles.md @@ -0,0 +1,141 @@ +# Android USB Host Missed Cycles + +This note explains the `missed cycles` warning seen in `boardd` on Android USB +host setups, the most likely causes, and the checks that help separate a bad +USB path from normal scheduler jitter. + +## What the warning means + +`selfdrive/boardd/boardd.cc` runs `boardd_can_recv` at 100 Hz. Each loop has a +10 ms budget to read CAN data from panda over USB, publish the `can` message, +and sleep until the next frame deadline. + +When the thread reaches the end of a loop after the deadline, `boardd` logs: + +```text +missed cycles () recv= +``` + +`remaining` is negative nanoseconds relative to the expected 100 Hz deadline. +`recv` is the time spent inside the panda CAN USB receive calls for that loop. +If `recv` is close to or above 10 ms, the USB read path is consuming the cycle +budget by itself. + +## Android no-root path + +Flowpilot has two panda connection paths: + +- Normal desktop/rooted path: `pandad.py` finds pandas by serial, then runs + `./boardd `. +- Android no-root path: `pandad.py` calls `termux-usb -l`, probes devices with + `termux-usb -r -e ./ispanda `, then launches + `termux-usb -r -e ./boardd `. + +The no-root path passes an Android USB permission file descriptor into +`boardd`. `Panda::Panda(const int fd, ...)` wraps that descriptor with +`libusb_wrap_sys_device`, and `Panda::can_receive()` reads endpoint `0x81` with +`libusb_bulk_transfer`. + +That means Android no-root operation depends on: + +- the phone ROM's USB host implementation, +- Termux/Android permission and descriptor handoff, +- libusb behavior on the wrapped descriptor, +- Android scheduler latency while `boardd_can_recv` tries to hold a 100 Hz + loop. + +## Why missed cycles happen + +The direct code-level risk was an unbounded CAN receive bulk transfer. The repo +default `TIMEOUT` is `0`, and `Panda::can_receive()` previously called +`usb_bulk_read()` without overriding it. In libusb, a zero timeout means the +bulk transfer may wait indefinitely. On Android ROMs where USB host access is +slow, suspended, power-throttled, or delayed by the permission FD path, one read +can consume more than the 10 ms loop budget. + +Other contributors can make the same symptom worse: + +- poor OTG cable, loose connector, or underpowered panda/phone USB path, +- Android battery optimization, doze, thermal throttling, or screen-off CPU + throttling, +- ROM-specific USB host bugs or aggressive power policy, +- multiple pandas read sequentially in one 100 Hz loop, +- other boardd threads holding the same per-panda USB lock for control + transfers, +- missing root-level real-time scheduling, CPU affinity, and IRQ affinity on + Android no-root devices. + +The old tici-specific hardware path explicitly kept a boardd core online and +assigned xHCI USB IRQs to that core. Android no-root devices usually cannot +recreate that setup. + +## Code fix in this PR + +This PR bounds the CAN receive bulk read: + +```cpp +usb_bulk_read(0x81, data, RECV_SIZE, 5); +``` + +A timeout is not treated as a communication failure in the existing +`usb_bulk_read()` implementation; it returns the bytes received so far, or zero +bytes if no CAN data arrived during the timeout. `Panda::can_receive()` already +handles `recv <= 0` as a healthy empty receive, so this keeps behavior +compatible while preventing one slow Android USB read from blocking the 100 Hz +loop without a limit. + +The receive timeout is set to 5 ms to match the existing CAN transmit bulk +write timeout. That leaves part of the 10 ms boardd cycle for publishing and +sleep recovery on the common single-panda Android path. + +The PR also makes the loop deadline arithmetic signed and logs `recv=` with +missed-cycle warnings. That makes it clear whether the loop was late because +USB receive consumed the budget or because later scheduler/publish work delayed +the thread. + +## Field checklist + +Use this order when diagnosing a phone that still logs missed cycles: + +1. Confirm that the log includes `recv=`. +2. If `recv` is near or above `10000000`, focus on USB host latency. +3. If `recv` is small but missed cycles continue, focus on Android scheduling, + CPU governor, thermal throttling, or other high-priority processes. +4. Keep the phone awake, disable battery optimization for Termux/Flowpilot, and + avoid screen-off power saving while testing. +5. Test a short known-good OTG cable, then a powered hub if the phone cannot + keep panda powered reliably. +6. Disconnect other USB devices from the phone while testing. +7. Compare the same panda and cable on Linux/desktop. If the warning disappears, + the phone/ROM USB host path is the likely source. +8. On rooted setups, test performance governor, real-time priority, and CPU/IRQ + affinity for boardd. Do not apply these blindly on no-root Termux; this repo + already disables some real-time helpers because they can freeze Android + chroot setups. + +To collect a first diagnostic bundle, run this either inside the Android +Termux/flowpilot environment or from a desktop with `adb` connected: + +```sh +./scripts/android_usb_host_missed_cycles_triage.sh +``` + +The script auto-detects local Android/Termux mode versus `adb` mode. It records +USB host features, `dumpsys usb`, Type-C or USB role sysfs files, `termux-usb -l` +when available, filtered USB/boardd logcat lines, thermal state, and a summary +template. Review the generated directory before posting it publicly because +device logs can contain private local details. + +## When a guide is enough + +If the patched build no longer logs large `recv` values but a specific ROM +still drops USB, this is likely outside `boardd` itself. The practical fix is a +device/ROM/cable/power change, or running Flowpilot through a rooted/native +environment where USB scheduling and power policy can be controlled. + +## References + +- Android USB host mode lets an Android device power the bus and enumerate USB + devices: +- libusb synchronous bulk transfer uses milliseconds for `timeout`, and `0` + means unlimited: diff --git a/scripts/android_usb_host_missed_cycles_triage.sh b/scripts/android_usb_host_missed_cycles_triage.sh new file mode 100755 index 00000000..32556dbc --- /dev/null +++ b/scripts/android_usb_host_missed_cycles_triage.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +set -euo pipefail + +out_dir="${1:-android-usb-host-missed-cycles-$(date +%Y%m%d-%H%M%S)}" +mkdir -p "$out_dir" + +quote_cmd() { + printf '$' + printf ' %q' "$@" + printf '\n\n' +} + +run_cmd() { + local name="$1" + shift + { + quote_cmd "$@" + "$@" 2>&1 || true + } > "$out_dir/$name.txt" +} + +run_local_shell() { + local name="$1" + shift + run_cmd "$name" sh -c "$*" +} + +run_adb_shell() { + local name="$1" + shift + run_cmd "$name" adb shell "$*" +} + +have_cmd() { + command -v "$1" >/dev/null 2>&1 +} + +mode="local" +if ! have_cmd getprop && have_cmd adb && adb get-state >/dev/null 2>&1; then + mode="adb" +fi + +{ + echo "Flowpilot Android USB host missed-cycles triage" + echo "mode: $mode" + echo "created_at: $(date -Iseconds 2>/dev/null || date)" + echo + echo "Review these files before sharing them publicly. Device logs can contain" + echo "serial numbers, nearby network names, account names, and vehicle details." +} > "$out_dir/README.txt" + +if [ "$mode" = "local" ]; then + run_local_shell build_props 'getprop ro.build.fingerprint; getprop ro.product.manufacturer; getprop ro.product.model; getprop ro.build.version.release; uname -a' + run_local_shell host_features 'pm list features | grep -i "android.hardware.usb.host" || true' + run_local_shell usb_state 'dumpsys usb' + run_local_shell usb_roles 'for f in /sys/class/usb_role/*/role /sys/class/typec/*/data_role; do [ -e "$f" ] && echo "$f=$(cat "$f")"; done' + run_local_shell usb_role_paths 'find /sys -path "*usb*" \( -name role -o -name mode \) 2>/dev/null | head -100' + run_local_shell processes 'ps -A 2>/dev/null | grep -Ei "boardd|pandad|termux|flowpilot" || true' + run_local_shell thermal 'dumpsys thermalservice 2>/dev/null || true' + run_local_shell top_threads 'top -H -n 1 2>/dev/null || top -n 1 2>/dev/null || true' + if have_cmd termux-usb; then + run_cmd termux_usb_list termux-usb -l + else + echo "termux-usb not found in PATH" > "$out_dir/termux_usb_list.txt" + fi + if have_cmd logcat; then + run_cmd logcat_full logcat -d + else + echo "logcat not found in PATH" > "$out_dir/logcat_full.txt" + fi +else + run_adb_shell build_props 'getprop ro.build.fingerprint; getprop ro.product.manufacturer; getprop ro.product.model; getprop ro.build.version.release; uname -a' + run_adb_shell host_features 'pm list features | grep -i "android.hardware.usb.host" || true' + run_adb_shell usb_state 'dumpsys usb' + run_adb_shell usb_roles 'for f in /sys/class/usb_role/*/role /sys/class/typec/*/data_role; do [ -e "$f" ] && echo "$f=$(cat "$f")"; done' + run_adb_shell usb_role_paths 'find /sys -path "*usb*" \( -name role -o -name mode \) 2>/dev/null | head -100' + run_adb_shell processes 'ps -A 2>/dev/null | grep -Ei "boardd|pandad|termux|flowpilot" || true' + run_adb_shell thermal 'dumpsys thermalservice 2>/dev/null || true' + run_adb_shell top_threads 'top -H -n 1 2>/dev/null || top -n 1 2>/dev/null || true' + run_cmd logcat_full adb logcat -d + run_cmd termux_usb_list adb shell 'command -v termux-usb >/dev/null 2>&1 && termux-usb -l || echo "termux-usb not found in adb shell PATH"' +fi + +grep -Eai 'UsbHost|UsbDevice|UsbPort|libusb|boardd|pandad|panda|termux-usb|missed cycles|overflow|timeout|disconnect|reset|vbus|typec' \ + "$out_dir/logcat_full.txt" > "$out_dir/logcat_usb_filtered.txt" || true + +cat > "$out_dir/summary_template.md" <<'EOF' +# Flowpilot Android USB Host Triage Summary + +- Device: +- ROM/build fingerprint: +- Android version: +- Panda type: +- Cable, OTG adapter, hub, and power path: +- Local Termux mode or adb mode: +- `android.hardware.usb.host` present: +- `dumpsys usb` host/source role while panda attached: +- `termux-usb -l` sees panda: +- First `missed cycles` log line: +- Nearby USB timeout/overflow/disconnect/reset lines: +- Thermal or CPU throttling evidence: +- Does the same panda/cable work on Linux or desktop: + +Attach the full output directory only after reviewing it for private data. +EOF + +printf 'Wrote %s\n' "$out_dir" diff --git a/selfdrive/boardd/boardd.cc b/selfdrive/boardd/boardd.cc index 94d89312..01c882bd 100644 --- a/selfdrive/boardd/boardd.cc +++ b/selfdrive/boardd/boardd.cc @@ -273,16 +273,18 @@ void can_recv_thread(std::vector pandas) { PubMaster pm({"can"}); // run at 100hz - const uint64_t dt = 10000000ULL; - uint64_t next_frame_time = nanos_since_boot() + dt; + const int64_t dt = 10000000LL; + int64_t next_frame_time = static_cast(nanos_since_boot()) + dt; std::vector raw_can_data; while (!do_exit && check_all_connected(pandas)) { bool comms_healthy = true; raw_can_data.clear(); + const int64_t receive_start_time = static_cast(nanos_since_boot()); for (const auto& panda : pandas) { comms_healthy &= panda->can_receive(raw_can_data); } + const int64_t receive_time = static_cast(nanos_since_boot()) - receive_start_time; MessageBuilder msg; auto evt = msg.initEvent(); @@ -296,14 +298,17 @@ void can_recv_thread(std::vector pandas) { } pm.send("can", msg); - uint64_t cur_time = nanos_since_boot(); + int64_t cur_time = static_cast(nanos_since_boot()); int64_t remaining = next_frame_time - cur_time; if (remaining > 0) { - std::this_thread::sleep_for(std::chrono::nanoseconds(remaining)); + std::this_thread::sleep_for(std::chrono::nanoseconds(remaining)); } else { if (ignition) { - if ((int)-1*remaining/dt > 1){ - LOGW("missed cycles (%d) %lld", (int)-1*remaining/dt, remaining); + const int64_t missed_cycles = (-remaining) / dt; + if (missed_cycles > 1) { + LOGW("missed cycles (%lld) %lld recv=%lld", (long long)missed_cycles, (long long)remaining, (long long)receive_time); + } else if (receive_time > dt) { + LOGW_100("CAN receive over 100Hz budget: recv=%lld remaining=%lld", (long long)receive_time, (long long)remaining); } } next_frame_time = cur_time; diff --git a/selfdrive/boardd/panda.cc b/selfdrive/boardd/panda.cc index 1f1ae25c..7e712bc3 100644 --- a/selfdrive/boardd/panda.cc +++ b/selfdrive/boardd/panda.cc @@ -13,6 +13,10 @@ #include "common/swaglog.h" #include "common/util.h" +namespace { +constexpr unsigned int CAN_RECV_TIMEOUT_MS = 5U; +} + static int init_usb_ctx(libusb_context **context) { assert(context != nullptr); @@ -473,7 +477,7 @@ void Panda::can_send(capnp::List::Reader can_data_list) { bool Panda::can_receive(std::vector& out_vec) { uint8_t data[RECV_SIZE]; - int recv = usb_bulk_read(0x81, (uint8_t*)data, RECV_SIZE); + int recv = usb_bulk_read(0x81, (uint8_t*)data, RECV_SIZE, CAN_RECV_TIMEOUT_MS); if (!comms_healthy) { return false; }