diff --git a/.github/workflows/android-tests.yml b/.github/workflows/android-tests.yml index fbb45f5..0e7ea67 100644 --- a/.github/workflows/android-tests.yml +++ b/.github/workflows/android-tests.yml @@ -194,16 +194,53 @@ jobs: -noaudio -no-boot-anim disable-animations: false script: | - # Wait for system services after snapshot restore — - # sys.boot_completed can flip before the `input` service publishes, - # which makes emulator-runner's own `input keyevent 82` crash the - # emulator. We run this loop ourselves and skip the keyevent - # (disable-animations: false). - adb wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; do sleep 1; done' - # Disable animations via settings (more reliable than input keyevent) - adb shell settings put global window_animation_scale 0.0 - adb shell settings put global transition_animation_scale 0.0 - adb shell settings put global animator_duration_scale 0.0 + # Post-snapshot service-readiness wait. + # + # `sys.boot_completed` flips as soon as zygote is alive, which on + # a snapshot restore happens before `system_server` finishes + # binding the `settings`, `package`, and `activity` services. + # The previous wait-for-device + `getprop` loop returned almost + # instantly because the property was already set, and the very + # next `adb shell settings put` then crashed with + # `cmd: Failure calling service settings: Broken pipe (32)` and + # exit-code 224 — the failure mode this is fixing. + # + # We poll each service we're about to call until it actually + # responds, with a hard 120s ceiling. Single retry on each + # `settings put` afterwards covers any final transient hiccup. + wait_for_emulator() { + local deadline=$((SECONDS + 120)) + while [[ $SECONDS -lt $deadline ]]; do + # adb 'tr -d \r' strips the CRLF Windows-style line endings + # the device sends back, otherwise the [[ == "1" ]] check + # fails on a stray \r. + bc=$(adb shell getprop sys.boot_completed 2>/dev/null | tr -d '\r') + if [[ "$bc" == "1" ]] \ + && adb shell 'pm path android' >/dev/null 2>&1 \ + && adb shell 'cmd settings list global' >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + echo "::error::Emulator services did not become ready within 120s" >&2 + adb shell getprop | grep -E 'boot|svc' >&2 || true + return 1 + } + wait_for_emulator + # Disable animations via settings (more reliable than input + # keyevent — emulator-runner's keyevent path crashes on some + # snapshots, which is why disable-animations is left at + # `false` and we drive the settings here directly). + # + # One retry on each `settings put` because the service can + # still hiccup briefly even after the readiness probe passes + # (the probe verifies it answers a `list`, but `put` opens a + # write transaction which can race with first-time + # initialisation). + for s in window_animation_scale transition_animation_scale animator_duration_scale; do + adb shell settings put global "$s" 0.0 \ + || { sleep 2; adb shell settings put global "$s" 0.0; } + done # -i prints per-test STARTED/PASSED/FAILED so a hanging test is # identifiable cd apps/example/android && ./gradlew :comapeo-core-react-native:connectedDebugAndroidTest :app:connectedDebugAndroidTest --no-daemon -i -PreactNativeArchitectures=x86_64 diff --git a/android/src/androidTest/java/com/comapeo/core/NodeJSIPCTest.kt b/android/src/androidTest/java/com/comapeo/core/NodeJSIPCTest.kt index 65ef888..3620493 100644 --- a/android/src/androidTest/java/com/comapeo/core/NodeJSIPCTest.kt +++ b/android/src/androidTest/java/com/comapeo/core/NodeJSIPCTest.kt @@ -288,6 +288,76 @@ class NodeJSIPCTest { Thread.sleep(500) } + /** + * `close()` is the synchronous teardown path used from + * `ComapeoCoreModule.OnDestroy` (the Expo lifecycle hook that fires on + * React Native JS context reload). The contract — distinct from + * `disconnect()` — is that the underlying socket FD is released on the + * calling thread before `close()` returns, so the peer (the Node.js + * backend over AF_UNIX) observes EOF before the next `OnCreate` opens a + * fresh connection. That ordering is what lets the backend's + * per-connection rpc-reflector cleanup run against the prior session + * rather than the new one. + * + * This test pins that contract: after `close()` returns, the server's + * blocking read on the same connection unblocks immediately with EOF + * (`-1` from `DataInputStream.read()`). + */ + @Test + fun closeReleasesSocketSynchronously() { + val connected = CountDownLatch(1) + val readResult = java.util.concurrent.atomic.AtomicInteger(Int.MIN_VALUE) + val readReturned = CountDownLatch(1) + + startMockServer { input, _ -> + connected.countDown() + // Block until the client closes its end. Without + // synchronous close on the client, this read would only + // unblock after the OS GC's the orphaned fd — racy and + // user-visible-slow. + try { + val n = input.read() + readResult.set(n) + } catch (e: IOException) { + // Some kernels surface peer-close as IOException rather + // than EOF; either is a valid EOF signal — treat both + // as "peer closed". + readResult.set(-1) + } + readReturned.countDown() + } + + val ipc = NodeJSIPC(socketFile) { msg -> receivedMessages.add(msg) } + assertTrue("Should connect within 10s", connected.await(10, TimeUnit.SECONDS)) + // Let the IPC settle into Connected and start its receive loop + // before we close — otherwise we're testing a connect-cancel + // race rather than steady-state synchronous close. + Thread.sleep(200) + + // The synchronous-close contract: the moment close() returns, + // the server's blocking read must already have observed EOF. + // We give the kernel a generous 1s window to deliver FIN — far + // longer than necessary on a local AF_UNIX socket but short + // enough that a regression to the old fire-and-forget close + // (which depends on the launched coroutine eventually running + // on Dispatchers.IO) shows up as a test failure. + ipc.close() + + assertTrue( + "Server-side read must unblock with EOF within 1s of close() returning. " + + "Regression: if close() reverts to a launched-coroutine teardown, the " + + "FD release happens after the JS reload has already opened a new " + + "connection and the rpc-reflector cleanup runs against the wrong " + + "connection.", + readReturned.await(1, TimeUnit.SECONDS) + ) + assertEquals( + "Server's read must return -1 (EOF) — peer closed cleanly.", + -1, + readResult.get() + ) + } + @Test fun handlesServerDisconnect() { val connected = CountDownLatch(1) diff --git a/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt b/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt index 8692a31..6496ae9 100644 --- a/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt +++ b/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt @@ -100,6 +100,17 @@ class ComapeoCoreModule : Module() { } override fun definition() = ModuleDefinition { + // OnCreate / OnDestroy are bound to the Expo `AppContext`, whose + // lifetime is the React Native JS runtime — not the Android + // Activity. They fire on every JS context tear-down/rebuild + // (dev reload, `DevSettings.reload()`, fast-refresh full + // reload), which is exactly the reconnect boundary we want for + // the RPC sockets: the previous JS session's rpc-reflector + // client is gone, but its event-listener subscriptions are + // still attached on the backend's MapeoManager. Closing the + // socket here forces the backend to observe the disconnect and + // tear those subscriptions down before the next JS session + // opens a fresh connection from a new OnCreate. OnCreate { val socketFile = File(appContext.persistentFilesDirectory, ComapeoCoreService.COMAPEO_SOCKET_FILENAME) @@ -190,8 +201,17 @@ class ComapeoCoreModule : Module() { } OnDestroy { - ipc.disconnect() - controlIpc.disconnect() + // Synchronous close so the AF_UNIX socket FD is released on + // this thread before OnDestroy returns. The fire-and-forget + // disconnect() would let the next OnCreate open a new + // connection while the old socket is still alive in a + // launched coroutine; a synchronous close means the backend + // sees EOF on the previous session before it accepts the + // new connection, so rpc-reflector's per-connection cleanup + // path (server.close → removeListener for every prior + // subscription) runs against the right connection. + ipc.close() + controlIpc.close() } OnActivityEntersForeground { diff --git a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt index f49ab27..79828bd 100644 --- a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt +++ b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt @@ -8,6 +8,7 @@ import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Job import kotlinx.coroutines.SupervisorJob +import kotlinx.coroutines.cancel import kotlinx.coroutines.cancelAndJoin import kotlinx.coroutines.channels.Channel import kotlinx.coroutines.delay @@ -231,6 +232,40 @@ class NodeJSIPC( connect() sendChannel.trySend(message) } + + /** + * Synchronous, terminal teardown for module-destroy lifecycle. + * + * Unlike [disconnect] — which launches a coroutine and returns + * immediately — [close] closes the underlying [LocalSocket] on the + * calling thread before returning, so the peer (the Node.js backend + * over the AF_UNIX socket) observes EOF before [close] returns. + * That ordering is the point: it lets a caller running in a tight + * lifecycle window (e.g. the Expo module's `OnDestroy`, immediately + * before a fresh `OnCreate` opens a new connection) guarantee the + * old socket is gone before the new one is opened, so the backend's + * per-connection state (notably any rpc-reflector event-listener + * subscriptions registered against the long-lived handler) is torn + * down on the same reload that starts a fresh client. + * + * After [close] the instance must not be reused: the scope is + * cancelled and a fresh [NodeJSIPC] should be constructed. + */ + fun close() { + // Cancel the scope first so the receive/send coroutines stop + // touching the streams as we close them. socket.close() will + // also unblock any in-flight read/write with an IOException; + // the cancelled scope means the coroutines exit without + // re-entering disconnect(). + scope.cancel() + sendChannel.close() + try { dataOutputStream?.close() } catch (_: Exception) {} + try { dataInputStream?.close() } catch (_: Exception) {} + if (::socket.isInitialized) { + try { socket.close() } catch (_: Exception) {} + } + state.value = State.Disconnected + } } private suspend fun connectWithRetry( diff --git a/backend/lib/message-port.js b/backend/lib/message-port.js index d77621d..5da64bf 100644 --- a/backend/lib/message-port.js +++ b/backend/lib/message-port.js @@ -100,5 +100,16 @@ export class SocketMessagePort extends TypedEmitter { this.#state = "closed"; this.#queue.length = 0; this.#framedStream.destroy(); + // Emit after `#state = "closed"` and `#framedStream.destroy()` so a + // listener that synchronously inspects state or writes a final + // postMessage observes the "closed" state and the destroyed + // stream. Consumers (notably `ComapeoRpcServer`, which wires + // `messagePort.on("close", () => server.close())` to remove every + // RPC event-listener subscription registered against + // `MapeoManager` for this connection) rely on this event firing + // exactly once when the underlying socket goes away — without it + // the rpc-reflector cleanup never runs, and listeners accumulate + // across client reconnects (e.g. on every React Native JS reload). + this.emit("close"); } } diff --git a/backend/package.json b/backend/package.json index 7aec498..6552377 100644 --- a/backend/package.json +++ b/backend/package.json @@ -5,6 +5,7 @@ "type": "module", "scripts": { "build": "rollup -c", + "test": "node --test \"test/*.test.js\"", "types": "tsc" }, "keywords": [], diff --git a/backend/test/message-port.test.js b/backend/test/message-port.test.js new file mode 100644 index 0000000..010fe7b --- /dev/null +++ b/backend/test/message-port.test.js @@ -0,0 +1,173 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { Duplex } from "node:stream"; +import FramedStream from "framed-stream"; +import { Buffer } from "node:buffer"; + +import { SocketMessagePort } from "../lib/message-port.js"; + +/** + * Builds a pair of connected duplex streams to stand in for a real + * net.Socket pair. Anything written to `a` is readable from `b` and vice + * versa, and destroying either side propagates EOF + close to the peer + * — same as a real AF_UNIX socket pair, which is what the backend + * connects to in production. + * + * Hand-rolled (rather than using `net.createServer` over a temp socket + * file) so the test stays in-process, runs without filesystem + * permissions, and finishes in milliseconds. + * + * @returns {[Duplex, Duplex]} + */ +function pair() { + /** @type {Duplex} */ + let a; + /** @type {Duplex} */ + let b; + a = new Duplex({ + read() {}, + write(chunk, _enc, cb) { + b.push(chunk); + cb(); + }, + final(cb) { + b.push(null); + cb(); + }, + destroy(_err, cb) { + // Mirrors AF_UNIX behaviour: closing one side delivers EOF to + // the peer, which then sees its readable end end and closes. + if (!b.destroyed) b.destroy(); + cb(null); + }, + }); + b = new Duplex({ + read() {}, + write(chunk, _enc, cb) { + a.push(chunk); + cb(); + }, + final(cb) { + a.push(null); + cb(); + }, + destroy(_err, cb) { + if (!a.destroyed) a.destroy(); + cb(null); + }, + }); + return [a, b]; +} + +test("SocketMessagePort emits 'close' once when the underlying socket closes", async () => { + const [serverSide, clientSide] = pair(); + const port = new SocketMessagePort(serverSide); + port.start(); + + let closeCount = 0; + port.on("close", () => { + closeCount++; + }); + + // Simulate the peer disconnecting (e.g. RN reload tears down the + // socket from the client side). The frame stream observes 'close' + // and our SocketMessagePort.close() must propagate it. + clientSide.destroy(); + + // Wait one microtask cycle for the close cascade to complete. + await new Promise((resolve) => setImmediate(resolve)); + + assert.equal(closeCount, 1, "close must fire exactly once"); +}); + +test("SocketMessagePort emits 'close' when close() is called directly", async () => { + const [serverSide] = pair(); + const port = new SocketMessagePort(serverSide); + port.start(); + + let closeCount = 0; + port.on("close", () => { + closeCount++; + }); + + port.close(); + // Calling close again must not emit another event — the close handler + // in ComapeoRpcServer attaches `server.close()` which is idempotent + // but we still don't want spurious double-fires. + port.close(); + + await new Promise((resolve) => setImmediate(resolve)); + + assert.equal(closeCount, 1, "close must be idempotent and fire at most once"); +}); + +test( + "ComapeoRpcServer-style wiring: handler listeners are removed when the socket closes", + async () => { + // This is the contract the backend relies on for stale-listener cleanup + // across RN reloads. We don't pull in @comapeo/ipc here — its + // createMapeoServer wraps rpc-reflector but the relevant invariant + // (server.close() removes listeners attached to the handler) is a + // property of rpc-reflector itself, exercised here directly so the + // test runs in isolation without the full Mapeo manager surface. + const { createServer } = await import("rpc-reflector"); + const { EventEmitter } = await import("node:events"); + + const handler = new EventEmitter(); + const [serverSide, clientSide] = pair(); + + const messagePort = new SocketMessagePort(serverSide); + messagePort.start(); + + // The shape ComapeoRpcServer constructs. + const server = createServer(handler, messagePort); + messagePort.on("close", () => server.close()); + + // Drive a subscription onto `handler` by sending an [ON, ...] frame + // through the SocketMessagePort, the same way an rpc-reflector + // client would after a `clientApi.on('progress', cb)` call. + // msgType.ON === 2 in rpc-reflector/lib/constants.js + // (REQUEST=0, RESPONSE=1, ON=2, OFF=3, EMIT=4). + const ON = 2; + const onFrame = Buffer.from(JSON.stringify([ON, "progress", []])); + const lengthPrefix = Buffer.alloc(4); + lengthPrefix.writeUInt32LE(onFrame.length, 0); + clientSide.write(Buffer.concat([lengthPrefix, onFrame])); + + // Give the server a tick to install its listener on `handler`. + await new Promise((resolve) => setImmediate(resolve)); + assert.equal( + handler.listenerCount("progress"), + 1, + "rpc-reflector should attach exactly one listener after [ON, ...]", + ); + + // Now simulate the client going away — what RN reload looks like + // from the backend's perspective once the IPC socket closes. + clientSide.destroy(); + await new Promise((resolve) => setImmediate(resolve)); + + assert.equal( + handler.listenerCount("progress"), + 0, + "listener must be removed once the socket closes (regression: " + + "without SocketMessagePort.emit('close'), the cleanup path " + + "in ComapeoRpcServer never runs and listeners leak)", + ); + }, +); + +// FramedStream is the one piece we lean on directly — make sure the +// test harness's `pair()` actually exercises it the way production does +// (length-prefixed framing, both sides observing close). +test("smoke: FramedStream wired through pair() round-trips a message", async () => { + const [a, b] = pair(); + const fa = new FramedStream(a); + const fb = new FramedStream(b); + + const received = new Promise((resolve) => fb.once("data", resolve)); + fa.write(Buffer.from("hello")); + + const buf = await received; + assert.equal(buf.toString(), "hello"); +}); diff --git a/ios/ComapeoCoreModule.swift b/ios/ComapeoCoreModule.swift index 030b55d..b57591a 100644 --- a/ios/ComapeoCoreModule.swift +++ b/ios/ComapeoCoreModule.swift @@ -65,6 +65,40 @@ public class ComapeoCoreModule: Module { } OnDestroy { + // OnDestroy fires reliably on iOS JS reload as of + // expo-modules-core's PR #33760 (merged Dec 2024, shipped in + // SDK 53+). Previously, a strong-reference cycle through + // `MainValueConverter` kept `AppContext` alive across + // reloads — and as long as `AppContext` stayed alive, its + // `ModuleHolder`s stayed alive, their deinits never ran, + // and the `.moduleDestroy` event that triggers OnDestroy + // was never posted. PR #33760 changed + // `MainValueConverter.appContext` to a weak reference, + // which removes the cycle so `AppContext` can be released + // on reload. Verified against the installed + // `expo-modules-core@55.0.23`: + // ios/Core/MainValueConverter.swift:7 + // `private(set) weak var appContext: AppContext?` + // ios/Core/ModuleHolder.swift:140 + // `deinit { post(event: .moduleDestroy) }` + // On reload: AppContext deinits → its module registry + // releases each ModuleHolder → each ModuleHolder's deinit + // fires `.moduleDestroy` → this block runs. + // + // `disconnect()` is already synchronous on iOS + // (`shutdown(2)` → join receive loop → `close(2)`), so the + // backend observes EOF before the OnDestroy block returns + // and the rpc-reflector subscription cleanup runs against + // the prior session's connection. + // + // If a future SDK upgrade or a third-party module + // reintroduces a strong reference that pins `AppContext`, + // the fallback is to subscribe to + // `RCTBridgeWillReloadNotification` / + // `RCTJavaScriptWillStartLoadingNotification` in OnCreate + // and call `disconnect()` from the notification handler. + // We don't pre-emptively wire that here — only one + // teardown path makes the lifecycle easier to reason about. self.ipc?.disconnect() self.ipc = nil }