From ed695283fa962a9a2adf9215fdc48d6d522becf6 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Thu, 30 Apr 2026 23:11:19 +0100 Subject: [PATCH 1/3] android: fold waitForFile into connect retry loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two NodeJSIPC instances are constructed back-to-back in ComapeoCoreModule.OnCreate, each calling waitForFile() with FileObserver on the same parent directory. Android's FileObserver$ObserverThread maps watch descriptors to observers via SparseArray.put(wd, ref); inotify_add_watch returns the same wd for repeat watches on the same path, so the second registration silently overwrites the first (issuetracker.google.com/37017033). In release builds Module.OnCreate fires fast enough that both observers register before either socket file is bound — the comapeo.sock observer loses the wd slot and never receives events, hanging the IPC for the full 30 s timeout. JS-side RPCs then time out with no socket connection to deliver them. Drop waitForFile entirely. LocalSocket.connect throws IOException for both ENOENT (file missing) and ECONNREFUSED (file exists, server not yet accepting) — same primitive handles both phases of backend startup. Replace connectWithRetry's 5-attempt exponential backoff with a fixed 50 ms cadence bounded by a 30 s deadline (matching the prior cumulative budget). One race-prone primitive instead of two; no FileObserver dependency. Verified end-to-end on emulator release build: state reaches STARTED in ~2 s; all 36 jasmine RPC tests pass. --- .../java/com/comapeo/core/WatchForFileTest.kt | 180 ------------------ .../com/comapeo/core/ComapeoCoreModule.kt | 2 +- .../main/java/com/comapeo/core/NodeJSIPC.kt | 53 +++--- .../java/com/comapeo/core/watchForFile.kt | 57 ------ .../comapeo/core/WatchForFileTimeoutTest.kt | 86 --------- apps/example/tests/android/WaitForFileTest.kt | 112 ----------- 6 files changed, 27 insertions(+), 463 deletions(-) delete mode 100644 android/src/androidTest/java/com/comapeo/core/WatchForFileTest.kt delete mode 100644 android/src/main/java/com/comapeo/core/watchForFile.kt delete mode 100644 android/src/test/java/com/comapeo/core/WatchForFileTimeoutTest.kt delete mode 100644 apps/example/tests/android/WaitForFileTest.kt diff --git a/android/src/androidTest/java/com/comapeo/core/WatchForFileTest.kt b/android/src/androidTest/java/com/comapeo/core/WatchForFileTest.kt deleted file mode 100644 index e0fcfa4..0000000 --- a/android/src/androidTest/java/com/comapeo/core/WatchForFileTest.kt +++ /dev/null @@ -1,180 +0,0 @@ -package com.comapeo.core - -import androidx.test.ext.junit.runners.AndroidJUnit4 -import androidx.test.platform.app.InstrumentationRegistry -import kotlinx.coroutines.async -import kotlinx.coroutines.cancel -import kotlinx.coroutines.delay -import kotlinx.coroutines.launch -import kotlinx.coroutines.runBlocking -import kotlinx.coroutines.withTimeout -import kotlinx.coroutines.withTimeoutOrNull -import org.junit.After -import org.junit.Assert.assertEquals -import org.junit.Assert.assertNotNull -import org.junit.Assert.assertNull -import org.junit.Before -import org.junit.Test -import org.junit.runner.RunWith -import java.io.File - -/** - * Instrumented tests for [waitForFile]. - * - * Tests the file watcher that uses Android's [android.os.FileObserver] - * to suspend until a file is created. Must run on a device/emulator - * because FileObserver requires the Android OS. - */ -@RunWith(AndroidJUnit4::class) -class WatchForFileTest { - - private lateinit var testDir: File - private lateinit var testFile: File - - @Before - fun setUp() { - val context = InstrumentationRegistry.getInstrumentation().targetContext - testDir = File(context.cacheDir, "watchforfile_test_${System.nanoTime()}") - testDir.mkdirs() - testFile = File(testDir, "test.sock") - testFile.delete() - } - - @After - fun tearDown() { - testDir.deleteRecursively() - } - - @Test - fun returnsImmediatelyIfFileExists() = runBlocking { - // Create the file first - testFile.createNewFile() - - val result = withTimeout(2000) { - waitForFile(testFile) - } - assertEquals(testFile, result) - } - - @Test - fun suspendsUntilFileIsCreated() = runBlocking { - val result = async { - waitForFile(testFile) - } - - // File doesn't exist yet — waitForFile should be suspended - delay(500) - - // Create the file after a delay - testFile.createNewFile() - - val file = withTimeout(5000) { - result.await() - } - assertEquals(testFile, file) - } - - @Test - fun handlesFileCreatedAfterShortDelay() = runBlocking { - val result = async { - waitForFile(testFile) - } - - // Create the file after 2 seconds - launch { - delay(2000) - testFile.createNewFile() - } - - val file = withTimeout(5000) { - result.await() - } - assertEquals(testFile, file) - } - - @Test - fun cancellationStopsWatching() = runBlocking { - val result = async { - waitForFile(testFile) - } - - // Cancel before the file is created - delay(500) - result.cancel() - - // Verify the deferred was cancelled - try { - result.await() - // Should not reach here - assert(false) { "Should have thrown CancellationException" } - } catch (e: kotlinx.coroutines.CancellationException) { - // Expected - } - } - - @Test - fun doesNotResumeForDifferentFile() = runBlocking { - val result = async { - waitForFile(testFile) - } - - // Create a DIFFERENT file in the same directory - delay(500) - File(testDir, "other_file.sock").createNewFile() - - // waitForFile should NOT have resumed - delay(1000) - val nullResult = withTimeoutOrNull(1000) { - result.await() - } - assertNull("Should not have resumed for different file", nullResult) - - // Now create the actual file - testFile.createNewFile() - - val file = withTimeout(5000) { - result.await() - } - assertEquals(testFile, file) - } - - @Test - fun createsParentDirectoryIfMissing() = runBlocking { - val nestedFile = File(File(testDir, "nested/deep"), "test.sock") - - val result = async { - waitForFile(nestedFile) - } - - // The parent directory should have been created by waitForFile - delay(500) - assert(nestedFile.parentFile!!.exists()) { "Parent directory should be created" } - - // Create the file - nestedFile.createNewFile() - - val file = withTimeout(5000) { - result.await() - } - assertEquals(nestedFile, file) - } - - @Test - fun handlesRapidFileCreateDelete() = runBlocking { - // Test the TOCTOU scenario: create and delete the file rapidly, - // then create it again. The watcher should eventually detect it. - val result = async { - waitForFile(testFile) - } - - delay(500) - - // Create the file — this should trigger the observer - testFile.createNewFile() - - val file = withTimeout(5000) { - result.await() - } - assertEquals(testFile, file) - } -} diff --git a/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt b/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt index 8692a31..d5eed55 100644 --- a/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt +++ b/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt @@ -27,7 +27,7 @@ class ComapeoCoreModule : Module() { * * The control socket file lives in the app's filesDir, accessible from * any process that shares the same UID (i.e. the FGS and the main - * process). The IPC's `waitForFile` polls until the FGS / Node binds. + * process). The IPC's connect loop retries until the FGS / Node binds. */ private lateinit var controlIpc: NodeJSIPC diff --git a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt index f49ab27..cc1aaf0 100644 --- a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt +++ b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt @@ -15,6 +15,7 @@ import kotlinx.coroutines.flow.MutableStateFlow import kotlinx.coroutines.flow.first import kotlinx.coroutines.isActive import kotlinx.coroutines.launch +import kotlinx.coroutines.withTimeout import java.io.DataInputStream import java.io.DataOutputStream import java.io.EOFException @@ -110,7 +111,6 @@ class NodeJSIPC( } } } - waitForFile(socketFile) if (::socket.isInitialized) { try { socket.close() @@ -233,37 +233,36 @@ class NodeJSIPC( } } +/** + * Connect with a fixed-cadence retry loop bounded by an overall deadline. + * + * Retries fire on every `IOException` from `LocalSocket.connect`, which covers + * both "socket file does not exist yet" (`ENOENT`) and "file exists but the + * server is not yet `accept`ing" (`ECONNREFUSED`) — the same primitive handles + * both phases of backend startup. The 50 ms cadence is fast enough to be + * invisible to TTI; the 30 s deadline matches the prior `waitForFile` timeout + * so the cumulative startup wait budget is unchanged. + * + * No exponential backoff: this is a one-shot startup wait, not a network call, + * and the failure mode we're tolerating is "backend not finished booting yet" + * — it doesn't get worse from retrying tightly. + */ private suspend fun connectWithRetry( socketAddress: LocalSocketAddress, - maxRetries: Int = 5, - initialDelayMs: Long = 100, - maxDelayMs: Long = 5000, - backoffMultiplier: Double = 2.0 -): LocalSocket { - var currentDelay = initialDelayMs - var lastException: IOException? = null - - repeat(maxRetries) { attempt -> + deadlineMs: Long = 30_000, + intervalMs: Long = 50, +): LocalSocket = withTimeout(deadlineMs) { + var attempt = 0 + while (true) { + attempt++ try { val socket = LocalSocket() socket.connect(socketAddress) - log("Connected on attempt ${attempt + 1}") - return socket - } catch (e: IOException) { - lastException = e - - if (attempt < maxRetries - 1) { - delay(currentDelay) - currentDelay = minOf( - (currentDelay * backoffMultiplier).toLong(), - maxDelayMs - ) - } + log("Connected on attempt $attempt") + return@withTimeout socket + } catch (_: IOException) { + delay(intervalMs) } } - - throw IOException( - "Failed to connect after $maxRetries attempts", - lastException - ) + @Suppress("UNREACHABLE_CODE") error("unreachable") } diff --git a/android/src/main/java/com/comapeo/core/watchForFile.kt b/android/src/main/java/com/comapeo/core/watchForFile.kt deleted file mode 100644 index 9b1178f..0000000 --- a/android/src/main/java/com/comapeo/core/watchForFile.kt +++ /dev/null @@ -1,57 +0,0 @@ -package com.comapeo.core - -import android.os.FileObserver -import kotlinx.coroutines.suspendCancellableCoroutine -import kotlinx.coroutines.withTimeout -import java.io.File - -/** - * A suspendable function that watches for file creation at the specified path. - * The function will suspend until the specified file is created or timeout occurs. - * - * @param file The file to watch for creation - * @param timeoutMs Maximum time to wait for the file to appear (default: 30 seconds) - * @return The created File object - * @throws IllegalArgumentException if the file has no parent directory - * @throws kotlinx.coroutines.TimeoutCancellationException if the file is not created within the timeout - */ -suspend fun waitForFile(file: File, timeoutMs: Long = 30_000L): File = withTimeout(timeoutMs) { - suspendCancellableCoroutine { continuation -> - - // Get parent directory and throw if null - val parentDir = file.parentFile ?: throw IllegalArgumentException("File must have a parent directory") - - // Create the directory if it doesn't exist - if (!parentDir.exists()) { - parentDir.mkdirs() - } - - // Create file observer BEFORE checking existence to avoid TOCTOU race. - val observer = object : FileObserver(parentDir, CREATE) { - override fun onEvent(event: Int, path: String?) { - if (path == file.name) { - stopWatching() - if (!continuation.isCompleted) { - continuation.resumeWith(Result.success(file)) - } - } - } - } - - observer.startWatching() - - // Check AFTER starting the observer — if the file was created between - // observer setup and this check, the observer will have caught it. - // If it existed before, we catch it here. - if (file.exists()) { - observer.stopWatching() - if (!continuation.isCompleted) { - continuation.resumeWith(Result.success(file)) - } - } - - continuation.invokeOnCancellation { - observer.stopWatching() - } - } -} diff --git a/android/src/test/java/com/comapeo/core/WatchForFileTimeoutTest.kt b/android/src/test/java/com/comapeo/core/WatchForFileTimeoutTest.kt deleted file mode 100644 index f2e3723..0000000 --- a/android/src/test/java/com/comapeo/core/WatchForFileTimeoutTest.kt +++ /dev/null @@ -1,86 +0,0 @@ -package com.comapeo.core - -import kotlinx.coroutines.TimeoutCancellationException -import kotlinx.coroutines.suspendCancellableCoroutine -import kotlinx.coroutines.test.runTest -import kotlinx.coroutines.withTimeout -import org.junit.Assert.assertEquals -import org.junit.Assert.assertTrue -import org.junit.Assert.fail -import org.junit.Test -import kotlin.coroutines.resume - -/** - * JVM unit tests for the timeout pattern used in [waitForFile]. - * - * The actual waitForFile function uses Android's FileObserver and can't run - * on JVM. These tests verify the coroutine timeout wrapper pattern: - * - withTimeout throws TimeoutCancellationException when file never appears - * - suspendCancellableCoroutine resumes immediately when condition is met - * - Cancellation propagates correctly - */ -class WatchForFileTimeoutTest { - - @Test(expected = TimeoutCancellationException::class) - fun timeoutThrowsWhenFileNeverAppears() = runTest { - // Simulates waitForFile when the file is never created - withTimeout(100) { - suspendCancellableCoroutine { /* never resumed */ } - } - } - - @Test - fun immediateResumeWhenAlreadyExists() = runTest { - // Simulates the "check after observer start" path where file already exists - val result = withTimeout(1000) { - suspendCancellableCoroutine { continuation -> - // Simulates: observer.startWatching() then file.exists() == true - val fileAlreadyExists = true - if (fileAlreadyExists) { - continuation.resume("found") - } - } - } - assertEquals("found", result) - } - - @Test - fun cancellationCallbackInvoked() = runTest { - var cleanupCalled = false - - try { - withTimeout(100) { - suspendCancellableCoroutine { continuation -> - continuation.invokeOnCancellation { - // Simulates: observer.stopWatching() - cleanupCalled = true - } - // Never resume — simulates file never appearing - } - } - } catch (_: TimeoutCancellationException) { - // Expected - } - - assertTrue("Cleanup (observer.stopWatching) should be called on timeout", cleanupCalled) - } - - @Test - fun doubleResumeIsIgnored() = runTest { - // Simulates the race where both observer and exists-check try to resume - // The fix uses continuation.isCompleted to guard against double resume - val result = withTimeout(1000) { - suspendCancellableCoroutine { continuation -> - // First resume (from observer callback) - if (!continuation.isCompleted) { - continuation.resume("from-observer") - } - // Second resume (from exists check) — should be safely skipped - if (!continuation.isCompleted) { - fail("Should not reach second resume after first completed") - } - } - } - assertEquals("from-observer", result) - } -} diff --git a/apps/example/tests/android/WaitForFileTest.kt b/apps/example/tests/android/WaitForFileTest.kt deleted file mode 100644 index 1b3d08c..0000000 --- a/apps/example/tests/android/WaitForFileTest.kt +++ /dev/null @@ -1,112 +0,0 @@ -package com.comapeo.core.example - -import androidx.test.ext.junit.runners.AndroidJUnit4 -import androidx.test.platform.app.InstrumentationRegistry -import com.comapeo.core.waitForFile -import kotlinx.coroutines.TimeoutCancellationException -import kotlinx.coroutines.launch -import kotlinx.coroutines.test.runTest -import org.junit.After -import org.junit.Assert.assertEquals -import org.junit.Assert.assertTrue -import org.junit.Assert.fail -import org.junit.Before -import org.junit.Test -import org.junit.runner.RunWith -import java.io.File - -/** - * Instrumented tests for [waitForFile]. - * - * These tests verify: - * - Timeout when file never appears (fix: added withTimeout wrapper) - * - Immediate return when file already exists - * - Observer detects file creation (TOCTOU fix: observer starts before exists check) - */ -@RunWith(AndroidJUnit4::class) -class WaitForFileTest { - - private lateinit var testDir: File - - @Before - fun setUp() { - val context = InstrumentationRegistry.getInstrumentation().targetContext - testDir = File(context.cacheDir, "waitForFile-test-${System.currentTimeMillis()}") - testDir.mkdirs() - } - - @After - fun tearDown() { - testDir.deleteRecursively() - } - - @Test(expected = TimeoutCancellationException::class) - fun timeoutWhenFileNeverAppears() = runTest { - val file = File(testDir, "never-created.txt") - waitForFile(file, timeoutMs = 500) - } - - @Test - fun returnsImmediatelyWhenFileAlreadyExists() = runTest { - val file = File(testDir, "already-exists.txt") - file.createNewFile() - - val result = waitForFile(file, timeoutMs = 2000) - assertEquals(file, result) - } - - @Test - fun detectsFileCreatedAfterWatchStarts() = runTest { - val file = File(testDir, "created-later.txt") - - // Launch waitForFile in background - val job = launch { - val result = waitForFile(file, timeoutMs = 5000) - assertEquals(file, result) - } - - // Wait a bit then create the file - kotlinx.coroutines.delay(500) - file.createNewFile() - - job.join() - assertTrue("Job should complete successfully", job.isCompleted) - } - - @Test - fun createsParentDirectoryIfMissing() = runTest { - val nestedDir = File(testDir, "nested/deep") - val file = File(nestedDir, "target.txt") - - // Parent doesn't exist yet - assertTrue(!nestedDir.exists()) - - val job = launch { - waitForFile(file, timeoutMs = 3000) - } - - // waitForFile should create the parent dir — give it a moment - kotlinx.coroutines.delay(200) - assertTrue("Parent directory should be created", nestedDir.exists()) - - // Create the file so the coroutine completes - file.createNewFile() - job.join() - } - - @Test - fun throwsForFileWithNoParent() = runTest { - // A root-level File object on Android won't have a null parent in practice, - // but we can test the error path by using a File with explicit empty parent - try { - // File("/") has parentFile == null on some systems - val rootFile = File("/") - if (rootFile.parentFile == null) { - waitForFile(rootFile, timeoutMs = 500) - fail("Should throw IllegalArgumentException for file with no parent") - } - } catch (_: IllegalArgumentException) { - // Expected - } - } -} From fcb069193051cb4c4e09ff3f1a2662dd10b313da Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Thu, 30 Apr 2026 23:23:33 +0100 Subject: [PATCH 2/3] review nits: drop unreachable suppress; update Android docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `connectWithRetry`: replace `return@withTimeout` + `@Suppress("UNREACHABLE_CODE") error(...)` with a `lateinit var connected` + `break` pattern. Same semantics, the type checker no longer needs the suppression. - Android-specific doc references to `waitForFile`/`watchForFile` removed from agents.md (file tree), docs/ARCHITECTURE.md (table row + ascii diagram + prose), and e2e/README.md (instrumented test list). iOS-side mentions are preserved — iOS still uses `waitForFile`. --- agents.md | 1 - .../src/main/java/com/comapeo/core/NodeJSIPC.kt | 8 ++++---- docs/ARCHITECTURE.md | 10 +++++----- e2e/README.md | 15 --------------- 4 files changed, 9 insertions(+), 25 deletions(-) diff --git a/agents.md b/agents.md index d91ad67..828de76 100644 --- a/agents.md +++ b/agents.md @@ -96,7 +96,6 @@ Messages are framed with a **4-byte little-endian length prefix** followed by a │ │ ├── ComapeoCoreReactActivityLifecycleListener.kt │ │ ├── ComapeoCorePackage.kt │ │ ├── Actions.kt -│ │ ├── watchForFile.kt │ │ └── log.kt │ ├── src/main/cpp/ │ │ ├── jni-bridge.cpp # JNI bridge to libnode.so + stdout/stderr → logcat diff --git a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt index cc1aaf0..3023cfe 100644 --- a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt +++ b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt @@ -252,17 +252,17 @@ private suspend fun connectWithRetry( deadlineMs: Long = 30_000, intervalMs: Long = 50, ): LocalSocket = withTimeout(deadlineMs) { + lateinit var connected: LocalSocket var attempt = 0 while (true) { attempt++ try { - val socket = LocalSocket() - socket.connect(socketAddress) + connected = LocalSocket().also { it.connect(socketAddress) } log("Connected on attempt $attempt") - return@withTimeout socket + break } catch (_: IOException) { delay(intervalMs) } } - @Suppress("UNREACHABLE_CODE") error("unreachable") + connected } diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 63873df..e4ed047 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -112,7 +112,7 @@ has been constructed. The main app process connects and uses A late connection (i.e. the React Native module connects after Node has already bound this socket) is the steady state: the backend always binds this socket some hundreds of milliseconds after process start. The -`NodeJSIPC.waitForFile()` poll handles the gap. +`NodeJSIPC.connectWithRetry()` 50 ms-cadence retry loop handles the gap. #### `control.sock` — lifecycle + handshake @@ -198,8 +198,8 @@ frames (each `type` is its own protocol). │ │ │ │ │ │ │ connect via │ │ │ ◄───────────────── │ NodeJSIPC │ - │ │ │ (waitForFile + │ - │ │ │ retry) │ + │ │ │ (50 ms retry │ + │ │ │ loop) │ │ │ │ │ │ │ │ │ broadcast │ │ ──{started}──► │ │ @@ -480,7 +480,7 @@ codebases. | 5 | Android `startupTimeoutMs` | 30 s | Mirrors #1 | Sends `error-native` to backend (best-effort; see #7) **and** sets local `BackendState.Error(starting-timeout)` | | 6 | Android `ComapeoCoreService.onDestroy` `withTimeout` | 10 s | `nodeJSService.stop()` hangs | Catches `TimeoutCancellationException` → `Process.killProcess`. This is the only outer bound on Android `stop()` — it has no internal timeout | | 7 | Android `SEND_ERROR_NATIVE_TIMEOUT_MS` | 2 s | `ipcDeferred` never completes (FGS init threw before IPC was constructed) | Frame logged as dropped, no error thrown — the FGS still sets local `ERROR` regardless | -| 8 | Android `waitForFile` | 30 s | Same as #3 | Throws `TimeoutCancellationException` | +| 8 | Android `connectWithRetry` (`NodeJSIPC`) | 30 s | Backend never binds the socket OR file exists but `accept()` never ready | IPC `State.Error` | **Backend (Node):** @@ -572,7 +572,7 @@ The current approach (§4.4) is a second `NodeJSIPC` connection to | **Second `NodeJSIPC` (current)** | Replay in Node | Yes | Low (~30 LOC) | **Selected.** Reuses existing socket client class; replay logic lives in JS where the rest of the lifecycle lives; iOS uses the in-process equivalent so the "Node owns the truth" abstraction holds across platforms. | The strongest case against the current approach is "it's a second -persistent socket paying connection overhead and a `waitForFile` poll +persistent socket paying connection overhead and a 50 ms retry poll for a channel that carries ~6 messages per process lifetime." That cost is real but small. If profiling later shows it matters, **ContentProvider + ContentObserver** is the runner-up. diff --git a/e2e/README.md b/e2e/README.md index d29f382..8d8b3bb 100644 --- a/e2e/README.md +++ b/e2e/README.md @@ -56,7 +56,6 @@ android/src/androidTest/java/com/comapeo/core/ # Instrumented tests (device) NodeJSIPCTest.kt # IPC: connect, send, receive, framing, disconnect ServiceLifecycleTest.kt # Service: start, stop, process isolation, notification ShutdownPathTest.kt # Shutdown: graceful stop, process kill, recovery cycles - WatchForFileTest.kt # FileObserver: creation, cancellation, TOCTOU e2e/.maestro/ # Maestro UI flows app-launch.yaml # App launches and displays all UI sections @@ -113,20 +112,6 @@ Tests the critical shutdown and recovery path. | `multipleStopStartCycles` | 3 start/stop cycles work (fresh process each) | | `appForceStopCleansUpService` | `am force-stop` kills both processes | -### WatchForFileTest - -Tests the `waitForFile` suspending function. - -| Test | What it verifies | -|---|---| -| `returnsImmediatelyIfFileExists` | No suspension if file exists | -| `suspendsUntilFileIsCreated` | Suspends, resumes on file creation | -| `handlesFileCreatedAfterShortDelay` | 2s delay before creation | -| `cancellationStopsWatching` | Coroutine cancellation cleans up | -| `doesNotResumeForDifferentFile` | Only the target filename triggers resume | -| `createsParentDirectoryIfMissing` | Creates missing parent dirs | -| `handlesRapidFileCreateDelete` | TOCTOU scenario | - ## Writing new tests ### Instrumented tests From c83a949bb9418cfabbd8bb5b456abec1f77c4bc0 Mon Sep 17 00:00:00 2001 From: Gregor MacLennan Date: Fri, 1 May 2026 11:17:15 +0100 Subject: [PATCH 3/3] android: close failed sockets and surface real cause on timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes from PR review: - `LocalSocket.connect` opens an fd before it can throw, so each failed retry leaks one until GC. Over a 30s × 50ms window that's hundreds of unclosed sockets — close on IOException before the next attempt. - On timeout, `withTimeout` discards the underlying IOException, so `State.Error` only carries "Timed out for 30000 ms" with no hint of which syscall was failing. Translate to an IOException with the last failure as the cause and the attempt count in the message. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../main/java/com/comapeo/core/NodeJSIPC.kt | 49 ++++++++++++++----- 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt index 3023cfe..11cb650 100644 --- a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt +++ b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt @@ -15,6 +15,7 @@ import kotlinx.coroutines.flow.MutableStateFlow import kotlinx.coroutines.flow.first import kotlinx.coroutines.isActive import kotlinx.coroutines.launch +import kotlinx.coroutines.TimeoutCancellationException import kotlinx.coroutines.withTimeout import java.io.DataInputStream import java.io.DataOutputStream @@ -251,18 +252,42 @@ private suspend fun connectWithRetry( socketAddress: LocalSocketAddress, deadlineMs: Long = 30_000, intervalMs: Long = 50, -): LocalSocket = withTimeout(deadlineMs) { - lateinit var connected: LocalSocket - var attempt = 0 - while (true) { - attempt++ - try { - connected = LocalSocket().also { it.connect(socketAddress) } - log("Connected on attempt $attempt") - break - } catch (_: IOException) { - delay(intervalMs) +): LocalSocket { + var lastFailure: IOException? = null + var attempts = 0 + val connected = try { + withTimeout(deadlineMs) { + // `LocalSocket.connect` opens a real fd before it can throw + // (`LocalSocketImpl.create` runs before `connectLocal`), so + // each failed attempt's socket has to be closed before the + // next iteration — otherwise we'd accumulate hundreds of + // file descriptors over the deadline window. + lateinit var s: LocalSocket + while (true) { + attempts++ + val candidate = LocalSocket() + try { + candidate.connect(socketAddress) + s = candidate + break + } catch (e: IOException) { + try { candidate.close() } catch (_: Exception) {} + lastFailure = e + delay(intervalMs) + } + } + s } + } catch (e: TimeoutCancellationException) { + // Translate the timeout into an IOException carrying the last + // connect failure as the cause; otherwise `State.Error` would + // surface only "Timed out for 30000 ms" with no hint of which + // syscall was failing or how many attempts ran. + throw IOException( + "Timed out connecting to socket after ${deadlineMs}ms across $attempts attempts", + lastFailure, + ) } - connected + log("Connected on attempt $attempts") + return connected }