diff --git a/agents.md b/agents.md index d91ad67..828de76 100644 --- a/agents.md +++ b/agents.md @@ -96,7 +96,6 @@ Messages are framed with a **4-byte little-endian length prefix** followed by a │ │ ├── ComapeoCoreReactActivityLifecycleListener.kt │ │ ├── ComapeoCorePackage.kt │ │ ├── Actions.kt -│ │ ├── watchForFile.kt │ │ └── log.kt │ ├── src/main/cpp/ │ │ ├── jni-bridge.cpp # JNI bridge to libnode.so + stdout/stderr → logcat diff --git a/android/src/androidTest/java/com/comapeo/core/WatchForFileTest.kt b/android/src/androidTest/java/com/comapeo/core/WatchForFileTest.kt deleted file mode 100644 index e0fcfa4..0000000 --- a/android/src/androidTest/java/com/comapeo/core/WatchForFileTest.kt +++ /dev/null @@ -1,180 +0,0 @@ -package com.comapeo.core - -import androidx.test.ext.junit.runners.AndroidJUnit4 -import androidx.test.platform.app.InstrumentationRegistry -import kotlinx.coroutines.async -import kotlinx.coroutines.cancel -import kotlinx.coroutines.delay -import kotlinx.coroutines.launch -import kotlinx.coroutines.runBlocking -import kotlinx.coroutines.withTimeout -import kotlinx.coroutines.withTimeoutOrNull -import org.junit.After -import org.junit.Assert.assertEquals -import org.junit.Assert.assertNotNull -import org.junit.Assert.assertNull -import org.junit.Before -import org.junit.Test -import org.junit.runner.RunWith -import java.io.File - -/** - * Instrumented tests for [waitForFile]. - * - * Tests the file watcher that uses Android's [android.os.FileObserver] - * to suspend until a file is created. Must run on a device/emulator - * because FileObserver requires the Android OS. - */ -@RunWith(AndroidJUnit4::class) -class WatchForFileTest { - - private lateinit var testDir: File - private lateinit var testFile: File - - @Before - fun setUp() { - val context = InstrumentationRegistry.getInstrumentation().targetContext - testDir = File(context.cacheDir, "watchforfile_test_${System.nanoTime()}") - testDir.mkdirs() - testFile = File(testDir, "test.sock") - testFile.delete() - } - - @After - fun tearDown() { - testDir.deleteRecursively() - } - - @Test - fun returnsImmediatelyIfFileExists() = runBlocking { - // Create the file first - testFile.createNewFile() - - val result = withTimeout(2000) { - waitForFile(testFile) - } - assertEquals(testFile, result) - } - - @Test - fun suspendsUntilFileIsCreated() = runBlocking { - val result = async { - waitForFile(testFile) - } - - // File doesn't exist yet — waitForFile should be suspended - delay(500) - - // Create the file after a delay - testFile.createNewFile() - - val file = withTimeout(5000) { - result.await() - } - assertEquals(testFile, file) - } - - @Test - fun handlesFileCreatedAfterShortDelay() = runBlocking { - val result = async { - waitForFile(testFile) - } - - // Create the file after 2 seconds - launch { - delay(2000) - testFile.createNewFile() - } - - val file = withTimeout(5000) { - result.await() - } - assertEquals(testFile, file) - } - - @Test - fun cancellationStopsWatching() = runBlocking { - val result = async { - waitForFile(testFile) - } - - // Cancel before the file is created - delay(500) - result.cancel() - - // Verify the deferred was cancelled - try { - result.await() - // Should not reach here - assert(false) { "Should have thrown CancellationException" } - } catch (e: kotlinx.coroutines.CancellationException) { - // Expected - } - } - - @Test - fun doesNotResumeForDifferentFile() = runBlocking { - val result = async { - waitForFile(testFile) - } - - // Create a DIFFERENT file in the same directory - delay(500) - File(testDir, "other_file.sock").createNewFile() - - // waitForFile should NOT have resumed - delay(1000) - val nullResult = withTimeoutOrNull(1000) { - result.await() - } - assertNull("Should not have resumed for different file", nullResult) - - // Now create the actual file - testFile.createNewFile() - - val file = withTimeout(5000) { - result.await() - } - assertEquals(testFile, file) - } - - @Test - fun createsParentDirectoryIfMissing() = runBlocking { - val nestedFile = File(File(testDir, "nested/deep"), "test.sock") - - val result = async { - waitForFile(nestedFile) - } - - // The parent directory should have been created by waitForFile - delay(500) - assert(nestedFile.parentFile!!.exists()) { "Parent directory should be created" } - - // Create the file - nestedFile.createNewFile() - - val file = withTimeout(5000) { - result.await() - } - assertEquals(nestedFile, file) - } - - @Test - fun handlesRapidFileCreateDelete() = runBlocking { - // Test the TOCTOU scenario: create and delete the file rapidly, - // then create it again. The watcher should eventually detect it. - val result = async { - waitForFile(testFile) - } - - delay(500) - - // Create the file — this should trigger the observer - testFile.createNewFile() - - val file = withTimeout(5000) { - result.await() - } - assertEquals(testFile, file) - } -} diff --git a/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt b/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt index 8692a31..d5eed55 100644 --- a/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt +++ b/android/src/main/java/com/comapeo/core/ComapeoCoreModule.kt @@ -27,7 +27,7 @@ class ComapeoCoreModule : Module() { * * The control socket file lives in the app's filesDir, accessible from * any process that shares the same UID (i.e. the FGS and the main - * process). The IPC's `waitForFile` polls until the FGS / Node binds. + * process). The IPC's connect loop retries until the FGS / Node binds. */ private lateinit var controlIpc: NodeJSIPC diff --git a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt index f49ab27..11cb650 100644 --- a/android/src/main/java/com/comapeo/core/NodeJSIPC.kt +++ b/android/src/main/java/com/comapeo/core/NodeJSIPC.kt @@ -15,6 +15,8 @@ import kotlinx.coroutines.flow.MutableStateFlow import kotlinx.coroutines.flow.first import kotlinx.coroutines.isActive import kotlinx.coroutines.launch +import kotlinx.coroutines.TimeoutCancellationException +import kotlinx.coroutines.withTimeout import java.io.DataInputStream import java.io.DataOutputStream import java.io.EOFException @@ -110,7 +112,6 @@ class NodeJSIPC( } } } - waitForFile(socketFile) if (::socket.isInitialized) { try { socket.close() @@ -233,37 +234,60 @@ class NodeJSIPC( } } +/** + * Connect with a fixed-cadence retry loop bounded by an overall deadline. + * + * Retries fire on every `IOException` from `LocalSocket.connect`, which covers + * both "socket file does not exist yet" (`ENOENT`) and "file exists but the + * server is not yet `accept`ing" (`ECONNREFUSED`) — the same primitive handles + * both phases of backend startup. The 50 ms cadence is fast enough to be + * invisible to TTI; the 30 s deadline matches the prior `waitForFile` timeout + * so the cumulative startup wait budget is unchanged. + * + * No exponential backoff: this is a one-shot startup wait, not a network call, + * and the failure mode we're tolerating is "backend not finished booting yet" + * — it doesn't get worse from retrying tightly. + */ private suspend fun connectWithRetry( socketAddress: LocalSocketAddress, - maxRetries: Int = 5, - initialDelayMs: Long = 100, - maxDelayMs: Long = 5000, - backoffMultiplier: Double = 2.0 + deadlineMs: Long = 30_000, + intervalMs: Long = 50, ): LocalSocket { - var currentDelay = initialDelayMs - var lastException: IOException? = null - - repeat(maxRetries) { attempt -> - try { - val socket = LocalSocket() - socket.connect(socketAddress) - log("Connected on attempt ${attempt + 1}") - return socket - } catch (e: IOException) { - lastException = e - - if (attempt < maxRetries - 1) { - delay(currentDelay) - currentDelay = minOf( - (currentDelay * backoffMultiplier).toLong(), - maxDelayMs - ) + var lastFailure: IOException? = null + var attempts = 0 + val connected = try { + withTimeout(deadlineMs) { + // `LocalSocket.connect` opens a real fd before it can throw + // (`LocalSocketImpl.create` runs before `connectLocal`), so + // each failed attempt's socket has to be closed before the + // next iteration — otherwise we'd accumulate hundreds of + // file descriptors over the deadline window. + lateinit var s: LocalSocket + while (true) { + attempts++ + val candidate = LocalSocket() + try { + candidate.connect(socketAddress) + s = candidate + break + } catch (e: IOException) { + try { candidate.close() } catch (_: Exception) {} + lastFailure = e + delay(intervalMs) + } } + s } + } catch (e: TimeoutCancellationException) { + // Translate the timeout into an IOException carrying the last + // connect failure as the cause; otherwise `State.Error` would + // surface only "Timed out for 30000 ms" with no hint of which + // syscall was failing or how many attempts ran. + throw IOException( + "Timed out connecting to socket after ${deadlineMs}ms across $attempts attempts", + lastFailure, + ) } - - throw IOException( - "Failed to connect after $maxRetries attempts", - lastException - ) + log("Connected on attempt $attempts") + return connected } diff --git a/android/src/main/java/com/comapeo/core/watchForFile.kt b/android/src/main/java/com/comapeo/core/watchForFile.kt deleted file mode 100644 index 9b1178f..0000000 --- a/android/src/main/java/com/comapeo/core/watchForFile.kt +++ /dev/null @@ -1,57 +0,0 @@ -package com.comapeo.core - -import android.os.FileObserver -import kotlinx.coroutines.suspendCancellableCoroutine -import kotlinx.coroutines.withTimeout -import java.io.File - -/** - * A suspendable function that watches for file creation at the specified path. - * The function will suspend until the specified file is created or timeout occurs. - * - * @param file The file to watch for creation - * @param timeoutMs Maximum time to wait for the file to appear (default: 30 seconds) - * @return The created File object - * @throws IllegalArgumentException if the file has no parent directory - * @throws kotlinx.coroutines.TimeoutCancellationException if the file is not created within the timeout - */ -suspend fun waitForFile(file: File, timeoutMs: Long = 30_000L): File = withTimeout(timeoutMs) { - suspendCancellableCoroutine { continuation -> - - // Get parent directory and throw if null - val parentDir = file.parentFile ?: throw IllegalArgumentException("File must have a parent directory") - - // Create the directory if it doesn't exist - if (!parentDir.exists()) { - parentDir.mkdirs() - } - - // Create file observer BEFORE checking existence to avoid TOCTOU race. - val observer = object : FileObserver(parentDir, CREATE) { - override fun onEvent(event: Int, path: String?) { - if (path == file.name) { - stopWatching() - if (!continuation.isCompleted) { - continuation.resumeWith(Result.success(file)) - } - } - } - } - - observer.startWatching() - - // Check AFTER starting the observer — if the file was created between - // observer setup and this check, the observer will have caught it. - // If it existed before, we catch it here. - if (file.exists()) { - observer.stopWatching() - if (!continuation.isCompleted) { - continuation.resumeWith(Result.success(file)) - } - } - - continuation.invokeOnCancellation { - observer.stopWatching() - } - } -} diff --git a/android/src/test/java/com/comapeo/core/WatchForFileTimeoutTest.kt b/android/src/test/java/com/comapeo/core/WatchForFileTimeoutTest.kt deleted file mode 100644 index f2e3723..0000000 --- a/android/src/test/java/com/comapeo/core/WatchForFileTimeoutTest.kt +++ /dev/null @@ -1,86 +0,0 @@ -package com.comapeo.core - -import kotlinx.coroutines.TimeoutCancellationException -import kotlinx.coroutines.suspendCancellableCoroutine -import kotlinx.coroutines.test.runTest -import kotlinx.coroutines.withTimeout -import org.junit.Assert.assertEquals -import org.junit.Assert.assertTrue -import org.junit.Assert.fail -import org.junit.Test -import kotlin.coroutines.resume - -/** - * JVM unit tests for the timeout pattern used in [waitForFile]. - * - * The actual waitForFile function uses Android's FileObserver and can't run - * on JVM. These tests verify the coroutine timeout wrapper pattern: - * - withTimeout throws TimeoutCancellationException when file never appears - * - suspendCancellableCoroutine resumes immediately when condition is met - * - Cancellation propagates correctly - */ -class WatchForFileTimeoutTest { - - @Test(expected = TimeoutCancellationException::class) - fun timeoutThrowsWhenFileNeverAppears() = runTest { - // Simulates waitForFile when the file is never created - withTimeout(100) { - suspendCancellableCoroutine { /* never resumed */ } - } - } - - @Test - fun immediateResumeWhenAlreadyExists() = runTest { - // Simulates the "check after observer start" path where file already exists - val result = withTimeout(1000) { - suspendCancellableCoroutine { continuation -> - // Simulates: observer.startWatching() then file.exists() == true - val fileAlreadyExists = true - if (fileAlreadyExists) { - continuation.resume("found") - } - } - } - assertEquals("found", result) - } - - @Test - fun cancellationCallbackInvoked() = runTest { - var cleanupCalled = false - - try { - withTimeout(100) { - suspendCancellableCoroutine { continuation -> - continuation.invokeOnCancellation { - // Simulates: observer.stopWatching() - cleanupCalled = true - } - // Never resume — simulates file never appearing - } - } - } catch (_: TimeoutCancellationException) { - // Expected - } - - assertTrue("Cleanup (observer.stopWatching) should be called on timeout", cleanupCalled) - } - - @Test - fun doubleResumeIsIgnored() = runTest { - // Simulates the race where both observer and exists-check try to resume - // The fix uses continuation.isCompleted to guard against double resume - val result = withTimeout(1000) { - suspendCancellableCoroutine { continuation -> - // First resume (from observer callback) - if (!continuation.isCompleted) { - continuation.resume("from-observer") - } - // Second resume (from exists check) — should be safely skipped - if (!continuation.isCompleted) { - fail("Should not reach second resume after first completed") - } - } - } - assertEquals("from-observer", result) - } -} diff --git a/apps/example/tests/android/WaitForFileTest.kt b/apps/example/tests/android/WaitForFileTest.kt deleted file mode 100644 index 1b3d08c..0000000 --- a/apps/example/tests/android/WaitForFileTest.kt +++ /dev/null @@ -1,112 +0,0 @@ -package com.comapeo.core.example - -import androidx.test.ext.junit.runners.AndroidJUnit4 -import androidx.test.platform.app.InstrumentationRegistry -import com.comapeo.core.waitForFile -import kotlinx.coroutines.TimeoutCancellationException -import kotlinx.coroutines.launch -import kotlinx.coroutines.test.runTest -import org.junit.After -import org.junit.Assert.assertEquals -import org.junit.Assert.assertTrue -import org.junit.Assert.fail -import org.junit.Before -import org.junit.Test -import org.junit.runner.RunWith -import java.io.File - -/** - * Instrumented tests for [waitForFile]. - * - * These tests verify: - * - Timeout when file never appears (fix: added withTimeout wrapper) - * - Immediate return when file already exists - * - Observer detects file creation (TOCTOU fix: observer starts before exists check) - */ -@RunWith(AndroidJUnit4::class) -class WaitForFileTest { - - private lateinit var testDir: File - - @Before - fun setUp() { - val context = InstrumentationRegistry.getInstrumentation().targetContext - testDir = File(context.cacheDir, "waitForFile-test-${System.currentTimeMillis()}") - testDir.mkdirs() - } - - @After - fun tearDown() { - testDir.deleteRecursively() - } - - @Test(expected = TimeoutCancellationException::class) - fun timeoutWhenFileNeverAppears() = runTest { - val file = File(testDir, "never-created.txt") - waitForFile(file, timeoutMs = 500) - } - - @Test - fun returnsImmediatelyWhenFileAlreadyExists() = runTest { - val file = File(testDir, "already-exists.txt") - file.createNewFile() - - val result = waitForFile(file, timeoutMs = 2000) - assertEquals(file, result) - } - - @Test - fun detectsFileCreatedAfterWatchStarts() = runTest { - val file = File(testDir, "created-later.txt") - - // Launch waitForFile in background - val job = launch { - val result = waitForFile(file, timeoutMs = 5000) - assertEquals(file, result) - } - - // Wait a bit then create the file - kotlinx.coroutines.delay(500) - file.createNewFile() - - job.join() - assertTrue("Job should complete successfully", job.isCompleted) - } - - @Test - fun createsParentDirectoryIfMissing() = runTest { - val nestedDir = File(testDir, "nested/deep") - val file = File(nestedDir, "target.txt") - - // Parent doesn't exist yet - assertTrue(!nestedDir.exists()) - - val job = launch { - waitForFile(file, timeoutMs = 3000) - } - - // waitForFile should create the parent dir — give it a moment - kotlinx.coroutines.delay(200) - assertTrue("Parent directory should be created", nestedDir.exists()) - - // Create the file so the coroutine completes - file.createNewFile() - job.join() - } - - @Test - fun throwsForFileWithNoParent() = runTest { - // A root-level File object on Android won't have a null parent in practice, - // but we can test the error path by using a File with explicit empty parent - try { - // File("/") has parentFile == null on some systems - val rootFile = File("/") - if (rootFile.parentFile == null) { - waitForFile(rootFile, timeoutMs = 500) - fail("Should throw IllegalArgumentException for file with no parent") - } - } catch (_: IllegalArgumentException) { - // Expected - } - } -} diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 63873df..e4ed047 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -112,7 +112,7 @@ has been constructed. The main app process connects and uses A late connection (i.e. the React Native module connects after Node has already bound this socket) is the steady state: the backend always binds this socket some hundreds of milliseconds after process start. The -`NodeJSIPC.waitForFile()` poll handles the gap. +`NodeJSIPC.connectWithRetry()` 50 ms-cadence retry loop handles the gap. #### `control.sock` — lifecycle + handshake @@ -198,8 +198,8 @@ frames (each `type` is its own protocol). │ │ │ │ │ │ │ connect via │ │ │ ◄───────────────── │ NodeJSIPC │ - │ │ │ (waitForFile + │ - │ │ │ retry) │ + │ │ │ (50 ms retry │ + │ │ │ loop) │ │ │ │ │ │ │ │ │ broadcast │ │ ──{started}──► │ │ @@ -480,7 +480,7 @@ codebases. | 5 | Android `startupTimeoutMs` | 30 s | Mirrors #1 | Sends `error-native` to backend (best-effort; see #7) **and** sets local `BackendState.Error(starting-timeout)` | | 6 | Android `ComapeoCoreService.onDestroy` `withTimeout` | 10 s | `nodeJSService.stop()` hangs | Catches `TimeoutCancellationException` → `Process.killProcess`. This is the only outer bound on Android `stop()` — it has no internal timeout | | 7 | Android `SEND_ERROR_NATIVE_TIMEOUT_MS` | 2 s | `ipcDeferred` never completes (FGS init threw before IPC was constructed) | Frame logged as dropped, no error thrown — the FGS still sets local `ERROR` regardless | -| 8 | Android `waitForFile` | 30 s | Same as #3 | Throws `TimeoutCancellationException` | +| 8 | Android `connectWithRetry` (`NodeJSIPC`) | 30 s | Backend never binds the socket OR file exists but `accept()` never ready | IPC `State.Error` | **Backend (Node):** @@ -572,7 +572,7 @@ The current approach (§4.4) is a second `NodeJSIPC` connection to | **Second `NodeJSIPC` (current)** | Replay in Node | Yes | Low (~30 LOC) | **Selected.** Reuses existing socket client class; replay logic lives in JS where the rest of the lifecycle lives; iOS uses the in-process equivalent so the "Node owns the truth" abstraction holds across platforms. | The strongest case against the current approach is "it's a second -persistent socket paying connection overhead and a `waitForFile` poll +persistent socket paying connection overhead and a 50 ms retry poll for a channel that carries ~6 messages per process lifetime." That cost is real but small. If profiling later shows it matters, **ContentProvider + ContentObserver** is the runner-up. diff --git a/e2e/README.md b/e2e/README.md index d29f382..8d8b3bb 100644 --- a/e2e/README.md +++ b/e2e/README.md @@ -56,7 +56,6 @@ android/src/androidTest/java/com/comapeo/core/ # Instrumented tests (device) NodeJSIPCTest.kt # IPC: connect, send, receive, framing, disconnect ServiceLifecycleTest.kt # Service: start, stop, process isolation, notification ShutdownPathTest.kt # Shutdown: graceful stop, process kill, recovery cycles - WatchForFileTest.kt # FileObserver: creation, cancellation, TOCTOU e2e/.maestro/ # Maestro UI flows app-launch.yaml # App launches and displays all UI sections @@ -113,20 +112,6 @@ Tests the critical shutdown and recovery path. | `multipleStopStartCycles` | 3 start/stop cycles work (fresh process each) | | `appForceStopCleansUpService` | `am force-stop` kills both processes | -### WatchForFileTest - -Tests the `waitForFile` suspending function. - -| Test | What it verifies | -|---|---| -| `returnsImmediatelyIfFileExists` | No suspension if file exists | -| `suspendsUntilFileIsCreated` | Suspends, resumes on file creation | -| `handlesFileCreatedAfterShortDelay` | 2s delay before creation | -| `cancellationStopsWatching` | Coroutine cancellation cleans up | -| `doesNotResumeForDifferentFile` | Only the target filename triggers resume | -| `createsParentDirectoryIfMissing` | Creates missing parent dirs | -| `handlesRapidFileCreateDelete` | TOCTOU scenario | - ## Writing new tests ### Instrumented tests