From c65ec8c2857704ff6c40daadc3d73a951d00e486 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?= <magnus@browser-use.com>
Date: Sun, 10 May 2026 15:41:07 +0000
Subject: [PATCH 1/2] docs: explain self-modifying harness workflow

---
 README.md                      |   5 +
 docs/self-modifying-harness.md | 207 +++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+)
 create mode 100644 docs/self-modifying-harness.md
diff --git a/README.md b/README.md
index ab7f6a5d..d1d6a0ba 100644
--- a/README.md
+++ b/README.md
@@ -38,6 +38,11 @@ Click Allow when the per-attach popup appears (Chrome 144+):
 
 See [agent-workspace/domain-skills/](agent-workspace/domain-skills/) for example tasks.
 
+Read [Self-modifying Browser Harness](docs/self-modifying-harness.md) for the
+core workflow: when a task hits an edge case, the agent patches the harness and
+retries. The guide walks through file uploads, drag-and-drop, signature/canvas
+drawing, and coordinate-only controls.
+
 ## Free Browser Use Cloud browsers
 
 Stealth, sub-agents, or headless deployment.<br>
diff --git a/docs/self-modifying-harness.md b/docs/self-modifying-harness.md
new file mode 100644
index 00000000..f64f061f
--- /dev/null
+++ b/docs/self-modifying-harness.md
@@ -0,0 +1,207 @@
+# Self-modifying Browser Harness
+
+Browser Harness is intentionally small. The core helpers cover navigation,
+screenshots, raw CDP, JavaScript, coordinates, keyboard input, and files. When a
+site needs something more specific, the agent should extend the harness while it
+works instead of treating the site as an unsolved edge case.
+
+The loop is:
+
+1. Inspect the page with `capture_screenshot()` and `page_info()`.
+2. Try the smallest built-in primitive: `click_at_xy`, `type_text`,
+   `press_key`, `upload_file`, `js`, or raw `cdp`.
+3. If the missing operation is reusable, add a helper to
+   `agent-workspace/agent_helpers.py`.
+4. Call the helper from `browser-harness -c`.
+5. Verify with another screenshot or direct page state check.
+6. If the helper teaches a durable site-specific trick, save it as a domain
+   skill under `agent-workspace/domain-skills/<site>/`.
+
+This is the main difference from a fixed browser automation wrapper: the agent
+can patch the harness during the task, then immediately retry with the new
+primitive.
+
+## Example 1: file upload behind a styled button
+
+Many sites hide the real `<input type="file">` and expose a styled button. If a
+visual click opens the file picker, do not ask the user to handle it. Find the
+input and set the file through CDP.
+
+```python
+# browser-harness -c '...'
+from pathlib import Path
+
+new_tab("https://example.com/profile")
+wait_for_load()
+
+avatar = Path("/tmp/avatar.png").resolve()
+upload_file("input[type=file]", str(avatar))
+
+print(js("""
+const input = document.querySelector('input[type=file]');
+return input && input.files.length;
+"""))
+capture_screenshot("/tmp/uploaded.png", max_dim=1800)
+```
+
+If the selector is unstable, add a task helper:
+
+```python
+# agent-workspace/agent_helpers.py
+def upload_first_file_input(path):
+    inputs = js("""
+    return [...document.querySelectorAll('input[type=file]')]
+      .map((el, i) => ({ i, accept: el.accept, name: el.name, id: el.id }));
+    """)
+    if not inputs:
+        raise RuntimeError("no file input on page")
+    upload_file("input[type=file]", path)
+```
+
+The helper is intentionally narrow: it solves the current page shape without
+adding a framework around uploads.
+
+## Example 2: drag-and-drop that ignores simple clicks
+
+For visual drag handles, use compositor-level mouse events first. These pass
+through shadow DOM, iframes, and framework wrappers because Chrome receives a
+real pointer sequence.
+
+```python
+# browser-harness -c '...'
+capture_screenshot("/tmp/before-drag.png", max_dim=1800)
+
+cdp("Input.dispatchMouseEvent", type="mousePressed", x=180, y=420,
+    button="left", clickCount=1)
+cdp("Input.dispatchMouseEvent", type="mouseMoved", x=340, y=420,
+    button="left")
+cdp("Input.dispatchMouseEvent", type="mouseMoved", x=520, y=420,
+    button="left")
+cdp("Input.dispatchMouseEvent", type="mouseReleased", x=520, y=420,
+    button="left", clickCount=1)
+
+capture_screenshot("/tmp/after-drag.png", max_dim=1800)
+```
+
+If the app only responds to DOM `dragover` / `drop`, add a helper that performs
+that site's expected event sequence:
+
+```python
+# agent-workspace/agent_helpers.py
+def dom_drag_between(source_selector, target_selector):
+    return js(f"""
+    const source = document.querySelector({source_selector!r});
+    const target = document.querySelector({target_selector!r});
+    if (!source || !target) throw new Error("missing drag source or target");
+
+    const data = new DataTransfer();
+    for (const type of ["dragstart", "dragenter", "dragover", "drop", "dragend"]) {{
+      const node = type === "dragstart" || type === "dragend" ? source : target;
+      node.dispatchEvent(new DragEvent(type, {{
+        bubbles: true,
+        cancelable: true,
+        dataTransfer: data
+      }}));
+    }}
+    return true;
+    """)
+```
+
+Use the DOM path only after the low-level pointer path fails; the pointer path
+is closer to what a user does.
+
+## Example 3: signature field or canvas drawing
+
+Canvas widgets often have no useful DOM children. Treat them as coordinate
+surfaces. Locate the canvas, draw through CDP mouse events, and verify with a
+screenshot or by checking the backing canvas pixels.
+
+```python
+# agent-workspace/agent_helpers.py
+def sign_canvas(selector="canvas", points=None):
+    points = points or [
+        (0.15, 0.65), (0.30, 0.35), (0.45, 0.70),
+        (0.62, 0.32), (0.80, 0.58),
+    ]
+    rect = js(f"""
+    const el = document.querySelector({selector!r});
+    if (!el) throw new Error("canvas not found");
+    const r = el.getBoundingClientRect();
+    return {{ x: r.x, y: r.y, w: r.width, h: r.height }};
+    """)
+    absolute = [
+        (rect["x"] + x * rect["w"], rect["y"] + y * rect["h"])
+        for x, y in points
+    ]
+    first = absolute[0]
+    cdp("Input.dispatchMouseEvent", type="mousePressed", x=first[0], y=first[1],
+        button="left", clickCount=1)
+    for x, y in absolute[1:]:
+        cdp("Input.dispatchMouseEvent", type="mouseMoved", x=x, y=y,
+            button="left")
+    last = absolute[-1]
+    cdp("Input.dispatchMouseEvent", type="mouseReleased", x=last[0], y=last[1],
+        button="left", clickCount=1)
+```
+
+Then call it:
+
+```python
+# browser-harness -c '...'
+sign_canvas("canvas.signature")
+capture_screenshot("/tmp/signature.png", max_dim=1800)
+```
+
+The reusable part is not the exact signature shape. It is the conversion from a
+canvas selector to real pointer coordinates.
+
+## Example 4: coordinate-only controls
+
+Some controls are visible but hostile to selectors: canvas maps, custom sliders,
+SVG editors, image crop boxes, or cross-origin iframe buttons. Use the
+screenshot as the source of truth, convert device pixels to CSS pixels when
+needed, and click the visible target.
+
+```python
+# browser-harness -c '...'
+path = capture_screenshot("/tmp/target.png", max_dim=1800)
+info = page_info()
+dpr = js("window.devicePixelRatio") or 1
+print({"screenshot": path, "viewport": info, "devicePixelRatio": dpr})
+
+# If the target was measured at image pixel (960, 540) on a 2x screenshot:
+click_at_xy(960 / dpr, 540 / dpr)
+capture_screenshot("/tmp/clicked.png", max_dim=1800)
+```
+
+If the same coordinate pattern repeats, add a helper with semantic names:
+
+```python
+# agent-workspace/agent_helpers.py
+def click_canvas_percent(selector, x_pct, y_pct):
+    rect = js(f"""
+    const el = document.querySelector({selector!r});
+    if (!el) throw new Error("target not found");
+    const r = el.getBoundingClientRect();
+    return {{ x: r.x, y: r.y, w: r.width, h: r.height }};
+    """)
+    click_at_xy(rect["x"] + rect["w"] * x_pct, rect["y"] + rect["h"] * y_pct)
+```
+
+Now the task can say `click_canvas_percent("canvas.map", 0.72, 0.41)` instead of
+carrying brittle absolute coordinates through the rest of the run.
+
+## What to commit back
+
+Commit helpers that are durable and general enough to help the next run:
+
+- site-specific login, upload, checkout, export, or scraping flows go in
+  `agent-workspace/domain-skills/<site>/`;
+- reusable interaction mechanics go in `interaction-skills/`;
+- one-off task glue can stay in `agent-workspace/agent_helpers.py` on the
+  user's machine.
+
+Do not commit secrets, user data, screenshots of private sessions, or fixed
+pixel coordinates from one viewport. Commit the durable map of how the site
+works, not the diary of one run.
+

From 2101f01c13e0cc9e247eec2ce1ca4c4c73e70c10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20M=C3=BCller?= <magnus@browser-use.com>
Date: Sun, 10 May 2026 15:47:47 +0000
Subject: [PATCH 2/2] docs: add edge-case benchmark page

---
 README.md                      |   2 +-
 SKILL.md                       |   1 +
 docs/edge-case-benchmark.html  | 294 +++++++++++++++++++++++++++++++++
 docs/self-modifying-harness.md |  34 +++-
 4 files changed, 329 insertions(+), 2 deletions(-)
 create mode 100644 docs/edge-case-benchmark.html

diff --git a/README.md b/README.md
index d1d6a0ba..6917ceb9 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ See [agent-workspace/domain-skills/](agent-workspace/domain-skills/) for example
 Read [Self-modifying Browser Harness](docs/self-modifying-harness.md) for the
 core workflow: when a task hits an edge case, the agent patches the harness and
 retries. The guide walks through file uploads, drag-and-drop, signature/canvas
-drawing, and coordinate-only controls.
+drawing, coordinate-only controls, and a local edge-case benchmark page.
 
 ## Free Browser Use Cloud browsers
 
diff --git a/SKILL.md b/SKILL.md
index 531d0ab9..c81f652d 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -81,6 +81,7 @@ If you start struggling with a specific mechanic while navigating, look in inter
 ## What actually works
 
 - Screenshots first: use capture_screenshot() to understand the current page quickly, find visible targets, and decide whether you need a click, a selector, or more navigation.
+- If an interaction helper is missing, treat that as editable harness work, not a task failure. Reproduce, inspect, add the smallest helper in `agent-workspace/agent_helpers.py`, retry, and keep the reusable pattern. See `docs/self-modifying-harness.md` and `docs/edge-case-benchmark.html` for upload, drag/drop, signature canvas, and coordinate-only examples.
 - Clicking: capture_screenshot() → read the pixel off the image → click_at_xy(x, y) → capture_screenshot() to verify. Suppress the Playwright-habit reflex of "locate first, then click" — no getBoundingClientRect, no selector hunt. Drop to DOM only when the target has no visible geometry (hidden input, 0×0 node). Hit-testing happens in Chrome's browser process, so clicks go through iframes / shadow DOM / cross-origin without extra work.
 - Bulk HTTP: http_get(url) + ThreadPoolExecutor. No browser for static pages (249 Netflix pages in 2.8s).
 - After goto: wait_for_load().
diff --git a/docs/edge-case-benchmark.html b/docs/edge-case-benchmark.html
new file mode 100644
index 00000000..8e37af84
--- /dev/null
+++ b/docs/edge-case-benchmark.html
@@ -0,0 +1,294 @@
+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>Browser Harness edge-case benchmark</title>
+  <style>
+    :root {
+      color-scheme: light;
+      font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
+      background: #f7faf9;
+      color: #17211f;
+    }
+    body {
+      margin: 0;
+      padding: 32px;
+    }
+    main {
+      max-width: 1120px;
+      margin: 0 auto;
+    }
+    h1 {
+      font-size: 32px;
+      margin: 0 0 8px;
+    }
+    .intro {
+      margin: 0 0 28px;
+      color: #4b5d58;
+      max-width: 760px;
+      line-height: 1.45;
+    }
+    .grid {
+      display: grid;
+      grid-template-columns: repeat(2, minmax(0, 1fr));
+      gap: 18px;
+    }
+    .task {
+      background: #ffffff;
+      border: 1px solid #d8e3df;
+      border-radius: 8px;
+      padding: 20px;
+      min-height: 250px;
+      box-shadow: 0 1px 2px rgba(11, 36, 31, 0.06);
+    }
+    .task h2 {
+      font-size: 18px;
+      margin: 0 0 12px;
+    }
+    .status {
+      display: inline-flex;
+      align-items: center;
+      min-height: 28px;
+      margin-top: 14px;
+      padding: 4px 10px;
+      border-radius: 999px;
+      background: #edf2f0;
+      color: #4b5d58;
+      font-size: 13px;
+      font-weight: 700;
+    }
+    .status.pass {
+      background: #d9f8e7;
+      color: #126236;
+    }
+    .upload-label, .drag-card {
+      display: inline-flex;
+      align-items: center;
+      justify-content: center;
+      min-width: 150px;
+      height: 46px;
+      border-radius: 6px;
+      background: #17211f;
+      color: white;
+      font-weight: 800;
+      cursor: pointer;
+      user-select: none;
+    }
+    input[type="file"] {
+      position: absolute;
+      width: 1px;
+      height: 1px;
+      opacity: 0.01;
+    }
+    .drop-zone {
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      min-height: 92px;
+      margin-top: 14px;
+      border: 2px dashed #87a49d;
+      border-radius: 8px;
+      color: #587069;
+      background: #f1f6f4;
+      font-weight: 700;
+    }
+    .drop-zone.hot {
+      border-color: #168a54;
+      background: #e4f9ed;
+      color: #126236;
+    }
+    canvas {
+      display: block;
+      width: 100%;
+      max-width: 420px;
+      height: 140px;
+      border: 1px solid #b9c8c3;
+      border-radius: 6px;
+      background: white;
+      touch-action: none;
+    }
+    #coordinate-canvas {
+      cursor: crosshair;
+    }
+    .summary {
+      margin-top: 22px;
+      padding: 16px 18px;
+      background: #eaf2ef;
+      border-radius: 8px;
+      color: #31423e;
+      font-family: ui-monospace, SFMono-Regular, Menlo, Consolas, monospace;
+      font-size: 14px;
+    }
+    @media (max-width: 760px) {
+      body {
+        padding: 18px;
+      }
+      .grid {
+        grid-template-columns: 1fr;
+      }
+    }
+  </style>
+</head>
+<body>
+  <main>
+    <h1>Browser Harness edge-case benchmark</h1>
+    <p class="intro">
+      Four small browser tasks that usually force agents to switch tactics:
+      hidden file input, drag/drop event payloads, canvas signature input, and a
+      coordinate-only canvas target.
+    </p>
+
+    <section class="grid">
+      <article class="task" data-task="file">
+        <h2>1. File upload</h2>
+        <label class="upload-label" for="file-input">Choose file</label>
+        <input id="file-input" type="file">
+        <div id="file-status" class="status">waiting for file</div>
+      </article>
+
+      <article class="task" data-task="drag">
+        <h2>2. Drag and drop</h2>
+        <div id="drag-source" class="drag-card" draggable="true">drag token</div>
+        <div id="drop-target" class="drop-zone">drop token here</div>
+        <div id="drag-status" class="status">waiting for drop</div>
+      </article>
+
+      <article class="task" data-task="signature">
+        <h2>3. Canvas signature</h2>
+        <canvas id="signature-canvas" width="420" height="140"></canvas>
+        <div id="signature-status" class="status">waiting for stroke</div>
+      </article>
+
+      <article class="task" data-task="coordinate">
+        <h2>4. Coordinate target</h2>
+        <canvas id="coordinate-canvas" width="420" height="140"></canvas>
+        <div id="coordinate-status" class="status">waiting for target click</div>
+      </article>
+    </section>
+
+    <pre id="summary" class="summary">window.bhBenchmarkResults() -> pending</pre>
+  </main>
+
+  <script>
+    (() => {
+    const state = {
+      file: false,
+      drag: false,
+      signature: false,
+      coordinate: false,
+    };
+
+    function mark(name, detail) {
+      state[name] = true;
+      const node = document.getElementById(`${name}-status`);
+      node.textContent = detail || "passed";
+      node.classList.add("pass");
+      renderSummary();
+    }
+
+    function renderSummary() {
+      document.getElementById("summary").textContent =
+        JSON.stringify(window.bhBenchmarkResults(), null, 2);
+    }
+
+    window.bhBenchmarkResults = () => ({
+      ...state,
+      passed: Object.values(state).every(Boolean),
+    });
+
+    document.getElementById("file-input").addEventListener("change", (event) => {
+      const file = event.target.files && event.target.files[0];
+      if (file && file.name) mark("file", `uploaded: ${file.name}`);
+    });
+
+    const source = document.getElementById("drag-source");
+    const target = document.getElementById("drop-target");
+    source.addEventListener("dragstart", (event) => {
+      event.dataTransfer.setData("text/plain", "browser-harness-token");
+    });
+    target.addEventListener("dragenter", () => target.classList.add("hot"));
+    target.addEventListener("dragover", (event) => event.preventDefault());
+    target.addEventListener("dragleave", () => target.classList.remove("hot"));
+    target.addEventListener("drop", (event) => {
+      event.preventDefault();
+      target.classList.remove("hot");
+      if (event.dataTransfer.getData("text/plain") === "browser-harness-token") {
+        target.textContent = "token dropped";
+        mark("drag", "dropped token");
+      }
+    });
+
+    const signature = document.getElementById("signature-canvas");
+    const sigCtx = signature.getContext("2d");
+    sigCtx.lineWidth = 4;
+    sigCtx.lineCap = "round";
+    sigCtx.strokeStyle = "#17211f";
+    let drawing = false;
+    let last = null;
+    let pathLength = 0;
+    let pointCount = 0;
+
+    function sigPoint(event) {
+      const rect = signature.getBoundingClientRect();
+      return {
+        x: (event.clientX - rect.left) * (signature.width / rect.width),
+        y: (event.clientY - rect.top) * (signature.height / rect.height),
+      };
+    }
+
+    function startSignature(event) {
+      drawing = true;
+      last = sigPoint(event);
+      pointCount = 1;
+      pathLength = 0;
+      sigCtx.beginPath();
+      sigCtx.moveTo(last.x, last.y);
+    }
+    function moveSignature(event) {
+      if (!drawing) return;
+      const next = sigPoint(event);
+      sigCtx.lineTo(next.x, next.y);
+      sigCtx.stroke();
+      pathLength += Math.hypot(next.x - last.x, next.y - last.y);
+      pointCount += 1;
+      last = next;
+    }
+    function endSignature() {
+      drawing = false;
+      if (pathLength > 120 && pointCount >= 5) mark("signature", "signature captured");
+    }
+    signature.addEventListener("pointerdown", startSignature);
+    signature.addEventListener("pointermove", moveSignature);
+    signature.addEventListener("pointerup", endSignature);
+    signature.addEventListener("mousedown", startSignature);
+    signature.addEventListener("mousemove", moveSignature);
+    window.addEventListener("mouseup", endSignature);
+
+    const coordinate = document.getElementById("coordinate-canvas");
+    const coordCtx = coordinate.getContext("2d");
+    const targetPoint = { x: 312, y: 82, radius: 18 };
+    coordCtx.fillStyle = "#f1f6f4";
+    coordCtx.fillRect(0, 0, coordinate.width, coordinate.height);
+    coordCtx.fillStyle = "#168a54";
+    coordCtx.beginPath();
+    coordCtx.arc(targetPoint.x, targetPoint.y, targetPoint.radius, 0, Math.PI * 2);
+    coordCtx.fill();
+    coordCtx.fillStyle = "#17211f";
+    coordCtx.font = "700 16px system-ui";
+    coordCtx.fillText("click the green target", 24, 42);
+
+    coordinate.addEventListener("click", (event) => {
+      const rect = coordinate.getBoundingClientRect();
+      const x = (event.clientX - rect.left) * (coordinate.width / rect.width);
+      const y = (event.clientY - rect.top) * (coordinate.height / rect.height);
+      if (Math.hypot(x - targetPoint.x, y - targetPoint.y) <= targetPoint.radius) {
+        mark("coordinate", "target clicked");
+      }
+    });
+
+    renderSummary();
+    })();
+  </script>
+</body>
+</html>
diff --git a/docs/self-modifying-harness.md b/docs/self-modifying-harness.md
index f64f061f..1a9412a5 100644
--- a/docs/self-modifying-harness.md
+++ b/docs/self-modifying-harness.md
@@ -191,6 +191,39 @@ def click_canvas_percent(selector, x_pct, y_pct):
 Now the task can say `click_canvas_percent("canvas.map", 0.72, 0.41)` instead of
 carrying brittle absolute coordinates through the rest of the run.
 
+## Edge-case benchmark
+
+Use [edge-case-benchmark.html](edge-case-benchmark.html) to test the four
+patterns on a single page: hidden file upload, drag/drop event payloads,
+signature canvas, and a coordinate-only canvas target.
+
+For a normal local browser session:
+
+```python
+# browser-harness -c '...'
+from pathlib import Path
+
+new_tab(Path("docs/edge-case-benchmark.html").resolve().as_uri())
+wait_for_load()
+print(page_info())
+```
+
+On a remote Browser Use Cloud browser, local `file://` URLs are not available.
+Load the HTML into `about:blank` instead:
+
+```python
+# browser-harness -c '...'
+from pathlib import Path
+
+html = Path("docs/edge-case-benchmark.html").read_text()
+new_tab("about:blank")
+frame_id = cdp("Page.getFrameTree")["frameTree"]["frame"]["id"]
+cdp("Page.setDocumentContent", frameId=frame_id, html=html)
+```
+
+The benchmark exposes `window.bhBenchmarkResults()` so an agent can verify
+completion without relying on a visual guess.
+
 ## What to commit back
 
 Commit helpers that are durable and general enough to help the next run:
@@ -204,4 +237,3 @@ Commit helpers that are durable and general enough to help the next run:
 Do not commit secrets, user data, screenshots of private sessions, or fixed
 pixel coordinates from one viewport. Commit the durable map of how the site
 works, not the diary of one run.
-