Skip to content

Commit a378941

Browse files
hyperpolymathclaude
andcommitted
docs(res-to-affine): corpus run + regex precision fixes (Refs #57)
First end-to-end run of the Phase-1 scanner against the estate's 491 deduplicated .res files (idaptik + gitbot-fleet) surfaced two high-impact false-positive sources in the top-level regexes: - `side-effect-import` fired on in-function `let _ = X.foo(...)`, ReScript's normal "discard a chained call's return value" idiom (1,181 hits, vast majority indented; LESSONS.md's anti-pattern is the top-level form only). - `mutable-global` fired on any `:=`, including local ref mutation inside function bodies (653 hits, none at column 0 in the corpus). Both regexes are now anchored at column 0, matching the LESSONS.md anti-pattern shape (module-load side effect, module-scoped mutable). `re_untyped_exn` also gains a `(^|...)` alternation so `raise` / `try` at column 0 is no longer missed. Total markers across the corpus: 2,146 → 348 (−84%). Files with any marker: 216 → 94. The 3-test snapshot suite is unchanged — the synthetic fixture covers column-0 cases only. Adds `tools/res-to-affine/CORPUS-RUN.md` (human report with corpus stats, regex change-log, top-hotspots table, and Phase-2 follow-ups) plus `CORPUS-RUN.json` (machine-readable sidecar). README links both. The trade-off: we no longer flag module-load side effects nested inside `module X = { ... }` blocks, nor `let x = ref(...)` top-level declarations paired with their later `:=`. Those are scoped to Phase 2's AST walker and listed in CORPUS-RUN.md as follow-ups. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 0b04511 commit a378941

4 files changed

Lines changed: 249 additions & 4 deletions

File tree

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
{
2+
"schema_version": 1,
3+
"tool": "res-to-affine",
4+
"tool_phase": 1,
5+
"run_date": "2026-05-21",
6+
"issue": "https://github.com/hyperpolymath/affinescript/issues/57",
7+
"merged_pr": "https://github.com/hyperpolymath/affinescript/pull/314",
8+
"corpus": {
9+
"total_files": 491,
10+
"by_repo": {
11+
"idaptik": 475,
12+
"gitbot-fleet": 16
13+
},
14+
"exclusions": [
15+
"**/node_modules/**",
16+
"**/_build/**",
17+
"**/lib/ocaml/**",
18+
"**/lib/bs/**"
19+
]
20+
},
21+
"before_regex_fix": {
22+
"total_markers": 2146,
23+
"files_with_markers": 216,
24+
"by_kind": {
25+
"side-effect-import": 1181,
26+
"mutable-global": 653,
27+
"raw-js": 198,
28+
"untyped-exception": 114
29+
}
30+
},
31+
"after_regex_fix": {
32+
"total_markers": 348,
33+
"files_with_markers": 94,
34+
"noise_reduction_pct": 84,
35+
"by_kind": {
36+
"raw-js": 198,
37+
"untyped-exception": 114,
38+
"side-effect-import": 36,
39+
"mutable-global": 0
40+
}
41+
},
42+
"regex_changes": {
43+
"re_side_effect_import": {
44+
"old": "^[ \\t]*let[ \\t]+_[ \\t]*=[ \\t]*[A-Z][a-zA-Z0-9_]*\\.",
45+
"new": "^let[ \\t]+_[ \\t]*=[ \\t]*[A-Z][a-zA-Z0-9_]*\\.",
46+
"rationale": "Column-0 anchor; in-function `let _ = X.f(...)` is normal ReScript idiom, not the LESSONS.md module-load anti-pattern."
47+
},
48+
"re_mutable_global": {
49+
"old": ":=",
50+
"new": "^[a-zA-Z_][a-zA-Z0-9_]*[ \\t]*:=",
51+
"rationale": "Column-0 anchor; intra-function `counter := ...` is local mutation, not module-scoped."
52+
},
53+
"re_untyped_exn": {
54+
"old": "Promise\\.catch\\|Js\\.Exn\\|[^a-zA-Z_]raise[ (]\\|[^a-zA-Z_]try[ {]",
55+
"new": "Promise\\.catch\\|Js\\.Exn\\|\\(^\\|[^a-zA-Z_]\\)raise[ (]\\|\\(^\\|[^a-zA-Z_]\\)try[ {]",
56+
"rationale": "Old form required a char before `raise`/`try`; column-0 occurrences were missed."
57+
}
58+
},
59+
"top_hotspots": [
60+
{ "markers": 31, "file": "idaptik/.affinescript-src/packages/affine-res/src/AffineScriptValue.res", "dominant_kind": "raw-js" },
61+
{ "markers": 29, "file": "idaptik/idaptik-ums/src/App.res", "dominant_kind": "raw-js" },
62+
{ "markers": 24, "file": "idaptik/src/Main.res", "dominant_kind": "side-effect-import" },
63+
{ "markers": 15, "file": "idaptik/src/app/screens/training/TrainingBase.res", "dominant_kind": "raw-js" },
64+
{ "markers": 13, "file": "idaptik/src/app/screens/WorldBuilder.res", "dominant_kind": "mixed" }
65+
],
66+
"phase_2_followups": [
67+
"Block-comment (`/* … */`) awareness in `is_codeish`",
68+
"Pair top-level `let x = ref(...)` with `x := ...`",
69+
"Nested-module side-effect detection via AST",
70+
"Callback-record + oversized-function detection (already scoped)",
71+
"String-literal anti-pattern filtering"
72+
]
73+
}

tools/res-to-affine/CORPUS-RUN.md

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
<!-- SPDX-License-Identifier: MPL-2.0 -->
2+
<!-- SPDX-FileCopyrightText: 2026 Jonathan D.A. Jewell -->
3+
4+
# `res-to-affine` — Phase-1 corpus run (2026-05-21)
5+
6+
First end-to-end exercise of the Phase-1 scanner against the estate's
7+
real ReScript surface. Run after [#314] (Phase-1 skeleton merge) on
8+
behalf of [#57]. Surfaced two high-impact false-positive sources in the
9+
top-level regexes and one false-negative; this run records the fixes
10+
and the new baseline.
11+
12+
[#57]: https://github.com/hyperpolymath/affinescript/issues/57
13+
[#314]: https://github.com/hyperpolymath/affinescript/pull/314
14+
15+
## Corpus
16+
17+
| Repo | Dedup'd `.res` files | Notes |
18+
|---|---:|---|
19+
| `idaptik` | 475 | excludes `lib/bs/**`, `lib/ocaml/**` (copies of `src/**`) |
20+
| `gitbot-fleet` | 16 | sustainabot + 3 SafeDOM examples; `lib/ocaml/**` excluded |
21+
| **Total** | **491** | run via `dune exec tools/res-to-affine/main.exe -- <path>` |
22+
23+
`_wt-lic1-idaptik/` is a git worktree of `idaptik` and was skipped.
24+
25+
## Findings
26+
27+
### Before the regex fix
28+
29+
| Kind | Hits |
30+
|---|---:|
31+
| `side-effect-import` | 1,181 |
32+
| `mutable-global` | 653 |
33+
| `raw-js` | 198 |
34+
| `untyped-exception` | 114 |
35+
| **Total** | **2,146** across 216 files |
36+
37+
Spot-check of the top file (`idaptik/src/app/devices/LaptopGUI.res`, 105
38+
markers) showed the 63 `side-effect-import` hits there were all
39+
**indented** `let _ = Container.addChild(parent, child)` — i.e. ReScript's
40+
normal "discard a chained call's return value" idiom inside a function
41+
body, not LESSONS.md's "module-load side effect" anti-pattern.
42+
43+
The same problem applied to `mutable-global`: line 496 of
44+
`NetworkDesktop.res` is ` currentY := currentY.contents +. ...`,
45+
local `ref` mutation inside a function, not a top-level module-scoped
46+
mutable.
47+
48+
### Fixes (this PR)
49+
50+
`scanner.ml`:
51+
52+
- `re_side_effect_import` — drop the leading `[ \t]*`; anchor at column
53+
0. Module-load side effects only fire at top level; in-function
54+
`let _ = X.f(...)` is a normal ReScript idiom.
55+
- `re_mutable_global` — replace bare `:=` with
56+
`^[a-zA-Z_][a-zA-Z0-9_]*[ \t]*:=`. Same logic: top-level assignment
57+
to a module-scoped ref is the anti-pattern; intra-function
58+
`counter := ...` is local mutation.
59+
- `re_untyped_exn` — replace `[^a-zA-Z_]raise[ (]` /
60+
`[^a-zA-Z_]try[ {]` with `\(^\|[^a-zA-Z_]\)…`. Previous form
61+
required at least one character before `raise` / `try`, missing
62+
column-0 occurrences.
63+
64+
The trade-off is that we no longer flag module-load side effects or
65+
top-level mutable globals nested inside a `module X = { ... }` block.
66+
Those are the Phase 2 (AST) walker's job. The benefit is a clean
67+
signal that the migrator can trust.
68+
69+
### After the regex fix
70+
71+
| Kind | Hits | Δ |
72+
|---|---:|---|
73+
| `raw-js` | 198 | unchanged |
74+
| `untyped-exception` | 114 | unchanged |
75+
| `side-effect-import` | 36 | −1,145 |
76+
| `mutable-global` | 0 | −653 |
77+
| **Total** | **348** across 94 files | **−84%** |
78+
79+
The 397 files (81%) now reporting zero markers do **not** mean those
80+
files are clean — Phase-1 only sees 4 of 6 anti-patterns, and the
81+
column-0 anchoring trades some recall for sharply improved precision.
82+
A "no findings" skeleton already calls this out:
83+
84+
> A clean `.res` surface does not mean the port is mechanical —
85+
> re-decomposition still applies (see PILOT.md upstream).
86+
87+
### Top remaining hot-spots
88+
89+
| Markers | File | Dominant kind |
90+
|---:|---|---|
91+
| 31 | `.affinescript-src/packages/affine-res/src/AffineScriptValue.res` | `raw-js` |
92+
| 29 | `idaptik-ums/src/App.res` | `raw-js` |
93+
| 24 | `src/Main.res` | `side-effect-import` (real, column-0 `let _ = X.constructor`) |
94+
| 15 | `src/app/screens/training/TrainingBase.res` | `raw-js` |
95+
| 13 | `src/app/screens/WorldBuilder.res` | mixed |
96+
97+
`AffineScriptValue.res` and `App.res` are heavy `%raw` users — the
98+
expected shape for a value-encoding interop layer. `Main.res`
99+
top-level `let _ = X.constructor` is the canonical
100+
"explicit-registration" candidate.
101+
102+
## Validation
103+
104+
- `dune test tools/res-to-affine/` — 3/3 OK (synthetic fixture
105+
unchanged; the snapshot covers column-0 cases only, so the regex
106+
tightening leaves the snapshot byte-identical).
107+
- Spot-check confirmed each `side-effect-import` hit is genuinely at
108+
column 0 (e.g. `idaptik/src/Main.res:5:let _ = PixiSound.sound`).
109+
- Spot-check confirmed each `raw-js` and `untyped-exception` hit
110+
corresponds to a real `%raw(…)` block or `try`/`Js.Exn`/
111+
`Promise.catch` occurrence.
112+
113+
## Follow-ups (deferred to Phase 2 / separate issues)
114+
115+
These are noise sources the AST walker can fix that the line-regex
116+
scanner cannot, plus a few small Phase-1 robustness items:
117+
118+
1. **Block-comment awareness**`is_codeish` filters `//` but not
119+
`/* … */`. A `%raw(…)` reference inside a block comment would
120+
currently flag.
121+
2. **Top-level ref declarations** — Phase 1 flags top-level `x := …`
122+
assignments but does not flag the top-level declaration
123+
`let x = ref(…)`. Phase 2 should surface both, paired.
124+
3. **Nested-module side effects**`module Foo = { let _ = X.bar }`
125+
is a module-load side effect inside a sub-module; needs AST.
126+
4. **Callback-record + oversized-function** — already scoped to
127+
Phase 2 in the ADR; surfacing here for completeness.
128+
5. **String-literal hits** — a `:=` or `%raw` inside a string literal
129+
would currently flag. Not seen in the corpus; flagged for awareness.
130+
131+
## Reproducing
132+
133+
```sh
134+
# from a clone of affinescript at the commit landing this PR:
135+
dune build tools/res-to-affine
136+
BIN=$PWD/_build/default/tools/res-to-affine/main.exe
137+
138+
# from a directory containing idaptik/ and gitbot-fleet/ clones:
139+
find idaptik gitbot-fleet -name '*.res' \
140+
-not -path '*/node_modules/*' \
141+
-not -path '*/_build/*' \
142+
-not -path '*/lib/ocaml/*' \
143+
-not -path '*/lib/bs/*' > corpus.txt
144+
145+
mkdir corpus-out
146+
while IFS= read -r f; do
147+
safe=$(echo "$f" | tr '/' '_')
148+
$BIN "$f" > "corpus-out/$safe.affine"
149+
done < corpus.txt
150+
151+
# tally
152+
cat corpus-out/*.affine \
153+
| grep -oE '\[(side-effect-import|raw-js|untyped-exception|mutable-global)\]' \
154+
| sort | uniq -c | sort -rn
155+
```

tools/res-to-affine/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,14 @@ where re-decomposition is genuinely required.
104104

105105
Phase 3 is when the tool earns its keep on idaptik's 542 files.
106106

107+
## Corpus run
108+
109+
[`CORPUS-RUN.md`](CORPUS-RUN.md) records the first end-to-end run
110+
against the estate's 491 deduplicated `.res` files. It documents the
111+
false-positive sources that the corpus surfaced (and the regex fixes
112+
that landed alongside it) plus the Phase-2 follow-ups it identified.
113+
A machine-readable sidecar lives at [`CORPUS-RUN.json`](CORPUS-RUN.json).
114+
107115
## Testing
108116

109117
```sh

tools/res-to-affine/scanner.ml

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,27 @@ type finding = {
3636
excerpt : string;
3737
}
3838

39-
(* ---- regexes, compiled once ---- *)
39+
(* ---- regexes, compiled once ----
40+
41+
The LESSONS.md anti-patterns these regexes target are *top-level*
42+
shapes (module-load side effects, module-scoped mutable globals).
43+
The two top-level regexes therefore anchor at column 0 to keep
44+
in-function `let _ = X.foo(...)` (a normal "discard the return
45+
value of a chained call" idiom) and intra-function `x := ...`
46+
(normal local ref mutation) from generating noise. The trade-off
47+
is that we miss module-load side effects nested inside a
48+
`module X = { ... }` block, which Phase 2's AST walker recovers. *)
4049

4150
let re_side_effect_import =
42-
Str.regexp "^[ \t]*let[ \t]+_[ \t]*=[ \t]*[A-Z][a-zA-Z0-9_]*\\."
51+
Str.regexp "^let[ \t]+_[ \t]*=[ \t]*[A-Z][a-zA-Z0-9_]*\\."
4352

4453
let re_raw_js = Str.regexp_case_fold "%raw\\|\\[%bs\\.raw"
4554

4655
let re_untyped_exn =
4756
Str.regexp
48-
"Promise\\.catch\\|Js\\.Exn\\|[^a-zA-Z_]raise[ (]\\|[^a-zA-Z_]try[ {]"
57+
"Promise\\.catch\\|Js\\.Exn\\|\\(^\\|[^a-zA-Z_]\\)raise[ (]\\|\\(^\\|[^a-zA-Z_]\\)try[ {]"
4958

50-
let re_mutable_global = Str.regexp ":="
59+
let re_mutable_global = Str.regexp "^[a-zA-Z_][a-zA-Z0-9_]*[ \t]*:="
5160

5261
let trim s =
5362
let n = String.length s in

0 commit comments

Comments
 (0)