From b802475d4d014eb1fa71652570763fda2fe82e48 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:20:41 +0000
Subject: [PATCH 1/3] Initial plan


From 1dda66bf5b65c8ba9560fe85e0ec86f20c8ba9c0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:25:14 +0000
Subject: [PATCH 2/3] Add unit tests for aligned_utf32_match and compute_diff
 code paths

Agent-Logs-Url: https://github.com/zotonic/diffy/sessions/bab29408-b264-43b5-8436-6af3fae6e96a

Co-authored-by: mmzeeman <1024972+mmzeeman@users.noreply.github.com>
---
 test/diffy_tests.erl | 90 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 42065a4..a4175d6 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -320,6 +320,96 @@ diff_test() ->
                             <<"cat mouse dog ">>)),
     ok.
 
+compute_diff_aligned_utf32_match_test() ->
+    %% "foo" is a common suffix of "barfoo" and "foo".
+    %% split_pre_and_suffix strips "foo" as the common suffix,
+    %% so compute_diff sees OldText = <<"bar">>, NewText = <<>>, yielding
+    %% [{delete, <<"bar">>}], which is combined with the equal suffix.
+    ?assertEqual([{delete, <<"bar">>}, {equal, <<"foo">>}],
+                 diffy:diff(<<"barfoo">>, <<"foo">>)),
+
+    %% "prefoo" and "foo" have no common prefix ('p' /= 'f').
+    %% split_pre_and_suffix strips "foo" as common suffix, so compute_diff sees
+    %% OldText = <<"pre">>, NewText = <<>>.
+    ?assertEqual([{delete, <<"pre">>}, {equal, <<"foo">>}],
+                 diffy:diff(<<"prefoo">>, <<"foo">>)),
+
+    %% ShortText ("test") found inside LongText ("a-test-b") — the
+    %% {Start, Length} arm of compute_diff fires directly.
+    ?assertEqual([{insert, <<"a-">>}, {equal, <<"test">>}, {insert, <<"-b">>}],
+                 diffy:diff(<<"test">>, <<"a-test-b">>)),
+
+    %% With non-ASCII: the first character of NewText is U+0100 (Ā), which
+    %% differs from the first character 't' of OldText ("test"), so
+    %% split_pre_and_suffix strips "test" as the common suffix, leaving
+    %% OldText = <<>>, NewText = <<$\x{100}/utf8>>.
+    ?assertEqual([{insert, <<$\x{100}/utf8>>}, {equal, <<"test">>}],
+                 diffy:diff(<<"test">>, <<$\x{100}/utf8, "test">>)),
+
+    ok.
+
+aligned_utf32_match_realignment_test() ->
+    %% This test verifies that the diff engine correctly handles cases where
+    %% byte-level pattern matching could hit a non-codepoint-boundary offset
+    %% before the true aligned match.
+    %%
+    %% In UTF-32, U+0100 (Ā) encodes as <<0,0,1,0>> and $a (U+0061) as
+    %% <<0,0,0,97>>. In the UTF-32 sequence for [U+0100, U+0061]:
+    %%   <<0,0,1,0, 0,0,0,97>>
+    %% the bytes <<0,0,0,97>> appear at byte offset 3 (misaligned) AND at
+    %% byte offset 4 (aligned). The aligned_utf32_match retry logic must skip
+    %% the misaligned hit at offset 3 and return the aligned match at offset 4.
+    %%
+    %% We verify this indirectly: without correct realignment the engine would
+    %% try to split the binary at a non-codepoint boundary, causing a crash or
+    %% wrong result. The correct result is [{insert, <<Ā/utf8>>}, {equal, <<"a">>}].
+    ?assertEqual(
+        [{insert, <<$\x{100}/utf8>>}, {equal, <<"a">>}],
+        diffy:diff(<<"a">>, <<$\x{100}/utf8, "a">>)),
+
+    %% A longer variant: two U+0100 codepoints precede "ab".
+    ?assertEqual(
+        [{insert, <<$\x{100}/utf8, $\x{100}/utf8>>}, {equal, <<"ab">>}],
+        diffy:diff(<<"ab">>, <<$\x{100}/utf8, $\x{100}/utf8, "ab">>)),
+
+    ok.
+
+compute_diff_test() ->
+    %% Branch 1: OldText is empty -> pure insert
+    ?assertEqual([{insert, <<"hello">>}], diffy:diff(<<>>, <<"hello">>)),
+
+    %% Branch 2: NewText is empty -> pure delete
+    ?assertEqual([{delete, <<"hello">>}], diffy:diff(<<"hello">>, <<>>)),
+
+    %% Branch 3: ShortText is a substring of LongText.
+    %% OldText shorter: "foo" found inside "barfoo" (via common-suffix stripping
+    %% then compute_diff on the remainder).
+    ?assertEqual([{delete, <<"bar">>}, {equal, <<"foo">>}],
+                 diffy:diff(<<"barfoo">>, <<"foo">>)),
+
+    %% OldText longer: "foobar" and "foo" share the common prefix "foo",
+    %% which split_pre_and_suffix strips. compute_diff then processes
+    %% "bar" vs <<>>, yielding [{delete,<<"bar">>}].
+    ?assertEqual([{equal, <<"foo">>}, {delete, <<"bar">>}],
+                 diffy:diff(<<"foobar">>, <<"foo">>)),
+
+    %% Branch 4a: single-codepoint ShortText with no match in LongText
+    %% -> [{delete, OldText}, {insert, NewText}]
+    ?assertEqual([{delete, <<"x">>}, {insert, <<"test">>}],
+                 diffy:diff(<<"x">>, <<"test">>)),
+    ?assertEqual([{delete, <<"test">>}, {insert, <<"x">>}],
+                 diffy:diff(<<"test">>, <<"x">>)),
+
+    %% Branch 4b: no substring relationship, length > 1 codepoint each —
+    %% falls through to try_half_match / bisect. Check round-trip correctness.
+    Old = <<"the cat sat on the mat">>,
+    New = <<"the dog sat on the rug">>,
+    Diffs = diffy:diff(Old, New),
+    ?assertEqual(Old, diffy:source_text(Diffs)),
+    ?assertEqual(New, diffy:destination_text(Diffs)),
+
+    ok.
+
 
 %%
 %% Helpers

From dd8596caf750671359c735291e7e39be53f85e93 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 18:15:50 +0000
Subject: [PATCH 3/3] Fix misleading aligned_utf32_match test names and
 comments

The aligned_utf32_match function does not exist in src/diffy.erl.
The code operates on UTF-8 binaries via split_pre_and_suffix +
compute_diff.  Renamed tests and rewrote comments to describe the
actual code paths exercised.

Agent-Logs-Url: https://github.com/zotonic/diffy/sessions/be84a1d4-231a-43b9-b640-c7706cc46d43

Co-authored-by: mmzeeman <1024972+mmzeeman@users.noreply.github.com>
---
 test/diffy_tests.erl | 72 +++++++++++++++++++++++---------------------
 1 file changed, 37 insertions(+), 35 deletions(-)

diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index a4175d6..755281a 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -320,58 +320,60 @@ diff_test() ->
                             <<"cat mouse dog ">>)),
     ok.
 
-compute_diff_aligned_utf32_match_test() ->
-    %% "foo" is a common suffix of "barfoo" and "foo".
-    %% split_pre_and_suffix strips "foo" as the common suffix,
-    %% so compute_diff sees OldText = <<"bar">>, NewText = <<>>, yielding
-    %% [{delete, <<"bar">>}], which is combined with the equal suffix.
+compute_diff_substring_match_test() ->
+    %% Exercise the {Start, Length} branch of compute_diff/3 where
+    %% binary:match(LongText, ShortText) succeeds — i.e. the short text
+    %% is a verbatim substring of the long text.
+
+    %% "test" found inside "a-test-b": no common prefix ('t' /= 'a') and no
+    %% common suffix ('t' /= 'b'), so split_pre_and_suffix leaves both texts
+    %% unchanged.  compute_diff sees ShortText = <<"test">>, LongText =
+    %% <<"a-test-b">>, binary:match finds "test" at byte 2, producing:
+    %%   [{insert, <<"a-">>}, {equal, <<"test">>}, {insert, <<"-b">>}]
+    ?assertEqual([{insert, <<"a-">>}, {equal, <<"test">>}, {insert, <<"-b">>}],
+                 diffy:diff(<<"test">>, <<"a-test-b">>)),
+
+    %% Reversed direction: "a-test-b" vs "test".
+    ?assertEqual([{delete, <<"a-">>}, {equal, <<"test">>}, {delete, <<"-b">>}],
+                 diffy:diff(<<"a-test-b">>, <<"test">>)),
+
+    %% "barfoo" vs "foo": split_pre_and_suffix strips "foo" as common suffix,
+    %% compute_diff sees <<"bar">> vs <<>>, yielding [{delete, <<"bar">>}].
+    %% Combined with suffix: [{delete, <<"bar">>}, {equal, <<"foo">>}].
     ?assertEqual([{delete, <<"bar">>}, {equal, <<"foo">>}],
                  diffy:diff(<<"barfoo">>, <<"foo">>)),
 
-    %% "prefoo" and "foo" have no common prefix ('p' /= 'f').
-    %% split_pre_and_suffix strips "foo" as common suffix, so compute_diff sees
-    %% OldText = <<"pre">>, NewText = <<>>.
+    %% "prefoo" vs "foo": no common prefix ('p' /= 'f'), common suffix "foo"
+    %% stripped.  compute_diff sees <<"pre">> vs <<>>.
     ?assertEqual([{delete, <<"pre">>}, {equal, <<"foo">>}],
                  diffy:diff(<<"prefoo">>, <<"foo">>)),
 
-    %% ShortText ("test") found inside LongText ("a-test-b") — the
-    %% {Start, Length} arm of compute_diff fires directly.
-    ?assertEqual([{insert, <<"a-">>}, {equal, <<"test">>}, {insert, <<"-b">>}],
-                 diffy:diff(<<"test">>, <<"a-test-b">>)),
-
-    %% With non-ASCII: the first character of NewText is U+0100 (Ā), which
-    %% differs from the first character 't' of OldText ("test"), so
-    %% split_pre_and_suffix strips "test" as the common suffix, leaving
-    %% OldText = <<>>, NewText = <<$\x{100}/utf8>>.
-    ?assertEqual([{insert, <<$\x{100}/utf8>>}, {equal, <<"test">>}],
-                 diffy:diff(<<"test">>, <<$\x{100}/utf8, "test">>)),
-
     ok.
 
-aligned_utf32_match_realignment_test() ->
-    %% This test verifies that the diff engine correctly handles cases where
-    %% byte-level pattern matching could hit a non-codepoint-boundary offset
-    %% before the true aligned match.
-    %%
-    %% In UTF-32, U+0100 (Ā) encodes as <<0,0,1,0>> and $a (U+0061) as
-    %% <<0,0,0,97>>. In the UTF-32 sequence for [U+0100, U+0061]:
-    %%   <<0,0,1,0, 0,0,0,97>>
-    %% the bytes <<0,0,0,97>> appear at byte offset 3 (misaligned) AND at
-    %% byte offset 4 (aligned). The aligned_utf32_match retry logic must skip
-    %% the misaligned hit at offset 3 and return the aligned match at offset 4.
+diff_non_ascii_prefix_test() ->
+    %% Verify that diff/2 handles non-ASCII characters correctly when they
+    %% precede an ASCII common suffix.
     %%
-    %% We verify this indirectly: without correct realignment the engine would
-    %% try to split the binary at a non-codepoint boundary, causing a crash or
-    %% wrong result. The correct result is [{insert, <<Ā/utf8>>}, {equal, <<"a">>}].
+    %% diff(<<"a">>, <<Ā/utf8, "a">>):
+    %%   split_pre_and_suffix finds no common prefix (first bytes differ:
+    %%   97 vs 196), but "a" is a common suffix.  After stripping the suffix
+    %%   compute_diff sees <<>> vs <<196,128>> (Ā in UTF-8).
+    %%   Result: [{insert, <<Ā/utf8>>}, {equal, <<"a">>}].
     ?assertEqual(
         [{insert, <<$\x{100}/utf8>>}, {equal, <<"a">>}],
         diffy:diff(<<"a">>, <<$\x{100}/utf8, "a">>)),
 
-    %% A longer variant: two U+0100 codepoints precede "ab".
+    %% Longer variant: two Ā codepoints precede "ab".
+    %%   Common suffix "ab" stripped; compute_diff sees <<>> vs <<Ā/utf8, Ā/utf8>>.
     ?assertEqual(
         [{insert, <<$\x{100}/utf8, $\x{100}/utf8>>}, {equal, <<"ab">>}],
         diffy:diff(<<"ab">>, <<$\x{100}/utf8, $\x{100}/utf8, "ab">>)),
 
+    %% Non-ASCII: U+0100 (Ā) before "test".  No common prefix (196 /= 116),
+    %% common suffix "test" stripped; compute_diff sees <<>> vs <<Ā/utf8>>.
+    ?assertEqual([{insert, <<$\x{100}/utf8>>}, {equal, <<"test">>}],
+                 diffy:diff(<<"test">>, <<$\x{100}/utf8, "test">>)),
+
     ok.
 
 compute_diff_test() ->