From b802475d4d014eb1fa71652570763fda2fe82e48 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:20:41 +0000 Subject: [PATCH 1/3] Initial plan From 1dda66bf5b65c8ba9560fe85e0ec86f20c8ba9c0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:25:14 +0000 Subject: [PATCH 2/3] Add unit tests for aligned_utf32_match and compute_diff code paths Agent-Logs-Url: https://github.com/zotonic/diffy/sessions/bab29408-b264-43b5-8436-6af3fae6e96a Co-authored-by: mmzeeman <1024972+mmzeeman@users.noreply.github.com> --- test/diffy_tests.erl | 90 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 42065a4..a4175d6 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -320,6 +320,96 @@ diff_test() -> <<"cat mouse dog ">>)), ok. +compute_diff_aligned_utf32_match_test() -> + %% "foo" is a common suffix of "barfoo" and "foo". + %% split_pre_and_suffix strips "foo" as the common suffix, + %% so compute_diff sees OldText = <<"bar">>, NewText = <<>>, yielding + %% [{delete, <<"bar">>}], which is combined with the equal suffix. + ?assertEqual([{delete, <<"bar">>}, {equal, <<"foo">>}], + diffy:diff(<<"barfoo">>, <<"foo">>)), + + %% "prefoo" and "foo" have no common prefix ('p' /= 'f'). + %% split_pre_and_suffix strips "foo" as common suffix, so compute_diff sees + %% OldText = <<"pre">>, NewText = <<>>. + ?assertEqual([{delete, <<"pre">>}, {equal, <<"foo">>}], + diffy:diff(<<"prefoo">>, <<"foo">>)), + + %% ShortText ("test") found inside LongText ("a-test-b") — the + %% {Start, Length} arm of compute_diff fires directly. + ?assertEqual([{insert, <<"a-">>}, {equal, <<"test">>}, {insert, <<"-b">>}], + diffy:diff(<<"test">>, <<"a-test-b">>)), + + %% With non-ASCII: the first character of NewText is U+0100 (Ā), which + %% differs from the first character 't' of OldText ("test"), so + %% split_pre_and_suffix strips "test" as the common suffix, leaving + %% OldText = <<>>, NewText = <<$\x{100}/utf8>>. + ?assertEqual([{insert, <<$\x{100}/utf8>>}, {equal, <<"test">>}], + diffy:diff(<<"test">>, <<$\x{100}/utf8, "test">>)), + + ok. + +aligned_utf32_match_realignment_test() -> + %% This test verifies that the diff engine correctly handles cases where + %% byte-level pattern matching could hit a non-codepoint-boundary offset + %% before the true aligned match. + %% + %% In UTF-32, U+0100 (Ā) encodes as <<0,0,1,0>> and $a (U+0061) as + %% <<0,0,0,97>>. In the UTF-32 sequence for [U+0100, U+0061]: + %% <<0,0,1,0, 0,0,0,97>> + %% the bytes <<0,0,0,97>> appear at byte offset 3 (misaligned) AND at + %% byte offset 4 (aligned). The aligned_utf32_match retry logic must skip + %% the misaligned hit at offset 3 and return the aligned match at offset 4. + %% + %% We verify this indirectly: without correct realignment the engine would + %% try to split the binary at a non-codepoint boundary, causing a crash or + %% wrong result. The correct result is [{insert, <<Ā/utf8>>}, {equal, <<"a">>}]. + ?assertEqual( + [{insert, <<$\x{100}/utf8>>}, {equal, <<"a">>}], + diffy:diff(<<"a">>, <<$\x{100}/utf8, "a">>)), + + %% A longer variant: two U+0100 codepoints precede "ab". + ?assertEqual( + [{insert, <<$\x{100}/utf8, $\x{100}/utf8>>}, {equal, <<"ab">>}], + diffy:diff(<<"ab">>, <<$\x{100}/utf8, $\x{100}/utf8, "ab">>)), + + ok. + +compute_diff_test() -> + %% Branch 1: OldText is empty -> pure insert + ?assertEqual([{insert, <<"hello">>}], diffy:diff(<<>>, <<"hello">>)), + + %% Branch 2: NewText is empty -> pure delete + ?assertEqual([{delete, <<"hello">>}], diffy:diff(<<"hello">>, <<>>)), + + %% Branch 3: ShortText is a substring of LongText. + %% OldText shorter: "foo" found inside "barfoo" (via common-suffix stripping + %% then compute_diff on the remainder). + ?assertEqual([{delete, <<"bar">>}, {equal, <<"foo">>}], + diffy:diff(<<"barfoo">>, <<"foo">>)), + + %% OldText longer: "foobar" and "foo" share the common prefix "foo", + %% which split_pre_and_suffix strips. compute_diff then processes + %% "bar" vs <<>>, yielding [{delete,<<"bar">>}]. + ?assertEqual([{equal, <<"foo">>}, {delete, <<"bar">>}], + diffy:diff(<<"foobar">>, <<"foo">>)), + + %% Branch 4a: single-codepoint ShortText with no match in LongText + %% -> [{delete, OldText}, {insert, NewText}] + ?assertEqual([{delete, <<"x">>}, {insert, <<"test">>}], + diffy:diff(<<"x">>, <<"test">>)), + ?assertEqual([{delete, <<"test">>}, {insert, <<"x">>}], + diffy:diff(<<"test">>, <<"x">>)), + + %% Branch 4b: no substring relationship, length > 1 codepoint each — + %% falls through to try_half_match / bisect. Check round-trip correctness. + Old = <<"the cat sat on the mat">>, + New = <<"the dog sat on the rug">>, + Diffs = diffy:diff(Old, New), + ?assertEqual(Old, diffy:source_text(Diffs)), + ?assertEqual(New, diffy:destination_text(Diffs)), + + ok. + %% %% Helpers From dd8596caf750671359c735291e7e39be53f85e93 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 10 Apr 2026 18:15:50 +0000 Subject: [PATCH 3/3] Fix misleading aligned_utf32_match test names and comments The aligned_utf32_match function does not exist in src/diffy.erl. The code operates on UTF-8 binaries via split_pre_and_suffix + compute_diff. Renamed tests and rewrote comments to describe the actual code paths exercised. Agent-Logs-Url: https://github.com/zotonic/diffy/sessions/be84a1d4-231a-43b9-b640-c7706cc46d43 Co-authored-by: mmzeeman <1024972+mmzeeman@users.noreply.github.com> --- test/diffy_tests.erl | 72 +++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index a4175d6..755281a 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -320,58 +320,60 @@ diff_test() -> <<"cat mouse dog ">>)), ok. -compute_diff_aligned_utf32_match_test() -> - %% "foo" is a common suffix of "barfoo" and "foo". - %% split_pre_and_suffix strips "foo" as the common suffix, - %% so compute_diff sees OldText = <<"bar">>, NewText = <<>>, yielding - %% [{delete, <<"bar">>}], which is combined with the equal suffix. +compute_diff_substring_match_test() -> + %% Exercise the {Start, Length} branch of compute_diff/3 where + %% binary:match(LongText, ShortText) succeeds — i.e. the short text + %% is a verbatim substring of the long text. + + %% "test" found inside "a-test-b": no common prefix ('t' /= 'a') and no + %% common suffix ('t' /= 'b'), so split_pre_and_suffix leaves both texts + %% unchanged. compute_diff sees ShortText = <<"test">>, LongText = + %% <<"a-test-b">>, binary:match finds "test" at byte 2, producing: + %% [{insert, <<"a-">>}, {equal, <<"test">>}, {insert, <<"-b">>}] + ?assertEqual([{insert, <<"a-">>}, {equal, <<"test">>}, {insert, <<"-b">>}], + diffy:diff(<<"test">>, <<"a-test-b">>)), + + %% Reversed direction: "a-test-b" vs "test". + ?assertEqual([{delete, <<"a-">>}, {equal, <<"test">>}, {delete, <<"-b">>}], + diffy:diff(<<"a-test-b">>, <<"test">>)), + + %% "barfoo" vs "foo": split_pre_and_suffix strips "foo" as common suffix, + %% compute_diff sees <<"bar">> vs <<>>, yielding [{delete, <<"bar">>}]. + %% Combined with suffix: [{delete, <<"bar">>}, {equal, <<"foo">>}]. ?assertEqual([{delete, <<"bar">>}, {equal, <<"foo">>}], diffy:diff(<<"barfoo">>, <<"foo">>)), - %% "prefoo" and "foo" have no common prefix ('p' /= 'f'). - %% split_pre_and_suffix strips "foo" as common suffix, so compute_diff sees - %% OldText = <<"pre">>, NewText = <<>>. + %% "prefoo" vs "foo": no common prefix ('p' /= 'f'), common suffix "foo" + %% stripped. compute_diff sees <<"pre">> vs <<>>. ?assertEqual([{delete, <<"pre">>}, {equal, <<"foo">>}], diffy:diff(<<"prefoo">>, <<"foo">>)), - %% ShortText ("test") found inside LongText ("a-test-b") — the - %% {Start, Length} arm of compute_diff fires directly. - ?assertEqual([{insert, <<"a-">>}, {equal, <<"test">>}, {insert, <<"-b">>}], - diffy:diff(<<"test">>, <<"a-test-b">>)), - - %% With non-ASCII: the first character of NewText is U+0100 (Ā), which - %% differs from the first character 't' of OldText ("test"), so - %% split_pre_and_suffix strips "test" as the common suffix, leaving - %% OldText = <<>>, NewText = <<$\x{100}/utf8>>. - ?assertEqual([{insert, <<$\x{100}/utf8>>}, {equal, <<"test">>}], - diffy:diff(<<"test">>, <<$\x{100}/utf8, "test">>)), - ok. -aligned_utf32_match_realignment_test() -> - %% This test verifies that the diff engine correctly handles cases where - %% byte-level pattern matching could hit a non-codepoint-boundary offset - %% before the true aligned match. - %% - %% In UTF-32, U+0100 (Ā) encodes as <<0,0,1,0>> and $a (U+0061) as - %% <<0,0,0,97>>. In the UTF-32 sequence for [U+0100, U+0061]: - %% <<0,0,1,0, 0,0,0,97>> - %% the bytes <<0,0,0,97>> appear at byte offset 3 (misaligned) AND at - %% byte offset 4 (aligned). The aligned_utf32_match retry logic must skip - %% the misaligned hit at offset 3 and return the aligned match at offset 4. +diff_non_ascii_prefix_test() -> + %% Verify that diff/2 handles non-ASCII characters correctly when they + %% precede an ASCII common suffix. %% - %% We verify this indirectly: without correct realignment the engine would - %% try to split the binary at a non-codepoint boundary, causing a crash or - %% wrong result. The correct result is [{insert, <<Ā/utf8>>}, {equal, <<"a">>}]. + %% diff(<<"a">>, <<Ā/utf8, "a">>): + %% split_pre_and_suffix finds no common prefix (first bytes differ: + %% 97 vs 196), but "a" is a common suffix. After stripping the suffix + %% compute_diff sees <<>> vs <<196,128>> (Ā in UTF-8). + %% Result: [{insert, <<Ā/utf8>>}, {equal, <<"a">>}]. ?assertEqual( [{insert, <<$\x{100}/utf8>>}, {equal, <<"a">>}], diffy:diff(<<"a">>, <<$\x{100}/utf8, "a">>)), - %% A longer variant: two U+0100 codepoints precede "ab". + %% Longer variant: two Ā codepoints precede "ab". + %% Common suffix "ab" stripped; compute_diff sees <<>> vs <<Ā/utf8, Ā/utf8>>. ?assertEqual( [{insert, <<$\x{100}/utf8, $\x{100}/utf8>>}, {equal, <<"ab">>}], diffy:diff(<<"ab">>, <<$\x{100}/utf8, $\x{100}/utf8, "ab">>)), + %% Non-ASCII: U+0100 (Ā) before "test". No common prefix (196 /= 116), + %% common suffix "test" stripped; compute_diff sees <<>> vs <<Ā/utf8>>. + ?assertEqual([{insert, <<$\x{100}/utf8>>}, {equal, <<"test">>}], + diffy:diff(<<"test">>, <<$\x{100}/utf8, "test">>)), + ok. compute_diff_test() ->