From 40e0e09a85c08a1d0821541c7d94432287406122 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Mon, 6 Apr 2026 20:57:08 +0200
Subject: [PATCH 01/47] Added semantic diff cleanup

---
 src/diffy.erl        | 270 ++++++++++++++++++++++++++++++++++++++++++-
 test/diffy_tests.erl |  16 +--
 2 files changed, 272 insertions(+), 14 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index fc75a87..16cebb8 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -581,7 +581,8 @@ levenshtein([{equal, _Data}|T], Insertions, Deletions, Levenshtein) ->
 %
 -spec cleanup_merge(diffs()) -> diffs().
 cleanup_merge(Diffs) ->
-    cleanup_merge(Diffs, []). 
+    Diffs1 = cleanup_merge(Diffs, []),
+    canonicalize_edits(Diffs1, []).
 
 %% Done
 cleanup_merge([], Acc) ->
@@ -620,16 +621,273 @@ cleanup_merge([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =:=
 cleanup_merge([H|T], Acc) ->
     cleanup_merge(T, [H|Acc]).
 
+canonicalize_edits([{insert, I}, {delete, D} | T], Acc) ->
+    canonicalize_edits(T, [{insert, I}, {delete, D} | Acc]);
+canonicalize_edits([H | T], Acc) ->
+    canonicalize_edits(T, [H | Acc]);
+canonicalize_edits([], Acc) ->
+    lists:reverse(Acc).
+
 % @doc Do semantic cleanup of diffs
 %
 -spec cleanup_semantic(diffs()) -> diffs().
 cleanup_semantic(Diffs) ->
-    cleanup_semantic(Diffs, []).
+    Diffs1 = cleanup_semantic_breakpoints(Diffs),
+    Diffs2 = cleanup_merge(Diffs1),
+    Diffs3 = cleanup_semantic_lossless(Diffs2),
+    cleanup_semantic_overlaps(Diffs3).
+
+cleanup_semantic_breakpoints(Diffs) ->
+    case find_breakpoint(Diffs, [], 0, 0, 0, 0, undefined) of
+        {found, NewDiffs} -> cleanup_semantic_breakpoints(NewDiffs);
+        not_found -> Diffs
+    end.
 
-cleanup_semantic([], Acc) ->
-    lists:reverse(Acc);
-cleanup_semantic([H|T], Acc) ->
-    cleanup_semantic(T, [H|Acc]).
+find_breakpoint([], _Acc, _LI1, _LD1, _LI2, _LD2, _LE) ->
+    not_found;
+find_breakpoint([{equal, Data} | T], Acc, _LI1, _LD1, LI2, LD2, _LE) ->
+    find_breakpoint(T, [{equal, Data} | Acc], LI2, LD2, 0, 0, Data);
+find_breakpoint([{insert, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) ->
+    NewLI2 = LI2 + text_size(Data),
+    case is_breakpoint(LE, LI1, LD1, NewLI2, LD2) of
+        true -> {found, apply_breakpoint(LE, Acc, [{insert, Data} | T])};
+        false -> find_breakpoint(T, [{insert, Data} | Acc], LI1, LD1, NewLI2, LD2, LE)
+    end;
+find_breakpoint([{delete, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) ->
+    NewLD2 = LD2 + text_size(Data),
+    case is_breakpoint(LE, LI1, LD1, LI2, NewLD2) of
+        true -> {found, apply_breakpoint(LE, Acc, [{delete, Data} | T])};
+        false -> find_breakpoint(T, [{delete, Data} | Acc], LI1, LD1, LI2, NewLD2, LE)
+    end.
+
+is_breakpoint(undefined, _, _, _, _) -> false;
+is_breakpoint(LE, LI1, LD1, LI2, LD2) ->
+    LEN = text_size(LE),
+    LEN =< max(LI1, LD1) andalso LEN =< max(LI2, LD2).
+
+apply_breakpoint(LE, Acc, T) ->
+    replace_equality(LE, Acc, T).
+
+replace_equality(LE, [{equal, LE} | T_Acc], T) ->
+    lists:reverse(T_Acc) ++ [{delete, LE}, {insert, LE} | T];
+replace_equality(LE, [H | T_Acc], T) ->
+    replace_equality(LE, T_Acc, [H | T]).
+
+cleanup_semantic_lossless(Diffs) ->
+    cleanup_semantic_lossless(Diffs, []).
+
+cleanup_semantic_lossless([{equal, E1}, {Op, Edit}, {equal, E2} | T], Acc) when ?IS_INS_OR_DEL(Op) ->
+    {NewE1, NewEdit, NewE2} = slide_edit(E1, Edit, E2),
+    case NewE1 of
+        <<>> ->
+            cleanup_semantic_lossless(lists:reverse(Acc, [{Op, NewEdit}, {equal, NewE2} | T]), []);
+        _ ->
+            case NewE2 of
+                <<>> ->
+                    cleanup_semantic_lossless(lists:reverse(Acc, [{equal, NewE1}, {Op, NewEdit} | T]), []);
+                _ ->
+                    cleanup_semantic_lossless([{Op, NewEdit}, {equal, NewE2} | T], [{equal, NewE1} | Acc])
+            end
+    end;
+cleanup_semantic_lossless([H | T], Acc) ->
+    cleanup_semantic_lossless(T, [H | Acc]);
+cleanup_semantic_lossless([], Acc) ->
+    lists:reverse(Acc).
+
+slide_edit(E1, Edit, E2) ->
+    Suffix = common_suffix(E1, Edit),
+    {E1_1, Edit_1, E2_1} = case Suffix of
+        <<>> -> {E1, Edit, E2};
+        _ ->
+            SLen = size(Suffix),
+            { binary:part(E1, 0, size(E1) - SLen),
+              <<Suffix/binary, (binary:part(Edit, 0, size(Edit) - SLen))/binary>>,
+              <<Suffix/binary, E2/binary>> }
+    end,
+    find_best_slide(E1_1, Edit_1, E2_1).
+
+find_best_slide(E1, Edit, E2) ->
+    Score = cleanup_semantic_score(E1, Edit) + cleanup_semantic_score(Edit, E2),
+    find_best_slide(E1, Edit, E2, Score, E1, Edit, E2).
+
+find_best_slide(E1, Edit, E2, BestScore, BestE1, BestEdit, BestE2) ->
+    case can_slide_right(Edit, E2) of
+        {true, Char, RestEdit, RestE2} ->
+            NewE1 = <<E1/binary, Char/binary>>,
+            NewEdit = <<RestEdit/binary, Char/binary>>,
+            NewE2 = RestE2,
+            NewScore = cleanup_semantic_score(NewE1, NewEdit) + cleanup_semantic_score(NewEdit, NewE2),
+            if
+                NewScore >= BestScore ->
+                    find_best_slide(NewE1, NewEdit, NewE2, NewScore, NewE1, NewEdit, NewE2);
+                true ->
+                    find_best_slide(NewE1, NewEdit, NewE2, BestScore, BestE1, BestEdit, BestE2)
+            end;
+        false ->
+            {BestE1, BestEdit, BestE2}
+    end.
+
+can_slide_right(<<C/utf8, RestEdit/binary>>, <<C/utf8, RestE2/binary>>) ->
+    {true, <<C/utf8>>, RestEdit, RestE2};
+can_slide_right(_, _) ->
+    false.
+
+cleanup_semantic_score(<<>>, _) -> 6;
+cleanup_semantic_score(_, <<>>) -> 6;
+cleanup_semantic_score(One, Two) ->
+    Char1 = last_char(One),
+    Char2 = first_char(Two),
+    NonAlphaNumeric1 = is_non_alphanumeric(Char1),
+    NonAlphaNumeric2 = is_non_alphanumeric(Char2),
+    Whitespace1 = NonAlphaNumeric1 andalso is_whitespace(Char1),
+    Whitespace2 = NonAlphaNumeric2 andalso is_whitespace(Char2),
+    LineBreak1 = Whitespace1 andalso is_linebreak(Char1),
+    LineBreak2 = Whitespace2 andalso is_linebreak(Char2),
+    BlankLine1 = LineBreak1 andalso is_blankline_end(One),
+    BlankLine2 = LineBreak2 andalso is_blankline_start(Two),
+    if
+        BlankLine1 orelse BlankLine2 -> 5;
+        LineBreak1 orelse LineBreak2 -> 4;
+        NonAlphaNumeric1 andalso (not Whitespace1) andalso Whitespace2 -> 3;
+        Whitespace1 orelse Whitespace2 -> 2;
+        NonAlphaNumeric1 orelse NonAlphaNumeric2 -> 1;
+        true -> 0
+    end.
+
+cleanup_semantic_overlaps(Diffs) ->
+    cleanup_semantic_overlaps(Diffs, []).
+
+cleanup_semantic_overlaps([{delete, Del}, {insert, Ins} | T], Acc) ->
+    Overlap1 = common_overlap(Del, Ins),
+    Overlap2 = common_overlap(Ins, Del),
+    if
+        Overlap1 >= Overlap2 ->
+            TDel = text_size(Del),
+            TIns = text_size(Ins),
+            case Overlap1 >= TDel / 2 orelse Overlap1 >= TIns / 2 of
+                true ->
+                    Common = substring_start(Ins, Overlap1),
+                    NewDel = substring_start(Del, TDel - Overlap1),
+                    NewIns = skip_chars(Ins, Overlap1),
+                    cleanup_semantic_overlaps([{insert, NewIns} | T], [{equal, Common}, {delete, NewDel} | Acc]);
+                false ->
+                    cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc])
+            end;
+        true ->
+            TDel = text_size(Del),
+            TIns = text_size(Ins),
+            case Overlap2 >= TDel / 2 orelse Overlap2 >= TIns / 2 of
+                true ->
+                    Common = substring_start(Del, Overlap2),
+                    NewIns = substring_start(Ins, TIns - Overlap2),
+                    NewDel = skip_chars(Del, Overlap2),
+                    cleanup_semantic_overlaps([{delete, NewDel} | T], [{equal, Common}, {insert, NewIns} | Acc]);
+                false ->
+                    cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc])
+            end
+    end;
+cleanup_semantic_overlaps([H | T], Acc) ->
+    cleanup_semantic_overlaps(T, [H | Acc]);
+cleanup_semantic_overlaps([], Acc) ->
+    lists:reverse(Acc).
+
+%% Helper functions for semantic cleanup
+
+common_overlap(<<>>, _) -> 0;
+common_overlap(_, <<>>) -> 0;
+common_overlap(Text1, Text2) ->
+    T1Len = text_size(Text1),
+    T2Len = text_size(Text2),
+    {T1, T2} = if
+        T1Len > T2Len -> {substring_end(Text1, T2Len), Text2};
+        T1Len < T2Len -> {Text1, substring_start(Text2, T1Len)};
+        true -> {Text1, Text2}
+    end,
+    TMin = min(T1Len, T2Len),
+    if
+        T1 =:= T2 -> TMin;
+        true -> common_overlap_loop(T1, T2, TMin, 0, 1)
+    end.
+
+common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin ->
+    Pattern = substring_end(T1, Length),
+    case binary:match(T2, Pattern) of
+        nomatch -> Best;
+        {FoundByteOffset, _} ->
+            FoundCharCount = text_size(binary:part(T2, 0, FoundByteOffset)),
+            NewLength = Length + FoundCharCount,
+            case NewLength > TMin of
+                true -> Best;
+                false ->
+                    case FoundCharCount =:= 0 orelse substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of
+                        true ->
+                            common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1);
+                        false ->
+                            common_overlap_loop(T1, T2, TMin, Best, NewLength + 1)
+                    end
+            end
+    end;
+common_overlap_loop(_T1, _T2, _TMin, Best, _Length) ->
+    Best.
+
+first_char(<<C/utf8, _/binary>>) -> C;
+first_char(_) -> undefined.
+
+last_char(Bin) ->
+    last_char(Bin, undefined).
+last_char(<<C/utf8, Rest/binary>>, _Last) -> last_char(Rest, C);
+last_char(<<>>, Last) -> Last.
+
+substring_start(Bin, Len) ->
+    substring_start(Bin, Len, <<>>).
+substring_start(_, 0, Acc) -> Acc;
+substring_start(<<C/utf8, Rest/binary>>, Len, Acc) ->
+    substring_start(Rest, Len - 1, <<Acc/binary, C/utf8>>);
+substring_start(<<>>, _, Acc) -> Acc.
+
+substring_end(Bin, Len) ->
+    TotalLen = text_size(Bin),
+    if
+        TotalLen =< Len -> Bin;
+        true -> skip_chars(Bin, TotalLen - Len)
+    end.
+
+skip_chars(Bin, 0) -> Bin;
+skip_chars(<<_/utf8, Rest/binary>>, N) -> skip_chars(Rest, N - 1);
+skip_chars(<<>>, _) -> <<>>.
+
+is_non_alphanumeric(undefined) -> true;
+is_non_alphanumeric(C) ->
+    not ((C >= $a andalso C =< $z) orelse
+         (C >= $A andalso C =< $Z) orelse
+         (C >= $0 andalso C =< $9)).
+
+is_whitespace(undefined) -> false;
+is_whitespace(C) ->
+    case C of
+        $\s -> true;
+        $\t -> true;
+        $\n -> true;
+        $\r -> true;
+        $\f -> true;
+        $\v -> true;
+        _ -> false
+    end.
+
+is_linebreak(C) ->
+    C =:= $\n orelse C =:= $\r.
+
+is_blankline_end(Bin) ->
+    case re:run(Bin, <<"\n\r?\n$">> ) of
+        {match, _} -> true;
+        nomatch -> false
+    end.
+
+is_blankline_start(Bin) ->
+    case re:run(Bin, <<"^\r?\n\r?\n">> ) of
+        {match, _} -> true;
+        nomatch -> false
+    end.
 
 % @doc Do efficiency cleanup of diffs.
 %
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 42065a4..4fb2bcc 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -225,14 +225,14 @@ cleanup_semantic_test() ->
     ?assertEqual([{delete, <<"abc">>}, {insert, <<"ABC">>}, {equal, <<"1234">>}, {delete, <<"wxyz">>}], 
         cleanup_semantic([{delete, <<"abc">>}, {insert, <<"ABC">>}, {equal, <<"1234">>}, {delete, <<"wxyz">>}])),
 
-    % % Simple elimination.
-    % ?assertEqual([{delete, <<"abc">>}, {insert, <<"b">>}], 
-    %     cleanup_semantic([{delete, <<"a">>}, {equal, <<"b">>}, {delete, <<"c">>}])),
+    % Simple elimination.
+    ?assertEqual([{delete, <<"abc">>}, {insert, <<"b">>}], 
+        cleanup_semantic([{delete, <<"a">>}, {equal, <<"b">>}, {delete, <<"c">>}])),
 
-    % % Multiple eliminations.
-    % ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}], 
-    %     cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}, 
-    %         {equal, <<"_">>}, {insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}])),
+    % Multiple eliminations.
+    ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}], 
+        cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}, 
+            {equal, <<"_">>}, {insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}])),
 
     ok.
 
@@ -261,7 +261,7 @@ cleanup_efficiency_test() ->
         cleanup_efficiency([{delete, <<"ab">>}, {insert, <<"12">>}, {equal, <<"xyz">>}, {delete, <<"cd">>}, {insert, <<"34">>}])),
 
     % Three-edit elimination
-    ?assertEqual([{insert, <<"12x34">>}, {delete, <<"xcd">>}], 
+    ?assertEqual([{delete, <<"xcd">>}, {insert, <<"12x34">>}], 
         cleanup_efficiency([{insert, <<"12">>}, {equal, <<"x">>}, {delete, <<"cd">>}, {insert, <<"34">>}])),
 
     % Backpass elimination

From 8b504d5f26efd7760a51667b5a7497066c5efa46 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 10:34:12 +0200
Subject: [PATCH 02/47] Upgraded deps, and use proper generator to make utf8
 binaries

---
 rebar.config         |  4 ++--
 rebar.lock           | 23 +++++++++++++++++------
 src/diffy.erl        | 31 ++++---------------------------
 test/diffy_tests.erl | 27 ++++++++++++++++++++++-----
 4 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/rebar.config b/rebar.config
index adb9366..304974c 100644
--- a/rebar.config
+++ b/rebar.config
@@ -1,13 +1,13 @@
 {erl_opts, [debug_info, warn_unused, warn_shadow_vars]}.
 {deps, [
-    {zotonic_stdlib, "1.2.3"}
+    {zotonic_stdlib, "1.27.0"}
 ]}.
 
 
 {profiles, [
     {test, [
         {deps, [
-            {proper, "1.2.0"}
+            {proper, "1.5.0"}
         ]},
 
         {xref_checks, [
diff --git a/rebar.lock b/rebar.lock
index 05bb477..98901fd 100644
--- a/rebar.lock
+++ b/rebar.lock
@@ -1,11 +1,22 @@
 {"1.2.0",
-[{<<"proper">>,{pkg,<<"proper">>,<<"1.2.0">>},0},
- {<<"zotonic_stdlib">>,{pkg,<<"zotonic_stdlib">>,<<"1.2.3">>},0}]}.
+[{<<"cowlib">>,{pkg,<<"cowlib">>,<<"2.16.0">>},1},
+ {<<"qdate_localtime">>,{pkg,<<"qdate_localtime">>,<<"1.2.2">>},1},
+ {<<"ssl_verify_fun">>,{pkg,<<"ssl_verify_fun">>,<<"1.1.7">>},2},
+ {<<"tls_certificate_check">>,
+  {pkg,<<"tls_certificate_check">>,<<"1.31.0">>},
+  1},
+ {<<"zotonic_stdlib">>,{pkg,<<"zotonic_stdlib">>,<<"1.27.0">>},0}]}.
 [
 {pkg_hash,[
- {<<"proper">>, <<"1466492385959412A02871505434E72E92765958C60DBA144B43863554B505A4">>},
- {<<"zotonic_stdlib">>, <<"4A33B60C82379169C9934CCD1FC9E512CA16B922E131AD6B6D26E562F66DF9CC">>}]},
+ {<<"cowlib">>, <<"54592074EBBBB92EE4746C8A8846E5605052F29309D3A873468D76CDF932076F">>},
+ {<<"qdate_localtime">>, <<"43E1B20102F50A8B2A2BE7042C2F6BE989AD96CA2CC319DB5DF56E122E8873F6">>},
+ {<<"ssl_verify_fun">>, <<"354C321CF377240C7B8716899E182CE4890C5938111A1296ADD3EC74CF1715DF">>},
+ {<<"tls_certificate_check">>, <<"9A910B54D8CB96CC810CABF4C0129F21360F82022B20180849F1442A25CCBB04">>},
+ {<<"zotonic_stdlib">>, <<"36D6F7A1004DEE169A61ADB57FDE8175F39F59634B5FFFD4AA0C1D0985D2A74E">>}]},
 {pkg_hash_ext,[
- {<<"proper">>, <<"CBC3766C08337806741343D330BF4BCB826155D2141BE8514C4B02858AA19FD3">>},
- {<<"zotonic_stdlib">>, <<"4712DD7A0C0C600AFEDAFDA738D40FEBF10CFC2485E62D109361FCC190F7381A">>}]}
+ {<<"cowlib">>, <<"7F478D80D66B747344F0EA7708C187645CFCC08B11AA424632F78E25BF05DB51">>},
+ {<<"qdate_localtime">>, <<"A38D5F1C5AE14B22F471E442B262AECCAFB915B664C7C364443DC73179C50FDA">>},
+ {<<"ssl_verify_fun">>, <<"FE4C190E8F37401D30167C8C405EDA19469F34577987C76DDE613E838BBC67F8">>},
+ {<<"tls_certificate_check">>, <<"9D2B41B128D5507BD8AD93E1A998E06D0AB2F9A772AF343F4C00BF76C6BE1532">>},
+ {<<"zotonic_stdlib">>, <<"B9555F50717F2F8FBD3D4156CE7F4E2DF380441D942DE54789466940929B08C3">>}]}
 ].
diff --git a/src/diffy.erl b/src/diffy.erl
index 16cebb8..df712b4 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -53,7 +53,7 @@
 
 -type for_fun() :: fun((integer(), term()) -> {continue, term()} | {break, term()}).
 
--export_type([diffs/0]).
+-export_type([diff_op/0, diff/0, diffs/0]).
 
 -define(PATCH_MARGIN, 4).
 -define(PATCH_MAX_PATCH_LEN, 32).
@@ -388,7 +388,7 @@ diff_bisect(A, B) when is_binary(A) andalso is_binary(B) ->
 compute_diff_bisect1(A, B, M, N) ->
     %% TODO, add deadline... 
     
-    MaxD = int_ceil((M + N) / 2),
+    MaxD = ceil((M + N) / 2),
 
     VOffset = MaxD,
     VLength = 2 * MaxD,
@@ -1138,14 +1138,7 @@ common_suffix(Text1, Text2) ->
 
 % @doc Count the number of characters in a utf8 binary.
 text_size(Text) when is_binary(Text) ->
-    text_size(Text, 0).
-
-text_size(<<>>, Count) ->
-    Count;
-text_size(<<_C/utf8, Rest/binary>>, Count) ->
-    text_size(Rest, Count+1);
-text_size(_, _) ->
-    error(badarg).
+    string:length(Text).
 
 %%
 %% Array utilities
@@ -1153,12 +1146,7 @@ text_size(_, _) ->
 
 % @doc Create an array from a utf8 binary.
 array_from_binary(Bin) when is_binary(Bin) ->
-    array_from_binary(Bin, 0, array:new()).
-
-array_from_binary(<<>>, _N, Array) ->
-    array:fix(Array);
-array_from_binary(<<C/utf8, Rest/binary>>, N, Array) ->
-    array_from_binary(Rest, N+1, array:set(N, C, Array)).
+    array:from_list(unicode:characters_to_list(Bin, utf8)).
 
 % @doc Create a binary from an array containing unicode characters.
 binary_from_array(Start, End, Array) ->
@@ -1246,17 +1234,6 @@ repair_head(Bin) ->
     %% Illegal sequence, can't repair it.
     {<<>>, Bin}.
 
-
-%% This function can go away when we support OTP 20 and up.
-%%
-int_ceil(Number) ->
-    T = trunc(Number),
-    case (Number - T) of
-        Neg when Neg < 0 -> T;
-        Pos when Pos > 0 -> T + 1;
-        _ -> T
-    end.
-
 %%
 %% Tests
 %%
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 4fb2bcc..aff3b24 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -28,19 +28,19 @@
 %%
 
 prop_cleanup_merge() ->
-    ?FORALL(Diffs, diffy:diffs(),
+    ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}),
         begin
             SourceText = diffy:source_text(Diffs),
             DestinationText = diffy:destination_text(Diffs),
 
             CleanDiffs = cleanup_merge(Diffs),
 
-            SourceText == diffy:source_text(CleanDiffs) andalso
-            DestinationText == diffy:destination_text(CleanDiffs)
+            SourceText == diffy:source_text(CleanDiffs)
+            andalso DestinationText == diffy:destination_text(CleanDiffs)
         end).
 
 prop_cleanup_efficiency() ->
-    ?FORALL(Diffs, diffy:diffs(),
+    ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}),
         begin
             SourceText = diffy:source_text(Diffs),
             DestinationText = diffy:destination_text(Diffs),
@@ -51,6 +51,16 @@ prop_cleanup_efficiency() ->
             DestinationText == diffy:destination_text(EfficientDiffs)
         end).
 
+prop_cleanup_semantic() ->
+    ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}),
+        begin
+            SourceText = diffy:source_text(Diffs),
+            DestinationText = diffy:destination_text(Diffs),
+            EfficientDiffs = cleanup_semantic(Diffs),
+            SourceText =:= diffy:source_text(EfficientDiffs) andalso
+            DestinationText =:= diffy:destination_text(EfficientDiffs)
+        end).
+
 html_like() ->
     proper_types:resize(200,
                         list(frequency([{70, range($a, $z)},       % letters
@@ -240,6 +250,10 @@ cleanup_efficiency_prop_test() ->
     ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 500}, {to_file, user}])),
     ok.
 
+cleanup_semantic_prop_test() ->
+    ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, 500}, {to_file, user}])),
+    ok.
+
 random_diffs_prop_test() ->
     ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, 500}, {to_file, user}])),
     ok.
@@ -279,7 +293,7 @@ text_size_test() ->
     ?assertEqual(4, diffy:text_size(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>)),
 
     %% Bad utf-8 input results in a badarg.
-    ?assertError(badarg, diffy:text_size(<<149,157,112,8>>)),
+    ?assertError({badarg, _}, diffy:text_size(<<149,157,112,8>>)),
 
     ok.
 
@@ -325,6 +339,9 @@ diff_test() ->
 %% Helpers
 %%
 
+diff_op() ->
+    oneof([insert, delete, equal]).
+
 pretty_html(Diffs) ->
     iolist_to_binary(diffy:pretty_html(Diffs)).
 

From 4a3afb30e2ed3bd153902861358d3184d0a09263 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 10:37:55 +0200
Subject: [PATCH 03/47] Increased the number of tests proper does

---
 test/diffy_tests.erl | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index aff3b24..bc06050 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -32,11 +32,10 @@ prop_cleanup_merge() ->
         begin
             SourceText = diffy:source_text(Diffs),
             DestinationText = diffy:destination_text(Diffs),
-
             CleanDiffs = cleanup_merge(Diffs),
 
-            SourceText == diffy:source_text(CleanDiffs)
-            andalso DestinationText == diffy:destination_text(CleanDiffs)
+            SourceText =:= diffy:source_text(CleanDiffs)
+            andalso DestinationText =:= diffy:destination_text(CleanDiffs)
         end).
 
 prop_cleanup_efficiency() ->
@@ -44,11 +43,10 @@ prop_cleanup_efficiency() ->
         begin
             SourceText = diffy:source_text(Diffs),
             DestinationText = diffy:destination_text(Diffs),
-
             EfficientDiffs = cleanup_efficiency(Diffs),
 
-            SourceText == diffy:source_text(EfficientDiffs) andalso
-            DestinationText == diffy:destination_text(EfficientDiffs)
+            SourceText =:= diffy:source_text(EfficientDiffs)
+            andalso DestinationText =:= diffy:destination_text(EfficientDiffs)
         end).
 
 prop_cleanup_semantic() ->
@@ -57,8 +55,9 @@ prop_cleanup_semantic() ->
             SourceText = diffy:source_text(Diffs),
             DestinationText = diffy:destination_text(Diffs),
             EfficientDiffs = cleanup_semantic(Diffs),
-            SourceText =:= diffy:source_text(EfficientDiffs) andalso
-            DestinationText =:= diffy:destination_text(EfficientDiffs)
+
+            SourceText =:= diffy:source_text(EfficientDiffs)
+            andalso DestinationText =:= diffy:destination_text(EfficientDiffs)
         end).
 
 html_like() ->
@@ -247,19 +246,19 @@ cleanup_semantic_test() ->
     ok.
 
 cleanup_efficiency_prop_test() ->
-    ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 500}, {to_file, user}])),
+    ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 800}, {to_file, user}])),
     ok.
 
 cleanup_semantic_prop_test() ->
-    ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, 500}, {to_file, user}])),
+    ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, 800}, {to_file, user}])),
     ok.
 
 random_diffs_prop_test() ->
-    ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, 500}, {to_file, user}])),
+    ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, 800}, {to_file, user}])),
     ok.
 
 random_inner_diff_prop_test() ->
-    ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, 500}, {to_file, user}])),
+    ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, 800}, {to_file, user}])),
     ok.
 
 cleanup_efficiency_test() ->

From faed45a6904a5a94f0114a72602fbd2fde46124c Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 10:59:31 +0200
Subject: [PATCH 04/47] Fix a utf8 matching problem which caused
 cleanup_semantic to result in different destination text output

---
 src/diffy.erl        | 71 +++++++++++++++++++++++++-------------------
 test/diffy_tests.erl | 13 ++++++--
 2 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index df712b4..c889369 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -762,27 +762,31 @@ cleanup_semantic_overlaps([{delete, Del}, {insert, Ins} | T], Acc) ->
     Overlap2 = common_overlap(Ins, Del),
     if
         Overlap1 >= Overlap2 ->
-            TDel = text_size(Del),
-            TIns = text_size(Ins),
-            case Overlap1 >= TDel / 2 orelse Overlap1 >= TIns / 2 of
-                true ->
-                    Common = substring_start(Ins, Overlap1),
-                    NewDel = substring_start(Del, TDel - Overlap1),
-                    NewIns = skip_chars(Ins, Overlap1),
+            TDel = size(Del),
+            TIns = size(Ins),
+            Overlap1BytesDel = overlap_to_bytes_end(Del, Overlap1),
+            Overlap1BytesIns = overlap_to_bytes_start(Ins, Overlap1),
+            if
+                Overlap1BytesDel >= TDel / 2 orelse Overlap1BytesIns >= TIns / 2 ->
+                    Common = binary:part(Ins, 0, Overlap1BytesIns),
+                    NewDel = binary:part(Del, 0, TDel - Overlap1BytesDel),
+                    NewIns = binary:part(Ins, Overlap1BytesIns, TIns - Overlap1BytesIns),
                     cleanup_semantic_overlaps([{insert, NewIns} | T], [{equal, Common}, {delete, NewDel} | Acc]);
-                false ->
+                true ->
                     cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc])
             end;
         true ->
-            TDel = text_size(Del),
-            TIns = text_size(Ins),
-            case Overlap2 >= TDel / 2 orelse Overlap2 >= TIns / 2 of
-                true ->
-                    Common = substring_start(Del, Overlap2),
-                    NewIns = substring_start(Ins, TIns - Overlap2),
-                    NewDel = skip_chars(Del, Overlap2),
+            TDel = size(Del),
+            TIns = size(Ins),
+            Overlap2BytesIns = overlap_to_bytes_end(Ins, Overlap2),
+            Overlap2BytesDel = overlap_to_bytes_start(Del, Overlap2),
+            if
+                Overlap2BytesIns >= TIns / 2 orelse Overlap2BytesDel >= TDel / 2 ->
+                    Common = binary:part(Ins, TIns - Overlap2BytesIns, Overlap2BytesIns),
+                    NewIns = binary:part(Ins, 0, TIns - Overlap2BytesIns),
+                    NewDel = binary:part(Del, Overlap2BytesDel, TDel - Overlap2BytesDel),
                     cleanup_semantic_overlaps([{delete, NewDel} | T], [{equal, Common}, {insert, NewIns} | Acc]);
-                false ->
+                true ->
                     cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc])
             end
     end;
@@ -793,17 +797,28 @@ cleanup_semantic_overlaps([], Acc) ->
 
 %% Helper functions for semantic cleanup
 
+overlap_to_bytes_start(_Bin, 0) -> 0;
+overlap_to_bytes_start(<<C/utf8, Rest/binary>>, N) ->
+    size(<<C/utf8>>) + overlap_to_bytes_start(Rest, N - 1).
+
+overlap_to_bytes_end(Bin, N) ->
+    Skip = text_size(Bin) - N,
+    skip_n_chars(Bin, Skip).
+
+skip_n_chars(Rest, 0) -> size(Rest);
+skip_n_chars(<<_/utf8, Rest/binary>>, N) ->
+    skip_n_chars(Rest, N - 1).
+
 common_overlap(<<>>, _) -> 0;
 common_overlap(_, <<>>) -> 0;
 common_overlap(Text1, Text2) ->
     T1Len = text_size(Text1),
     T2Len = text_size(Text2),
-    {T1, T2} = if
-        T1Len > T2Len -> {substring_end(Text1, T2Len), Text2};
-        T1Len < T2Len -> {Text1, substring_start(Text2, T1Len)};
-        true -> {Text1, Text2}
+    {T1, T2, TMin} = if
+        T1Len > T2Len -> {substring_end(Text1, T2Len), Text2, T2Len};
+        T1Len < T2Len -> {Text1, substring_start(Text2, T1Len), T1Len};
+        true -> {Text1, Text2, T1Len}
     end,
-    TMin = min(T1Len, T2Len),
     if
         T1 =:= T2 -> TMin;
         true -> common_overlap_loop(T1, T2, TMin, 0, 1)
@@ -839,22 +854,18 @@ last_char(<<C/utf8, Rest/binary>>, _Last) -> last_char(Rest, C);
 last_char(<<>>, Last) -> Last.
 
 substring_start(Bin, Len) ->
-    substring_start(Bin, Len, <<>>).
-substring_start(_, 0, Acc) -> Acc;
-substring_start(<<C/utf8, Rest/binary>>, Len, Acc) ->
-    substring_start(Rest, Len - 1, <<Acc/binary, C/utf8>>);
-substring_start(<<>>, _, Acc) -> Acc.
+    binary:part(Bin, 0, overlap_to_bytes_start(Bin, Len)).
 
 substring_end(Bin, Len) ->
     TotalLen = text_size(Bin),
     if
         TotalLen =< Len -> Bin;
-        true -> skip_chars(Bin, TotalLen - Len)
+        true -> 
+            SkipChars = TotalLen - Len,
+            SkipBytes = overlap_to_bytes_start(Bin, SkipChars),
+            binary:part(Bin, SkipBytes, size(Bin) - SkipBytes)
     end.
 
-skip_chars(Bin, 0) -> Bin;
-skip_chars(<<_/utf8, Rest/binary>>, N) -> skip_chars(Rest, N - 1);
-skip_chars(<<>>, _) -> <<>>.
 
 is_non_alphanumeric(undefined) -> true;
 is_non_alphanumeric(C) ->
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index bc06050..81c8f71 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -239,12 +239,19 @@ cleanup_semantic_test() ->
         cleanup_semantic([{delete, <<"a">>}, {equal, <<"b">>}, {delete, <<"c">>}])),
 
     % Multiple eliminations.
-    ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}], 
-        cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}, 
+    ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}],
+        cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>},
             {equal, <<"_">>}, {insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}])),
 
-    ok.
+    % Regression test for UTF-8 data loss in cleanup_semantic_overlaps
+    % Ins1 = <<0,32,204,128,0,0>> (size 6, text_size 5)
+    % Ins2 = <<0,0,0,0,0,0,0,0>> (size 8, text_size 8)
+    % Total Dest size 14, text_size 13
+    Diffs = [{delete,<<0,0,0,0,0,0,0,0>>},{insert,<<0,32,204,128,0,0>>},{insert,<<0,0,0,0,0,0,0,0>>}],
+    Cleaned = cleanup_semantic(Diffs),
+    ?assertEqual(diffy:destination_text(Diffs), diffy:destination_text(Cleaned)),
 
+    ok.
 cleanup_efficiency_prop_test() ->
     ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 800}, {to_file, user}])),
     ok.

From 222bfadb1a28b83ff6a3489e7a3420b176a63620 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 11:24:45 +0200
Subject: [PATCH 05/47] Robustly handle grapheme counting problems

---
 src/diffy.erl        | 16 +++++++---------
 test/diffy_tests.erl | 12 +++++++-----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index c889369..e5c54c2 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -797,17 +797,15 @@ cleanup_semantic_overlaps([], Acc) ->
 
 %% Helper functions for semantic cleanup
 
-overlap_to_bytes_start(_Bin, 0) -> 0;
-overlap_to_bytes_start(<<C/utf8, Rest/binary>>, N) ->
-    size(<<C/utf8>>) + overlap_to_bytes_start(Rest, N - 1).
+overlap_to_bytes_start(Bin, N) ->
+    Prefix = string:slice(Bin, 0, N),
+    size(Prefix).
 
 overlap_to_bytes_end(Bin, N) ->
-    Skip = text_size(Bin) - N,
-    skip_n_chars(Bin, Skip).
-
-skip_n_chars(Rest, 0) -> size(Rest);
-skip_n_chars(<<_/utf8, Rest/binary>>, N) ->
-    skip_n_chars(Rest, N - 1).
+    TotalLen = text_size(Bin),
+    Skip = TotalLen - N,
+    Rest = string:slice(Bin, Skip),
+    size(Rest).
 
 common_overlap(<<>>, _) -> 0;
 common_overlap(_, <<>>) -> 0;
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 81c8f71..c3ebe45 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -23,6 +23,8 @@
 -include_lib("proper/include/proper.hrl").
 -include_lib("eunit/include/eunit.hrl").
 
+-define(NUM_TESTS, 800).
+
 %%
 %% Properties
 %%
@@ -219,7 +221,7 @@ cleanup_merge_test() ->
     ok.
 
 cleanup_merge_prop_test() ->
-    ?assertEqual(true, proper:quickcheck(prop_cleanup_merge(), [{numtests, 500}, {to_file, user}])),
+    ?assertEqual(true, proper:quickcheck(prop_cleanup_merge(), [{numtests, ?NUM_TESTS}, {to_file, user}])),
     ok.
 
 cleanup_semantic_test() ->
@@ -253,19 +255,19 @@ cleanup_semantic_test() ->
 
     ok.
 cleanup_efficiency_prop_test() ->
-    ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 800}, {to_file, user}])),
+    ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, ?NUM_TESTS}, {to_file, user}])),
     ok.
 
 cleanup_semantic_prop_test() ->
-    ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, 800}, {to_file, user}])),
+    ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, ?NUM_TESTS}, {to_file, user}])),
     ok.
 
 random_diffs_prop_test() ->
-    ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, 800}, {to_file, user}])),
+    ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, ?NUM_TESTS}, {to_file, user}])),
     ok.
 
 random_inner_diff_prop_test() ->
-    ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, 800}, {to_file, user}])),
+    ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, ?NUM_TESTS}, {to_file, user}])),
     ok.
 
 cleanup_efficiency_test() ->

From 7c0ec5d57765c961320ffa4c349c8c952edd424c Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 12:05:15 +0200
Subject: [PATCH 06/47] Robustly handle grapheme counting problems

---
 src/diffy.erl        | 26 +++++++++++++++++---------
 test/diffy_tests.erl |  4 ++--
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index e5c54c2..81787a0 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -1,9 +1,9 @@
 %% @author Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
-%% @copyright 2014-2019 Maas-Maarten Zeeman
+%% @copyright 2014-2026 Maas-Maarten Zeeman
 %%
 %% @doc Diffy, an erlang diff match and patch implementation 
 %%
-%% Copyright 2014-2019 Maas-Maarten Zeeman
+%% Copyright 2014-2026 Maas-Maarten Zeeman
 %%
 %% Licensed under the Apache License, Version 2.0 (the "License");
 %% you may not use this file except in compliance with the License.
@@ -799,13 +799,13 @@ cleanup_semantic_overlaps([], Acc) ->
 
 overlap_to_bytes_start(Bin, N) ->
     Prefix = string:slice(Bin, 0, N),
-    size(Prefix).
+    string:length(Prefix).
 
 overlap_to_bytes_end(Bin, N) ->
     TotalLen = text_size(Bin),
     Skip = TotalLen - N,
     Rest = string:slice(Bin, Skip),
-    size(Rest).
+    string:length(Rest).
 
 common_overlap(<<>>, _) -> 0;
 common_overlap(_, <<>>) -> 0;
@@ -829,10 +829,10 @@ common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin ->
         {FoundByteOffset, _} ->
             FoundCharCount = text_size(binary:part(T2, 0, FoundByteOffset)),
             NewLength = Length + FoundCharCount,
-            case NewLength > TMin of
-                true -> Best;
-                false ->
-                    case FoundCharCount =:= 0 orelse substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of
+            if
+                NewLength > TMin -> Best;
+                true ->
+                    case substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of
                         true ->
                             common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1);
                         false ->
@@ -1147,7 +1147,15 @@ common_suffix(Text1, Text2) ->
 
 % @doc Count the number of characters in a utf8 binary.
 text_size(Text) when is_binary(Text) ->
-    string:length(Text).
+    % string:length(Text).
+    text_size(Text, 0).
+
+text_size(<<>>, Count) ->
+    Count;
+text_size(<<_C/utf8, Rest/binary>>, Count) ->
+    text_size(Rest, Count+1);
+text_size(_, _) ->
+    error(badarg).
 
 %%
 %% Array utilities
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index c3ebe45..3e6aac2 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -1,9 +1,9 @@
 %% @author Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
-%% @copyright 2014 Maas-Maarten Zeeman
+%% @copyright 2014-2026 Maas-Maarten Zeeman
 %%
 %% @doc Diffy, an erlang diff match and patch implementation 
 %%
-%% Copyright 2014 Maas-Maarten Zeeman
+%% Copyright 2014-2026 Maas-Maarten Zeeman
 %%
 %% Licensed under the Apache License, Version 2.0 (the "License");
 %% you may not use this file except in compliance with the License.

From 4907ec12e4e2fb2a33e904abd65edfda454fcc59 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 13:29:08 +0200
Subject: [PATCH 07/47] Decode line diffs with a tuple instead of an array

---
 src/diffy.erl        | 85 ++++++++++++++++++++++++++------------------
 test/diffy_tests.erl |  2 +-
 2 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 81787a0..8688f42 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -284,7 +284,7 @@ diff_linemode(Text1, Text2) ->
     Diffs = diff(CharText1, CharText2, false),
 
     %% Transform the diffs back to lines.
-    Diffs1 = chars_to_lines(Diffs, Lines),
+    Diffs1 = decode_lines(Diffs, Lines),
 
     Cleaned = cleanup_merge(Diffs1),
     cleanup_line_diff(Cleaned, <<>>, <<>>, [], []).
@@ -351,16 +351,15 @@ insert_line(Line, Lines, Dict, NextChar) ->
             {NextChar, NextChar+1, [Line|Lines], dict:store(Line, NextChar, Dict)}
     end.
 
-%%
-chars_to_lines(Diffs, Lines) when is_list(Lines) ->
-    A = array:from_list(Lines),
-    chars_to_lines(Diffs, A, []).
+decode_lines(Diffs, Lines) when is_list(Lines) ->
+    LinesTuple = list_to_tuple(Lines),
+    decode_lines(Diffs, LinesTuple, []).
 
-chars_to_lines([], _A, Acc) ->
+decode_lines([], _LinesTuple, Acc) ->
     lists:reverse(Acc);
-chars_to_lines([{Op, Data}|Rest], LineArray, Acc) ->
-    Data1 = << <<(array:get(C, LineArray))/binary>> || <<C/utf8>> <= Data >>,
-    chars_to_lines(Rest, LineArray, [{Op, Data1}|Acc]).
+decode_lines([{Op, Data} | Rest], LinesTuple, Acc) ->
+    Data1 = << <<(element(C + 1, LinesTuple))/binary>> || <<C/utf8>> <= Data >>,
+    decode_lines(Rest, LinesTuple, [{Op, Data1} | Acc]).
 
 
 % Find the 'middle snake' of a diff, split the problem in two
@@ -797,15 +796,39 @@ cleanup_semantic_overlaps([], Acc) ->
 
 %% Helper functions for semantic cleanup
 
+%% @doc Convert N codepoints from the start of Bin to a byte offset.
+%% This is consistent with text_size/1 which counts codepoints (not grapheme clusters).
 overlap_to_bytes_start(Bin, N) ->
-    Prefix = string:slice(Bin, 0, N),
-    string:length(Prefix).
+    codepoints_to_bytes(Bin, N, 0).
+
+codepoints_to_bytes(_Bin, 0, Acc) ->
+    Acc;
+codepoints_to_bytes(<<C/utf8, Rest/binary>>, N, Acc) ->
+    codepoints_to_bytes(Rest, N - 1, Acc + byte_size(<<C/utf8>>));
+codepoints_to_bytes(<<_C, Rest/binary>>, N, Acc) ->
+    %% Invalid utf-8 byte, count as 1
+    codepoints_to_bytes(Rest, N - 1, Acc + 1);
+codepoints_to_bytes(<<>>, _N, Acc) ->
+    Acc.
 
+%% @doc Convert N codepoints from the END of Bin to a byte count of that suffix.
 overlap_to_bytes_end(Bin, N) ->
+    SkipChars = text_size(Bin) - N,
+    SkipBytes = codepoints_to_bytes(Bin, SkipChars, 0),
+    byte_size(Bin) - SkipBytes.
+
+substring_start(Bin, Len) ->
+    binary:part(Bin, 0, overlap_to_bytes_start(Bin, Len)).
+
+substring_end(Bin, Len) ->
     TotalLen = text_size(Bin),
-    Skip = TotalLen - N,
-    Rest = string:slice(Bin, Skip),
-    string:length(Rest).
+    case TotalLen =< Len of
+        true -> Bin;
+        false ->
+            SkipChars = TotalLen - Len,
+            SkipBytes = codepoints_to_bytes(Bin, SkipChars, 0),
+            binary:part(Bin, SkipBytes, byte_size(Bin) - SkipBytes)
+    end.
 
 common_overlap(<<>>, _) -> 0;
 common_overlap(_, <<>>) -> 0;
@@ -817,9 +840,9 @@ common_overlap(Text1, Text2) ->
         T1Len < T2Len -> {Text1, substring_start(Text2, T1Len), T1Len};
         true -> {Text1, Text2, T1Len}
     end,
-    if
-        T1 =:= T2 -> TMin;
-        true -> common_overlap_loop(T1, T2, TMin, 0, 1)
+    case T1 =:= T2 of
+        true -> TMin;
+        false -> common_overlap_loop(T1, T2, TMin, 0, 1)
     end.
 
 common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin ->
@@ -846,25 +869,20 @@ common_overlap_loop(_T1, _T2, _TMin, Best, _Length) ->
 first_char(<<C/utf8, _/binary>>) -> C;
 first_char(_) -> undefined.
 
-last_char(Bin) ->
-    last_char(Bin, undefined).
-last_char(<<C/utf8, Rest/binary>>, _Last) -> last_char(Rest, C);
-last_char(<<>>, Last) -> Last.
-
-substring_start(Bin, Len) ->
-    binary:part(Bin, 0, overlap_to_bytes_start(Bin, Len)).
+last_char(<<>>) -> undefined;
+last_char(Bin) when is_binary(Bin) ->
+    last_char(Bin, byte_size(Bin) - 1).
 
-substring_end(Bin, Len) ->
-    TotalLen = text_size(Bin),
-    if
-        TotalLen =< Len -> Bin;
-        true -> 
-            SkipChars = TotalLen - Len,
-            SkipBytes = overlap_to_bytes_start(Bin, SkipChars),
-            binary:part(Bin, SkipBytes, size(Bin) - SkipBytes)
+last_char(Bin, Pos) ->
+    case binary:at(Bin, Pos) band 16#C0 of
+        16#80 ->
+            % continuation byte, keep scanning back
+            last_char(Bin, Pos - 1);
+        _ ->
+            <<_:Pos/binary, C/utf8, _/binary>> = Bin,
+            C
     end.
 
-
 is_non_alphanumeric(undefined) -> true;
 is_non_alphanumeric(C) ->
     not ((C >= $a andalso C =< $z) orelse
@@ -1147,7 +1165,6 @@ common_suffix(Text1, Text2) ->
 
 % @doc Count the number of characters in a utf8 binary.
 text_size(Text) when is_binary(Text) ->
-    % string:length(Text).
     text_size(Text, 0).
 
 text_size(<<>>, Count) ->
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 3e6aac2..27454ea 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -301,7 +301,7 @@ text_size_test() ->
     ?assertEqual(4, diffy:text_size(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>)),
 
     %% Bad utf-8 input results in a badarg.
-    ?assertError({badarg, _}, diffy:text_size(<<149,157,112,8>>)),
+    ?assertError(badarg, diffy:text_size(<<149,157,112,8>>)),
 
     ok.
 

From 37d50fa14537acdb22a131df66dae8c515283221 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 19:28:42 +0200
Subject: [PATCH 08/47] Remove array routines for finding the middle snake

---
 src/diffy.erl        | 100 +++++++++++++++----------------------------
 test/diffy_tests.erl |   2 +-
 2 files changed, 35 insertions(+), 67 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 8688f42..5c96c6d 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -375,13 +375,15 @@ decode_lines([{Op, Data} | Rest], LinesTuple, Acc) ->
 %%      Array of diff tuples.
 %%    """
 diff_bisect(A, B) when is_binary(A) andalso is_binary(B) ->
-    ArrA = array_from_binary(A),
-    ArrB = array_from_binary(B),
-    try compute_diff_bisect1(ArrA, ArrB, array:size(ArrA), array:size(ArrB)) of
-        no_overlap -> [{delete, A}, {insert, B}] 
+    A32 = unicode:characters_to_binary(A, utf8, utf32),
+    B32 = unicode:characters_to_binary(B, utf8, utf32),
+    M = byte_size(A32) div 4,
+    N = byte_size(B32) div 4,
+    try compute_diff_bisect1(A32, B32, M, N) of
+        no_overlap -> [{delete, A}, {insert, B}]
     catch
-        throw:{overlap, A1, B1, X, Y} ->
-            diff_bisect_split(A1, B1, X, Y)
+        throw:{overlap, X, Y} ->
+            diff_bisect_split(A, B, A32, B32, X, Y)
     end.
 
 compute_diff_bisect1(A, B, M, N) ->
@@ -441,7 +443,7 @@ compute_diff_bisect1(A, B, M, N) ->
                                     if 
                                         X1_1 >= X2 ->
                                             % Overlap detected
-                                            throw({overlap, A, B, X1_1, Y1_1});
+                                            throw({overlap, X1_1, Y1_1});
                                         true ->
                                             {continue, S2_1}
                                     end;
@@ -491,7 +493,7 @@ compute_diff_bisect1(A, B, M, N) ->
                                         % Mirror x2 onto top-left coordinate system.
                                         X1 >= M - X2_1 ->
                                             % Overlap detected
-                                            throw({overlap, A, B, X1, Y1});
+                                            throw({overlap, X1, Y1});
                                         true ->
                                             {continue, S4_1}
                                     end;
@@ -507,12 +509,12 @@ compute_diff_bisect1(A, B, M, N) ->
     no_overlap.
 
 % @doc Split A and B and process the parts.
-diff_bisect_split(A, B, X, Y) ->
-    A1 = binary_from_array(0, X, A),
-    A2 = binary_from_array(0, Y, B),
+diff_bisect_split(A, B, A32, B32, X, Y) ->
+    A1 = utf32_prefix_to_utf8(A32, X),
+    A2 = utf32_prefix_to_utf8(B32, Y),
 
-    B1 = binary_from_array(X, array:size(A), A),
-    B2 = binary_from_array(Y, array:size(B), B),
+    B1 = binary:part(A, byte_size(A1), byte_size(A) - byte_size(A1)),
+    B2 = binary:part(B, byte_size(A2), byte_size(B) - byte_size(A2)),
 
     Diffs = diff(A1, A2, false),
     DiffsB = diff(B1, B2, false),
@@ -540,26 +542,11 @@ pretty_html([{Op, Data}|T], Acc) ->
 
 % @doc Compute the source text from a list of diffs.
 source_text(Diffs) ->
-    source_text(Diffs, <<>>).
-
-source_text([], Acc) ->
-    Acc;
-source_text([{insert, _Data}|T], Acc) ->
-    source_text(T, Acc);
-source_text([{_Op, Data}|T], Acc) ->
-    source_text(T, <<Acc/binary, Data/binary>>).
-    
+    iolist_to_binary([Data || {Op, Data} <- Diffs, Op =/= insert]).
 
 % @doc Compute the destination text from a list of diffs.
 destination_text(Diffs) ->
-    destination_text(Diffs, <<>>).
-    
-destination_text([], Acc) -> 
-    Acc;
-destination_text([{delete, _Data}|T], Acc) ->
-    destination_text(T, Acc);
-destination_text([{_Op, Data}|T], Acc) ->
-    destination_text(T, <<Acc/binary, Data/binary>>).
+    iolist_to_binary([Data || {Op, Data} <- Diffs, Op =/= delete]).
     
 % @doc Compute the Levenshtein distance, the number of inserted, deleted or substituted characters.
 levenshtein(Diffs) ->
@@ -1091,26 +1078,22 @@ is_suffix(A, B) ->
     size(A) =:= binary:longest_common_suffix([A, B]).
 
 %
-match_front(X1, Y1, A, M, B, N) when X1 < M andalso Y1 < N ->
-    case array:get(X1, A) =:= array:get(Y1, B) of
-        true -> 
-	    match_front(X1+1, Y1+1, A, M, B, N);
-        false -> 
-	    {X1, Y1}
-    end;
+match_front(X1, Y1, A32, M, B32, N) when X1 < M andalso Y1 < N ->
+    APart = binary:part(A32, X1 * 4, (M - X1) * 4),
+    BPart = binary:part(B32, Y1 * 4, (N - Y1) * 4),
+    Steps = binary:longest_common_prefix([APart, BPart]) div 4,
+    {X1 + Steps, Y1 + Steps};
 match_front(X1, Y1, _, _, _, _) ->
     {X1, Y1}.
 
 %
-match_reverse(X1, Y1, A, M, B, N) when X1 < M andalso Y1 < N ->
-    case array:get(M-X1-1, A) =:= array:get(N-Y1-1, B) of
-        true -> 
-	    match_reverse(X1+1, Y1+1, A, M, B, N);
-        false -> 
-	    {X1, Y1}
-    end;
-match_reverse(X1, Y1, _, _, _, _) ->
-    {X1, Y1}.
+match_reverse(X2, Y2, A32, M, B32, N) when X2 < M andalso Y2 < N ->
+    APart = binary:part(A32, 0, (M - X2) * 4),
+    BPart = binary:part(B32, 0, (N - Y2) * 4),
+    Steps = binary:longest_common_suffix([APart, BPart]) div 4,
+    {X2 + Steps, Y2 + Steps};
+match_reverse(X2, Y2, _, _, _, _) ->
+    {X2, Y2}.
 
 
 %% Implementation of the for statement
@@ -1175,22 +1158,13 @@ text_size(_, _) ->
     error(badarg).
 
 %%
-%% Array utilities
+%% UTF-32 utilities
 %%
 
-% @doc Create an array from a utf8 binary.
-array_from_binary(Bin) when is_binary(Bin) ->
-    array:from_list(unicode:characters_to_list(Bin, utf8)).
-
-% @doc Create a binary from an array containing unicode characters.
-binary_from_array(Start, End, Array) ->
-    binary_from_array(Start, End, Array, <<>>).
-    
-binary_from_array(N, End, Array, Acc) when N < End ->
-    C = array:get(N, Array),
-    binary_from_array(N+1, End, Array, <<Acc/binary, C/utf8>>);
-binary_from_array(_, _, _, Acc) ->
-    Acc.
+% @doc Convert the first N codepoints of a UTF-32BE binary to a UTF-8 binary.
+utf32_prefix_to_utf8(Utf32, CodepointCount) ->
+    Prefix32 = binary:part(Utf32, 0, CodepointCount * 4),
+    unicode:characters_to_binary(Prefix32, utf32, utf8).
 
 %% @doc Checks the trailing bytes for utf8 prefix bytes.
 repair_tail(<<>>) ->
@@ -1313,12 +1287,6 @@ for_test() ->
     ?assertEqual(0, for(0, 10, fun(I, _N) -> {break, I} end, undefined)),
     ok.
 
-array_test() ->
-    ?assertEqual(20, array:size(array_from_binary(<<"de apen eten bananen">>))),
-    ?assertEqual(<<"broodje aap">>, binary_from_array(0, 11, array_from_binary(<<"broodje aap">>))),
-    ?assertEqual(<<"aa">>, binary_from_array(0, 2, array_from_binary(<<"aap">>))),
-    ?assertEqual(<<"ap">>, binary_from_array(1, 3, array_from_binary(<<"aap">>))),
-    ok.
 
 diff_utf8_test() ->
     ?assertEqual([{equal, <<208,174, 208,189, 208,184, 208,186, 208,190, 208,180>>}], 
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 27454ea..a0b8849 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -23,7 +23,7 @@
 -include_lib("proper/include/proper.hrl").
 -include_lib("eunit/include/eunit.hrl").
 
--define(NUM_TESTS, 800).
+-define(NUM_TESTS, 500).
 
 %%
 %% Properties

From b822b7b751f92d7704326cec66532e46bb490624 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 19:35:56 +0200
Subject: [PATCH 09/47] Update otp_versions for testing

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ca85721..0fed54b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -18,14 +18,14 @@ jobs:
 
     strategy:
       matrix:
-        otp_version: [22,23,24]
+        otp_version: [25,26,27]
         os: [ubuntu-latest]
 
     container:
       image: erlang:${{ matrix.otp_version }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Compile
         run: make
       - name: Test

From 6fc9c9be41a27d480d57acf3211b971cd5249046 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 19:37:24 +0200
Subject: [PATCH 10/47] Update otp_versions for testing

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0fed54b..7adb427 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -18,7 +18,7 @@ jobs:
 
     strategy:
       matrix:
-        otp_version: [25,26,27]
+        otp_version: [26,27,28]
         os: [ubuntu-latest]
 
     container:

From 336a95a7c3bb6819b4e34003f3ae48a2f91c6858 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 19:57:36 +0200
Subject: [PATCH 11/47] Supress dialyzer warning

---
 src/diffy.app.src | 2 +-
 src/diffy.erl     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/diffy.app.src b/src/diffy.app.src
index ec9688e..4e9d5b5 100644
--- a/src/diffy.app.src
+++ b/src/diffy.app.src
@@ -2,7 +2,7 @@
     {description, "Diff, match patch implementation"},
     {vsn, "git"},
     {registered, []},
-    {applications, [kernel, stdlib]},
+    {applications, [kernel, stdlib, zotonic_stdlib]},
     {env, []},
     {maintainers, ["Maas-Maarten Zeeman", "Zotonic Team"]},
     {licenses, ["Apache 2.0"]},
diff --git a/src/diffy.erl b/src/diffy.erl
index 5c96c6d..46234d8 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -79,6 +79,8 @@
     length2 = 0
 }).
 
+-dialyzer({no_match, for/5}).
+
 % @doc Compute the difference between two binary texts
 %
 -spec diff(unicode:unicode_binary(), unicode:unicode_binary()) -> diffs().
@@ -1105,10 +1107,8 @@ for(From, To, _Step, _Fun, State) when From >= To ->
     State;
 for(From, To, Step, Fun, State) ->
     case Fun(From, State) of
-        {continue, S1} ->
-            for(From + Step, To, Step, Fun, S1);
-        {break, S1} ->
-            S1
+        {continue, S1} -> for(From + Step, To, Step, Fun, S1);
+        {break, S1} -> S1
     end.
 
 split_pre_and_suffix(Text1, Text2) ->

From 1c2c8e8c38358d9af89ccb01923a424094d590f9 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 20:04:46 +0200
Subject: [PATCH 12/47] Extra plt apss for dialyzer

---
 rebar.config | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/rebar.config b/rebar.config
index 304974c..6425ad6 100644
--- a/rebar.config
+++ b/rebar.config
@@ -26,3 +26,8 @@
         ]}
     ]}
 ]}.
+
+{dialyzer, [
+    {plt_extra_apps, [eunit]}
+]}.
+

From 82122f10d647635cc7bd3034945adc0fdc8430fb Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 20:21:47 +0200
Subject: [PATCH 13/47] More dialyzer fixes in test

---
 rebar.config         | 8 ++++----
 test/diffy_tests.erl | 8 ++++++++
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/rebar.config b/rebar.config
index 6425ad6..08a0aed 100644
--- a/rebar.config
+++ b/rebar.config
@@ -22,12 +22,12 @@
         {dialyzer, [
           {warnings, [
               no_return
+          ]},
+          {plt_extra_apps, [
+              eunit,
+              proper
           ]}
         ]}
     ]}
 ]}.
 
-{dialyzer, [
-    {plt_extra_apps, [eunit]}
-]}.
-
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index a0b8849..d016669 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -23,6 +23,14 @@
 -include_lib("proper/include/proper.hrl").
 -include_lib("eunit/include/eunit.hrl").
 
+-dialyzer({no_opaque, [
+    cleanup_merge_prop_test/0,
+    cleanup_efficiency_prop_test/0,
+    cleanup_semantic_prop_test/0,
+    random_inner_diff_prop_test/0,
+    random_diffs_prop_test/0
+]}).
+
 -define(NUM_TESTS, 500).
 
 %%

From 9e05a94aee8877a76a63a5a82bb3e54da25920df Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 20:39:16 +0200
Subject: [PATCH 14/47] More strict dialyzer and xref settings

---
 rebar.config | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/rebar.config b/rebar.config
index 08a0aed..bbf548a 100644
--- a/rebar.config
+++ b/rebar.config
@@ -12,8 +12,10 @@
 
         {xref_checks, [
             undefined_function_calls,
+            undefined_functions,
             locals_not_used,
-            deprecated_function_calls
+            deprecated_function_calls,
+            deprecated_functions
         ]},
 
         {xref_ignores, [
@@ -21,7 +23,12 @@
 
         {dialyzer, [
           {warnings, [
-              no_return
+              no_return,
+              extra_return,
+              missing_return,
+              specdiffs,
+              overspecs,
+              underspecs
           ]},
           {plt_extra_apps, [
               eunit,

From 0af6dbbb1c736cae9ac3592293f56becad4eb4cb Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Tue, 7 Apr 2026 22:06:51 +0200
Subject: [PATCH 15/47] More strict dialyzer and xref settings

---
 rebar.config | 2 --
 1 file changed, 2 deletions(-)

diff --git a/rebar.config b/rebar.config
index bbf548a..03ca5c6 100644
--- a/rebar.config
+++ b/rebar.config
@@ -26,8 +26,6 @@
               no_return,
               extra_return,
               missing_return,
-              specdiffs,
-              overspecs,
               underspecs
           ]},
           {plt_extra_apps, [

From 51f623157b38101a5efa79ef21ab8bffb3e8c080 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Wed, 8 Apr 2026 09:25:07 +0200
Subject: [PATCH 16/47] Added make doc and introduce ex_doc

---
 Makefile                   | 13 +++++++++----
 rebar.config               | 10 ++++++++++
 src/diffy.erl              |  1 +
 src/diffy_simple_patch.erl |  3 ++-
 src/diffy_term.erl         |  3 ++-
 5 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 82eaca1..b3e31ac 100644
--- a/Makefile
+++ b/Makefile
@@ -28,13 +28,18 @@ xref: $(REBAR)
 dialyzer: $(REBAR)
 	$(REBAR) as test dialyzer
 
-clean: $(REBAR)
+clean: $(REBAR) clean_doc
 	$(REBAR) clean
 
-distclean:
-	rm -rf _build
-	rm $(REBAR)
+clean_doc:
+	@rm -rf doc
 
+distclean: clean_doc
+	@rm -rf _build
+	@rm $(REBAR)
+
+doc: $(REBAR)
+	$(REBAR) ex_doc --output doc --formatter html
 
 # dializer 
 
diff --git a/rebar.config b/rebar.config
index 03ca5c6..b8dd080 100644
--- a/rebar.config
+++ b/rebar.config
@@ -1,8 +1,18 @@
 {erl_opts, [debug_info, warn_unused, warn_shadow_vars]}.
+
 {deps, [
     {zotonic_stdlib, "1.27.0"}
 ]}.
 
+{project_plugins, [rebar3_ex_doc]}.
+
+{hex, [{doc, ex_doc}]}.
+
+{ex_doc, [
+    {source_url, <<"https://github.com/zotonic/diffy">>},
+    {extras, [<<"README.md">>, <<"LICENSE">>]},
+    {main, <<"readme">>}
+]}.
 
 {profiles, [
     {test, [
diff --git a/src/diffy.erl b/src/diffy.erl
index 46234d8..1de37b2 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -2,6 +2,7 @@
 %% @copyright 2014-2026 Maas-Maarten Zeeman
 %%
 %% @doc Diffy, an erlang diff match and patch implementation 
+%% @end
 %%
 %% Copyright 2014-2026 Maas-Maarten Zeeman
 %%
diff --git a/src/diffy_simple_patch.erl b/src/diffy_simple_patch.erl
index 66a6ae8..29b5753 100644
--- a/src/diffy_simple_patch.erl
+++ b/src/diffy_simple_patch.erl
@@ -2,8 +2,9 @@
 %% @copyright 2014 Maas-Maarten Zeeman
 %%
 %% @doc Diffy, an erlang diff match and patch implementation 
+%% @end
 %%
-%% Copyright 2014 Maas-Maarten Zeeman
+%% Copyright 2014-2026 Maas-Maarten Zeeman
 %%
 %% Licensed under the Apache License, Version 2.0 (the "License");
 %% you may not use this file except in compliance with the License.
diff --git a/src/diffy_term.erl b/src/diffy_term.erl
index ed25387..2639bf6 100644
--- a/src/diffy_term.erl
+++ b/src/diffy_term.erl
@@ -3,8 +3,9 @@
 %%
 %% @doc Diffy, an erlang diff match and patch implementation 
 %%      Adapted from diffy.erl for simple diff on a list of Erlang terms
+%% @end
 %%
-%% Copyright 2014-2015 Maas-Maarten Zeeman, Marc Worrell
+%% Copyright 2014-2026 Maas-Maarten Zeeman, Marc Worrell
 %%
 %% Licensed under the Apache License, Version 2.0 (the "License");
 %% you may not use this file except in compliance with the License.

From 6e2a5dff7dd58e5f217aa061a6c3a2dad0a4568e Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Wed, 8 Apr 2026 12:26:07 +0200
Subject: [PATCH 17/47] Remove dep zotonic_stdlib and implement html escaping

---
 rebar.config         |  4 +---
 rebar.lock           | 23 +----------------------
 src/diffy.app.src    |  2 +-
 src/diffy.erl        | 24 +++++++++++++++++-------
 test/diffy_tests.erl |  5 +++++
 5 files changed, 25 insertions(+), 33 deletions(-)

diff --git a/rebar.config b/rebar.config
index b8dd080..ce7ffd3 100644
--- a/rebar.config
+++ b/rebar.config
@@ -1,8 +1,6 @@
 {erl_opts, [debug_info, warn_unused, warn_shadow_vars]}.
 
-{deps, [
-    {zotonic_stdlib, "1.27.0"}
-]}.
+{deps, [ ]}.
 
 {project_plugins, [rebar3_ex_doc]}.
 
diff --git a/rebar.lock b/rebar.lock
index 98901fd..57afcca 100644
--- a/rebar.lock
+++ b/rebar.lock
@@ -1,22 +1 @@
-{"1.2.0",
-[{<<"cowlib">>,{pkg,<<"cowlib">>,<<"2.16.0">>},1},
- {<<"qdate_localtime">>,{pkg,<<"qdate_localtime">>,<<"1.2.2">>},1},
- {<<"ssl_verify_fun">>,{pkg,<<"ssl_verify_fun">>,<<"1.1.7">>},2},
- {<<"tls_certificate_check">>,
-  {pkg,<<"tls_certificate_check">>,<<"1.31.0">>},
-  1},
- {<<"zotonic_stdlib">>,{pkg,<<"zotonic_stdlib">>,<<"1.27.0">>},0}]}.
-[
-{pkg_hash,[
- {<<"cowlib">>, <<"54592074EBBBB92EE4746C8A8846E5605052F29309D3A873468D76CDF932076F">>},
- {<<"qdate_localtime">>, <<"43E1B20102F50A8B2A2BE7042C2F6BE989AD96CA2CC319DB5DF56E122E8873F6">>},
- {<<"ssl_verify_fun">>, <<"354C321CF377240C7B8716899E182CE4890C5938111A1296ADD3EC74CF1715DF">>},
- {<<"tls_certificate_check">>, <<"9A910B54D8CB96CC810CABF4C0129F21360F82022B20180849F1442A25CCBB04">>},
- {<<"zotonic_stdlib">>, <<"36D6F7A1004DEE169A61ADB57FDE8175F39F59634B5FFFD4AA0C1D0985D2A74E">>}]},
-{pkg_hash_ext,[
- {<<"cowlib">>, <<"7F478D80D66B747344F0EA7708C187645CFCC08B11AA424632F78E25BF05DB51">>},
- {<<"qdate_localtime">>, <<"A38D5F1C5AE14B22F471E442B262AECCAFB915B664C7C364443DC73179C50FDA">>},
- {<<"ssl_verify_fun">>, <<"FE4C190E8F37401D30167C8C405EDA19469F34577987C76DDE613E838BBC67F8">>},
- {<<"tls_certificate_check">>, <<"9D2B41B128D5507BD8AD93E1A998E06D0AB2F9A772AF343F4C00BF76C6BE1532">>},
- {<<"zotonic_stdlib">>, <<"B9555F50717F2F8FBD3D4156CE7F4E2DF380441D942DE54789466940929B08C3">>}]}
-].
+[].
diff --git a/src/diffy.app.src b/src/diffy.app.src
index 4e9d5b5..ec9688e 100644
--- a/src/diffy.app.src
+++ b/src/diffy.app.src
@@ -2,7 +2,7 @@
     {description, "Diff, match patch implementation"},
     {vsn, "git"},
     {registered, []},
-    {applications, [kernel, stdlib, zotonic_stdlib]},
+    {applications, [kernel, stdlib]},
     {env, []},
     {maintainers, ["Maas-Maarten Zeeman", "Zotonic Team"]},
     {licenses, ["Apache 2.0"]},
diff --git a/src/diffy.erl b/src/diffy.erl
index 1de37b2..37b9dcf 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -525,23 +525,33 @@ diff_bisect_split(A, B, A32, B32, X, Y) ->
     Diffs ++ DiffsB.
 
 % @doc Convert the diffs into a pretty html report
--spec pretty_html(diffs()) -> iolist().
 pretty_html(Diffs) ->
     pretty_html(Diffs, []).
 
 pretty_html([], Acc) ->
     lists:reverse(Acc);
-pretty_html([{Op, Data}|T], Acc) ->
-    Text = z_html:escape(Data),
+pretty_html([{Op, Data} | T], Acc) ->
+    Safe = html_escape(Data),
     HTML = case Op of
         insert ->
-            [<<"<ins style='background:#e6ffe6;'>">>, Text, <<"</ins>">>];
+            [<<"<ins style='background:#e6ffe6;'>">>, Safe, <<"</ins>">>];
         delete ->
-            [<<"<del style='background:#ffe6e6;'>">>, Text, <<"</del>">>];
+            [<<"<del style='background:#ffe6e6;'>">>, Safe, <<"</del>">>];
         equal ->
-            [<<"<span>">>, Text, <<"</span>">>]
+            [<<"<span>">>, Safe, <<"</span>">>]
     end,
-    pretty_html(T, [HTML|Acc]).
+    pretty_html(T, [HTML | Acc]).
+
+html_escape(B) when is_binary(B) ->
+    binary:replace(B,
+                   [<<"&">>, <<"<">>, <<">">>, <<"\"">>, <<"'">>],
+                   fun (<<"&">>)   -> <<"&amp;">>;
+                       (<<"<">>)   -> <<"&lt;">>;
+                       (<<">">>)   -> <<"&gt;">>;
+                       (<<"\"">>)  -> <<"&quot;">>;
+                       (<<"'">>)   -> <<"&#39;">>
+                   end,
+                   [global]).
 
 % @doc Compute the source text from a list of diffs.
 source_text(Diffs) ->
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index d016669..694db66 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -142,8 +142,13 @@ pretty_html_test() ->
     ?assertEqual(<<"<span>test</span>">>, pretty_html([{equal, <<"test">>}])),
     ?assertEqual(<<"<del style='background:#ffe6e6;'>foo</del><span>test</span>">>, 
         pretty_html([{delete, <<"foo">>}, {equal, <<"test">>}])),
+
     ?assertEqual(<<"<ins style='background:#e6ffe6;'>foo</ins><span>test</span>">>, 
         pretty_html([{insert, <<"foo">>}, {equal, <<"test">>}])),
+
+    %% escaping.
+    ?assertEqual(<<"<ins style='background:#e6ffe6;'>&lt;span&gt;foo&lt;/span&gt;</ins><span>&amp; &lt; &gt; &quot; &#39;</span>">>, 
+        pretty_html([{insert, <<"<span>foo</span>">>}, {equal, <<"& < > \" '">>}])),
     ok.
 
 source_text_test() ->

From 53d29fb73206743a7abee46188f6fb812402c624 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Wed, 8 Apr 2026 14:03:15 +0200
Subject: [PATCH 18/47] binary:replace with replacement function is only
 available from otp 27

---
 src/diffy.erl | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/diffy.erl b/src/diffy.erl
index 37b9dcf..dab2972 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -542,6 +542,7 @@ pretty_html([{Op, Data} | T], Acc) ->
     end,
     pretty_html(T, [HTML | Acc]).
 
+-if(?OTP_RELEASE >= 27).
 html_escape(B) when is_binary(B) ->
     binary:replace(B,
                    [<<"&">>, <<"<">>, <<">">>, <<"\"">>, <<"'">>],
@@ -552,7 +553,23 @@ html_escape(B) when is_binary(B) ->
                        (<<"'">>)   -> <<"&#39;">>
                    end,
                    [global]).
+-else.
+html_escape(B) when is_binary(B) ->
+    lists:foldl(fun({From, To}, Acc) ->
+                        binary:replace(Acc, From, To, [global])
+                end,
+                B,
+                [
+                 {<<"&">>,  <<"&amp;">>},
+                 {<<"<">>,  <<"&lt;">>},
+                 {<<">">>,  <<"&gt;">>},
+                 {<<"\"">>, <<"&quot;">>},
+                 {<<"'">>,  <<"&#39;">>}
+                ]).
+-endif.
+
 
+% Above function can be replaced with this when OTP 27 is the lowest supported 
 % @doc Compute the source text from a list of diffs.
 source_text(Diffs) ->
     iolist_to_binary([Data || {Op, Data} <- Diffs, Op =/= insert]).

From 8d05c22579c62d809847111d55495fb6d98f0ad6 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Thu, 9 Apr 2026 22:39:45 +0200
Subject: [PATCH 19/47] Changed the internal functions to use utf32 binaries
 instead of trying to work with utf8 and repair the result

---
 src/diffy.erl        | 877 ++++++++++++++++++++-----------------------
 test/diffy_tests.erl |  99 ++++-
 2 files changed, 503 insertions(+), 473 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index dab2972..57d04ce 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -23,6 +23,7 @@
 
 -export([
     diff/2,
+    diff/3,
     diff_bisect/2,
     diff_linemode/2,
 
@@ -52,16 +53,19 @@
 -type diff() :: {diff_op(), unicode:unicode_binary()}.
 -type diffs() :: list(diff()).
 
+-type diff_option() ::
+    semantic |
+    efficiency |
+    {efficiency, EditCost :: pos_integer()} |
+    no_linemode.
+
 -type for_fun() :: fun((integer(), term()) -> {continue, term()} | {break, term()}).
 
--export_type([diff_op/0, diff/0, diffs/0]).
+-export_type([diff_op/0, diff/0, diffs/0, diff_option/0]).
 
 -define(PATCH_MARGIN, 4).
--define(PATCH_MAX_PATCH_LEN, 32).
-
--define(MATCH_MAXBITS, 31).
-
 -define(IS_INS_OR_DEL(Op), (Op =:= insert orelse Op =:= delete)).
+-define(PHASH2_RANGE, (1 bsl 32)).
 
 -record(bisect_state, {
     k1start = 0, k1end = 0,
@@ -82,34 +86,72 @@
 
 -dialyzer({no_match, for/5}).
 
-% @doc Compute the difference between two binary texts
-%
+% @doc Compute the difference between two binary texts.
 -spec diff(unicode:unicode_binary(), unicode:unicode_binary()) -> diffs().
 diff(Text1, Text2) ->
-    diff(Text1, Text2, true).
+    diff(Text1, Text2, []).
+
+% @doc Compute the difference between two binary texts with options.
+%
+% Options:
+%   semantic             - run cleanup_semantic/1 on the result
+%   efficiency           - run cleanup_efficiency/1 on the result (default edit cost 4)
+%   {efficiency, Cost}   - run cleanup_efficiency/2 with a custom edit cost
+%   no_linemode          - disable the linemode optimization for large texts
+%
+% Cleanups are always applied in the correct order: semantic first, then efficiency.
+-spec diff(unicode:unicode_binary(), unicode:unicode_binary(), [diff_option()]) -> diffs().
+diff(Text1, Text2, Options) when is_list(Options) ->
+    CheckLines = not lists:member(no_linemode, Options),
+    T1 = to_utf32(Text1),
+    T2 = to_utf32(Text2),
+    Diffs32 = diff32(T1, T2, CheckLines),
+    Diffs1 = case lists:member(semantic, Options) of
+        true  -> cleanup_semantic32(Diffs32);
+        false -> Diffs32
+    end,
+    Diffs2 = case efficiency_opt(Options) of
+        none           -> Diffs1;
+        default        -> cleanup_efficiency32(Diffs1);
+        {custom, Cost} -> cleanup_efficiency32(Diffs1, Cost)
+    end,
+    %% Single conversion at the exit boundary.
+    [{Op, to_utf8(D)} || {Op, D} <- Diffs2].
+
+%% Extract the efficiency option, preferring {efficiency, Cost} over plain efficiency.
+efficiency_opt(Options) ->
+    case lists:keyfind(efficiency, 1, Options) of
+        {efficiency, Cost} -> {custom, Cost};
+        false ->
+            case lists:member(efficiency, Options) of
+                true  -> default;
+                false -> none
+            end
+    end.
 
-diff(<<>>, <<>>, _CheckLines) ->
+%% Internal diff working entirely in UTF-32 binaries.
+diff32(<<>>, <<>>, _CheckLines) ->
     [];
-diff(Text1, Text2, _CheckLines) when Text1 =:= Text2 ->
+diff32(Text1, Text2, _CheckLines) when Text1 =:= Text2 ->
     [{equal, Text1}];
-diff(Text1, Text2, CheckLines) ->
+diff32(Text1, Text2, CheckLines) ->
     {Prefix, MText1, MText2, Suffix} = split_pre_and_suffix(Text1, Text2),
 
     Diffs = compute_diff(MText1, MText2, CheckLines),
 
     Diffs1 = case Suffix of
-        <<>> -> Diffs;
-        _ -> Diffs ++ [{equal, Suffix}]
-    end,
+                 <<>> -> Diffs;
+                 _ -> Diffs ++ [{equal, Suffix}]
+             end,
 
-    Diffs2 = case Prefix of 
-        <<>> -> Diffs1;
-        _ -> [{equal, Prefix} | Diffs1]
-    end,
+    Diffs2 = case Prefix of
+                 <<>> -> Diffs1;
+                 _ -> [{equal, Prefix} | Diffs1]
+             end,
 
-    cleanup_merge(Diffs2).
+    cleanup_merge32(Diffs2).
 
-%% This assumes Text1 and Text2 don't have a common prefix
+%% This assumes Text1 and Text2 don't have a common prefix. Operates on UTF-32.
 compute_diff(<<>>, NewText, _CheckLines) ->
     [{insert, NewText}];
 compute_diff(OldText, <<>>, _CheckLines) ->
@@ -118,22 +160,23 @@ compute_diff(OldText, NewText, CheckLines) ->
     OldStNew = size(OldText) < size(NewText),
 
     {ShortText, LongText} = case OldStNew of
-        true -> {OldText, NewText};
-        false -> {NewText, OldText}
-    end,
+                                true -> {OldText, NewText};
+                                false -> {NewText, OldText}
+                            end,
 
     case binary:match(LongText, ShortText) of
         {Start, Length} ->
             <<Pre:Start/binary, _:Length/binary, Suf/binary>> = LongText,
             Op = diff_op(OldStNew),
-            [{Op, Pre}, {equal, ShortText}, {Op, Suf}]; 
+            [{Op, Pre}, {equal, ShortText}, {Op, Suf}];
         nomatch ->
-            case single_char(ShortText) of
+            %% In UTF-32, a single codepoint is exactly 4 bytes.
+            case size(ShortText) =:= 4 of
                 true ->
                     [{delete, OldText}, {insert, NewText}];
                 false ->
                     try_half_match(OldText, NewText, CheckLines)
-             end
+            end
     end.
 
 diff_op(true) -> insert;
@@ -143,14 +186,15 @@ diff_op(false) -> delete.
 try_half_match(OldText, NewText, CheckLines) ->
     case half_match(OldText, NewText) of
         {half_match, A1, A2, B1, B2, Common} ->
-            Diffs1 = diff(A1, B1, CheckLines),
-            Diffs2 = diff(A2, B2, CheckLines),
+            Diffs1 = diff32(A1, B1, CheckLines),
+            Diffs2 = diff32(A2, B2, CheckLines),
             Diffs1 ++ [{equal, Common} | Diffs2];
         undefined ->
             compute_diff1(OldText, NewText, CheckLines)
     end.
 
 %% Check if we can do a half-match diff, returns undefined if it is not advantageous.
+%% Operates on UTF-32 binaries — size comparisons are in bytes (4 bytes per codepoint).
 half_match(A, B) ->
     AGtB = size(A) > size(B),
     {Short, Long} = case AGtB of
@@ -158,28 +202,28 @@ half_match(A, B) ->
         false -> {A, B}
     end,
 
-    case text_smaller_than(Long, 4) orelse size(Short) * 2 < size(Long) of
+    %% text_smaller_than(Long, 4) becomes size(Long) < 4*4 in UTF-32.
+    case size(Long) < 16 orelse size(Short) * 2 < size(Long) of
         true ->
             %% No point in looking.
             undefined;
         false ->
-            %% Note: this could split through a utf8 byte sequence.
             Hm1 = half_match_i(Long, Short, (size(Long) + 3) div 4),
             Hm2 = half_match_i(Long, Short, (size(Long) + 1) div 2),
 
             %% Select the longest half-match.
             Hm = case {Hm1, Hm2} of
-                {undefined, undefined} -> 
-                    undefined;
-                {undefined, _} -> 
-                    Hm2;
-                {_, undefined} -> 
-                    Hm1;
-                {{half_match, _, _, _, _, C1}, {half_match, _, _, _, _, C2}} when size(C1) > size(C2) ->
-                    Hm1;
-                {_, _} ->
-                    Hm2
-            end,
+                     {undefined, undefined} -> 
+                         undefined;
+                     {undefined, _} -> 
+                         Hm2;
+                     {_, undefined} -> 
+                         Hm1;
+                     {{half_match, _, _, _, _, C1}, {half_match, _, _, _, _, C2}} when size(C1) > size(C2) ->
+                         Hm1;
+                     {_, _} ->
+                         Hm2
+                 end,
 
             %% Swap values if A was smaller than B
             case Hm of
@@ -193,19 +237,14 @@ half_match(A, B) ->
             end
     end.
 
-
 % Find the best common overlap at location I.
 half_match_i(Long, Short, I) ->
     {NewI, Seed} = seed(Long, I),
     case Seed of
-        <<>> -> 
-            undefined;
-        _ ->
-            best_common(Long, Short, Seed, NewI, 0, 
-                undefined, undefined, undefined, undefined, <<>>) 
+        <<>> -> undefined;
+        _ -> best_common(Long, Short, Seed, NewI, 0, <<>>, <<>>, <<>>, <<>>, <<>>) 
     end.
 
-
 %% Find the best common overlap inside two texts.
 best_common(Long, Short, Seed, SeedLoc, Start, 
         BestLongA, BestLongB, BestShortA, BestShortB, BestCommon) ->
@@ -251,45 +290,49 @@ best_common(Long, Short, Seed, SeedLoc, Start,
             end
     end.
 
-%% @doc Return the position of the next character.
-next_char(Bin, Pos) ->
-    <<_:Pos/binary, C/utf8, _Rest/binary>> = Bin,
-    %% The next char is at binary position...
-    Pos + size(<<C/utf8>>). 
+%% @doc Return the byte position of the next codepoint in a UTF-32 binary.
+next_char(_Bin, Pos) ->
+    Pos + 4.
 
-%% 
+%%
+%% In UTF-32 every codepoint is exactly 4 bytes, so any 4-byte-aligned slice
+%% is a valid codepoint boundary — no repair_head/repair_tail needed.
 seed(Long, Start) ->
     SeedSize = size(Long) div 4,
 
-    %% Note, need to split on utf8 character boundary here.
-    <<_Pre:Start/binary, Seed:SeedSize/binary, _Post/binary>> = Long,
-
-    %% Utf-8 repair the seed's head and tail. 
-    {Pre, Seed1} = repair_head(Seed),
-    {Seed2, _} = repair_tail(Seed1),
+    %% Align Start to a 4-byte (codepoint) boundary.
+    AlignedStart = (Start div 4) * 4,
+    <<_Pre:AlignedStart/binary, Seed:SeedSize/binary, _Post/binary>> = Long,
 
-    %% return the start position of the seed and the seed itself.
-    {Start - size(Pre), Seed2}.
+    {AlignedStart, Seed}.
 
 
 %% Line diff
 compute_diff1(Text1, Text2, true) ->
-    diff_linemode(Text1, Text2);
-compute_diff1(Text1, Text2, false) when size(Text1) > 100 orelse size(Text2) > 100 ->
-    diff_linemode(Text1, Text2);
+    diff_linemode32(Text1, Text2);
+compute_diff1(Text1, Text2, false) when size(Text1) > 400 orelse size(Text2) > 400 ->
+    %% 100 UTF-8 bytes ≈ 400 UTF-32 bytes (conservative upper bound)
+    diff_linemode32(Text1, Text2);
 compute_diff1(Text1, Text2, false) ->
-    diff_bisect(Text1, Text2).
+    diff_bisect32(Text1, Text2).
 
 
-%% Compute diff in linemode
+%% Public entry: accepts UTF-8, converts at boundary.
 diff_linemode(Text1, Text2) ->
+    T1 = to_utf32(Text1),
+    T2 = to_utf32(Text2),
+    Diffs32 = diff_linemode32(T1, T2),
+    [{Op, to_utf8(D)} || {Op, D} <- Diffs32].
+
+%% Internal: operates entirely on UTF-32 binaries.
+diff_linemode32(Text1, Text2) ->
     {CharText1, CharText2, Lines} = lines_to_chars(Text1, Text2),
-    Diffs = diff(CharText1, CharText2, false),
+    Diffs = diff32(CharText1, CharText2, false),
 
     %% Transform the diffs back to lines.
     Diffs1 = decode_lines(Diffs, Lines),
 
-    Cleaned = cleanup_merge(Diffs1),
+    Cleaned = cleanup_merge32(Diffs1),
     cleanup_line_diff(Cleaned, <<>>, <<>>, [], []).
 
 
@@ -313,45 +356,51 @@ cleanup_line_diff([{equal, _}=E|Rest], DeleteData, InsertData, TmpAcc, Acc)
 
 %% Found leading insert and delete data, diff the texts and replace the operations.
 cleanup_line_diff([{equal, _}=E|Rest], DeleteData, InsertData, _TmpAcc, Acc) ->
-    %% rediff the delete and insert data.
-    Diffs = diff(DeleteData, InsertData, false),
+    %% Data is already UTF-32 — pass directly to diff32.
+    Diffs = diff32(DeleteData, InsertData, false),
     Acc1 = lists:reverse(Diffs) ++ Acc,
     cleanup_line_diff(Rest, <<>>, <<>>, [], [E|Acc1]).
 
 
-%% Diff lines
+%% Diff lines.
+%% Text1 and Text2 are UTF-32 binaries. Lines are stored as UTF-32 binaries.
+%% CharText1/CharText2 are UTF-32 binaries where each 4-byte word is a line index.
 lines_to_chars(Text1, Text2) ->
-    {CharText1, NextChar, Lines1, Dict1} = lines_to_chars(Text1, 0, <<>>, 0, [], dict:new()),
-    {CharText2, _, Lines2, _Dict2} = lines_to_chars(Text2, 0, <<>>, NextChar, Lines1, Dict1),
+    Utf8Text1 = to_utf8(Text1),
+    Utf8Text2 = to_utf8(Text2),
+    {CharText1, NextChar, Lines1, Map1} = lines_to_chars(Utf8Text1, 0, <<>>, 0, [], #{}),
+    {CharText2, _, Lines2, _Map2} = lines_to_chars(Utf8Text2, 0, <<>>, NextChar, Lines1, Map1),
 
     {CharText1, CharText2, lists:reverse(Lines2)}.
 
-% Transform each unique line into a single char
-lines_to_chars(Text, Idx, CharText, NextChar, Lines, D) when Idx >= size(Text) ->
-    {CharText, NextChar, Lines, D};
-lines_to_chars(Text, Idx, CharText, NextChar, Lines, D) ->
-    case binary:match(Text, <<"\n">>, [{scope, {Idx, size(Text)-Idx}}]) of
+%% Transform each unique line into a 4-byte index; store line content as UTF-32.
+lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) when Idx >= byte_size(Text) ->
+    {CharText, NextChar, Lines, Map};
+lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) ->
+    case binary:match(Text, <<"\n">>, [{scope, {Idx, byte_size(Text)-Idx}}]) of
         nomatch ->
             <<_:Idx/binary, Line/binary>> = Text,
-            {Char, NextChar1, Lines1, D1} = insert_line(Line, Lines, D, NextChar),
-            CharText1 = <<CharText/binary, Char/utf8>>,
-            {CharText1, NextChar1, Lines1, D1};
+            {Char, NextChar1, Lines1, Map1} = insert_line(to_utf32(Line), Lines, Map, NextChar),
+            CharText1 = <<CharText/binary, Char:32>>,
+            {CharText1, NextChar1, Lines1, Map1};
         {Start, _} ->
             LineLength = Start - Idx + 1,
             <<_:Idx/binary, Line:LineLength/binary, _/binary>> = Text,
-
-            {Char, NextChar1, Lines1, D1} = insert_line(Line, Lines, D, NextChar),
-            CharText1 = <<CharText/binary, Char/utf8>>,
-
-            lines_to_chars(Text, Idx + LineLength, CharText1, NextChar1, Lines1, D1) 
+            {Char, NextChar1, Lines1, Map1} = insert_line(to_utf32(Line), Lines, Map, NextChar),
+            CharText1 = <<CharText/binary, Char:32>>,
+            lines_to_chars(Text, Idx + LineLength, CharText1, NextChar1, Lines1, Map1)
     end.
 
-insert_line(Line, Lines, Dict, NextChar) ->
-    case dict:find(Line, Dict) of
-        {ok, Char} ->
-            {Char, NextChar, Lines, Dict};
-        error ->
-            {NextChar, NextChar+1, [Line|Lines], dict:store(Line, NextChar, Dict)}
+
+insert_line(Line, Lines, Map, NextChar) ->
+    Hash = erlang:phash2(Line, ?PHASH2_RANGE),
+    case Map of
+        %% Hash hit — verify the stored line matches to guard against collisions.
+        #{Hash := {Char, Line}} ->
+            {Char, NextChar, Lines, Map};
+        %% Hash miss or collision with a different line — assign a new index.
+        _ ->
+            {NextChar, NextChar + 1, [Line | Lines], Map#{Hash => {NextChar, Line}}}
     end.
 
 decode_lines(Diffs, Lines) when is_list(Lines) ->
@@ -361,7 +410,8 @@ decode_lines(Diffs, Lines) when is_list(Lines) ->
 decode_lines([], _LinesTuple, Acc) ->
     lists:reverse(Acc);
 decode_lines([{Op, Data} | Rest], LinesTuple, Acc) ->
-    Data1 = << <<(element(C + 1, LinesTuple))/binary>> || <<C/utf8>> <= Data >>,
+    %% Each index is a 32-bit word; lines are already UTF-32 — just concatenate.
+    Data1 = << <<(element(C + 1, LinesTuple))/binary>> || <<C:32>> <= Data >>,
     decode_lines(Rest, LinesTuple, [{Op, Data1} | Acc]).
 
 
@@ -377,16 +427,20 @@ decode_lines([{Op, Data} | Rest], LinesTuple, Acc) ->
 %%    Returns:
 %%      Array of diff tuples.
 %%    """
+%% Public entry point — converts UTF-8 inputs to UTF-32, runs bisect, converts back.
 diff_bisect(A, B) when is_binary(A) andalso is_binary(B) ->
-    A32 = unicode:characters_to_binary(A, utf8, utf32),
-    B32 = unicode:characters_to_binary(B, utf8, utf32),
-    M = byte_size(A32) div 4,
-    N = byte_size(B32) div 4,
-    try compute_diff_bisect1(A32, B32, M, N) of
+    Diffs32 = diff_bisect32(to_utf32(A), to_utf32(B)),
+    [{Op, to_utf8(D)} || {Op, D} <- Diffs32].
+
+%% Internal bisect working entirely on UTF-32 binaries.
+diff_bisect32(A, B) ->
+    M = byte_size(A) div 4,
+    N = byte_size(B) div 4,
+    try compute_diff_bisect1(A, B, M, N) of
         no_overlap -> [{delete, A}, {insert, B}]
     catch
         throw:{overlap, X, Y} ->
-            diff_bisect_split(A, B, A32, B32, X, Y)
+            diff_bisect_split(A, B, X, Y)
     end.
 
 compute_diff_bisect1(A, B, M, N) ->
@@ -414,11 +468,13 @@ compute_diff_bisect1(A, B, M, N) ->
         S3 = for(-D + S1#bisect_state.k1start, D + 1 - S1#bisect_state.k1end, 2, fun(K1, S2) ->
             K1Offset = VOffset + K1,
 
-            X1 = case K1 =:= -D orelse (K1 =/= D andalso 
-                    (array:get(K1Offset-1, S2#bisect_state.v1) < array:get(K1Offset+1, S2#bisect_state.v1))) of
-                true -> array:get(K1Offset + 1, S2#bisect_state.v1);
-                false -> array:get(K1Offset - 1, S2#bisect_state.v1) + 1
-            end,
+            X1 = case K1 =:= -D
+                      orelse (K1 =/= D
+                              andalso (array:get(K1Offset-1, S2#bisect_state.v1) < array:get(K1Offset+1, S2#bisect_state.v1)))
+                 of
+                     true -> array:get(K1Offset + 1, S2#bisect_state.v1);
+                     false -> array:get(K1Offset - 1, S2#bisect_state.v1) + 1
+                 end,
 
             Y1 = X1 - K1,
             {X1_1, Y1_1} = match_front(X1, Y1, A, M, B, N),
@@ -460,13 +516,13 @@ compute_diff_bisect1(A, B, M, N) ->
         %% Walk the reverse path one step. (verdacht hetzelfde als het ding hierboven...)
         S5 = for(-D + S3#bisect_state.k2start, D + 1 - S3#bisect_state.k2end, 2, fun(K2, S4) ->
             K2Offset = VOffset + K2,
-            X2 = case K2 =:= -D orelse (K2 =/= D andalso 
-                        array:get(K2Offset-1, S4#bisect_state.v2) < array:get(K2Offset+1, S4#bisect_state.v2)) of
-                true -> 
-                    array:get(K2Offset + 1, S4#bisect_state.v2);
-                false -> 
-                    array:get(K2Offset - 1, S4#bisect_state.v2) + 1
-            end,
+            X2 = case K2 =:= -D
+                      orelse (K2 =/= D
+                              andalso array:get(K2Offset-1, S4#bisect_state.v2) < array:get(K2Offset+1, S4#bisect_state.v2))
+                 of
+                     true -> array:get(K2Offset + 1, S4#bisect_state.v2);
+                     false -> array:get(K2Offset - 1, S4#bisect_state.v2) + 1
+                 end,
 
             Y2 = X2 - K2,
 
@@ -511,18 +567,14 @@ compute_diff_bisect1(A, B, M, N) ->
 
     no_overlap.
 
-% @doc Split A and B and process the parts.
-diff_bisect_split(A, B, A32, B32, X, Y) ->
-    A1 = utf32_prefix_to_utf8(A32, X),
-    A2 = utf32_prefix_to_utf8(B32, Y),
-
-    B1 = binary:part(A, byte_size(A1), byte_size(A) - byte_size(A1)),
-    B2 = binary:part(B, byte_size(A2), byte_size(B) - byte_size(A2)),
+% @doc Split A and B at the overlap point and recursively diff each half.
+diff_bisect_split(A, B, X, Y) ->
+    A1 = binary:part(A, 0, X * 4),
+    A2 = binary:part(B, 0, Y * 4),
+    B1 = binary:part(A, X * 4, byte_size(A) - X * 4),
+    B2 = binary:part(B, Y * 4, byte_size(B) - Y * 4),
 
-    Diffs = diff(A1, A2, false),
-    DiffsB = diff(B1, B2, false),
-
-    Diffs ++ DiffsB.
+    diff32(A1, A2, false) ++ diff32(B1, B2, false).
 
 % @doc Convert the diffs into a pretty html report
 pretty_html(Diffs) ->
@@ -597,45 +649,50 @@ levenshtein([{equal, _Data}|T], Insertions, Deletions, Levenshtein) ->
 %
 -spec cleanup_merge(diffs()) -> diffs().
 cleanup_merge(Diffs) ->
-    Diffs1 = cleanup_merge(Diffs, []),
+    Diffs32 = [{Op, to_utf32(D)} || {Op, D} <- Diffs],
+    [{Op, to_utf8(D)} || {Op, D} <- cleanup_merge32(Diffs32)].
+
+%% Internal cleanup_merge operating on UTF-32 diffs.
+cleanup_merge32(Diffs) ->
+    Diffs1 = cleanup_merge32(Diffs, []),
     canonicalize_edits(Diffs1, []).
 
 %% Done
-cleanup_merge([], Acc) ->
+cleanup_merge32([], Acc) ->
     lists:reverse(Acc);
 %% Remove operations without data.
-cleanup_merge([{_Op, <<>>}|T], Acc) ->
-    cleanup_merge(T, Acc);
+cleanup_merge32([{_Op, <<>>}|T], Acc) ->
+    cleanup_merge32(T, Acc);
 %% Merge data from equal operations
-cleanup_merge([{Op2, Data2}|T], [{Op1, Data1}|Acc]) when Op1 =:= Op2 ->
-    cleanup_merge(T, [{Op1, <<Data1/binary, Data2/binary>>}|Acc]);
+cleanup_merge32([{Op2, Data2}|T], [{Op1, Data1}|Acc]) when Op1 =:= Op2 ->
+    cleanup_merge32(T, [{Op1, <<Data1/binary, Data2/binary>>}|Acc]);
 %% Cleanup edits before equal operation
-cleanup_merge([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc]) when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal ->
-    cleanup_merge(T, [I, {Op3, <<Data3/binary, Data1/binary>>}|Acc]);
+cleanup_merge32([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc]) when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal ->
+    cleanup_merge32(T, [I, {Op3, <<Data3/binary, Data1/binary>>}|Acc]);
 %% Check if Op1Data and Op2Data have common prefixes.
-cleanup_merge([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc]) when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal ->
+cleanup_merge32([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc]) when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal ->
     {Prefix, Op1DataD, Op2DataD, Suffix} = split_pre_and_suffix(Op1Data, Op2Data),
-    cleanup_merge(T, [{equal, <<Suffix/binary, E1/binary>>}, 
+    cleanup_merge32(T, [{equal, <<Suffix/binary, E1/binary>>},
         {Op1, Op1DataD}, {Op2, Op2DataD}, {equal, <<E2/binary, Prefix/binary>>}|Acc]);
 %% Check for slide left and slide right edits
-cleanup_merge([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =:= insert orelse Op =:= delete ->
+cleanup_merge32([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =:= insert orelse Op =:= delete ->
     case is_suffix(E2, I) of
         false ->
             case is_prefix(E1, I) of
                 false ->
-                    cleanup_merge(T, [H|Acc]);
+                    cleanup_merge32(T, [H|Acc]);
                 true ->
                     P = size(E1),
                     <<_:P/binary, Post/binary>> = I,
-                    cleanup_merge([{equal, <<E2/binary, E1/binary>>}, {Op, <<Post/binary, E1/binary>>}|T], AccTail)
+                    cleanup_merge32([{equal, <<E2/binary, E1/binary>>}, {Op, <<Post/binary, E1/binary>>}|T], AccTail)
             end;
         true ->
             R = size(I) - size(E2),
-            <<Pre:R/binary,  Post/binary>> = I,
-            cleanup_merge([{Op, <<E2/binary, Pre/binary>>}, {equal, <<Post/binary, E1/binary>>}|T], AccTail)
+            <<Pre:R/binary, Post/binary>> = I,
+            cleanup_merge32([{Op, <<E2/binary, Pre/binary>>}, {equal, <<Post/binary, E1/binary>>}|T], AccTail)
     end;
-cleanup_merge([H|T], Acc) ->
-    cleanup_merge(T, [H|Acc]).
+cleanup_merge32([H|T], Acc) ->
+    cleanup_merge32(T, [H|Acc]).
 
 canonicalize_edits([{insert, I}, {delete, D} | T], Acc) ->
     canonicalize_edits(T, [{insert, I}, {delete, D} | Acc]);
@@ -648,8 +705,13 @@ canonicalize_edits([], Acc) ->
 %
 -spec cleanup_semantic(diffs()) -> diffs().
 cleanup_semantic(Diffs) ->
+    Diffs32 = [{Op, to_utf32(D)} || {Op, D} <- Diffs],
+    [{Op, to_utf8(D)} || {Op, D} <- cleanup_semantic32(Diffs32)].
+
+%% Internal semantic cleanup operating on UTF-32 diffs.
+cleanup_semantic32(Diffs) ->
     Diffs1 = cleanup_semantic_breakpoints(Diffs),
-    Diffs2 = cleanup_merge(Diffs1),
+    Diffs2 = cleanup_merge32(Diffs1),
     Diffs3 = cleanup_semantic_lossless(Diffs2),
     cleanup_semantic_overlaps(Diffs3).
 
@@ -664,13 +726,13 @@ find_breakpoint([], _Acc, _LI1, _LD1, _LI2, _LD2, _LE) ->
 find_breakpoint([{equal, Data} | T], Acc, _LI1, _LD1, LI2, LD2, _LE) ->
     find_breakpoint(T, [{equal, Data} | Acc], LI2, LD2, 0, 0, Data);
 find_breakpoint([{insert, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) ->
-    NewLI2 = LI2 + text_size(Data),
+    NewLI2 = LI2 + text_size32(Data),
     case is_breakpoint(LE, LI1, LD1, NewLI2, LD2) of
         true -> {found, apply_breakpoint(LE, Acc, [{insert, Data} | T])};
         false -> find_breakpoint(T, [{insert, Data} | Acc], LI1, LD1, NewLI2, LD2, LE)
     end;
 find_breakpoint([{delete, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) ->
-    NewLD2 = LD2 + text_size(Data),
+    NewLD2 = LD2 + text_size32(Data),
     case is_breakpoint(LE, LI1, LD1, LI2, NewLD2) of
         true -> {found, apply_breakpoint(LE, Acc, [{delete, Data} | T])};
         false -> find_breakpoint(T, [{delete, Data} | Acc], LI1, LD1, LI2, NewLD2, LE)
@@ -678,7 +740,7 @@ find_breakpoint([{delete, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) ->
 
 is_breakpoint(undefined, _, _, _, _) -> false;
 is_breakpoint(LE, LI1, LD1, LI2, LD2) ->
-    LEN = text_size(LE),
+    LEN = text_size32(LE),
     LEN =< max(LI1, LD1) andalso LEN =< max(LI2, LD2).
 
 apply_breakpoint(LE, Acc, T) ->
@@ -743,8 +805,9 @@ find_best_slide(E1, Edit, E2, BestScore, BestE1, BestEdit, BestE2) ->
             {BestE1, BestEdit, BestE2}
     end.
 
-can_slide_right(<<C/utf8, RestEdit/binary>>, <<C/utf8, RestE2/binary>>) ->
-    {true, <<C/utf8>>, RestEdit, RestE2};
+%% In UTF-32 each codepoint is exactly 4 bytes — no pattern matching on variable-width needed.
+can_slide_right(<<Char:32, RestEdit/binary>>, <<Char:32, RestE2/binary>>) ->
+    {true, <<Char:32>>, RestEdit, RestE2};
 can_slide_right(_, _) ->
     false.
 
@@ -776,31 +839,25 @@ cleanup_semantic_overlaps(Diffs) ->
 cleanup_semantic_overlaps([{delete, Del}, {insert, Ins} | T], Acc) ->
     Overlap1 = common_overlap(Del, Ins),
     Overlap2 = common_overlap(Ins, Del),
+    TDel = text_size32(Del),
+    TIns = text_size32(Ins),
     if
         Overlap1 >= Overlap2 ->
-            TDel = size(Del),
-            TIns = size(Ins),
-            Overlap1BytesDel = overlap_to_bytes_end(Del, Overlap1),
-            Overlap1BytesIns = overlap_to_bytes_start(Ins, Overlap1),
             if
-                Overlap1BytesDel >= TDel / 2 orelse Overlap1BytesIns >= TIns / 2 ->
-                    Common = binary:part(Ins, 0, Overlap1BytesIns),
-                    NewDel = binary:part(Del, 0, TDel - Overlap1BytesDel),
-                    NewIns = binary:part(Ins, Overlap1BytesIns, TIns - Overlap1BytesIns),
+                Overlap1 * 2 >= TDel orelse Overlap1 * 2 >= TIns ->
+                    Common = binary:part(Ins, 0, Overlap1 * 4),
+                    NewDel = binary:part(Del, 0, (TDel - Overlap1) * 4),
+                    NewIns = binary:part(Ins, Overlap1 * 4, (TIns - Overlap1) * 4),
                     cleanup_semantic_overlaps([{insert, NewIns} | T], [{equal, Common}, {delete, NewDel} | Acc]);
                 true ->
                     cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc])
             end;
         true ->
-            TDel = size(Del),
-            TIns = size(Ins),
-            Overlap2BytesIns = overlap_to_bytes_end(Ins, Overlap2),
-            Overlap2BytesDel = overlap_to_bytes_start(Del, Overlap2),
             if
-                Overlap2BytesIns >= TIns / 2 orelse Overlap2BytesDel >= TDel / 2 ->
-                    Common = binary:part(Ins, TIns - Overlap2BytesIns, Overlap2BytesIns),
-                    NewIns = binary:part(Ins, 0, TIns - Overlap2BytesIns),
-                    NewDel = binary:part(Del, Overlap2BytesDel, TDel - Overlap2BytesDel),
+                Overlap2 * 2 >= TIns orelse Overlap2 * 2 >= TDel ->
+                    Common = binary:part(Ins, (TIns - Overlap2) * 4, Overlap2 * 4),
+                    NewIns = binary:part(Ins, 0, (TIns - Overlap2) * 4),
+                    NewDel = binary:part(Del, Overlap2 * 4, (TDel - Overlap2) * 4),
                     cleanup_semantic_overlaps([{delete, NewDel} | T], [{equal, Common}, {insert, NewIns} | Acc]);
                 true ->
                     cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc])
@@ -811,47 +868,26 @@ cleanup_semantic_overlaps([H | T], Acc) ->
 cleanup_semantic_overlaps([], Acc) ->
     lists:reverse(Acc).
 
-%% Helper functions for semantic cleanup
-
-%% @doc Convert N codepoints from the start of Bin to a byte offset.
-%% This is consistent with text_size/1 which counts codepoints (not grapheme clusters).
-overlap_to_bytes_start(Bin, N) ->
-    codepoints_to_bytes(Bin, N, 0).
-
-codepoints_to_bytes(_Bin, 0, Acc) ->
-    Acc;
-codepoints_to_bytes(<<C/utf8, Rest/binary>>, N, Acc) ->
-    codepoints_to_bytes(Rest, N - 1, Acc + byte_size(<<C/utf8>>));
-codepoints_to_bytes(<<_C, Rest/binary>>, N, Acc) ->
-    %% Invalid utf-8 byte, count as 1
-    codepoints_to_bytes(Rest, N - 1, Acc + 1);
-codepoints_to_bytes(<<>>, _N, Acc) ->
-    Acc.
-
-%% @doc Convert N codepoints from the END of Bin to a byte count of that suffix.
-overlap_to_bytes_end(Bin, N) ->
-    SkipChars = text_size(Bin) - N,
-    SkipBytes = codepoints_to_bytes(Bin, SkipChars, 0),
-    byte_size(Bin) - SkipBytes.
+%% In UTF-32 every codepoint is exactly 4 bytes, so all byte/codepoint conversions
+%% are simple multiplications and binary:part calls.
 
+%% @doc Return the first Len codepoints of Bin as a binary.
 substring_start(Bin, Len) ->
-    binary:part(Bin, 0, overlap_to_bytes_start(Bin, Len)).
+    binary:part(Bin, 0, Len * 4).
 
+%% @doc Return the last Len codepoints of Bin as a binary.
 substring_end(Bin, Len) ->
-    TotalLen = text_size(Bin),
+    TotalLen = text_size32(Bin),
     case TotalLen =< Len of
         true -> Bin;
-        false ->
-            SkipChars = TotalLen - Len,
-            SkipBytes = codepoints_to_bytes(Bin, SkipChars, 0),
-            binary:part(Bin, SkipBytes, byte_size(Bin) - SkipBytes)
+        false -> binary:part(Bin, (TotalLen - Len) * 4, Len * 4)
     end.
 
 common_overlap(<<>>, _) -> 0;
 common_overlap(_, <<>>) -> 0;
 common_overlap(Text1, Text2) ->
-    T1Len = text_size(Text1),
-    T2Len = text_size(Text2),
+    T1Len = text_size32(Text1),
+    T2Len = text_size32(Text2),
     {T1, T2, TMin} = if
         T1Len > T2Len -> {substring_end(Text1, T2Len), Text2, T2Len};
         T1Len < T2Len -> {Text1, substring_start(Text2, T1Len), T1Len};
@@ -867,7 +903,8 @@ common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin ->
     case binary:match(T2, Pattern) of
         nomatch -> Best;
         {FoundByteOffset, _} ->
-            FoundCharCount = text_size(binary:part(T2, 0, FoundByteOffset)),
+            %% In UTF-32, byte offset maps directly to codepoint count.
+            FoundCharCount = FoundByteOffset div 4,
             NewLength = Length + FoundCharCount,
             if
                 NewLength > TMin -> Best;
@@ -883,22 +920,15 @@ common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin ->
 common_overlap_loop(_T1, _T2, _TMin, Best, _Length) ->
     Best.
 
-first_char(<<C/utf8, _/binary>>) -> C;
+%% In UTF-32 the first and last codepoints are always at fixed byte offsets.
+first_char(<<C:32, _/binary>>) -> C;
 first_char(_) -> undefined.
 
 last_char(<<>>) -> undefined;
-last_char(Bin) when is_binary(Bin) ->
-    last_char(Bin, byte_size(Bin) - 1).
-
-last_char(Bin, Pos) ->
-    case binary:at(Bin, Pos) band 16#C0 of
-        16#80 ->
-            % continuation byte, keep scanning back
-            last_char(Bin, Pos - 1);
-        _ ->
-            <<_:Pos/binary, C/utf8, _/binary>> = Bin,
-            C
-    end.
+last_char(Bin) ->
+    Size = byte_size(Bin),
+    <<_:(Size-4)/binary, C:32>> = Bin,
+    C.
 
 is_non_alphanumeric(undefined) -> true;
 is_non_alphanumeric(C) ->
@@ -921,17 +951,25 @@ is_whitespace(C) ->
 is_linebreak(C) ->
     C =:= $\n orelse C =:= $\r.
 
-is_blankline_end(Bin) ->
-    case re:run(Bin, <<"\n\r?\n$">> ) of
-        {match, _} -> true;
-        nomatch -> false
-    end.
+%% In UTF-32 each codepoint is 4 bytes, so newline patterns are fixed-width.
+is_blankline_end(Bin) when byte_size(Bin) >= 8 ->
+    Size = byte_size(Bin),
+    case Bin of
+        <<_:(Size-8)/binary,  $\n:32, $\n:32>>       -> true;
+        <<_:(Size-12)/binary, $\n:32, $\r:32, $\n:32>> -> true;
+        _ -> false
+    end;
+is_blankline_end(_) -> false.
 
-is_blankline_start(Bin) ->
-    case re:run(Bin, <<"^\r?\n\r?\n">> ) of
-        {match, _} -> true;
-        nomatch -> false
-    end.
+is_blankline_start(Bin) when byte_size(Bin) >= 8 ->
+    case Bin of
+        <<$\n:32, $\n:32, _/binary>>             -> true;
+        <<$\n:32, $\r:32, $\n:32, _/binary>>     -> true;
+        <<$\r:32, $\n:32, $\n:32, _/binary>>     -> true;
+        <<$\r:32, $\n:32, $\r:32, $\n:32, _/binary>> -> true;
+        _ -> false
+    end;
+is_blankline_start(_) -> false.
 
 % @doc Do efficiency cleanup of diffs.
 %
@@ -939,60 +977,53 @@ is_blankline_start(Bin) ->
 cleanup_efficiency(Diffs) ->
     cleanup_efficiency(Diffs, 4).
 
+-spec cleanup_efficiency(diffs(), pos_integer()) -> diffs().
 cleanup_efficiency(Diffs, EditCost) ->
-    cleanup_efficiency(Diffs, false, EditCost, []).
+    Diffs32 = [{Op, to_utf32(D)} || {Op, D} <- Diffs],
+    [{Op, to_utf8(D)} || {Op, D} <- cleanup_efficiency32(Diffs32, EditCost)].
+
+%% Internal efficiency cleanup operating on UTF-32 diffs.
+cleanup_efficiency32(Diffs) ->
+    cleanup_efficiency32(Diffs, 4).
+
+cleanup_efficiency32(Diffs, EditCost) ->
+    cleanup_efficiency32(Diffs, false, EditCost, []).
 
 %% Done.
-cleanup_efficiency([], Changed, _EditCost, Acc) ->
+cleanup_efficiency32([], Changed, _EditCost, Acc) ->
     Diffs = lists:reverse(Acc),
     case Changed of
         false -> Diffs;
-        true -> cleanup_merge(Diffs)
+        true -> cleanup_merge32(Diffs)
     end;
 %% Any equality which is surrounded on both sides by an insertion and deletion need less then 
 %% EditCost characters for it to be advantageous to split.
-cleanup_efficiency([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc) when 
+cleanup_efficiency32([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc) when 
         O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) ->
     case text_smaller_than(XY, EditCost) of
         true ->
-            %% Split
             Del = {delete, XY},
             Ins = {insert, XY},
-
-            cleanup_efficiency([Ins, B | T], true, EditCost, [Del, A | Acc]);
+            cleanup_efficiency32([Ins, B | T], true, EditCost, [Del, A | Acc]);
         false ->
-            %% Equal is big enough, move A and equal out of the way.
-            cleanup_efficiency([B | T], Changed, EditCost, [E, A |Acc])
+            cleanup_efficiency32([B | T], Changed, EditCost, [E, A | Acc])
     end;
 %% Any equality which is surrounded on one side by an existing insertion and deletion and on the 
-%% other side by an exisiting insertion or deletion needs by less than half C characters long for it 
-%% to be advantagous to split.
-cleanup_efficiency([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc) when
+%% other side by an existing insertion or deletion needs less than half C characters long for it 
+%% to be advantageous to split.
+cleanup_efficiency32([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc) when
     O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) andalso ?IS_INS_OR_DEL(O3) ->
     case text_smaller_than(X, EditCost div 2 + 1) of
         true ->
-            %% Split
             Del = {delete, X},
             Ins = {insert, X},
-            cleanup_efficiency([Ins, C | T], true, EditCost, [Del, B, A | Acc]);
+            cleanup_efficiency32([Ins, C | T], true, EditCost, [Del, B, A | Acc]);
         false ->
-            %% Equal is big enough, move delete and equal out of the way.
-            cleanup_efficiency([B, E, C | T], Changed, EditCost, [A |Acc])
+            cleanup_efficiency32([B, E, C | T], Changed, EditCost, [A | Acc])
     end;
-cleanup_efficiency([H|T], Changed, EditCost, Acc) ->
-    cleanup_efficiency(T, Changed, EditCost, [H|Acc]).
-
+cleanup_efficiency32([H | T], Changed, EditCost, Acc) ->
+    cleanup_efficiency32(T, Changed, EditCost, [H | Acc]).
 
-% @doc Return true iff the text is smaller than specified 
-text_smaller_than(_, 0) ->
-    false;
-text_smaller_than(<<>>, _Size) ->
-    true;
-text_smaller_than(<<_C/utf8, Rest/binary>>, Size) when Size > 0 ->
-    text_smaller_than(Rest, Size-1);
-text_smaller_than(<<_C, Rest/binary>>, Size) when Size > 0 ->
-    %% Illegal utf-8 string, just count this as a single character and continue
-    text_smaller_than(Rest, Size-1).
 
 % @doc create a patch from a list of diffs
 make_patch(Diffs) when is_list(Diffs) ->
@@ -1012,10 +1043,8 @@ make_patch(Diffs, SourceText) when is_list(Diffs) andalso is_binary(SourceText)
 
 make_patch([], _PrePatchText, _PostPatchText, _Count1, _Count2, [Patch|Rest]=Patches) ->
     case Patch#patch.diffs of
-        [] -> 
-            lists:reverse(Rest);
-        _ -> 
-            lists:reverse(Patches)
+        [] -> lists:reverse(Rest);
+        _ -> lists:reverse(Patches)
     end;
     
 make_patch([{insert, Data}=D|T], PrePatchText, PostPatchText, Count1, Count2, [Patch|Rest]) ->
@@ -1090,11 +1119,6 @@ unique_match(Pattern, Text) ->
 %% Helpers
 %%
 
-% @doc Return true iff binary is a single character.
-single_char(<<>>) -> false;
-single_char(<<_C/utf8>>) -> true;
-single_char(Bin) when is_binary(Bin) -> false.
-
 % @doc Return true iff A is a prefix of B
 is_prefix(A, B) when size(A) > size(B) ->
     false;
@@ -1155,120 +1179,45 @@ split_pre_and_suffix(Text1, Text2) ->
     {Prefix, MiddleText1, MiddleText2, Suffix}.
 
     
-% @doc Return the common prefix of Text1 and Text2. (utf8 aware)
+% @doc Return the common prefix of Text1 and Text2. Works on UTF-32 — always codepoint-aligned.
 common_prefix(Text1, Text2) ->
     Length = binary:longest_common_prefix([Text1, Text2]),
-    Prefix = binary:part(Text1, 0, Length),
-    
-    %% Utf-8 repair the tail of the prefix. It could contain a half utf-8 char.
-    {Prefix1, _} = repair_tail(Prefix),
-    Prefix1.
+    %% Round down to 4-byte boundary (should already be aligned for valid UTF-32).
+    binary:part(Text1, 0, (Length div 4) * 4).
 
-% @doc Return the common prefix of Text1 and Text2 (utf8 aware)
+% @doc Return the common suffix of Text1 and Text2. Works on UTF-32 — always codepoint-aligned.
 common_suffix(Text1, Text2) ->
     Length = binary:longest_common_suffix([Text1, Text2]),
-    Suffix = binary:part(Text1, size(Text1), -Length),
-
-    %% Utf-8 repair the head of the suffix. Could contain a half utf8 char
-    {_, Suffix1} = repair_head(Suffix),
-    Suffix1.
+    binary:part(Text1, byte_size(Text1), -((Length div 4) * 4)).
 
 
-% @doc Count the number of characters in a utf8 binary.
+% @doc Count the number of codepoints in a UTF-8 binary.
+% @deprecated Use text_size32/1 internally. This public function may be removed in a future version.
+-spec text_size(unicode:unicode_binary()) -> non_neg_integer().
 text_size(Text) when is_binary(Text) ->
-    text_size(Text, 0).
+    string:length(Text).
 
-text_size(<<>>, Count) ->
-    Count;
-text_size(<<_C/utf8, Rest/binary>>, Count) ->
-    text_size(Rest, Count+1);
-text_size(_, _) ->
-    error(badarg).
+% @doc Count the number of codepoints in a UTF-32 binary. O(1).
+text_size32(Text) when is_binary(Text) ->
+    byte_size(Text) div 4.
+
+% @doc Return true iff Text has fewer than Size codepoints. O(1) for UTF-32.
+text_smaller_than(_, 0) ->
+    false;
+text_smaller_than(Text, Size) ->
+    byte_size(Text) < Size * 4.
 
 %%
-%% UTF-32 utilities
+%% UTF-32 boundary helpers
 %%
 
-% @doc Convert the first N codepoints of a UTF-32BE binary to a UTF-8 binary.
-utf32_prefix_to_utf8(Utf32, CodepointCount) ->
-    Prefix32 = binary:part(Utf32, 0, CodepointCount * 4),
-    unicode:characters_to_binary(Prefix32, utf32, utf8).
-
-%% @doc Checks the trailing bytes for utf8 prefix bytes.
-repair_tail(<<>>) ->
-    {<<>>, <<>>};
-%% Checks 
-repair_tail(Bin) ->
-    Size = size(Bin),
-    Size1 = Size-1, Size2 = Size-2, Size3 = Size-3, Size4 = Size-4,
-    case Bin of
-        %% Valid 1 -byte
-        <<_:Size1/binary, 2#0:1, _A:7>> ->
-             {Bin, <<>>}; 
-
-        %% Invalid 1-byte
-        <<Pre:Size1/binary, 2#110:3, A:5>> ->
-            {Pre, <<2#110:3, A:5>>};
-        <<Pre:Size1/binary, 2#1110:4, A:4>> ->
-            {Pre, <<2#1110:4, A:4>>};
-        <<Pre:Size1/binary, 2#11110:5, A:3>> ->
-            {Pre, <<2#11110:5, A:3>>};
-
-        %% Valid 2-byte ending
-        <<_:Size2/binary, 2#110:3, _A:5, 2#10:2, _B:6>> ->
-             {Bin, <<>>};
-
-        %% Invalid 2-byte ending
-        <<Pre:Size2/binary, 2#1110:4, A:4, 2#10:2, B:6>> ->
-            {Pre, <<2#1110:4, A:4, 2#10:2, B:6>>};
-        <<Pre:Size2/binary, 2#11110:5, A:3, 2#10:2, B:6>> ->
-            {Pre, <<2#11110:5, A:3, 2#10:2, B:6>>};
-
-        %% Valid 3-byte ending
-        <<_:Size3/binary, 2#1110:4, _A:4,  2#10:2, _B:6,  2#10:2, _C:6>> ->
-             {Bin, <<>>};
-
-        %% Invalid 3-byte ending
-        <<Pre:Size3/binary, 2#11110:5, A:3,  2#10:2, B:6, 2#10:2, C:6>> ->
-            {Pre, <<2#11110:5, A:3, 2#10:2, B:6, 2#10:2, C:6>>};
-
-        %% Valid 4-byte ending
-        <<_:Size4/binary, 2#11110:5, _A:3,  2#10:2, _B:6,   2#10:2, _C:6,  2#10:2, _D:6>> ->
-             {Bin, <<>>};
-
-        %% Illegal utf-8 sequence.
-        _ ->
-	    %% Can't repair it, just return
-	    {Bin, <<>>}
-    end.
+% @doc Convert a UTF-8 binary to UTF-32, crashing on invalid input.
+to_utf32(Bin) ->
+    <<_/binary>> = unicode:characters_to_binary(Bin, utf8, utf32).
 
-% @doc Checks the beginning of a binary and strips of partial utf-8 encoded bytes.
-repair_head(<<>>) ->
-    {<<>>, <<>>};
-% valid 1-byte beginning
-repair_head(<<2#0:1, _A:7, _Rest/binary>>=Bin) ->
-    {<<>>, Bin};
-% valid 4-byte beginning
-repair_head(<<2#11110:5, _A:3,  2#10:2, _B:6, 2#10:2, _C:6,  2#10:2, _D:6, _Rest/binary>>=Bin) ->
-    {<<>>, Bin};
-% valid 3-byte beginning
-repair_head(<<2#1110:4, _A:4,  2#10:2, _B:6,  2#10:2, _C:6, _Rest/binary>>=Bin) ->
-    {<<>>, Bin};
-% invalid 3-byte beginning
-repair_head(<<2#10:2, A:6, 2#10:2, B:6, 2#10:2, C:6, Rest/binary>>) ->
-    {<<2#10:2, A:6, 2#10:2, B:6, 2#10:2, C:6>>, Rest};
-% valid 2-byte beginning
-repair_head(<<2#110:3, _A:5, 2#10:2, _B:6, _Rest/binary>>=Bin) ->
-    {<<>>, Bin};
-% invalid 2-byte beginnings
-repair_head(<<2#10:2, A:6, 2#10:2, B:6, Rest/binary>>) ->
-    {<<2#10:2, A:6, 2#10:2, B:6>>, Rest};
-% invalid 1-byte beginning
-repair_head(<<2#10:2, A:6, Rest/binary>>) ->
-    {<<2#10:2, A:6>>, Rest};
-repair_head(Bin) ->
-    %% Illegal sequence, can't repair it.
-    {<<>>, Bin}.
+% @doc Convert a UTF-32 binary to UTF-8, crashing on invalid input.
+to_utf8(Bin) ->
+    <<_/binary>> = unicode:characters_to_binary(Bin, utf32, utf8).
 
 %%
 %% Tests
@@ -1278,44 +1227,11 @@ repair_head(Bin) ->
 
 -include_lib("eunit/include/eunit.hrl").
 
-repair_tail_test() ->
-    ?assertEqual({<<>>, <<>>}, repair_tail(<<>>)),
-    ?assertEqual({<<"aap">>, <<>>}, repair_tail(<<"aap">>)),
-    ?assertEqual({<<200/utf8>>, <<>>}, repair_tail(<<200/utf8>>)),
-    ?assertEqual({<<600/utf8>>, <<>>}, repair_tail(<<600/utf8>>)),
-    ?assertEqual({<<1000/utf8>>, <<>>}, repair_tail(<<1000/utf8>>)),
-
-    ?assertEqual({<<"aap">>, <<200>>}, repair_tail(<<"aap", 200>>)),
-
-    ?assertEqual({<<"test">>, <<240, 159, 159>>}, repair_tail(<<116,101,115,116,240,159,159>>)),
-
-    ok.
-
-repair_head_test() -> 
-    ?assertEqual({<<>>, <<>>}, repair_head(<<>>)),
-    ?assertEqual({<<>>, <<"a">>}, repair_head(<<"a">>)),
-    ?assertEqual({<<>>, <<"aap">>}, repair_head(<<"aap">>)),
-    ?assertEqual({<<>>, <<200/utf8>>}, repair_head(<<200/utf8>>)),
-    ?assertEqual({<<>>, <<600/utf8>>}, repair_head(<<600/utf8>>)),
-    ?assertEqual({<<>>, <<1000/utf8>>}, repair_head(<<1000/utf8>>)),
-
-    %%
-    ?assertEqual({<<2#10:2, 10:6>>, <<"aap">>}, 
-        repair_head(<<2#10:2, 10:6, "aap">>)),
-    ?assertEqual({<<2#10:2, 60:6, 2#10:2, 10:6>>, <<"aap">>}, 
-        repair_head(<<2#10:2, 60:6, 2#10:2, 10:6, "aap">>)),
-    ?assertEqual({<<2#10:2, 60:6, 2#10:2, 10:6, 2#10:2, 13:6>>, <<"aap">>}, 
-        repair_head(<<2#10:2, 60:6, 2#10:2, 10:6, 2#10:2, 13:6, "aap">>)),
-
-    ok.
-    
-
 for_test() ->
     ?assertEqual(9, for(0, 10, fun(I, _N) -> {continue, I} end, undefined)),
     ?assertEqual(0, for(0, 10, fun(I, _N) -> {break, I} end, undefined)),
     ok.
 
-
 diff_utf8_test() ->
     ?assertEqual([{equal, <<208,174, 208,189, 208,184, 208,186, 208,190, 208,180>>}], 
         diff(<<208,174,208,189,208,184,208,186,208,190,208,180>>, 
@@ -1342,10 +1258,6 @@ diff_bisect_test() ->
                   {equal,<<" a banana">>}], diff_bisect(<<"fruit flies like a banana">>, 
                                                         <<"fruit flies eat a banana">>)),
 
-
-    %?assertEqual([{delete,<<"cat">>},
-    %              {insert,<<"map">>}], diff_bisect(<<"cat">>, <<"map">>)), 
-
     ?assertEqual([{delete,<<"c">>},
                   {insert,<<"m">>},
                   {equal,<<"a">>},
@@ -1361,84 +1273,87 @@ diff_bisect_test() ->
 
     ?assertEqual([{equal, <<"text">>}],
                  diff_bisect(<<"text">>, <<"text">>)),
-                 
 
     ok.
 
+%% half_match operates on UTF-32 internally; wrap inputs/outputs for testing.
+half_match_utf8(A, B) ->
+    case half_match(to_utf32(A), to_utf32(B)) of
+        undefined -> undefined;
+        {half_match, A1, A2, B1, B2, C} ->
+            {half_match, to_utf8(A1), to_utf8(A2), to_utf8(B1), to_utf8(B2), to_utf8(C)}
+    end.
+
 half_match_test() ->
-    ?assertEqual(undefined, half_match(<<"1234567890">>, <<"abcdef">>)),
-    ?assertEqual(undefined, half_match(<<"12345">>, <<"23">>)),
+    ?assertEqual(undefined, half_match_utf8(<<"1234567890">>, <<"abcdef">>)),
+    ?assertEqual(undefined, half_match_utf8(<<"12345">>, <<"23">>)),
 
     %% Single Match
     ?assertEqual({half_match, <<"12">>, <<"90">>, <<"a">>, <<"z">>, <<"345678">>}, 
-        half_match(<<"1234567890">>, <<"a345678z">>)),
+        half_match_utf8(<<"1234567890">>, <<"a345678z">>)),
     ?assertEqual({half_match, <<"a">>, <<"z">>, <<"12">>, <<"90">>, <<"345678">>}, 
-        half_match(<<"a345678z">>, <<"1234567890">>)),
+        half_match_utf8(<<"a345678z">>, <<"1234567890">>)),
     ?assertEqual({half_match, <<"abc">>, <<"z">>, <<"1234">>, <<"0">>, <<"56789">>}, 
-        half_match(<<"abc56789z">>, <<"1234567890">>)),
+        half_match_utf8(<<"abc56789z">>, <<"1234567890">>)),
     ?assertEqual({half_match, <<"a">>, <<"xyz">>, <<"1">>, <<"7890">>, <<"23456">>}, 
-        half_match(<<"a23456xyz">>, <<"1234567890">>)),
+        half_match_utf8(<<"a23456xyz">>, <<"1234567890">>)),
 
     %% Multiple Matches
     ?assertEqual({half_match, <<"12123">>, <<"123121">>, <<"a">>, <<"z">>, <<"1234123451234">>}, 
-        half_match(<<"121231234123451234123121">>, <<"a1234123451234z">>)),
+        half_match_utf8(<<"121231234123451234123121">>, <<"a1234123451234z">>)),
 
     ?assertEqual({half_match, <<"">>, <<"-=-=-=-=-=">>, <<"x">>, <<"">>, <<"x-=-=-=-=-=-=-=">>}, 
-        half_match(<<"x-=-=-=-=-=-=-=-=-=-=-=-=">>, <<"xx-=-=-=-=-=-=-=">>)),
+        half_match_utf8(<<"x-=-=-=-=-=-=-=-=-=-=-=-=">>, <<"xx-=-=-=-=-=-=-=">>)),
 
     ?assertEqual({half_match, <<"-=-=-=-=-=">>, <<"">>, <<"">>, <<"y">>, <<"-=-=-=-=-=-=-=y">>}, 
-        half_match(<<"-=-=-=-=-=-=-=-=-=-=-=-=y">>, <<"-=-=-=-=-=-=-=yy">>)),
+        half_match_utf8(<<"-=-=-=-=-=-=-=-=-=-=-=-=y">>, <<"-=-=-=-=-=-=-=yy">>)),
 
-    % Non-optimal halfmatch.
-    % Optimal diff would be -q+x=H-i+e=lloHe+Hu=llo-Hew+y not -qHillo+x=HelloHe-w+Hulloy
     ?assertEqual({half_match, <<"qHillo">>, <<"w">>, <<"x">>, <<"Hulloy">>, <<"HelloHe">>}, 
-        half_match(<<"qHilloHelloHew">>, <<"xHelloHeHulloy">>)),
+        half_match_utf8(<<"qHilloHelloHew">>, <<"xHelloHeHulloy">>)),
 
     ok.
 
-
+%% common_prefix/suffix operate on UTF-32; wrap for testing.
 common_prefix_test() ->
-    ?assertEqual(<<>>, common_prefix(<<"Text">>, <<"Next">>)),
-    ?assertEqual(<<"T">>, common_prefix(<<"Text">>, <<"Tax">>)),
-    ?assertEqual(<<"text">>, common_prefix(<<"text">>, <<"text">>)),
-
-    ?assertEqual(<<"test🟡"/utf8>>, common_prefix(<<"test🟡123"/utf8>>, <<"test🟡456"/utf8>>)),
+    Prefix = fun(A, B) -> to_utf8(common_prefix(to_utf32(A), to_utf32(B))) end,
 
-    ?assertEqual(<<"test">>, common_prefix(<<"test🟢123"/utf8>>, <<"test🟡123"/utf8>>)),
-    ?assertEqual(<<"test">>, common_prefix(<<"test🟡123"/utf8>>, <<"test🟢123"/utf8>>)),
-    
-    ?assertEqual(<<"test">>, common_prefix(<<"test🟡123"/utf8>>, <<"test🔵123"/utf8>>)),
-    ?assertEqual(<<"test">>, common_prefix(<<"test🔵123"/utf8>>, <<"test🟡123"/utf8>>)),
-
-    ?assertEqual(<<"test">>, common_prefix(<<"test🟡123"/utf8>>, <<"test⚫️123"/utf8>>)),
-    ?assertEqual(<<"test">>, common_prefix(<<"test⚫️123"/utf8>>, <<"test🟡123"/utf8>>)),
+    ?assertEqual(<<>>, Prefix(<<"Text">>, <<"Next">>)),
+    ?assertEqual(<<"T">>, Prefix(<<"Text">>, <<"Tax">>)),
+    ?assertEqual(<<"text">>, Prefix(<<"text">>, <<"text">>)),
 
+    ?assertEqual(<<"test🟡"/utf8>>, Prefix(<<"test🟡123"/utf8>>, <<"test🟡456"/utf8>>)),
+    ?assertEqual(<<"test">>, Prefix(<<"test🟢123"/utf8>>, <<"test🟡123"/utf8>>)),
+    ?assertEqual(<<"test">>, Prefix(<<"test🟡123"/utf8>>, <<"test🟢123"/utf8>>)),
+    ?assertEqual(<<"test">>, Prefix(<<"test🟡123"/utf8>>, <<"test🔵123"/utf8>>)),
+    ?assertEqual(<<"test">>, Prefix(<<"test🔵123"/utf8>>, <<"test🟡123"/utf8>>)),
+    ?assertEqual(<<"test">>, Prefix(<<"test🟡123"/utf8>>, <<"test⚫️123"/utf8>>)),
+    ?assertEqual(<<"test">>, Prefix(<<"test⚫️123"/utf8>>, <<"test🟡123"/utf8>>)),
 
     ok.
 
-
 common_suffix_test() ->
-    ?assertEqual(<<"ext">>, common_suffix(<<"Text">>, <<"Next">>)),
-    ?assertEqual(<<>>, common_suffix(<<"Text">>, <<"Tax">>)),
-    ?assertEqual(<<"text">>, common_suffix(<<"text">>, <<"text">>)),
+    Suffix = fun(A, B) -> to_utf8(common_suffix(to_utf32(A), to_utf32(B))) end,
+
+    ?assertEqual(<<"ext">>, Suffix(<<"Text">>, <<"Next">>)),
+    ?assertEqual(<<>>, Suffix(<<"Text">>, <<"Tax">>)),
+    ?assertEqual(<<"text">>, Suffix(<<"text">>, <<"text">>)),
     ok.
 
+%% split_pre_and_suffix operates on UTF-32; wrap for testing.
 split_pre_and_suffix_test() ->
-    ?assertEqual({<<>>, <<>>, <<>>, <<>>}, split_pre_and_suffix(<<>>, <<>>)),
-
-    ?assertEqual({<<>>, <<"a">>, <<"b">>, <<>>}, split_pre_and_suffix(<<"a">>, <<"b">>)),
-    
-    ?assertEqual({<<"a">>, <<"b">>, <<"c">>, <<"d">>}, 
-       split_pre_and_suffix(<<"abd">>, <<"acd">>)),
-    ?assertEqual({<<"aa">>, <<"bb">>, <<"cc">>, <<"dd">>}, 
-       split_pre_and_suffix(<<"aabbdd">>, <<"aaccdd">>)),
-    ?assertEqual({<<"aa">>, <<"bb">>, <<"c">>, <<"dd">>}, 
-       split_pre_and_suffix(<<"aabbdd">>, <<"aacdd">>)),
+    Split = fun(A, B) ->
+        {P, M1, M2, S} = split_pre_and_suffix(to_utf32(A), to_utf32(B)),
+        {to_utf8(P), to_utf8(M1), to_utf8(M2), to_utf8(S)}
+    end,
 
+    ?assertEqual({<<>>, <<>>, <<>>, <<>>}, Split(<<>>, <<>>)),
+    ?assertEqual({<<>>, <<"a">>, <<"b">>, <<>>}, Split(<<"a">>, <<"b">>)),
+    ?assertEqual({<<"a">>, <<"b">>, <<"c">>, <<"d">>}, Split(<<"abd">>, <<"acd">>)),
+    ?assertEqual({<<"aa">>, <<"bb">>, <<"cc">>, <<"dd">>}, Split(<<"aabbdd">>, <<"aaccdd">>)),
+    ?assertEqual({<<"aa">>, <<"bb">>, <<"c">>, <<"dd">>}, Split(<<"aabbdd">>, <<"aacdd">>)),
     ?assertEqual({<<"cat ">>, <<>>, <<"mouse dog ">>, <<>>},
-                 split_pre_and_suffix(<<"cat ">>, <<"cat mouse dog ">>)),
-
-    ok. 
+                 Split(<<"cat ">>, <<"cat mouse dog ">>)),
+    ok.
 
 unique_match_test() ->
     ?assertEqual(true, unique_match(<<"a">>, <<"abc">>)),
@@ -1447,54 +1362,74 @@ unique_match_test() ->
     ?assertEqual(false, unique_match(<<"ab">>, <<"abab">>)),
     ok.
 
-
 text_smaller_than_test() ->
-    ?assertEqual(true, text_smaller_than(<<>>, 5)),
-    ?assertEqual(true, text_smaller_than(<<>>, 1)),
-
-    ?assertEqual(false, text_smaller_than(<<>>, 0)),
-
-    ?assertEqual(false, text_smaller_than(<<"abc">>, 0)),
-    ?assertEqual(false, text_smaller_than(<<"abc">>, 1)),
-    ?assertEqual(true, text_smaller_than(<<"abc">>, 4)),
-
-    %% Test if we count characters.
-    Utf8Binary = <<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>,
-    ?assertEqual(true, size(Utf8Binary) > 5), % binary is larger due to utf8 encoding
-    ?assertEqual(true, text_smaller_than(Utf8Binary, 5)),
-    ?assertEqual(false, text_smaller_than(Utf8Binary, 4)),
-
-    %% Test illegal utf8 sequence, the chars are counted as normal chars
-    ?assertEqual(false, text_smaller_than(<<149,157,112,8>>, 4)),
+    %% text_smaller_than now works on UTF-32 binaries.
+    ?assertEqual(true,  text_smaller_than(to_utf32(<<>>), 5)),
+    ?assertEqual(true,  text_smaller_than(to_utf32(<<>>), 1)),
+    ?assertEqual(false, text_smaller_than(to_utf32(<<>>), 0)),
+    ?assertEqual(false, text_smaller_than(to_utf32(<<"abc">>), 0)),
+    ?assertEqual(false, text_smaller_than(to_utf32(<<"abc">>), 1)),
+    ?assertEqual(true,  text_smaller_than(to_utf32(<<"abc">>), 4)),
+
+    %% Multi-byte UTF-8 characters each become exactly 4 bytes in UTF-32.
+    Utf32 = to_utf32(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>),
+    ?assertEqual(true,  text_smaller_than(Utf32, 5)),
+    ?assertEqual(false, text_smaller_than(Utf32, 4)),
 
     ok.
 
 lines_to_chars_test() ->
-    ?assertEqual({<<>>, <<>>, []}, lines_to_chars(<<>>, <<>>)),
-
-    %% Simple text
-    ?assertEqual({<<0, 1>>, <<0, 2>>, [<<"hello\n">>, <<"world\n">>, <<"maas\n">>]}, 
-        lines_to_chars(<<"hello\n\world\n">>, <<"hello\nmaas\n">>)),
-
-    %% No newline at the end.
-    ?assertEqual({<<0, 1>>, <<0, 2>>, [<<"hello\n">>, <<"world\n">>, <<"maas">>]}, 
-        lines_to_chars(<<"hello\n\world\n">>, <<"hello\nmaas">>)),
-   
-    %% No newline at the end.
-    ?assertEqual({<<0, 1>>, <<0, 2>>, [<<"hello\n">>, <<"world\n">>, <<"maas">>]}, 
-        lines_to_chars(<<"hello\n\world\n">>, <<"hello\nmaas">>)),
-    
-    %% With empty lines 
-    ?assertEqual({<<0, 1, 2>>, <<0, 1, 3>>, [<<"hello\n">>, <<"\n">>, <<"world\n">>, <<"maas">>]}, 
-        lines_to_chars(<<"hello\n\nworld\n">>, <<"hello\n\nmaas">>)),
+    %% lines_to_chars takes UTF-32 input, returns UTF-32 index sequences and UTF-32 lines.
+    {C1, C2, Lines} = lines_to_chars(to_utf32(<<>>), to_utf32(<<>>)),
+    ?assertEqual(<<>>, C1),
+    ?assertEqual(<<>>, C2),
+    ?assertEqual([], Lines),
+
+    {C3, C4, Lines2} = lines_to_chars(to_utf32(<<"hello\nworld\n">>), to_utf32(<<"hello\nmaas\n">>)),
+    %% Lines are stored as UTF-32 binaries.
+    ?assertEqual([to_utf32(<<"hello\n">>), to_utf32(<<"world\n">>), to_utf32(<<"maas\n">>)], Lines2),
+    ?assertEqual(<<0:32, 1:32>>, C3),
+    ?assertEqual(<<0:32, 2:32>>, C4),
 
     ok.
 
-
 diff_linemode_test() ->
     ?assertEqual([{equal, <<"hello\n">>}, {delete, <<"world\n">>}, {insert, <<"maas\n">>}], 
         diff_linemode(<<"hello\nworld\n">>, <<"hello\nmaas\n">>)),
 
     ok.
 
+diff_options_test() ->
+    A = <<"cat">>,
+    B = <<"map">>,
+
+    %% No options — same as diff/2.
+    ?assertEqual(diff(A, B), diff(A, B, [])),
+
+    %% no_linemode: result is structurally equivalent (same source/dest text).
+    NoLinemode = diff(A, B, [no_linemode]),
+    ?assertEqual(source_text(diff(A, B)),      source_text(NoLinemode)),
+    ?assertEqual(destination_text(diff(A, B)), destination_text(NoLinemode)),
+
+    %% semantic option applies cleanup_semantic to the raw diff.
+    ?assertEqual(cleanup_semantic(diff(A, B)), diff(A, B, [semantic])),
+
+    %% efficiency option applies cleanup_efficiency to the raw diff.
+    ?assertEqual(cleanup_efficiency(diff(A, B)), diff(A, B, [efficiency])),
+
+    %% {efficiency, Cost} applies cleanup_efficiency/2 with the given cost.
+    ?assertEqual(cleanup_efficiency(diff(A, B), 2), diff(A, B, [{efficiency, 2}])),
+
+    %% Both: semantic first, then efficiency.
+    ?assertEqual(
+        cleanup_efficiency(cleanup_semantic(diff(A, B))),
+        diff(A, B, [semantic, efficiency])),
+
+    %% Order of options in list does not affect cleanup order.
+    ?assertEqual(
+        diff(A, B, [semantic, efficiency]),
+        diff(A, B, [efficiency, semantic])),
+
+    ok.
+
 -endif.
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 694db66..06f9e86 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -77,7 +77,7 @@ html_like() ->
                                         {2, utf8(4)},              % Some small portions of unicode chars.
                                         {2, range($0, $9)},        % numbers
                                         {2, $\s},                  % whitespace
-                                        {4,  $\n},                 % linebreaks
+                                        {4, $\n},                 % linebreaks
                                         {2, oneof([$., $-, $!, $?, $,])}   % punctuation
                                        ]))).
 
@@ -314,7 +314,7 @@ text_size_test() ->
     ?assertEqual(4, diffy:text_size(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>)),
 
     %% Bad utf-8 input results in a badarg.
-    ?assertError(badarg, diffy:text_size(<<149,157,112,8>>)),
+    ?assertError({badarg, _}, diffy:text_size(<<149,157,112,8>>)),
 
     ok.
 
@@ -355,6 +355,101 @@ diff_test() ->
                             <<"cat mouse dog ">>)),
     ok.
 
+  
+diff_linemode_corners_test() ->
+    %% Empty inputs.
+    ?assertEqual([], diffy:diff_linemode(<<>>, <<>>)),
+    ?assertEqual([{insert, <<"hello\n">>}], diffy:diff_linemode(<<>>, <<"hello\n">>)),
+    ?assertEqual([{delete, <<"hello\n">>}], diffy:diff_linemode(<<"hello\n">>, <<>>)),
+
+    %% Identical input — single equal op.
+    ?assertEqual([{equal, <<"hello\nworld\n">>}],
+        diffy:diff_linemode(<<"hello\nworld\n">>, <<"hello\nworld\n">>)),
+
+    %% No newline at end of file — last line treated as its own token.
+    ?assertEqual(
+        [{equal, <<"hello\n">>}, {delete, <<"world">>}, {insert, <<"maas">>}],
+        diffy:diff_linemode(<<"hello\nworld">>, <<"hello\nmaas">>)),
+
+    %% Blank lines — exercise is_blankline_start/end and the \n\n pattern.
+    %% The rediff within cleanup_line_diff splits b\n vs c\n at character level.
+    ?assertEqual(
+        [{equal, <<"a\n\n">>}, {delete, <<"b">>}, {insert, <<"c">>}, {equal, <<"\nd\n">>}],
+        diffy:diff_linemode(<<"a\n\nb\nd\n">>, <<"a\n\nc\nd\n">>)),
+
+    %% \r\n line endings — exercises the \r\n\r\n blankline pattern.
+    ?assertEqual(
+        [{equal, <<"hello\r\n">>}, {delete, <<"world\r\n">>}, {insert, <<"maas\r\n">>}],
+        diffy:diff_linemode(<<"hello\r\nworld\r\n">>, <<"hello\r\nmaas\r\n">>)),
+
+    %% Repeated lines — the same line appearing multiple times should reuse the same index.
+    ?assertEqual(
+        [{equal, <<"a\nb\na\n">>}, {insert, <<"b\n">>}],
+        diffy:diff_linemode(<<"a\nb\na\n">>, <<"a\nb\na\nb\n">>)),
+
+    %% Large enough to trigger linemode via compute_diff1 size threshold.
+    %% Build two texts that differ only in one line buried in > 100 chars of context.
+    Prefix = binary:copy(<<"padding line\n">>, 10),
+    Suffix = binary:copy(<<"trailing line\n">>, 10),
+    Text1 = <<Prefix/binary, "old line\n", Suffix/binary>>,
+    Text2 = <<Prefix/binary, "new line\n", Suffix/binary>>,
+    Diffs = diffy:diff(Text1, Text2),
+    %% Source and destination text must be preserved exactly.
+    ?assertEqual(Text1, diffy:source_text(Diffs)),
+    ?assertEqual(Text2, diffy:destination_text(Diffs)),
+    %% Must contain at least one delete and one insert — the changed line.
+    ?assert(lists:any(fun({delete, _}) -> true; (_) -> false end, Diffs)),
+    ?assert(lists:any(fun({insert, _}) -> true; (_) -> false end, Diffs)),
+
+    %% Multi-byte UTF-8 lines — verify encoding survives the linemode round-trip.
+    ?assertEqual(
+        [{equal, <<"héllo\n"/utf8>>}, {delete, <<"wörld\n"/utf8>>}, {insert, <<"wörlt\n"/utf8>>}],
+        diffy:diff_linemode(<<"héllo\nwörld\n"/utf8>>, <<"héllo\nwörlt\n"/utf8>>)),
+
+    %% cleanup_line_diff rediff path — two changed lines adjacent to an equal trigger
+    %% the rediff of accumulated delete+insert data.
+    T1 = <<"aaa\nbbb\nccc\n">>,
+    T2 = <<"aab\nbbc\nccc\n">>,
+    RediffDiffs = diffy:diff_linemode(T1, T2),
+    ?assertEqual(T1, diffy:source_text(RediffDiffs)),
+    ?assertEqual(T2, diffy:destination_text(RediffDiffs)),
+
+    ok.
+
+diff_options_test() ->
+    A = <<"one two x four five">>,
+    B = <<"one TWO x FOUR five">>,
+
+    %% No options — same as diff/2.
+    ?assertEqual(diffy:diff(A, B), diffy:diff(A, B, [])),
+
+    %% no_linemode: result is structurally equivalent (same source/dest text).
+    NoLinemode = diffy:diff(A, B, [no_linemode]),
+    ?assertEqual(diffy:source_text(diffy:diff(A, B)), diffy:source_text(NoLinemode)),
+    ?assertEqual(diffy:destination_text(diffy:diff(A, B)), diffy:destination_text(NoLinemode)),
+
+    %% semantic option applies cleanup_semantic to the raw diff.
+    ?assertEqual(diffy:cleanup_semantic(diffy:diff(A, B)), diffy:diff(A, B, [semantic])),
+
+    %% efficiency option applies cleanup_efficiency to the raw diff.
+    ?assertEqual(diffy:cleanup_efficiency(diffy:diff(A, B)), diffy:diff(A, B, [efficiency])),
+
+    %% {efficiency, Cost} applies cleanup_efficiency/2 with the given cost.
+    ?assertEqual(diffy:cleanup_efficiency(diffy:diff(A, B), 2), diffy:diff(A, B, [{efficiency, 2}])),
+
+    %% Both: semantic first, then efficiency.
+    ?assertEqual(
+        diffy:cleanup_efficiency(diffy:cleanup_semantic(diffy:diff(A, B))),
+        diffy:diff(A, B, [semantic, efficiency])),
+
+    %% Order of options in list does not affect cleanup order.
+    ?assertEqual(
+        diffy:diff(A, B, [semantic, efficiency]),
+        diffy:diff(A, B, [efficiency, semantic])),
+
+    ok.
+
+
 
 %%
 %% Helpers

From 99e22aa885981e5a24e4abe382ac0707565698e3 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 09:03:38 +0200
Subject: [PATCH 20/47] Reuse long and short size

---
 src/diffy.erl | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 57d04ce..312feab 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -196,20 +196,23 @@ try_half_match(OldText, NewText, CheckLines) ->
 %% Check if we can do a half-match diff, returns undefined if it is not advantageous.
 %% Operates on UTF-32 binaries — size comparisons are in bytes (4 bytes per codepoint).
 half_match(A, B) ->
-    AGtB = size(A) > size(B),
-    {Short, Long} = case AGtB of
-        true -> {B, A};
-        false -> {A, B}
-    end,
+    AgtB = size(A) > size(B),
+    {Short, Long} = case AgtB of
+                        true -> {B, A};
+                        false -> {A, B}
+                    end,
+
+    LongSize = size(Long),
+    ShortSize = size(Short),
 
     %% text_smaller_than(Long, 4) becomes size(Long) < 4*4 in UTF-32.
-    case size(Long) < 16 orelse size(Short) * 2 < size(Long) of
+    case LongSize < 16 orelse ShortSize * 2 < LongSize of
         true ->
             %% No point in looking.
             undefined;
         false ->
-            Hm1 = half_match_i(Long, Short, (size(Long) + 3) div 4),
-            Hm2 = half_match_i(Long, Short, (size(Long) + 1) div 2),
+            Hm1 = half_match_i(Long, Short, (LongSize + 3) div 4),
+            Hm2 = half_match_i(Long, Short, (LongSize + 1) div 2),
 
             %% Select the longest half-match.
             Hm = case {Hm1, Hm2} of
@@ -229,7 +232,7 @@ half_match(A, B) ->
             case Hm of
                 undefined -> undefined;
                 {half_match, T1A, T1B, T2A, T2B, MidCommon} ->
-                    case AGtB of
+                    case AgtB of
                         true -> Hm;
                         false ->
                             {half_match, T2A, T2B, T1A, T1B, MidCommon}

From 4d1df305164c422bfcaaf29781d903ae4d2362e3 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 09:45:15 +0200
Subject: [PATCH 21/47] Using phash2 for keys in the index map can lead to
 collisions which can be better handled by the low level map implementation

---
 src/diffy.erl | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 312feab..70d1afc 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -396,14 +396,11 @@ lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) ->
 
 
 insert_line(Line, Lines, Map, NextChar) ->
-    Hash = erlang:phash2(Line, ?PHASH2_RANGE),
     case Map of
-        %% Hash hit — verify the stored line matches to guard against collisions.
-        #{Hash := {Char, Line}} ->
+        #{Line := Char} ->
             {Char, NextChar, Lines, Map};
-        %% Hash miss or collision with a different line — assign a new index.
         _ ->
-            {NextChar, NextChar + 1, [Line | Lines], Map#{Hash => {NextChar, Line}}}
+            {NextChar, NextChar + 1, [Line | Lines], Map#{Line => NextChar}}
     end.
 
 decode_lines(Diffs, Lines) when is_list(Lines) ->

From a311a71a1039d379e29729000326ce097f5d1c00 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 14:49:02 +0200
Subject: [PATCH 22/47] Fix utf32 size boundary problem:

---
 src/diffy.erl | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 70d1afc..b952cf2 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -211,8 +211,11 @@ half_match(A, B) ->
             %% No point in looking.
             undefined;
         false ->
-            Hm1 = half_match_i(Long, Short, (LongSize + 3) div 4),
-            Hm2 = half_match_i(Long, Short, (LongSize + 1) div 2),
+            %% Seed positions are quarter-way and half-way through Long,
+            %% expressed as byte offsets (codepoints * 4).
+            LongLen = LongSize div 4,  %% codepoint count
+            Hm1 = half_match_i(Long, Short, ((LongLen + 3) div 4) * 4),
+            Hm2 = half_match_i(Long, Short, ((LongLen + 1) div 2) * 4),
 
             %% Select the longest half-match.
             Hm = case {Hm1, Hm2} of
@@ -298,16 +301,12 @@ next_char(_Bin, Pos) ->
     Pos + 4.
 
 %%
-%% In UTF-32 every codepoint is exactly 4 bytes, so any 4-byte-aligned slice
-%% is a valid codepoint boundary — no repair_head/repair_tail needed.
+%% In UTF-32 every codepoint is exactly 4 bytes. Start is always a 4-byte-aligned
+%% byte offset, so no alignment step is needed.
 seed(Long, Start) ->
     SeedSize = size(Long) div 4,
-
-    %% Align Start to a 4-byte (codepoint) boundary.
-    AlignedStart = (Start div 4) * 4,
-    <<_Pre:AlignedStart/binary, Seed:SeedSize/binary, _Post/binary>> = Long,
-
-    {AlignedStart, Seed}.
+    <<_Pre:Start/binary, Seed:SeedSize/binary, _Post/binary>> = Long,
+    {Start, Seed}.
 
 
 %% Line diff
@@ -1285,12 +1284,10 @@ half_match_utf8(A, B) ->
     end.
 
 half_match_test() ->
-    ?assertEqual(undefined, half_match_utf8(<<"1234567890">>, <<"abcdef">>)),
-    ?assertEqual(undefined, half_match_utf8(<<"12345">>, <<"23">>)),
+    ?assertEqual(undefined, half_match_utf8(<<"1234567890">>, <<"abcdef">>)), ?assertEqual(undefined, half_match_utf8(<<"12345">>, <<"23">>)),
 
     %% Single Match
-    ?assertEqual({half_match, <<"12">>, <<"90">>, <<"a">>, <<"z">>, <<"345678">>}, 
-        half_match_utf8(<<"1234567890">>, <<"a345678z">>)),
+    ?assertEqual({half_match, <<"12">>, <<"90">>, <<"a">>, <<"z">>, <<"345678">>}, half_match_utf8(<<"1234567890">>, <<"a345678z">>)),
     ?assertEqual({half_match, <<"a">>, <<"z">>, <<"12">>, <<"90">>, <<"345678">>}, 
         half_match_utf8(<<"a345678z">>, <<"1234567890">>)),
     ?assertEqual({half_match, <<"abc">>, <<"z">>, <<"1234">>, <<"0">>, <<"56789">>}, 
@@ -1311,6 +1308,24 @@ half_match_test() ->
     ?assertEqual({half_match, <<"qHillo">>, <<"w">>, <<"x">>, <<"Hulloy">>, <<"HelloHe">>}, 
         half_match_utf8(<<"qHilloHelloHew">>, <<"xHelloHeHulloy">>)),
 
+    ?assertEqual({half_match, <<"qHillo"/utf8>>, <<"w"/utf8>>, <<"x"/utf8>>, <<"eHull💯y"/utf8>>, <<"🐶🐱🐭🐹🐰H❤️"/utf8>>}, 
+        half_match_utf8(<<"qHillo🐶🐱🐭🐹🐰H❤️w"/utf8>>, <<"x🐶🐱🐭🐹🐰H❤️eHull💯y"/utf8>>)),
+
+    %% Unicode: é is 2 UTF-8 bytes but 1 codepoint (4 UTF-32 bytes).
+    %% With the old bug, size(Long) div 4 gave the wrong seed position
+    %% because byte_size in UTF-32 ≠ codepoint_count for multi-byte UTF-8 chars.
+    %% Long = éééééééééé (10 chars), Short = a + éééééééé + z (10 chars).
+    %% half_match should find the 8-char common section of é's.
+    E = <<233/utf8>>,
+    ULong = binary:copy(E, 10),
+    UShort = <<"a", (binary:copy(E, 8))/binary, "z">>,
+    UDiff = diff(ULong, UShort),
+    ?assertEqual(ULong, source_text(UDiff)),
+    ?assertEqual(UShort, destination_text(UDiff)),
+    %% The 8-char run of é must appear as a single equal op.
+    Equal8 = binary:copy(E, 8),
+    ?assert(lists:member({equal, Equal8}, UDiff)),
+
     ok.
 
 %% common_prefix/suffix operate on UTF-32; wrap for testing.

From 9b09fb8988fe2b53860dcf7a7dde6976d62beb33 Mon Sep 17 00:00:00 2001
From: MM Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 15:49:37 +0200
Subject: [PATCH 23/47] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/diffy.erl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index b952cf2..282f3cd 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -304,7 +304,9 @@ next_char(_Bin, Pos) ->
 %% In UTF-32 every codepoint is exactly 4 bytes. Start is always a 4-byte-aligned
 %% byte offset, so no alignment step is needed.
 seed(Long, Start) ->
-    SeedSize = size(Long) div 4,
+    TotalCodepoints = size(Long) div 4,
+    SeedCodepoints = TotalCodepoints div 4,
+    SeedSize = SeedCodepoints * 4,
     <<_Pre:Start/binary, Seed:SeedSize/binary, _Post/binary>> = Long,
     {Start, Seed}.
 

From e2815378130327f77ba41359d30a62cc741000c9 Mon Sep 17 00:00:00 2001
From: MM Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 15:52:38 +0200
Subject: [PATCH 24/47] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/diffy.erl | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 282f3cd..af32fe6 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -164,7 +164,7 @@ compute_diff(OldText, NewText, CheckLines) ->
                                 false -> {NewText, OldText}
                             end,
 
-    case binary:match(LongText, ShortText) of
+    case aligned_utf32_match(LongText, ShortText, 0) of
         {Start, Length} ->
             <<Pre:Start/binary, _:Length/binary, Suf/binary>> = LongText,
             Op = diff_op(OldStNew),
@@ -255,7 +255,7 @@ half_match_i(Long, Short, I) ->
 best_common(Long, Short, Seed, SeedLoc, Start, 
         BestLongA, BestLongB, BestShortA, BestShortB, BestCommon) ->
     %% Check if we can find a match for Seed2 inside the shorttext.
-    case binary:match(Short, Seed, [{scope, {Start, size(Short)-Start}}]) of
+    case aligned_utf32_match(Short, Seed, Start) of
         nomatch -> 
             case size(BestCommon) * 2 >= size(Long) of
                 false -> 
@@ -296,6 +296,29 @@ best_common(Long, Short, Seed, SeedLoc, Start,
             end
     end.
 
+%% @doc Round a byte offset up to the next UTF-32 codepoint boundary.
+align_utf32_offset(Offset) when Offset rem 4 =:= 0 ->
+    Offset;
+align_utf32_offset(Offset) ->
+    Offset + (4 - (Offset rem 4)).
+
+%% @doc Find a match whose start offset is aligned to a UTF-32 codepoint boundary.
+aligned_utf32_match(Bin, Pattern, Start) ->
+    AlignedStart = align_utf32_offset(Start),
+    case AlignedStart >= size(Bin) of
+        true ->
+            nomatch;
+        false ->
+            case binary:match(Bin, Pattern, [{scope, {AlignedStart, size(Bin) - AlignedStart}}]) of
+                nomatch ->
+                    nomatch;
+                {MatchStart, Length} when MatchStart rem 4 =:= 0 ->
+                    {MatchStart, Length};
+                {MatchStart, _Length} ->
+                    aligned_utf32_match(Bin, Pattern, MatchStart + 1)
+            end
+    end.
+
 %% @doc Return the byte position of the next codepoint in a UTF-32 binary.
 next_char(_Bin, Pos) ->
     Pos + 4.

From 0f049ea102b1f2087ddd86bdb62c4d470f241890 Mon Sep 17 00:00:00 2001
From: MM Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 15:53:18 +0200
Subject: [PATCH 25/47] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/diffy.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index af32fe6..c32b2f0 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -1219,7 +1219,7 @@ common_suffix(Text1, Text2) ->
 % @deprecated Use text_size32/1 internally. This public function may be removed in a future version.
 -spec text_size(unicode:unicode_binary()) -> non_neg_integer().
 text_size(Text) when is_binary(Text) ->
-    string:length(Text).
+    byte_size(to_utf32(Text)) div 4.
 
 % @doc Count the number of codepoints in a UTF-32 binary. O(1).
 text_size32(Text) when is_binary(Text) ->

From 7fc3b2328418c2016efbc4986b0594bf54c87e1f Mon Sep 17 00:00:00 2001
From: MM Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 15:56:57 +0200
Subject: [PATCH 26/47] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/diffy.erl | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index c32b2f0..f34b6fd 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -1237,11 +1237,25 @@ text_smaller_than(Text, Size) ->
 
 % @doc Convert a UTF-8 binary to UTF-32, crashing on invalid input.
 to_utf32(Bin) ->
-    <<_/binary>> = unicode:characters_to_binary(Bin, utf8, utf32).
+    case unicode:characters_to_binary(Bin, utf8, utf32) of
+        Out when is_binary(Out) ->
+            Out;
+        {error, _, _} ->
+            error(badarg);
+        {incomplete, _, _} ->
+            error(badarg)
+    end.
 
 % @doc Convert a UTF-32 binary to UTF-8, crashing on invalid input.
 to_utf8(Bin) ->
-    <<_/binary>> = unicode:characters_to_binary(Bin, utf32, utf8).
+    case unicode:characters_to_binary(Bin, utf32, utf8) of
+        Out when is_binary(Out) ->
+            Out;
+        {error, _, _} ->
+            error(badarg);
+        {incomplete, _, _} ->
+            error(badarg)
+    end.
 
 %%
 %% Tests

From 001b75c9c796fe7e331701fb61a6316968a9347f Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 15:57:54 +0200
Subject: [PATCH 27/47] Fix text_size test

---
 test/diffy_tests.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 06f9e86..a642f2c 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -314,7 +314,7 @@ text_size_test() ->
     ?assertEqual(4, diffy:text_size(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>)),
 
     %% Bad utf-8 input results in a badarg.
-    ?assertError({badarg, _}, diffy:text_size(<<149,157,112,8>>)),
+    ?assertError(badarg, diffy:text_size(<<149,157,112,8>>)),
 
     ok.
 

From b1033019881b91401e5c1119598486444889e4e3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:02:14 +0000
Subject: [PATCH 28/47] Initial plan


From 356304437c061ce2c394a9c6e3029bbef4fddcfd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:04:59 +0000
Subject: [PATCH 29/47] Add seed_test/0 EUnit tests for UTF-32 alignment
 invariants

Agent-Logs-Url: https://github.com/zotonic/diffy/sessions/2c2ca16d-02fc-4d4c-b55a-570d0b9ac07e

Co-authored-by: mmzeeman <1024972+mmzeeman@users.noreply.github.com>
---
 src/diffy.erl | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/src/diffy.erl b/src/diffy.erl
index f34b6fd..441584d 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -1486,4 +1486,65 @@ diff_options_test() ->
 
     ok.
 
+seed_test() ->
+    %% 1. Empty binary: no codepoints, seed is empty.
+    ?assertEqual({0, <<>>}, seed(<<>>, 0)),
+
+    %% 2. Binary shorter than 4 codepoints (3 codepoints): 3 div 4 = 0, seed is empty.
+    Short3 = to_utf32(<<"abc">>),
+    ?assertEqual({0, <<>>}, seed(Short3, 0)),
+
+    %% 3. Exactly 4 codepoints, Start=0: seed is 1 codepoint (the first one).
+    Exact4 = to_utf32(<<"abcd">>),
+    ?assertEqual({0, to_utf32(<<"a">>)}, seed(Exact4, 0)),
+
+    %% 4. 8 codepoints, Start=0: seed is 2 codepoints starting at offset 0.
+    Long8 = to_utf32(<<"12345678">>),
+    ?assertEqual({0, to_utf32(<<"12">>)}, seed(Long8, 0)),
+
+    %% 5. 16 codepoints, Start=8 (byte offset = 2 codepoints in):
+    %%    seed is 4 codepoints; returned Start equals 8 and seed bytes are the correct slice.
+    Long16 = to_utf32(<<"abcdefghijklmnop">>),
+    {S5, Seed5} = seed(Long16, 8),
+    ?assertEqual(8, S5),
+    ?assertEqual(to_utf32(<<"cdef">>), Seed5),
+
+    %% 6. ASCII text round-trip: "1234567890" (10 chars), seed at quarter-way offset.
+    Ascii10 = to_utf32(<<"1234567890">>),
+    %% TotalCodepoints=10, SeedCodepoints=2; Start=0 (quarter-way = 0 for simplicity).
+    {_, SeedAscii} = seed(Ascii10, 0),
+    ?assertEqual(<<"12">>, to_utf8(SeedAscii)),
+
+    %% 7. Multi-byte codepoint alignment: 10 Greek letters (2 UTF-8 bytes each, 4 UTF-32 bytes each).
+    Greek10 = to_utf32(<<"αβγδεζηθικ"/utf8>>),
+    {Start7, Seed7} = seed(Greek10, 0),
+    %% Returned Start is 0.
+    ?assertEqual(0, Start7),
+    %% Seed is 4-byte-aligned.
+    ?assertEqual(0, byte_size(Seed7) rem 4),
+    %% Seed length = (10 div 4) * 4 = 8 bytes = 2 codepoints.
+    ?assertEqual((10 div 4) * 4, byte_size(Seed7)),
+    %% Seed decodes back to the first 2 Greek letters.
+    ?assertEqual(<<"αβ"/utf8>>, to_utf8(Seed7)),
+
+    %% 8. Emoji (4-byte UTF-8 codepoints): 10 emoji, seed is first 2.
+    Emoji10 = to_utf32(<<"🐶🐱🐭🐹🐰🐨🐯🦁🐮🐷"/utf8>>),
+    {_, SeedEmoji} = seed(Emoji10, 0),
+    %% Seed length = (10 div 4) * 4 = 8 bytes = 2 codepoints.
+    ?assertEqual((10 div 4) * 4, byte_size(SeedEmoji)),
+    %% Seed decodes back to the first 2 emoji.
+    ?assertEqual(<<"🐶🐱"/utf8>>, to_utf8(SeedEmoji)),
+
+    %% 9. Seed start offset preserved: non-zero Start is returned unchanged.
+    Long12 = to_utf32(<<"abcdefghijkl">>),
+    {Start9, _} = seed(Long12, 8),
+    ?assertEqual(8, Start9),
+
+    %% 10. Seed is a contiguous slice of Long: binary:part(Long, Start, byte_size(Seed)) =:= Seed.
+    Long20 = to_utf32(<<"abcdefghijklmnopqrst">>),
+    {Start10, Seed10} = seed(Long20, 8),
+    ?assertEqual(Seed10, binary:part(Long20, Start10, byte_size(Seed10))),
+
+    ok.
+
 -endif.

From 83179221c27db91fca9bbb164548eed7605990ba Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:05:36 +0000
Subject: [PATCH 30/47] Fix misleading comments in seed_test/0

Agent-Logs-Url: https://github.com/zotonic/diffy/sessions/2c2ca16d-02fc-4d4c-b55a-570d0b9ac07e

Co-authored-by: mmzeeman <1024972+mmzeeman@users.noreply.github.com>
---
 src/diffy.erl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 441584d..33817fe 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -1502,7 +1502,7 @@ seed_test() ->
     Long8 = to_utf32(<<"12345678">>),
     ?assertEqual({0, to_utf32(<<"12">>)}, seed(Long8, 0)),
 
-    %% 5. 16 codepoints, Start=8 (byte offset = 2 codepoints in):
+    %% 5. 16 codepoints, Start=8 (8 bytes = 2 codepoints * 4 bytes/codepoint):
     %%    seed is 4 codepoints; returned Start equals 8 and seed bytes are the correct slice.
     Long16 = to_utf32(<<"abcdefghijklmnop">>),
     {S5, Seed5} = seed(Long16, 8),
@@ -1511,7 +1511,7 @@ seed_test() ->
 
     %% 6. ASCII text round-trip: "1234567890" (10 chars), seed at quarter-way offset.
     Ascii10 = to_utf32(<<"1234567890">>),
-    %% TotalCodepoints=10, SeedCodepoints=2; Start=0 (quarter-way = 0 for simplicity).
+    %% TotalCodepoints=10, SeedCodepoints=2; Start=0 to keep the offset 4-byte-aligned.
     {_, SeedAscii} = seed(Ascii10, 0),
     ?assertEqual(<<"12">>, to_utf8(SeedAscii)),
 

From 3fbdd22c4658dfb58c9961a899ae6a956492a90f Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 22:08:03 +0200
Subject: [PATCH 31/47] Fix the aligned_utf32_match function and added tests

---
 src/diffy.erl | 54 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 13 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 33817fe..b088681 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -65,7 +65,7 @@
 
 -define(PATCH_MARGIN, 4).
 -define(IS_INS_OR_DEL(Op), (Op =:= insert orelse Op =:= delete)).
--define(PHASH2_RANGE, (1 bsl 32)).
+-define(IS_UTF32_ALIGNED(Offset), (Offset rem 4 =:= 0)).
 
 -record(bisect_state, {
     k1start = 0, k1end = 0,
@@ -296,26 +296,24 @@ best_common(Long, Short, Seed, SeedLoc, Start,
             end
     end.
 
-%% @doc Round a byte offset up to the next UTF-32 codepoint boundary.
-align_utf32_offset(Offset) when Offset rem 4 =:= 0 ->
-    Offset;
-align_utf32_offset(Offset) ->
-    Offset + (4 - (Offset rem 4)).
-
 %% @doc Find a match whose start offset is aligned to a UTF-32 codepoint boundary.
-aligned_utf32_match(Bin, Pattern, Start) ->
-    AlignedStart = align_utf32_offset(Start),
-    case AlignedStart >= size(Bin) of
+aligned_utf32_match(Bin, Pattern, Start)
+  when ?IS_UTF32_ALIGNED(Start) andalso Start >= 0 ->
+    case Start + size(Pattern) > size(Bin) of
         true ->
             nomatch;
         false ->
-            case binary:match(Bin, Pattern, [{scope, {AlignedStart, size(Bin) - AlignedStart}}]) of
+            case binary:match(Bin, Pattern, [{scope, {Start, size(Bin) - Start}}]) of
                 nomatch ->
                     nomatch;
-                {MatchStart, Length} when MatchStart rem 4 =:= 0 ->
+                {MatchStart, Length} when ?IS_UTF32_ALIGNED(MatchStart) ->
+                    %% Match found, and it is correctly aligned.
                     {MatchStart, Length};
                 {MatchStart, _Length} ->
-                    aligned_utf32_match(Bin, Pattern, MatchStart + 1)
+                    %% Misaligned hit. binary:match found the first byte-level match,
+                    %% so there is no aligned match before MatchStart. Skip directly
+                    %% to the next aligned boundary after MatchStart.
+                    aligned_utf32_match(Bin, Pattern, MatchStart + (4 - MatchStart rem 4))
             end
     end.
 
@@ -1547,4 +1545,34 @@ seed_test() ->
 
     ok.
 
+aligned_utf32_match_test() ->
+    ?assertEqual(nomatch, aligned_utf32_match(<<>>, <<0,0,0,0>>, 0)),
+    ?assertEqual(nomatch, aligned_utf32_match(<<>>, <<0,0,0,0>>, 4)),
+
+    ?assertError(function_clause, aligned_utf32_match(<<>>, <<0,0,0,0>>, 3)),
+    ?assertError(function_clause, aligned_utf32_match(<<>>, <<0,0,0,0>>, -4)),
+
+    ?assertEqual({0, 4}, aligned_utf32_match(<<1,2,3,4>>, <<1,2,3,4>>, 0)),
+    ?assertEqual({4, 4}, aligned_utf32_match(<<0,0,0,0, 1,2,3,4>>, <<1,2,3,4>>, 0)),
+
+    %% These will binary match, but the match is not on a utf32 boundary
+    ?assertEqual(nomatch, aligned_utf32_match(<<0,0,1,2, 3,4,5,6>>, <<1,2,3,4>>, 0)),
+    ?assertEqual({8,4}, aligned_utf32_match(<<0,0,1,2, 3,4,5,6, 1,2,3,4>>, <<1,2,3,4>>, 0)),
+    ?assertEqual({8,4}, aligned_utf32_match(<<0,0,1,2, 3,4,5,6, 1,2,3,4>>, <<1,2,3,4>>, 4)),
+    ?assertEqual(nomatch, aligned_utf32_match(<<0,0,1,2, 3,4,5,1, 2,3,4,0>>, <<1,2,3,4>>, 4)),
+
+    %% Some longer matches
+    ?assertEqual({40, 20}, aligned_utf32_match(to_utf32(<<"the quick brown fox jumps over the lazy dog"/utf8>>),
+                                               to_utf32(<<"brown"/utf8>>), 0)),
+    ?assertEqual(nomatch, aligned_utf32_match(to_utf32(<<"the quick brown fox jumps over the lazy dog"/utf8>>),
+                                              to_utf32(<<"blue"/utf8>>), 0)),
+
+    %% All emoticon matches emoticons
+    ?assertEqual(nomatch, aligned_utf32_match(to_utf32(<<"😔😟😕🙁☹️😣😖😫😩🥺🥶"/utf8>>),
+                                              to_utf32(<<"💩"/utf8>>), 0)),
+    ?assertEqual({16,12}, aligned_utf32_match(to_utf32(<<"😔😟😕🙁☹️💩😣😖😫😩🥺🥶"/utf8>>),
+                                              to_utf32(<<"☹️💩"/utf8>>), 0)),
+
+    ok.
+
 -endif.

From e19afc27fa552b84d0a7c084381a370ca33074a7 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Fri, 10 Apr 2026 23:08:29 +0200
Subject: [PATCH 32/47] Fix an issue where the overlap could match outside
 utf32 char boundaries

---
 src/diffy.erl        | 33 ++++++++++++++++++++++++---------
 test/diffy_tests.erl |  1 +
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index b088681..2734ce6 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -911,10 +911,13 @@ common_overlap(Text1, Text2) ->
     T1Len = text_size32(Text1),
     T2Len = text_size32(Text2),
     {T1, T2, TMin} = if
-        T1Len > T2Len -> {substring_end(Text1, T2Len), Text2, T2Len};
-        T1Len < T2Len -> {Text1, substring_start(Text2, T1Len), T1Len};
-        true -> {Text1, Text2, T1Len}
-    end,
+                         T1Len > T2Len ->
+                             {substring_end(Text1, T2Len), Text2, T2Len};
+                         T1Len < T2Len ->
+                             {Text1, substring_start(Text2, T1Len), T1Len};
+                         true ->
+                             {Text1, Text2, T1Len}
+                     end,
     case T1 =:= T2 of
         true -> TMin;
         false -> common_overlap_loop(T1, T2, TMin, 0, 1)
@@ -922,18 +925,18 @@ common_overlap(Text1, Text2) ->
 
 common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin ->
     Pattern = substring_end(T1, Length),
-    case binary:match(T2, Pattern) of
+    case aligned_utf32_match(T2, Pattern, 0) of
         nomatch -> Best;
         {FoundByteOffset, _} ->
             %% In UTF-32, byte offset maps directly to codepoint count.
             FoundCharCount = FoundByteOffset div 4,
             NewLength = Length + FoundCharCount,
-            if
-                NewLength > TMin -> Best;
-                true ->
+            case NewLength > TMin of
+                true -> Best;
+                false ->
                     case substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of
                         true ->
-                            common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1);
+                           common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1);
                         false ->
                             common_overlap_loop(T1, T2, TMin, Best, NewLength + 1)
                     end
@@ -1575,4 +1578,16 @@ aligned_utf32_match_test() ->
 
     ok.
 
+common_overlap_loop_test() ->
+    Abc = to_utf32(<<"abc">>),
+    Cde = to_utf32(<<"cde">>),
+    ?assertEqual(1, common_overlap_loop(Abc, Cde, size(Cde), 0, 1)),
+
+    Abcdef = to_utf32(<<"abcdef">>),
+    Efde = to_utf32(<<"efde">>),
+    ?assertEqual(2, common_overlap_loop(Abcdef, Efde, size(Cde), 0, 1)),
+
+    ok.
+
+
 -endif.
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index a642f2c..0e1980e 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -267,6 +267,7 @@ cleanup_semantic_test() ->
     ?assertEqual(diffy:destination_text(Diffs), diffy:destination_text(Cleaned)),
 
     ok.
+
 cleanup_efficiency_prop_test() ->
     ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, ?NUM_TESTS}, {to_file, user}])),
     ok.

From 953afd4d4a7284971749e23859e5855f54ac091a Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Sat, 11 Apr 2026 00:13:32 +0200
Subject: [PATCH 33/47] Fix an issue where the overlap could match outside
 utf32 char boundaries

---
 src/diffy.erl | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 2734ce6..6498ac2 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -926,7 +926,8 @@ common_overlap(Text1, Text2) ->
 common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin ->
     Pattern = substring_end(T1, Length),
     case aligned_utf32_match(T2, Pattern, 0) of
-        nomatch -> Best;
+        nomatch ->
+            Best;
         {FoundByteOffset, _} ->
             %% In UTF-32, byte offset maps directly to codepoint count.
             FoundCharCount = FoundByteOffset div 4,
@@ -936,7 +937,7 @@ common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin ->
                 false ->
                     case substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of
                         true ->
-                           common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1);
+                            common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1);
                         false ->
                             common_overlap_loop(T1, T2, TMin, Best, NewLength + 1)
                     end
@@ -1553,7 +1554,6 @@ aligned_utf32_match_test() ->
     ?assertEqual(nomatch, aligned_utf32_match(<<>>, <<0,0,0,0>>, 4)),
 
     ?assertError(function_clause, aligned_utf32_match(<<>>, <<0,0,0,0>>, 3)),
-    ?assertError(function_clause, aligned_utf32_match(<<>>, <<0,0,0,0>>, -4)),
 
     ?assertEqual({0, 4}, aligned_utf32_match(<<1,2,3,4>>, <<1,2,3,4>>, 0)),
     ?assertEqual({4, 4}, aligned_utf32_match(<<0,0,0,0, 1,2,3,4>>, <<1,2,3,4>>, 0)),
@@ -1578,6 +1578,15 @@ aligned_utf32_match_test() ->
 
     ok.
 
+common_overlap_test() ->
+    A = to_utf32(<<"Fire at Will">>),
+    B = to_utf32(<<"William Riker is number one">>),
+
+    ?assertEqual(4, common_overlap(A, B)),
+
+    ok.
+
+
 common_overlap_loop_test() ->
     Abc = to_utf32(<<"abc">>),
     Cde = to_utf32(<<"cde">>),

From d760a29682aa33871158fb8693ffb77be9c4e59a Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Sat, 11 Apr 2026 15:25:07 +0200
Subject: [PATCH 34/47] Removed unique match function, not relevant for the api
 and confusing

---
 src/diffy.erl | 28 +---------------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 6498ac2..68f03a8 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -45,8 +45,7 @@
 
     text_size/1,
 
-    split_pre_and_suffix/2,
-    unique_match/2
+    split_pre_and_suffix/2
 ]).
 
 -type diff_op() :: delete | equal | insert.
@@ -1123,24 +1122,6 @@ make_patch([{equal, Data}|T], PrePatchText, PostPatchText, Count1, Count2, [Patc
         
     make_patch(T, PrePatchText, PostPatchText, Count1+Size, Count2+Size, [P|Rest]).
 
-    
-% @doc Returns true iff Pattern is a unique match inside Text.
-unique_match(Pattern, Text) ->
-    TextSize = size(Text),
-    case binary:match(Text, Pattern) of
-        nomatch -> 
-            error(nomatch);
-        {Start, Length} when Start + 1 + Length < TextSize ->
-            %% We have a match, and we can search..
-            case binary:match(Text, Pattern, [{scope, {Start+1, TextSize-Start-1}}]) of
-                nomatch -> true;
-                {_, _} -> false
-            end;
-        {_, _} ->
-            true
-    end.
-
-
 %%
 %% Helpers
 %%
@@ -1411,13 +1392,6 @@ split_pre_and_suffix_test() ->
                  Split(<<"cat ">>, <<"cat mouse dog ">>)),
     ok.
 
-unique_match_test() ->
-    ?assertEqual(true, unique_match(<<"a">>, <<"abc">>)),
-    ?assertEqual(true, unique_match(<<"b">>, <<"abc">>)),
-    ?assertEqual(true, unique_match(<<"c">>, <<"abc">>)),
-    ?assertEqual(false, unique_match(<<"ab">>, <<"abab">>)),
-    ok.
-
 text_smaller_than_test() ->
     %% text_smaller_than now works on UTF-32 binaries.
     ?assertEqual(true,  text_smaller_than(to_utf32(<<>>), 5)),

From b462e21969b5244b5800a4a6fe8bf0209f9e9ac3 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Sat, 11 Apr 2026 21:47:22 +0200
Subject: [PATCH 35/47] Use aligned utf32 match to do line_diff. Also removed
 unneeded conversion from an to utf8

---
 src/diffy.erl | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 68f03a8..07756b9 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -390,32 +390,28 @@ cleanup_line_diff([{equal, _}=E|Rest], DeleteData, InsertData, _TmpAcc, Acc) ->
 %% Text1 and Text2 are UTF-32 binaries. Lines are stored as UTF-32 binaries.
 %% CharText1/CharText2 are UTF-32 binaries where each 4-byte word is a line index.
 lines_to_chars(Text1, Text2) ->
-    Utf8Text1 = to_utf8(Text1),
-    Utf8Text2 = to_utf8(Text2),
-    {CharText1, NextChar, Lines1, Map1} = lines_to_chars(Utf8Text1, 0, <<>>, 0, [], #{}),
-    {CharText2, _, Lines2, _Map2} = lines_to_chars(Utf8Text2, 0, <<>>, NextChar, Lines1, Map1),
-
+    {CharText1, NextChar, Lines1, Map1} = lines_to_chars(Text1, 0, <<>>, 0, [], #{}),
+    {CharText2, _, Lines2, _Map2} = lines_to_chars(Text2, 0, <<>>, NextChar, Lines1, Map1),
     {CharText1, CharText2, lists:reverse(Lines2)}.
 
 %% Transform each unique line into a 4-byte index; store line content as UTF-32.
 lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) when Idx >= byte_size(Text) ->
     {CharText, NextChar, Lines, Map};
-lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) ->
-    case binary:match(Text, <<"\n">>, [{scope, {Idx, byte_size(Text)-Idx}}]) of
+lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) when ?IS_UTF32_ALIGNED(Idx) ->
+    case aligned_utf32_match(Text, <<$\n:32>>, Idx) of
         nomatch ->
             <<_:Idx/binary, Line/binary>> = Text,
-            {Char, NextChar1, Lines1, Map1} = insert_line(to_utf32(Line), Lines, Map, NextChar),
+            {Char, NextChar1, Lines1, Map1} = insert_line(Line, Lines, Map, NextChar),
             CharText1 = <<CharText/binary, Char:32>>,
             {CharText1, NextChar1, Lines1, Map1};
         {Start, _} ->
-            LineLength = Start - Idx + 1,
+            LineLength = Start - Idx + 4,
             <<_:Idx/binary, Line:LineLength/binary, _/binary>> = Text,
-            {Char, NextChar1, Lines1, Map1} = insert_line(to_utf32(Line), Lines, Map, NextChar),
+            {Char, NextChar1, Lines1, Map1} = insert_line(Line, Lines, Map, NextChar),
             CharText1 = <<CharText/binary, Char:32>>,
             lines_to_chars(Text, Idx + LineLength, CharText1, NextChar1, Lines1, Map1)
     end.
 
-
 insert_line(Line, Lines, Map, NextChar) ->
     case Map of
         #{Line := Char} ->
@@ -1555,12 +1551,9 @@ aligned_utf32_match_test() ->
 common_overlap_test() ->
     A = to_utf32(<<"Fire at Will">>),
     B = to_utf32(<<"William Riker is number one">>),
-
     ?assertEqual(4, common_overlap(A, B)),
-
     ok.
 
-
 common_overlap_loop_test() ->
     Abc = to_utf32(<<"abc">>),
     Cde = to_utf32(<<"cde">>),
@@ -1572,5 +1565,4 @@ common_overlap_loop_test() ->
 
     ok.
 
-
 -endif.

From 23ce7dc9b2ede03f656506a319f772f40dea916b Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Sat, 11 Apr 2026 21:59:30 +0200
Subject: [PATCH 36/47] Replaced some functions with macro's

---
 src/diffy.erl | 36 +++++++++---------------------------
 1 file changed, 9 insertions(+), 27 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 07756b9..f5e513a 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -65,6 +65,9 @@
 -define(PATCH_MARGIN, 4).
 -define(IS_INS_OR_DEL(Op), (Op =:= insert orelse Op =:= delete)).
 -define(IS_UTF32_ALIGNED(Offset), (Offset rem 4 =:= 0)).
+-define(IS_WS(C), (C =:= $\s orelse C =:= $\t orelse C =:= $\n orelse C =:= $\r orelse C =:= $\f orelse C =:= $\v)).
+-define(IS_LB(C), (C =:= $\n orelse C =:= $\r)).
+-define(IS_ALPHA(C), ((C >= $a andalso C =< $z) orelse (C >= $A andalso C =< $Z) orelse (C >= $0 andalso C =< $9))).
 
 -record(bisect_state, {
     k1start = 0, k1end = 0,
@@ -833,12 +836,12 @@ cleanup_semantic_score(_, <<>>) -> 6;
 cleanup_semantic_score(One, Two) ->
     Char1 = last_char(One),
     Char2 = first_char(Two),
-    NonAlphaNumeric1 = is_non_alphanumeric(Char1),
-    NonAlphaNumeric2 = is_non_alphanumeric(Char2),
-    Whitespace1 = NonAlphaNumeric1 andalso is_whitespace(Char1),
-    Whitespace2 = NonAlphaNumeric2 andalso is_whitespace(Char2),
-    LineBreak1 = Whitespace1 andalso is_linebreak(Char1),
-    LineBreak2 = Whitespace2 andalso is_linebreak(Char2),
+    NonAlphaNumeric1 = not ?IS_ALPHA(Char1),
+    NonAlphaNumeric2 = not ?IS_ALPHA(Char2),
+    Whitespace1 = NonAlphaNumeric1 andalso ?IS_WS(Char1),
+    Whitespace2 = NonAlphaNumeric2 andalso ?IS_WS(Char2),
+    LineBreak1 = Whitespace1 andalso ?IS_LB(Char1),
+    LineBreak2 = Whitespace2 andalso ?IS_LB(Char2),
     BlankLine1 = LineBreak1 andalso is_blankline_end(One),
     BlankLine2 = LineBreak2 andalso is_blankline_start(Two),
     if
@@ -951,27 +954,6 @@ last_char(Bin) ->
     <<_:(Size-4)/binary, C:32>> = Bin,
     C.
 
-is_non_alphanumeric(undefined) -> true;
-is_non_alphanumeric(C) ->
-    not ((C >= $a andalso C =< $z) orelse
-         (C >= $A andalso C =< $Z) orelse
-         (C >= $0 andalso C =< $9)).
-
-is_whitespace(undefined) -> false;
-is_whitespace(C) ->
-    case C of
-        $\s -> true;
-        $\t -> true;
-        $\n -> true;
-        $\r -> true;
-        $\f -> true;
-        $\v -> true;
-        _ -> false
-    end.
-
-is_linebreak(C) ->
-    C =:= $\n orelse C =:= $\r.
-
 %% In UTF-32 each codepoint is 4 bytes, so newline patterns are fixed-width.
 is_blankline_end(Bin) when byte_size(Bin) >= 8 ->
     Size = byte_size(Bin),

From cff9bcd6a2f9164090cc826358578897c61bd0b2 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Sat, 11 Apr 2026 22:14:37 +0200
Subject: [PATCH 37/47] Some reformatting

---
 src/diffy.erl | 92 ++++++++++++++++++++++++---------------------------
 1 file changed, 43 insertions(+), 49 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index f5e513a..d017107 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -109,14 +109,14 @@ diff(Text1, Text2, Options) when is_list(Options) ->
     T2 = to_utf32(Text2),
     Diffs32 = diff32(T1, T2, CheckLines),
     Diffs1 = case lists:member(semantic, Options) of
-        true  -> cleanup_semantic32(Diffs32);
-        false -> Diffs32
-    end,
+                 true  -> cleanup_semantic32(Diffs32);
+                 false -> Diffs32
+             end,
     Diffs2 = case efficiency_opt(Options) of
-        none           -> Diffs1;
-        default        -> cleanup_efficiency32(Diffs1);
-        {custom, Cost} -> cleanup_efficiency32(Diffs1, Cost)
-    end,
+                 none           -> Diffs1;
+                 default        -> cleanup_efficiency32(Diffs1);
+                 {custom, Cost} -> cleanup_efficiency32(Diffs1, Cost)
+             end,
     %% Single conversion at the exit boundary.
     [{Op, to_utf8(D)} || {Op, D} <- Diffs2].
 
@@ -519,12 +519,10 @@ compute_diff_bisect1(A, B, M, N) ->
                                 true ->
                                     % Mirror x2 onto top-left coordinate system.
                                     X2 = M - V2AtOffset,
-                                    if 
-                                        X1_1 >= X2 ->
-                                            % Overlap detected
-                                            throw({overlap, X1_1, Y1_1});
-                                        true ->
-                                            {continue, S2_1}
+                                    case X1_1 >= X2 of 
+                                        % Overlap detected
+                                        true -> throw({overlap, X1_1, Y1_1});
+                                        false -> {continue, S2_1}
                                     end;
                                 false -> {continue, S2_1}
                             end
@@ -568,13 +566,11 @@ compute_diff_bisect1(A, B, M, N) ->
                                 true ->
                                     X1 = V1AtOffset,
                                     Y1 = VOffset + X1 - K1Offset,
-                                    if 
-                                        % Mirror x2 onto top-left coordinate system.
-                                        X1 >= M - X2_1 ->
-                                            % Overlap detected
-                                            throw({overlap, X1, Y1});
-                                        true ->
-                                            {continue, S4_1}
+                                    % Mirror x2 onto top-left coordinate system.
+                                    case X1 >= M - X2_1 of
+                                        % Overlap detected
+                                        true -> throw({overlap, X1, Y1});
+                                        false -> {continue, S4_1}
                                     end;
                                 false -> {continue, S4_1}
                             end
@@ -795,13 +791,13 @@ cleanup_semantic_lossless([], Acc) ->
 slide_edit(E1, Edit, E2) ->
     Suffix = common_suffix(E1, Edit),
     {E1_1, Edit_1, E2_1} = case Suffix of
-        <<>> -> {E1, Edit, E2};
-        _ ->
-            SLen = size(Suffix),
-            { binary:part(E1, 0, size(E1) - SLen),
-              <<Suffix/binary, (binary:part(Edit, 0, size(Edit) - SLen))/binary>>,
-              <<Suffix/binary, E2/binary>> }
-    end,
+                               <<>> -> {E1, Edit, E2};
+                               _ ->
+                                   SLen = size(Suffix),
+                                   { binary:part(E1, 0, size(E1) - SLen),
+                                     <<Suffix/binary, (binary:part(Edit, 0, size(Edit) - SLen))/binary>>,
+                                     <<Suffix/binary, E2/binary>> }
+                           end,
     find_best_slide(E1_1, Edit_1, E2_1).
 
 find_best_slide(E1, Edit, E2) ->
@@ -815,11 +811,9 @@ find_best_slide(E1, Edit, E2, BestScore, BestE1, BestEdit, BestE2) ->
             NewEdit = <<RestEdit/binary, Char/binary>>,
             NewE2 = RestE2,
             NewScore = cleanup_semantic_score(NewE1, NewEdit) + cleanup_semantic_score(NewEdit, NewE2),
-            if
-                NewScore >= BestScore ->
-                    find_best_slide(NewE1, NewEdit, NewE2, NewScore, NewE1, NewEdit, NewE2);
-                true ->
-                    find_best_slide(NewE1, NewEdit, NewE2, BestScore, BestE1, BestEdit, BestE2)
+            case NewScore >= BestScore of 
+                true -> find_best_slide(NewE1, NewEdit, NewE2, NewScore, NewE1, NewEdit, NewE2);
+                false -> find_best_slide(NewE1, NewEdit, NewE2, BestScore, BestE1, BestEdit, BestE2)
             end;
         false ->
             {BestE1, BestEdit, BestE2}
@@ -861,25 +855,25 @@ cleanup_semantic_overlaps([{delete, Del}, {insert, Ins} | T], Acc) ->
     Overlap2 = common_overlap(Ins, Del),
     TDel = text_size32(Del),
     TIns = text_size32(Ins),
-    if
-        Overlap1 >= Overlap2 ->
-            if
-                Overlap1 * 2 >= TDel orelse Overlap1 * 2 >= TIns ->
+    case Overlap1 >= Overlap2 of
+        true ->
+            case Overlap1 * 2 >= TDel orelse Overlap1 * 2 >= TIns of
+                true ->
                     Common = binary:part(Ins, 0, Overlap1 * 4),
                     NewDel = binary:part(Del, 0, (TDel - Overlap1) * 4),
                     NewIns = binary:part(Ins, Overlap1 * 4, (TIns - Overlap1) * 4),
                     cleanup_semantic_overlaps([{insert, NewIns} | T], [{equal, Common}, {delete, NewDel} | Acc]);
-                true ->
+                false ->
                     cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc])
             end;
-        true ->
-            if
-                Overlap2 * 2 >= TIns orelse Overlap2 * 2 >= TDel ->
+        false ->
+            case Overlap2 * 2 >= TIns orelse Overlap2 * 2 >= TDel of
+                true ->
                     Common = binary:part(Ins, (TIns - Overlap2) * 4, Overlap2 * 4),
                     NewIns = binary:part(Ins, 0, (TIns - Overlap2) * 4),
                     NewDel = binary:part(Del, Overlap2 * 4, (TDel - Overlap2) * 4),
                     cleanup_semantic_overlaps([{delete, NewDel} | T], [{equal, Common}, {insert, NewIns} | Acc]);
-                true ->
+                false ->
                     cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc])
             end
     end;
@@ -958,7 +952,7 @@ last_char(Bin) ->
 is_blankline_end(Bin) when byte_size(Bin) >= 8 ->
     Size = byte_size(Bin),
     case Bin of
-        <<_:(Size-8)/binary,  $\n:32, $\n:32>>       -> true;
+        <<_:(Size-8)/binary,  $\n:32, $\n:32>>         -> true;
         <<_:(Size-12)/binary, $\n:32, $\r:32, $\n:32>> -> true;
         _ -> false
     end;
@@ -966,9 +960,9 @@ is_blankline_end(_) -> false.
 
 is_blankline_start(Bin) when byte_size(Bin) >= 8 ->
     case Bin of
-        <<$\n:32, $\n:32, _/binary>>             -> true;
-        <<$\n:32, $\r:32, $\n:32, _/binary>>     -> true;
-        <<$\r:32, $\n:32, $\n:32, _/binary>>     -> true;
+        <<$\n:32, $\n:32, _/binary>>                 -> true;
+        <<$\n:32, $\r:32, $\n:32, _/binary>>         -> true;
+        <<$\r:32, $\n:32, $\n:32, _/binary>>         -> true;
         <<$\r:32, $\n:32, $\r:32, $\n:32, _/binary>> -> true;
         _ -> false
     end;
@@ -1001,8 +995,8 @@ cleanup_efficiency32([], Changed, _EditCost, Acc) ->
     end;
 %% Any equality which is surrounded on both sides by an insertion and deletion need less then 
 %% EditCost characters for it to be advantageous to split.
-cleanup_efficiency32([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc) when 
-        O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) ->
+cleanup_efficiency32([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc)
+  when O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) ->
     case text_smaller_than(XY, EditCost) of
         true ->
             Del = {delete, XY},
@@ -1014,8 +1008,8 @@ cleanup_efficiency32([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCos
 %% Any equality which is surrounded on one side by an existing insertion and deletion and on the 
 %% other side by an existing insertion or deletion needs less than half C characters long for it 
 %% to be advantageous to split.
-cleanup_efficiency32([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc) when
-    O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) andalso ?IS_INS_OR_DEL(O3) ->
+cleanup_efficiency32([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc)
+  when O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) andalso ?IS_INS_OR_DEL(O3) ->
     case text_smaller_than(X, EditCost div 2 + 1) of
         true ->
             Del = {delete, X},

From 15360775d8ff76e1d4df72b57f807257c840aa60 Mon Sep 17 00:00:00 2001
From: MM Zeeman <mmzeeman@xs4all.nl>
Date: Sun, 12 Apr 2026 10:51:28 +0200
Subject: [PATCH 38/47] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b3e31ac..716ad98 100644
--- a/Makefile
+++ b/Makefile
@@ -36,7 +36,7 @@ clean_doc:
 
 distclean: clean_doc
 	@rm -rf _build
-	@rm $(REBAR)
+	@rm -f $(REBAR)
 
 doc: $(REBAR)
 	$(REBAR) ex_doc --output doc --formatter html

From 554a227d586a2573ec5d1fb33555965590a12343 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Sun, 12 Apr 2026 10:56:44 +0200
Subject: [PATCH 39/47] Fix size parameter in common_overlap_loop test

---
 src/diffy.erl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index d017107..be93a86 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -1533,11 +1533,11 @@ common_overlap_test() ->
 common_overlap_loop_test() ->
     Abc = to_utf32(<<"abc">>),
     Cde = to_utf32(<<"cde">>),
-    ?assertEqual(1, common_overlap_loop(Abc, Cde, size(Cde), 0, 1)),
+    ?assertEqual(1, common_overlap_loop(Abc, Cde, text_size(Cde), 0, 1)),
 
     Abcdef = to_utf32(<<"abcdef">>),
     Efde = to_utf32(<<"efde">>),
-    ?assertEqual(2, common_overlap_loop(Abcdef, Efde, size(Cde), 0, 1)),
+    ?assertEqual(2, common_overlap_loop(Abcdef, Efde, text_size(Efde), 0, 1)),
 
     ok.
 

From 023124c450ad8679ee2ff7efaf105a2151bd26f9 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Sun, 12 Apr 2026 12:06:40 +0200
Subject: [PATCH 40/47] Minor refactoring

---
 src/diffy.erl | 42 +++++++++++++++---------------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index be93a86..3aee7ea 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -62,6 +62,7 @@
 
 -export_type([diff_op/0, diff/0, diffs/0, diff_option/0]).
 
+-define(DEFAULT_EDIT_COST, 4).
 -define(PATCH_MARGIN, 4).
 -define(IS_INS_OR_DEL(Op), (Op =:= insert orelse Op =:= delete)).
 -define(IS_UTF32_ALIGNED(Offset), (Offset rem 4 =:= 0)).
@@ -104,38 +105,27 @@ diff(Text1, Text2) ->
 % Cleanups are always applied in the correct order: semantic first, then efficiency.
 -spec diff(unicode:unicode_binary(), unicode:unicode_binary(), [diff_option()]) -> diffs().
 diff(Text1, Text2, Options) when is_list(Options) ->
-    CheckLines = not lists:member(no_linemode, Options),
     T1 = to_utf32(Text1),
     T2 = to_utf32(Text2),
+    CheckLines = not proplists:get_value(no_linemode, Options, false),
     Diffs32 = diff32(T1, T2, CheckLines),
-    Diffs1 = case lists:member(semantic, Options) of
+    Diffs1 = case proplists:get_value(semantic, Options) of
                  true  -> cleanup_semantic32(Diffs32);
-                 false -> Diffs32
+                 _ -> Diffs32
              end,
-    Diffs2 = case efficiency_opt(Options) of
-                 none           -> Diffs1;
-                 default        -> cleanup_efficiency32(Diffs1);
-                 {custom, Cost} -> cleanup_efficiency32(Diffs1, Cost)
+    Diffs2 = case proplists:get_value(efficiency, Options) of
+                 true -> cleanup_efficiency32(Diffs1);
+                 Cost when is_integer(Cost) andalso Cost > 0 -> cleanup_efficiency32(Diffs1, Cost);
+                 _ -> Diffs1
              end,
     %% Single conversion at the exit boundary.
     [{Op, to_utf8(D)} || {Op, D} <- Diffs2].
 
-%% Extract the efficiency option, preferring {efficiency, Cost} over plain efficiency.
-efficiency_opt(Options) ->
-    case lists:keyfind(efficiency, 1, Options) of
-        {efficiency, Cost} -> {custom, Cost};
-        false ->
-            case lists:member(efficiency, Options) of
-                true  -> default;
-                false -> none
-            end
-    end.
-
 %% Internal diff working entirely in UTF-32 binaries.
-diff32(<<>>, <<>>, _CheckLines) ->
-    [];
-diff32(Text1, Text2, _CheckLines) when Text1 =:= Text2 ->
-    [{equal, Text1}];
+diff32(<<>>, <<>>, _CheckLines) -> [];
+diff32(<<>>, Text2, _CheckLines) -> [{insert, Text2}];
+diff32(Text1, <<>>, _CheckLines) -> [{delete, Text1}];
+diff32(Text1, Text2, _CheckLines) when Text1 =:= Text2 -> [{equal, Text1}];
 diff32(Text1, Text2, CheckLines) ->
     {Prefix, MText1, MText2, Suffix} = split_pre_and_suffix(Text1, Text2),
 
@@ -154,10 +144,8 @@ diff32(Text1, Text2, CheckLines) ->
     cleanup_merge32(Diffs2).
 
 %% This assumes Text1 and Text2 don't have a common prefix. Operates on UTF-32.
-compute_diff(<<>>, NewText, _CheckLines) ->
-    [{insert, NewText}];
-compute_diff(OldText, <<>>, _CheckLines) ->
-    [{delete, OldText}];
+compute_diff(<<>>, NewText, _CheckLines) -> [{insert, NewText}];
+compute_diff(OldText, <<>>, _CheckLines) -> [{delete, OldText}];
 compute_diff(OldText, NewText, CheckLines) ->
     OldStNew = size(OldText) < size(NewText),
 
@@ -981,7 +969,7 @@ cleanup_efficiency(Diffs, EditCost) ->
 
 %% Internal efficiency cleanup operating on UTF-32 diffs.
 cleanup_efficiency32(Diffs) ->
-    cleanup_efficiency32(Diffs, 4).
+    cleanup_efficiency32(Diffs, ?DEFAULT_EDIT_COST).
 
 cleanup_efficiency32(Diffs, EditCost) ->
     cleanup_efficiency32(Diffs, false, EditCost, []).

From 5c7759481bf2dac53e2afb319285ec97a76bcd46 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Sun, 12 Apr 2026 20:35:01 +0200
Subject: [PATCH 41/47] Fix a problem where cleanup_merge did not collapse all
 possible outcomes

---
 src/diffy.erl        | 32 +++++++++++++++-----------------
 test/diffy_tests.erl | 25 +++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 3aee7ea..b47f868 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -658,8 +658,7 @@ cleanup_merge(Diffs) ->
 
 %% Internal cleanup_merge operating on UTF-32 diffs.
 cleanup_merge32(Diffs) ->
-    Diffs1 = cleanup_merge32(Diffs, []),
-    canonicalize_edits(Diffs1, []).
+    cleanup_merge32(Diffs, []).
 
 %% Done
 cleanup_merge32([], Acc) ->
@@ -667,19 +666,25 @@ cleanup_merge32([], Acc) ->
 %% Remove operations without data.
 cleanup_merge32([{_Op, <<>>}|T], Acc) ->
     cleanup_merge32(T, Acc);
-%% Merge data from equal operations
+%% Ensure delete/insert ordering: if insert is on top and a delete arrives, sink the insert.
+cleanup_merge32([{delete, _}=D|T], [{insert, _}=I|Acc]) ->
+    cleanup_merge32([D, I|T], Acc);
+%% Merge data from equal operations.
 cleanup_merge32([{Op2, Data2}|T], [{Op1, Data1}|Acc]) when Op1 =:= Op2 ->
     cleanup_merge32(T, [{Op1, <<Data1/binary, Data2/binary>>}|Acc]);
-%% Cleanup edits before equal operation
-cleanup_merge32([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc]) when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal ->
-    cleanup_merge32(T, [I, {Op3, <<Data3/binary, Data1/binary>>}|Acc]);
-%% Check if Op1Data and Op2Data have common prefixes.
-cleanup_merge32([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc]) when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal ->
+%% Cleanup edits before equal operation — re-queue merged op for further processing.
+cleanup_merge32([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc])
+        when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal ->
+    cleanup_merge32([I, {Op3, <<Data3/binary, Data1/binary>>} | T], Acc);
+%% Factor out common prefixes and suffixes from adjacent insert/delete pairs.
+cleanup_merge32([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc])
+        when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal ->
     {Prefix, Op1DataD, Op2DataD, Suffix} = split_pre_and_suffix(Op1Data, Op2Data),
     cleanup_merge32(T, [{equal, <<Suffix/binary, E1/binary>>},
         {Op1, Op1DataD}, {Op2, Op2DataD}, {equal, <<E2/binary, Prefix/binary>>}|Acc]);
-%% Check for slide left and slide right edits
-cleanup_merge32([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =:= insert orelse Op =:= delete ->
+%% Slide edits left and right.
+cleanup_merge32([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc)
+        when Op =:= insert orelse Op =:= delete ->
     case is_suffix(E2, I) of
         false ->
             case is_prefix(E1, I) of
@@ -698,13 +703,6 @@ cleanup_merge32([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =
 cleanup_merge32([H|T], Acc) ->
     cleanup_merge32(T, [H|Acc]).
 
-canonicalize_edits([{insert, I}, {delete, D} | T], Acc) ->
-    canonicalize_edits(T, [{insert, I}, {delete, D} | Acc]);
-canonicalize_edits([H | T], Acc) ->
-    canonicalize_edits(T, [H | Acc]);
-canonicalize_edits([], Acc) ->
-    lists:reverse(Acc).
-
 % @doc Do semantic cleanup of diffs
 %
 -spec cleanup_semantic(diffs()) -> diffs().
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 0e1980e..293ca46 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -48,6 +48,13 @@ prop_cleanup_merge() ->
             andalso DestinationText =:= diffy:destination_text(CleanDiffs)
         end).
 
+prop_cleanup_merge_idempotent() ->
+    ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}),
+        begin
+            Cleaned = cleanup_merge(Diffs),
+            Cleaned =:= cleanup_merge(Cleaned)
+        end).
+
 prop_cleanup_efficiency() ->
     ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}),
         begin
@@ -233,6 +240,24 @@ cleanup_merge_test() ->
 
     ok.
 
+%% delete/insert/delete — the two deletes merge, then insert must be re-checked
+%% against the equal below it, which should then slide
+requeue_i_test() ->
+    ?assertEqual([{delete, <<"aXa">>}, {insert, <<"b">>}],
+                 cleanup_merge([{delete, <<"a">>}, {insert, <<"b">>}, {delete, <<"Xa">>}])).
+
+%% Three consecutive deletes separated by inserts collapse correctly
+triple_delete_test() ->
+    ?assertEqual([{delete, <<"abc">>}, {insert, <<"xyz">>}],
+                 cleanup_merge([{delete, <<"a">>}, {insert, <<"x">>},
+                                {delete, <<"b">>}, {insert, <<"y">>},
+                                {delete, <<"c">>}, {insert, <<"z">>}])).
+
+%% After sliding, the two equals on either side should merge into one
+slide_merge_test() ->
+    ?assertEqual([{insert, <<"aX">>}, {equal, <<"abc">>}],
+                 cleanup_merge([{equal, <<"a">>}, {insert, <<"Xa">>}, {equal, <<"bc">>}])).
+
 cleanup_merge_prop_test() ->
     ?assertEqual(true, proper:quickcheck(prop_cleanup_merge(), [{numtests, ?NUM_TESTS}, {to_file, user}])),
     ok.

From 0cb1e7ff2350f391e164601f88137bb989cd56b1 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Sun, 12 Apr 2026 21:24:48 +0200
Subject: [PATCH 42/47] Don't fall through to default when efficiency cost is
 mis-configured

---
 src/diffy.erl        | 4 ++--
 test/diffy_tests.erl | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index b47f868..c4af0a0 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -114,9 +114,9 @@ diff(Text1, Text2, Options) when is_list(Options) ->
                  _ -> Diffs32
              end,
     Diffs2 = case proplists:get_value(efficiency, Options) of
+                 NoEfficiency when NoEfficiency =:= undefined orelse NoEfficiency =:= false  -> Diffs1;
                  true -> cleanup_efficiency32(Diffs1);
-                 Cost when is_integer(Cost) andalso Cost > 0 -> cleanup_efficiency32(Diffs1, Cost);
-                 _ -> Diffs1
+                 Cost when is_integer(Cost) andalso Cost > 0 -> cleanup_efficiency32(Diffs1, Cost)
              end,
     %% Single conversion at the exit boundary.
     [{Op, to_utf8(D)} || {Op, D} <- Diffs2].
diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 293ca46..4f7370e 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -51,6 +51,7 @@ prop_cleanup_merge() ->
 prop_cleanup_merge_idempotent() ->
     ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}),
         begin
+            % Cleaning the diffs again shoul not result in more changes
             Cleaned = cleanup_merge(Diffs),
             Cleaned =:= cleanup_merge(Cleaned)
         end).

From 5c8bb4cd80348264ef02c0f11c2e81fae6b20cb7 Mon Sep 17 00:00:00 2001
From: Maas-Maarten Zeeman <mmzeeman@xs4all.nl>
Date: Mon, 13 Apr 2026 20:34:29 +0200
Subject: [PATCH 43/47] Don't attempt to do linemode diffs for terms

---
 src/diffy.erl      |  1 -
 src/diffy_term.erl | 10 +++-------
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index c4af0a0..273d636 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -17,7 +17,6 @@
 %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 %% See the License for the specific language governing permissions and
 %% limitations under the License.
-%% Erlang diff-match-patch implementation
 
 -module(diffy).
 
diff --git a/src/diffy_term.erl b/src/diffy_term.erl
index 2639bf6..23c23b6 100644
--- a/src/diffy_term.erl
+++ b/src/diffy_term.erl
@@ -26,11 +26,10 @@
     diff/2
 ]).
 
--type diff_op() :: delete | equal | insert.
--type diff() :: {diff_op(), term()}.
+-type diff() :: {diffy:diff_op(), term()}.
 -type diffs() :: list(diff()).
 
--export_type([ diffs/0 ]).
+-export_type([ diff/0, diffs/0 ]).
 
 -spec diff(list(), list()) -> diffs().
 diff(A, A) ->
@@ -42,7 +41,7 @@ diff([], B) ->
 diff(A, B) when is_list(A), is_list(B) ->
     {Dict0, N} = term_dict(A, dict:new(), 0),
     {Dict, _N} = term_dict(B, Dict0, N),
-    Diff = diffy:diff(map_terms(A, Dict), map_terms(B, Dict)),
+    Diff = diffy:diff(map_terms(A, Dict), map_terms(B, Dict), [no_linemode]),
     unmap_diff(Diff, Dict).
 
 term_dict([], D, N) ->
@@ -69,7 +68,6 @@ unmap_diff_1({Op, B}, RDict) ->
     {Op, [ dict:fetch(C, RDict) || C <- Cs ]}.
 
 
-
 -ifdef(TEST).
 
 -include_lib("eunit/include/eunit.hrl").
@@ -89,7 +87,5 @@ diffy_term_test() ->
         diffy_term:diff([a,b,c,d,e], [a,e,b,c,d])),
     ok.
 
-
 -endif.
 
-

From 9000ce3fa4c01ba46a2692308cd6e4373d1ce4cb Mon Sep 17 00:00:00 2001
From: MM Zeeman <mmzeeman@xs4all.nl>
Date: Mon, 13 Apr 2026 21:08:00 +0200
Subject: [PATCH 44/47] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 test/diffy_tests.erl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index 4f7370e..b2ecd4f 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -72,10 +72,10 @@ prop_cleanup_semantic() ->
         begin
             SourceText = diffy:source_text(Diffs),
             DestinationText = diffy:destination_text(Diffs),
-            EfficientDiffs = cleanup_semantic(Diffs),
+            SemanticDiffs = cleanup_semantic(Diffs),
 
-            SourceText =:= diffy:source_text(EfficientDiffs)
-            andalso DestinationText =:= diffy:destination_text(EfficientDiffs)
+            SourceText =:= diffy:source_text(SemanticDiffs)
+            andalso DestinationText =:= diffy:destination_text(SemanticDiffs)
         end).
 
 html_like() ->

From 762fc294503193ceaf5f6fc4e8ef5050819d07ab Mon Sep 17 00:00:00 2001
From: MM Zeeman <mmzeeman@xs4all.nl>
Date: Mon, 13 Apr 2026 21:08:36 +0200
Subject: [PATCH 45/47] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 test/diffy_tests.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl
index b2ecd4f..c7c9781 100644
--- a/test/diffy_tests.erl
+++ b/test/diffy_tests.erl
@@ -51,7 +51,7 @@ prop_cleanup_merge() ->
 prop_cleanup_merge_idempotent() ->
     ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}),
         begin
-            % Cleaning the diffs again shoul not result in more changes
+            % Cleaning the diffs again should not result in more changes
             Cleaned = cleanup_merge(Diffs),
             Cleaned =:= cleanup_merge(Cleaned)
         end).

From 2350f64369afd35c637dd056f212a747c2b664ed Mon Sep 17 00:00:00 2001
From: MM Zeeman <mmzeeman@xs4all.nl>
Date: Mon, 13 Apr 2026 21:09:20 +0200
Subject: [PATCH 46/47] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/diffy.erl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 273d636..8de8864 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -957,7 +957,7 @@ is_blankline_start(_) -> false.
 %
 -spec cleanup_efficiency(diffs()) -> diffs().
 cleanup_efficiency(Diffs) ->
-    cleanup_efficiency(Diffs, 4).
+    cleanup_efficiency(Diffs, ?DEFAULT_EDIT_COST).
 
 -spec cleanup_efficiency(diffs(), pos_integer()) -> diffs().
 cleanup_efficiency(Diffs, EditCost) ->

From 4385ab8448535ea878052729c14df5d60dd3db22 Mon Sep 17 00:00:00 2001
From: MM Zeeman <mmzeeman@xs4all.nl>
Date: Mon, 13 Apr 2026 21:09:52 +0200
Subject: [PATCH 47/47] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/diffy.erl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/diffy.erl b/src/diffy.erl
index 8de8864..d9b7f5d 100644
--- a/src/diffy.erl
+++ b/src/diffy.erl
@@ -1518,11 +1518,11 @@ common_overlap_test() ->
 common_overlap_loop_test() ->
     Abc = to_utf32(<<"abc">>),
     Cde = to_utf32(<<"cde">>),
-    ?assertEqual(1, common_overlap_loop(Abc, Cde, text_size(Cde), 0, 1)),
+    ?assertEqual(1, common_overlap_loop(Abc, Cde, text_size32(Cde), 0, 1)),
 
     Abcdef = to_utf32(<<"abcdef">>),
     Efde = to_utf32(<<"efde">>),
-    ?assertEqual(2, common_overlap_loop(Abcdef, Efde, text_size(Efde), 0, 1)),
+    ?assertEqual(2, common_overlap_loop(Abcdef, Efde, text_size32(Efde), 0, 1)),
 
     ok.