From 40e0e09a85c08a1d0821541c7d94432287406122 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Mon, 6 Apr 2026 20:57:08 +0200 Subject: [PATCH 01/47] Added semantic diff cleanup --- src/diffy.erl | 270 ++++++++++++++++++++++++++++++++++++++++++- test/diffy_tests.erl | 16 +-- 2 files changed, 272 insertions(+), 14 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index fc75a87..16cebb8 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -581,7 +581,8 @@ levenshtein([{equal, _Data}|T], Insertions, Deletions, Levenshtein) -> % -spec cleanup_merge(diffs()) -> diffs(). cleanup_merge(Diffs) -> - cleanup_merge(Diffs, []). + Diffs1 = cleanup_merge(Diffs, []), + canonicalize_edits(Diffs1, []). %% Done cleanup_merge([], Acc) -> @@ -620,16 +621,273 @@ cleanup_merge([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =:= cleanup_merge([H|T], Acc) -> cleanup_merge(T, [H|Acc]). +canonicalize_edits([{insert, I}, {delete, D} | T], Acc) -> + canonicalize_edits(T, [{insert, I}, {delete, D} | Acc]); +canonicalize_edits([H | T], Acc) -> + canonicalize_edits(T, [H | Acc]); +canonicalize_edits([], Acc) -> + lists:reverse(Acc). + % @doc Do semantic cleanup of diffs % -spec cleanup_semantic(diffs()) -> diffs(). cleanup_semantic(Diffs) -> - cleanup_semantic(Diffs, []). + Diffs1 = cleanup_semantic_breakpoints(Diffs), + Diffs2 = cleanup_merge(Diffs1), + Diffs3 = cleanup_semantic_lossless(Diffs2), + cleanup_semantic_overlaps(Diffs3). + +cleanup_semantic_breakpoints(Diffs) -> + case find_breakpoint(Diffs, [], 0, 0, 0, 0, undefined) of + {found, NewDiffs} -> cleanup_semantic_breakpoints(NewDiffs); + not_found -> Diffs + end. -cleanup_semantic([], Acc) -> - lists:reverse(Acc); -cleanup_semantic([H|T], Acc) -> - cleanup_semantic(T, [H|Acc]). +find_breakpoint([], _Acc, _LI1, _LD1, _LI2, _LD2, _LE) -> + not_found; +find_breakpoint([{equal, Data} | T], Acc, _LI1, _LD1, LI2, LD2, _LE) -> + find_breakpoint(T, [{equal, Data} | Acc], LI2, LD2, 0, 0, Data); +find_breakpoint([{insert, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) -> + NewLI2 = LI2 + text_size(Data), + case is_breakpoint(LE, LI1, LD1, NewLI2, LD2) of + true -> {found, apply_breakpoint(LE, Acc, [{insert, Data} | T])}; + false -> find_breakpoint(T, [{insert, Data} | Acc], LI1, LD1, NewLI2, LD2, LE) + end; +find_breakpoint([{delete, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) -> + NewLD2 = LD2 + text_size(Data), + case is_breakpoint(LE, LI1, LD1, LI2, NewLD2) of + true -> {found, apply_breakpoint(LE, Acc, [{delete, Data} | T])}; + false -> find_breakpoint(T, [{delete, Data} | Acc], LI1, LD1, LI2, NewLD2, LE) + end. + +is_breakpoint(undefined, _, _, _, _) -> false; +is_breakpoint(LE, LI1, LD1, LI2, LD2) -> + LEN = text_size(LE), + LEN =< max(LI1, LD1) andalso LEN =< max(LI2, LD2). + +apply_breakpoint(LE, Acc, T) -> + replace_equality(LE, Acc, T). + +replace_equality(LE, [{equal, LE} | T_Acc], T) -> + lists:reverse(T_Acc) ++ [{delete, LE}, {insert, LE} | T]; +replace_equality(LE, [H | T_Acc], T) -> + replace_equality(LE, T_Acc, [H | T]). + +cleanup_semantic_lossless(Diffs) -> + cleanup_semantic_lossless(Diffs, []). + +cleanup_semantic_lossless([{equal, E1}, {Op, Edit}, {equal, E2} | T], Acc) when ?IS_INS_OR_DEL(Op) -> + {NewE1, NewEdit, NewE2} = slide_edit(E1, Edit, E2), + case NewE1 of + <<>> -> + cleanup_semantic_lossless(lists:reverse(Acc, [{Op, NewEdit}, {equal, NewE2} | T]), []); + _ -> + case NewE2 of + <<>> -> + cleanup_semantic_lossless(lists:reverse(Acc, [{equal, NewE1}, {Op, NewEdit} | T]), []); + _ -> + cleanup_semantic_lossless([{Op, NewEdit}, {equal, NewE2} | T], [{equal, NewE1} | Acc]) + end + end; +cleanup_semantic_lossless([H | T], Acc) -> + cleanup_semantic_lossless(T, [H | Acc]); +cleanup_semantic_lossless([], Acc) -> + lists:reverse(Acc). + +slide_edit(E1, Edit, E2) -> + Suffix = common_suffix(E1, Edit), + {E1_1, Edit_1, E2_1} = case Suffix of + <<>> -> {E1, Edit, E2}; + _ -> + SLen = size(Suffix), + { binary:part(E1, 0, size(E1) - SLen), + <>, + <> } + end, + find_best_slide(E1_1, Edit_1, E2_1). + +find_best_slide(E1, Edit, E2) -> + Score = cleanup_semantic_score(E1, Edit) + cleanup_semantic_score(Edit, E2), + find_best_slide(E1, Edit, E2, Score, E1, Edit, E2). + +find_best_slide(E1, Edit, E2, BestScore, BestE1, BestEdit, BestE2) -> + case can_slide_right(Edit, E2) of + {true, Char, RestEdit, RestE2} -> + NewE1 = <>, + NewEdit = <>, + NewE2 = RestE2, + NewScore = cleanup_semantic_score(NewE1, NewEdit) + cleanup_semantic_score(NewEdit, NewE2), + if + NewScore >= BestScore -> + find_best_slide(NewE1, NewEdit, NewE2, NewScore, NewE1, NewEdit, NewE2); + true -> + find_best_slide(NewE1, NewEdit, NewE2, BestScore, BestE1, BestEdit, BestE2) + end; + false -> + {BestE1, BestEdit, BestE2} + end. + +can_slide_right(<>, <>) -> + {true, <>, RestEdit, RestE2}; +can_slide_right(_, _) -> + false. + +cleanup_semantic_score(<<>>, _) -> 6; +cleanup_semantic_score(_, <<>>) -> 6; +cleanup_semantic_score(One, Two) -> + Char1 = last_char(One), + Char2 = first_char(Two), + NonAlphaNumeric1 = is_non_alphanumeric(Char1), + NonAlphaNumeric2 = is_non_alphanumeric(Char2), + Whitespace1 = NonAlphaNumeric1 andalso is_whitespace(Char1), + Whitespace2 = NonAlphaNumeric2 andalso is_whitespace(Char2), + LineBreak1 = Whitespace1 andalso is_linebreak(Char1), + LineBreak2 = Whitespace2 andalso is_linebreak(Char2), + BlankLine1 = LineBreak1 andalso is_blankline_end(One), + BlankLine2 = LineBreak2 andalso is_blankline_start(Two), + if + BlankLine1 orelse BlankLine2 -> 5; + LineBreak1 orelse LineBreak2 -> 4; + NonAlphaNumeric1 andalso (not Whitespace1) andalso Whitespace2 -> 3; + Whitespace1 orelse Whitespace2 -> 2; + NonAlphaNumeric1 orelse NonAlphaNumeric2 -> 1; + true -> 0 + end. + +cleanup_semantic_overlaps(Diffs) -> + cleanup_semantic_overlaps(Diffs, []). + +cleanup_semantic_overlaps([{delete, Del}, {insert, Ins} | T], Acc) -> + Overlap1 = common_overlap(Del, Ins), + Overlap2 = common_overlap(Ins, Del), + if + Overlap1 >= Overlap2 -> + TDel = text_size(Del), + TIns = text_size(Ins), + case Overlap1 >= TDel / 2 orelse Overlap1 >= TIns / 2 of + true -> + Common = substring_start(Ins, Overlap1), + NewDel = substring_start(Del, TDel - Overlap1), + NewIns = skip_chars(Ins, Overlap1), + cleanup_semantic_overlaps([{insert, NewIns} | T], [{equal, Common}, {delete, NewDel} | Acc]); + false -> + cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) + end; + true -> + TDel = text_size(Del), + TIns = text_size(Ins), + case Overlap2 >= TDel / 2 orelse Overlap2 >= TIns / 2 of + true -> + Common = substring_start(Del, Overlap2), + NewIns = substring_start(Ins, TIns - Overlap2), + NewDel = skip_chars(Del, Overlap2), + cleanup_semantic_overlaps([{delete, NewDel} | T], [{equal, Common}, {insert, NewIns} | Acc]); + false -> + cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) + end + end; +cleanup_semantic_overlaps([H | T], Acc) -> + cleanup_semantic_overlaps(T, [H | Acc]); +cleanup_semantic_overlaps([], Acc) -> + lists:reverse(Acc). + +%% Helper functions for semantic cleanup + +common_overlap(<<>>, _) -> 0; +common_overlap(_, <<>>) -> 0; +common_overlap(Text1, Text2) -> + T1Len = text_size(Text1), + T2Len = text_size(Text2), + {T1, T2} = if + T1Len > T2Len -> {substring_end(Text1, T2Len), Text2}; + T1Len < T2Len -> {Text1, substring_start(Text2, T1Len)}; + true -> {Text1, Text2} + end, + TMin = min(T1Len, T2Len), + if + T1 =:= T2 -> TMin; + true -> common_overlap_loop(T1, T2, TMin, 0, 1) + end. + +common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin -> + Pattern = substring_end(T1, Length), + case binary:match(T2, Pattern) of + nomatch -> Best; + {FoundByteOffset, _} -> + FoundCharCount = text_size(binary:part(T2, 0, FoundByteOffset)), + NewLength = Length + FoundCharCount, + case NewLength > TMin of + true -> Best; + false -> + case FoundCharCount =:= 0 orelse substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of + true -> + common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1); + false -> + common_overlap_loop(T1, T2, TMin, Best, NewLength + 1) + end + end + end; +common_overlap_loop(_T1, _T2, _TMin, Best, _Length) -> + Best. + +first_char(<>) -> C; +first_char(_) -> undefined. + +last_char(Bin) -> + last_char(Bin, undefined). +last_char(<>, _Last) -> last_char(Rest, C); +last_char(<<>>, Last) -> Last. + +substring_start(Bin, Len) -> + substring_start(Bin, Len, <<>>). +substring_start(_, 0, Acc) -> Acc; +substring_start(<>, Len, Acc) -> + substring_start(Rest, Len - 1, <>); +substring_start(<<>>, _, Acc) -> Acc. + +substring_end(Bin, Len) -> + TotalLen = text_size(Bin), + if + TotalLen =< Len -> Bin; + true -> skip_chars(Bin, TotalLen - Len) + end. + +skip_chars(Bin, 0) -> Bin; +skip_chars(<<_/utf8, Rest/binary>>, N) -> skip_chars(Rest, N - 1); +skip_chars(<<>>, _) -> <<>>. + +is_non_alphanumeric(undefined) -> true; +is_non_alphanumeric(C) -> + not ((C >= $a andalso C =< $z) orelse + (C >= $A andalso C =< $Z) orelse + (C >= $0 andalso C =< $9)). + +is_whitespace(undefined) -> false; +is_whitespace(C) -> + case C of + $\s -> true; + $\t -> true; + $\n -> true; + $\r -> true; + $\f -> true; + $\v -> true; + _ -> false + end. + +is_linebreak(C) -> + C =:= $\n orelse C =:= $\r. + +is_blankline_end(Bin) -> + case re:run(Bin, <<"\n\r?\n$">> ) of + {match, _} -> true; + nomatch -> false + end. + +is_blankline_start(Bin) -> + case re:run(Bin, <<"^\r?\n\r?\n">> ) of + {match, _} -> true; + nomatch -> false + end. % @doc Do efficiency cleanup of diffs. % diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 42065a4..4fb2bcc 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -225,14 +225,14 @@ cleanup_semantic_test() -> ?assertEqual([{delete, <<"abc">>}, {insert, <<"ABC">>}, {equal, <<"1234">>}, {delete, <<"wxyz">>}], cleanup_semantic([{delete, <<"abc">>}, {insert, <<"ABC">>}, {equal, <<"1234">>}, {delete, <<"wxyz">>}])), - % % Simple elimination. - % ?assertEqual([{delete, <<"abc">>}, {insert, <<"b">>}], - % cleanup_semantic([{delete, <<"a">>}, {equal, <<"b">>}, {delete, <<"c">>}])), + % Simple elimination. + ?assertEqual([{delete, <<"abc">>}, {insert, <<"b">>}], + cleanup_semantic([{delete, <<"a">>}, {equal, <<"b">>}, {delete, <<"c">>}])), - % % Multiple eliminations. - % ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}], - % cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}, - % {equal, <<"_">>}, {insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}])), + % Multiple eliminations. + ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}], + cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}, + {equal, <<"_">>}, {insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}])), ok. @@ -261,7 +261,7 @@ cleanup_efficiency_test() -> cleanup_efficiency([{delete, <<"ab">>}, {insert, <<"12">>}, {equal, <<"xyz">>}, {delete, <<"cd">>}, {insert, <<"34">>}])), % Three-edit elimination - ?assertEqual([{insert, <<"12x34">>}, {delete, <<"xcd">>}], + ?assertEqual([{delete, <<"xcd">>}, {insert, <<"12x34">>}], cleanup_efficiency([{insert, <<"12">>}, {equal, <<"x">>}, {delete, <<"cd">>}, {insert, <<"34">>}])), % Backpass elimination From 8b504d5f26efd7760a51667b5a7497066c5efa46 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 10:34:12 +0200 Subject: [PATCH 02/47] Upgraded deps, and use proper generator to make utf8 binaries --- rebar.config | 4 ++-- rebar.lock | 23 +++++++++++++++++------ src/diffy.erl | 31 ++++--------------------------- test/diffy_tests.erl | 27 ++++++++++++++++++++++----- 4 files changed, 45 insertions(+), 40 deletions(-) diff --git a/rebar.config b/rebar.config index adb9366..304974c 100644 --- a/rebar.config +++ b/rebar.config @@ -1,13 +1,13 @@ {erl_opts, [debug_info, warn_unused, warn_shadow_vars]}. {deps, [ - {zotonic_stdlib, "1.2.3"} + {zotonic_stdlib, "1.27.0"} ]}. {profiles, [ {test, [ {deps, [ - {proper, "1.2.0"} + {proper, "1.5.0"} ]}, {xref_checks, [ diff --git a/rebar.lock b/rebar.lock index 05bb477..98901fd 100644 --- a/rebar.lock +++ b/rebar.lock @@ -1,11 +1,22 @@ {"1.2.0", -[{<<"proper">>,{pkg,<<"proper">>,<<"1.2.0">>},0}, - {<<"zotonic_stdlib">>,{pkg,<<"zotonic_stdlib">>,<<"1.2.3">>},0}]}. +[{<<"cowlib">>,{pkg,<<"cowlib">>,<<"2.16.0">>},1}, + {<<"qdate_localtime">>,{pkg,<<"qdate_localtime">>,<<"1.2.2">>},1}, + {<<"ssl_verify_fun">>,{pkg,<<"ssl_verify_fun">>,<<"1.1.7">>},2}, + {<<"tls_certificate_check">>, + {pkg,<<"tls_certificate_check">>,<<"1.31.0">>}, + 1}, + {<<"zotonic_stdlib">>,{pkg,<<"zotonic_stdlib">>,<<"1.27.0">>},0}]}. [ {pkg_hash,[ - {<<"proper">>, <<"1466492385959412A02871505434E72E92765958C60DBA144B43863554B505A4">>}, - {<<"zotonic_stdlib">>, <<"4A33B60C82379169C9934CCD1FC9E512CA16B922E131AD6B6D26E562F66DF9CC">>}]}, + {<<"cowlib">>, <<"54592074EBBBB92EE4746C8A8846E5605052F29309D3A873468D76CDF932076F">>}, + {<<"qdate_localtime">>, <<"43E1B20102F50A8B2A2BE7042C2F6BE989AD96CA2CC319DB5DF56E122E8873F6">>}, + {<<"ssl_verify_fun">>, <<"354C321CF377240C7B8716899E182CE4890C5938111A1296ADD3EC74CF1715DF">>}, + {<<"tls_certificate_check">>, <<"9A910B54D8CB96CC810CABF4C0129F21360F82022B20180849F1442A25CCBB04">>}, + {<<"zotonic_stdlib">>, <<"36D6F7A1004DEE169A61ADB57FDE8175F39F59634B5FFFD4AA0C1D0985D2A74E">>}]}, {pkg_hash_ext,[ - {<<"proper">>, <<"CBC3766C08337806741343D330BF4BCB826155D2141BE8514C4B02858AA19FD3">>}, - {<<"zotonic_stdlib">>, <<"4712DD7A0C0C600AFEDAFDA738D40FEBF10CFC2485E62D109361FCC190F7381A">>}]} + {<<"cowlib">>, <<"7F478D80D66B747344F0EA7708C187645CFCC08B11AA424632F78E25BF05DB51">>}, + {<<"qdate_localtime">>, <<"A38D5F1C5AE14B22F471E442B262AECCAFB915B664C7C364443DC73179C50FDA">>}, + {<<"ssl_verify_fun">>, <<"FE4C190E8F37401D30167C8C405EDA19469F34577987C76DDE613E838BBC67F8">>}, + {<<"tls_certificate_check">>, <<"9D2B41B128D5507BD8AD93E1A998E06D0AB2F9A772AF343F4C00BF76C6BE1532">>}, + {<<"zotonic_stdlib">>, <<"B9555F50717F2F8FBD3D4156CE7F4E2DF380441D942DE54789466940929B08C3">>}]} ]. diff --git a/src/diffy.erl b/src/diffy.erl index 16cebb8..df712b4 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -53,7 +53,7 @@ -type for_fun() :: fun((integer(), term()) -> {continue, term()} | {break, term()}). --export_type([diffs/0]). +-export_type([diff_op/0, diff/0, diffs/0]). -define(PATCH_MARGIN, 4). -define(PATCH_MAX_PATCH_LEN, 32). @@ -388,7 +388,7 @@ diff_bisect(A, B) when is_binary(A) andalso is_binary(B) -> compute_diff_bisect1(A, B, M, N) -> %% TODO, add deadline... - MaxD = int_ceil((M + N) / 2), + MaxD = ceil((M + N) / 2), VOffset = MaxD, VLength = 2 * MaxD, @@ -1138,14 +1138,7 @@ common_suffix(Text1, Text2) -> % @doc Count the number of characters in a utf8 binary. text_size(Text) when is_binary(Text) -> - text_size(Text, 0). - -text_size(<<>>, Count) -> - Count; -text_size(<<_C/utf8, Rest/binary>>, Count) -> - text_size(Rest, Count+1); -text_size(_, _) -> - error(badarg). + string:length(Text). %% %% Array utilities @@ -1153,12 +1146,7 @@ text_size(_, _) -> % @doc Create an array from a utf8 binary. array_from_binary(Bin) when is_binary(Bin) -> - array_from_binary(Bin, 0, array:new()). - -array_from_binary(<<>>, _N, Array) -> - array:fix(Array); -array_from_binary(<>, N, Array) -> - array_from_binary(Rest, N+1, array:set(N, C, Array)). + array:from_list(unicode:characters_to_list(Bin, utf8)). % @doc Create a binary from an array containing unicode characters. binary_from_array(Start, End, Array) -> @@ -1246,17 +1234,6 @@ repair_head(Bin) -> %% Illegal sequence, can't repair it. {<<>>, Bin}. - -%% This function can go away when we support OTP 20 and up. -%% -int_ceil(Number) -> - T = trunc(Number), - case (Number - T) of - Neg when Neg < 0 -> T; - Pos when Pos > 0 -> T + 1; - _ -> T - end. - %% %% Tests %% diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 4fb2bcc..aff3b24 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -28,19 +28,19 @@ %% prop_cleanup_merge() -> - ?FORALL(Diffs, diffy:diffs(), + ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), begin SourceText = diffy:source_text(Diffs), DestinationText = diffy:destination_text(Diffs), CleanDiffs = cleanup_merge(Diffs), - SourceText == diffy:source_text(CleanDiffs) andalso - DestinationText == diffy:destination_text(CleanDiffs) + SourceText == diffy:source_text(CleanDiffs) + andalso DestinationText == diffy:destination_text(CleanDiffs) end). prop_cleanup_efficiency() -> - ?FORALL(Diffs, diffy:diffs(), + ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), begin SourceText = diffy:source_text(Diffs), DestinationText = diffy:destination_text(Diffs), @@ -51,6 +51,16 @@ prop_cleanup_efficiency() -> DestinationText == diffy:destination_text(EfficientDiffs) end). +prop_cleanup_semantic() -> + ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), + begin + SourceText = diffy:source_text(Diffs), + DestinationText = diffy:destination_text(Diffs), + EfficientDiffs = cleanup_semantic(Diffs), + SourceText =:= diffy:source_text(EfficientDiffs) andalso + DestinationText =:= diffy:destination_text(EfficientDiffs) + end). + html_like() -> proper_types:resize(200, list(frequency([{70, range($a, $z)}, % letters @@ -240,6 +250,10 @@ cleanup_efficiency_prop_test() -> ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 500}, {to_file, user}])), ok. +cleanup_semantic_prop_test() -> + ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, 500}, {to_file, user}])), + ok. + random_diffs_prop_test() -> ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, 500}, {to_file, user}])), ok. @@ -279,7 +293,7 @@ text_size_test() -> ?assertEqual(4, diffy:text_size(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>)), %% Bad utf-8 input results in a badarg. - ?assertError(badarg, diffy:text_size(<<149,157,112,8>>)), + ?assertError({badarg, _}, diffy:text_size(<<149,157,112,8>>)), ok. @@ -325,6 +339,9 @@ diff_test() -> %% Helpers %% +diff_op() -> + oneof([insert, delete, equal]). + pretty_html(Diffs) -> iolist_to_binary(diffy:pretty_html(Diffs)). From 4a3afb30e2ed3bd153902861358d3184d0a09263 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 10:37:55 +0200 Subject: [PATCH 03/47] Increased the number of tests proper does --- test/diffy_tests.erl | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index aff3b24..bc06050 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -32,11 +32,10 @@ prop_cleanup_merge() -> begin SourceText = diffy:source_text(Diffs), DestinationText = diffy:destination_text(Diffs), - CleanDiffs = cleanup_merge(Diffs), - SourceText == diffy:source_text(CleanDiffs) - andalso DestinationText == diffy:destination_text(CleanDiffs) + SourceText =:= diffy:source_text(CleanDiffs) + andalso DestinationText =:= diffy:destination_text(CleanDiffs) end). prop_cleanup_efficiency() -> @@ -44,11 +43,10 @@ prop_cleanup_efficiency() -> begin SourceText = diffy:source_text(Diffs), DestinationText = diffy:destination_text(Diffs), - EfficientDiffs = cleanup_efficiency(Diffs), - SourceText == diffy:source_text(EfficientDiffs) andalso - DestinationText == diffy:destination_text(EfficientDiffs) + SourceText =:= diffy:source_text(EfficientDiffs) + andalso DestinationText =:= diffy:destination_text(EfficientDiffs) end). prop_cleanup_semantic() -> @@ -57,8 +55,9 @@ prop_cleanup_semantic() -> SourceText = diffy:source_text(Diffs), DestinationText = diffy:destination_text(Diffs), EfficientDiffs = cleanup_semantic(Diffs), - SourceText =:= diffy:source_text(EfficientDiffs) andalso - DestinationText =:= diffy:destination_text(EfficientDiffs) + + SourceText =:= diffy:source_text(EfficientDiffs) + andalso DestinationText =:= diffy:destination_text(EfficientDiffs) end). html_like() -> @@ -247,19 +246,19 @@ cleanup_semantic_test() -> ok. cleanup_efficiency_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 500}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 800}, {to_file, user}])), ok. cleanup_semantic_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, 500}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, 800}, {to_file, user}])), ok. random_diffs_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, 500}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, 800}, {to_file, user}])), ok. random_inner_diff_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, 500}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, 800}, {to_file, user}])), ok. cleanup_efficiency_test() -> From faed45a6904a5a94f0114a72602fbd2fde46124c Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 10:59:31 +0200 Subject: [PATCH 04/47] Fix a utf8 matching problem which caused cleanup_semantic to result in different destination text output --- src/diffy.erl | 71 +++++++++++++++++++++++++------------------- test/diffy_tests.erl | 13 ++++++-- 2 files changed, 51 insertions(+), 33 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index df712b4..c889369 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -762,27 +762,31 @@ cleanup_semantic_overlaps([{delete, Del}, {insert, Ins} | T], Acc) -> Overlap2 = common_overlap(Ins, Del), if Overlap1 >= Overlap2 -> - TDel = text_size(Del), - TIns = text_size(Ins), - case Overlap1 >= TDel / 2 orelse Overlap1 >= TIns / 2 of - true -> - Common = substring_start(Ins, Overlap1), - NewDel = substring_start(Del, TDel - Overlap1), - NewIns = skip_chars(Ins, Overlap1), + TDel = size(Del), + TIns = size(Ins), + Overlap1BytesDel = overlap_to_bytes_end(Del, Overlap1), + Overlap1BytesIns = overlap_to_bytes_start(Ins, Overlap1), + if + Overlap1BytesDel >= TDel / 2 orelse Overlap1BytesIns >= TIns / 2 -> + Common = binary:part(Ins, 0, Overlap1BytesIns), + NewDel = binary:part(Del, 0, TDel - Overlap1BytesDel), + NewIns = binary:part(Ins, Overlap1BytesIns, TIns - Overlap1BytesIns), cleanup_semantic_overlaps([{insert, NewIns} | T], [{equal, Common}, {delete, NewDel} | Acc]); - false -> + true -> cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) end; true -> - TDel = text_size(Del), - TIns = text_size(Ins), - case Overlap2 >= TDel / 2 orelse Overlap2 >= TIns / 2 of - true -> - Common = substring_start(Del, Overlap2), - NewIns = substring_start(Ins, TIns - Overlap2), - NewDel = skip_chars(Del, Overlap2), + TDel = size(Del), + TIns = size(Ins), + Overlap2BytesIns = overlap_to_bytes_end(Ins, Overlap2), + Overlap2BytesDel = overlap_to_bytes_start(Del, Overlap2), + if + Overlap2BytesIns >= TIns / 2 orelse Overlap2BytesDel >= TDel / 2 -> + Common = binary:part(Ins, TIns - Overlap2BytesIns, Overlap2BytesIns), + NewIns = binary:part(Ins, 0, TIns - Overlap2BytesIns), + NewDel = binary:part(Del, Overlap2BytesDel, TDel - Overlap2BytesDel), cleanup_semantic_overlaps([{delete, NewDel} | T], [{equal, Common}, {insert, NewIns} | Acc]); - false -> + true -> cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) end end; @@ -793,17 +797,28 @@ cleanup_semantic_overlaps([], Acc) -> %% Helper functions for semantic cleanup +overlap_to_bytes_start(_Bin, 0) -> 0; +overlap_to_bytes_start(<>, N) -> + size(<>) + overlap_to_bytes_start(Rest, N - 1). + +overlap_to_bytes_end(Bin, N) -> + Skip = text_size(Bin) - N, + skip_n_chars(Bin, Skip). + +skip_n_chars(Rest, 0) -> size(Rest); +skip_n_chars(<<_/utf8, Rest/binary>>, N) -> + skip_n_chars(Rest, N - 1). + common_overlap(<<>>, _) -> 0; common_overlap(_, <<>>) -> 0; common_overlap(Text1, Text2) -> T1Len = text_size(Text1), T2Len = text_size(Text2), - {T1, T2} = if - T1Len > T2Len -> {substring_end(Text1, T2Len), Text2}; - T1Len < T2Len -> {Text1, substring_start(Text2, T1Len)}; - true -> {Text1, Text2} + {T1, T2, TMin} = if + T1Len > T2Len -> {substring_end(Text1, T2Len), Text2, T2Len}; + T1Len < T2Len -> {Text1, substring_start(Text2, T1Len), T1Len}; + true -> {Text1, Text2, T1Len} end, - TMin = min(T1Len, T2Len), if T1 =:= T2 -> TMin; true -> common_overlap_loop(T1, T2, TMin, 0, 1) @@ -839,22 +854,18 @@ last_char(<>, _Last) -> last_char(Rest, C); last_char(<<>>, Last) -> Last. substring_start(Bin, Len) -> - substring_start(Bin, Len, <<>>). -substring_start(_, 0, Acc) -> Acc; -substring_start(<>, Len, Acc) -> - substring_start(Rest, Len - 1, <>); -substring_start(<<>>, _, Acc) -> Acc. + binary:part(Bin, 0, overlap_to_bytes_start(Bin, Len)). substring_end(Bin, Len) -> TotalLen = text_size(Bin), if TotalLen =< Len -> Bin; - true -> skip_chars(Bin, TotalLen - Len) + true -> + SkipChars = TotalLen - Len, + SkipBytes = overlap_to_bytes_start(Bin, SkipChars), + binary:part(Bin, SkipBytes, size(Bin) - SkipBytes) end. -skip_chars(Bin, 0) -> Bin; -skip_chars(<<_/utf8, Rest/binary>>, N) -> skip_chars(Rest, N - 1); -skip_chars(<<>>, _) -> <<>>. is_non_alphanumeric(undefined) -> true; is_non_alphanumeric(C) -> diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index bc06050..81c8f71 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -239,12 +239,19 @@ cleanup_semantic_test() -> cleanup_semantic([{delete, <<"a">>}, {equal, <<"b">>}, {delete, <<"c">>}])), % Multiple eliminations. - ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}], - cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}, + ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}], + cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}, {equal, <<"_">>}, {insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}])), - ok. + % Regression test for UTF-8 data loss in cleanup_semantic_overlaps + % Ins1 = <<0,32,204,128,0,0>> (size 6, text_size 5) + % Ins2 = <<0,0,0,0,0,0,0,0>> (size 8, text_size 8) + % Total Dest size 14, text_size 13 + Diffs = [{delete,<<0,0,0,0,0,0,0,0>>},{insert,<<0,32,204,128,0,0>>},{insert,<<0,0,0,0,0,0,0,0>>}], + Cleaned = cleanup_semantic(Diffs), + ?assertEqual(diffy:destination_text(Diffs), diffy:destination_text(Cleaned)), + ok. cleanup_efficiency_prop_test() -> ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 800}, {to_file, user}])), ok. From 222bfadb1a28b83ff6a3489e7a3420b176a63620 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 11:24:45 +0200 Subject: [PATCH 05/47] Robustly handle grapheme counting problems --- src/diffy.erl | 16 +++++++--------- test/diffy_tests.erl | 12 +++++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index c889369..e5c54c2 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -797,17 +797,15 @@ cleanup_semantic_overlaps([], Acc) -> %% Helper functions for semantic cleanup -overlap_to_bytes_start(_Bin, 0) -> 0; -overlap_to_bytes_start(<>, N) -> - size(<>) + overlap_to_bytes_start(Rest, N - 1). +overlap_to_bytes_start(Bin, N) -> + Prefix = string:slice(Bin, 0, N), + size(Prefix). overlap_to_bytes_end(Bin, N) -> - Skip = text_size(Bin) - N, - skip_n_chars(Bin, Skip). - -skip_n_chars(Rest, 0) -> size(Rest); -skip_n_chars(<<_/utf8, Rest/binary>>, N) -> - skip_n_chars(Rest, N - 1). + TotalLen = text_size(Bin), + Skip = TotalLen - N, + Rest = string:slice(Bin, Skip), + size(Rest). common_overlap(<<>>, _) -> 0; common_overlap(_, <<>>) -> 0; diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 81c8f71..c3ebe45 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -23,6 +23,8 @@ -include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). +-define(NUM_TESTS, 800). + %% %% Properties %% @@ -219,7 +221,7 @@ cleanup_merge_test() -> ok. cleanup_merge_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_cleanup_merge(), [{numtests, 500}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_cleanup_merge(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. cleanup_semantic_test() -> @@ -253,19 +255,19 @@ cleanup_semantic_test() -> ok. cleanup_efficiency_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 800}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. cleanup_semantic_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, 800}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. random_diffs_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, 800}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. random_inner_diff_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, 800}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. cleanup_efficiency_test() -> From 7c0ec5d57765c961320ffa4c349c8c952edd424c Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 12:05:15 +0200 Subject: [PATCH 06/47] Robustly handle grapheme counting problems --- src/diffy.erl | 26 +++++++++++++++++--------- test/diffy_tests.erl | 4 ++-- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index e5c54c2..81787a0 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -1,9 +1,9 @@ %% @author Maas-Maarten Zeeman -%% @copyright 2014-2019 Maas-Maarten Zeeman +%% @copyright 2014-2026 Maas-Maarten Zeeman %% %% @doc Diffy, an erlang diff match and patch implementation %% -%% Copyright 2014-2019 Maas-Maarten Zeeman +%% Copyright 2014-2026 Maas-Maarten Zeeman %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. @@ -799,13 +799,13 @@ cleanup_semantic_overlaps([], Acc) -> overlap_to_bytes_start(Bin, N) -> Prefix = string:slice(Bin, 0, N), - size(Prefix). + string:length(Prefix). overlap_to_bytes_end(Bin, N) -> TotalLen = text_size(Bin), Skip = TotalLen - N, Rest = string:slice(Bin, Skip), - size(Rest). + string:length(Rest). common_overlap(<<>>, _) -> 0; common_overlap(_, <<>>) -> 0; @@ -829,10 +829,10 @@ common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin -> {FoundByteOffset, _} -> FoundCharCount = text_size(binary:part(T2, 0, FoundByteOffset)), NewLength = Length + FoundCharCount, - case NewLength > TMin of - true -> Best; - false -> - case FoundCharCount =:= 0 orelse substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of + if + NewLength > TMin -> Best; + true -> + case substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of true -> common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1); false -> @@ -1147,7 +1147,15 @@ common_suffix(Text1, Text2) -> % @doc Count the number of characters in a utf8 binary. text_size(Text) when is_binary(Text) -> - string:length(Text). + % string:length(Text). + text_size(Text, 0). + +text_size(<<>>, Count) -> + Count; +text_size(<<_C/utf8, Rest/binary>>, Count) -> + text_size(Rest, Count+1); +text_size(_, _) -> + error(badarg). %% %% Array utilities diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index c3ebe45..3e6aac2 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -1,9 +1,9 @@ %% @author Maas-Maarten Zeeman -%% @copyright 2014 Maas-Maarten Zeeman +%% @copyright 2014-2026 Maas-Maarten Zeeman %% %% @doc Diffy, an erlang diff match and patch implementation %% -%% Copyright 2014 Maas-Maarten Zeeman +%% Copyright 2014-2026 Maas-Maarten Zeeman %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. From 4907ec12e4e2fb2a33e904abd65edfda454fcc59 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 13:29:08 +0200 Subject: [PATCH 07/47] Decode line diffs with a tuple instead of an array --- src/diffy.erl | 85 ++++++++++++++++++++++++++------------------ test/diffy_tests.erl | 2 +- 2 files changed, 52 insertions(+), 35 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 81787a0..8688f42 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -284,7 +284,7 @@ diff_linemode(Text1, Text2) -> Diffs = diff(CharText1, CharText2, false), %% Transform the diffs back to lines. - Diffs1 = chars_to_lines(Diffs, Lines), + Diffs1 = decode_lines(Diffs, Lines), Cleaned = cleanup_merge(Diffs1), cleanup_line_diff(Cleaned, <<>>, <<>>, [], []). @@ -351,16 +351,15 @@ insert_line(Line, Lines, Dict, NextChar) -> {NextChar, NextChar+1, [Line|Lines], dict:store(Line, NextChar, Dict)} end. -%% -chars_to_lines(Diffs, Lines) when is_list(Lines) -> - A = array:from_list(Lines), - chars_to_lines(Diffs, A, []). +decode_lines(Diffs, Lines) when is_list(Lines) -> + LinesTuple = list_to_tuple(Lines), + decode_lines(Diffs, LinesTuple, []). -chars_to_lines([], _A, Acc) -> +decode_lines([], _LinesTuple, Acc) -> lists:reverse(Acc); -chars_to_lines([{Op, Data}|Rest], LineArray, Acc) -> - Data1 = << <<(array:get(C, LineArray))/binary>> || <> <= Data >>, - chars_to_lines(Rest, LineArray, [{Op, Data1}|Acc]). +decode_lines([{Op, Data} | Rest], LinesTuple, Acc) -> + Data1 = << <<(element(C + 1, LinesTuple))/binary>> || <> <= Data >>, + decode_lines(Rest, LinesTuple, [{Op, Data1} | Acc]). % Find the 'middle snake' of a diff, split the problem in two @@ -797,15 +796,39 @@ cleanup_semantic_overlaps([], Acc) -> %% Helper functions for semantic cleanup +%% @doc Convert N codepoints from the start of Bin to a byte offset. +%% This is consistent with text_size/1 which counts codepoints (not grapheme clusters). overlap_to_bytes_start(Bin, N) -> - Prefix = string:slice(Bin, 0, N), - string:length(Prefix). + codepoints_to_bytes(Bin, N, 0). + +codepoints_to_bytes(_Bin, 0, Acc) -> + Acc; +codepoints_to_bytes(<>, N, Acc) -> + codepoints_to_bytes(Rest, N - 1, Acc + byte_size(<>)); +codepoints_to_bytes(<<_C, Rest/binary>>, N, Acc) -> + %% Invalid utf-8 byte, count as 1 + codepoints_to_bytes(Rest, N - 1, Acc + 1); +codepoints_to_bytes(<<>>, _N, Acc) -> + Acc. +%% @doc Convert N codepoints from the END of Bin to a byte count of that suffix. overlap_to_bytes_end(Bin, N) -> + SkipChars = text_size(Bin) - N, + SkipBytes = codepoints_to_bytes(Bin, SkipChars, 0), + byte_size(Bin) - SkipBytes. + +substring_start(Bin, Len) -> + binary:part(Bin, 0, overlap_to_bytes_start(Bin, Len)). + +substring_end(Bin, Len) -> TotalLen = text_size(Bin), - Skip = TotalLen - N, - Rest = string:slice(Bin, Skip), - string:length(Rest). + case TotalLen =< Len of + true -> Bin; + false -> + SkipChars = TotalLen - Len, + SkipBytes = codepoints_to_bytes(Bin, SkipChars, 0), + binary:part(Bin, SkipBytes, byte_size(Bin) - SkipBytes) + end. common_overlap(<<>>, _) -> 0; common_overlap(_, <<>>) -> 0; @@ -817,9 +840,9 @@ common_overlap(Text1, Text2) -> T1Len < T2Len -> {Text1, substring_start(Text2, T1Len), T1Len}; true -> {Text1, Text2, T1Len} end, - if - T1 =:= T2 -> TMin; - true -> common_overlap_loop(T1, T2, TMin, 0, 1) + case T1 =:= T2 of + true -> TMin; + false -> common_overlap_loop(T1, T2, TMin, 0, 1) end. common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin -> @@ -846,25 +869,20 @@ common_overlap_loop(_T1, _T2, _TMin, Best, _Length) -> first_char(<>) -> C; first_char(_) -> undefined. -last_char(Bin) -> - last_char(Bin, undefined). -last_char(<>, _Last) -> last_char(Rest, C); -last_char(<<>>, Last) -> Last. - -substring_start(Bin, Len) -> - binary:part(Bin, 0, overlap_to_bytes_start(Bin, Len)). +last_char(<<>>) -> undefined; +last_char(Bin) when is_binary(Bin) -> + last_char(Bin, byte_size(Bin) - 1). -substring_end(Bin, Len) -> - TotalLen = text_size(Bin), - if - TotalLen =< Len -> Bin; - true -> - SkipChars = TotalLen - Len, - SkipBytes = overlap_to_bytes_start(Bin, SkipChars), - binary:part(Bin, SkipBytes, size(Bin) - SkipBytes) +last_char(Bin, Pos) -> + case binary:at(Bin, Pos) band 16#C0 of + 16#80 -> + % continuation byte, keep scanning back + last_char(Bin, Pos - 1); + _ -> + <<_:Pos/binary, C/utf8, _/binary>> = Bin, + C end. - is_non_alphanumeric(undefined) -> true; is_non_alphanumeric(C) -> not ((C >= $a andalso C =< $z) orelse @@ -1147,7 +1165,6 @@ common_suffix(Text1, Text2) -> % @doc Count the number of characters in a utf8 binary. text_size(Text) when is_binary(Text) -> - % string:length(Text). text_size(Text, 0). text_size(<<>>, Count) -> diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 3e6aac2..27454ea 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -301,7 +301,7 @@ text_size_test() -> ?assertEqual(4, diffy:text_size(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>)), %% Bad utf-8 input results in a badarg. - ?assertError({badarg, _}, diffy:text_size(<<149,157,112,8>>)), + ?assertError(badarg, diffy:text_size(<<149,157,112,8>>)), ok. From 37d50fa14537acdb22a131df66dae8c515283221 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 19:28:42 +0200 Subject: [PATCH 08/47] Remove array routines for finding the middle snake --- src/diffy.erl | 100 +++++++++++++++---------------------------- test/diffy_tests.erl | 2 +- 2 files changed, 35 insertions(+), 67 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 8688f42..5c96c6d 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -375,13 +375,15 @@ decode_lines([{Op, Data} | Rest], LinesTuple, Acc) -> %% Array of diff tuples. %% """ diff_bisect(A, B) when is_binary(A) andalso is_binary(B) -> - ArrA = array_from_binary(A), - ArrB = array_from_binary(B), - try compute_diff_bisect1(ArrA, ArrB, array:size(ArrA), array:size(ArrB)) of - no_overlap -> [{delete, A}, {insert, B}] + A32 = unicode:characters_to_binary(A, utf8, utf32), + B32 = unicode:characters_to_binary(B, utf8, utf32), + M = byte_size(A32) div 4, + N = byte_size(B32) div 4, + try compute_diff_bisect1(A32, B32, M, N) of + no_overlap -> [{delete, A}, {insert, B}] catch - throw:{overlap, A1, B1, X, Y} -> - diff_bisect_split(A1, B1, X, Y) + throw:{overlap, X, Y} -> + diff_bisect_split(A, B, A32, B32, X, Y) end. compute_diff_bisect1(A, B, M, N) -> @@ -441,7 +443,7 @@ compute_diff_bisect1(A, B, M, N) -> if X1_1 >= X2 -> % Overlap detected - throw({overlap, A, B, X1_1, Y1_1}); + throw({overlap, X1_1, Y1_1}); true -> {continue, S2_1} end; @@ -491,7 +493,7 @@ compute_diff_bisect1(A, B, M, N) -> % Mirror x2 onto top-left coordinate system. X1 >= M - X2_1 -> % Overlap detected - throw({overlap, A, B, X1, Y1}); + throw({overlap, X1, Y1}); true -> {continue, S4_1} end; @@ -507,12 +509,12 @@ compute_diff_bisect1(A, B, M, N) -> no_overlap. % @doc Split A and B and process the parts. -diff_bisect_split(A, B, X, Y) -> - A1 = binary_from_array(0, X, A), - A2 = binary_from_array(0, Y, B), +diff_bisect_split(A, B, A32, B32, X, Y) -> + A1 = utf32_prefix_to_utf8(A32, X), + A2 = utf32_prefix_to_utf8(B32, Y), - B1 = binary_from_array(X, array:size(A), A), - B2 = binary_from_array(Y, array:size(B), B), + B1 = binary:part(A, byte_size(A1), byte_size(A) - byte_size(A1)), + B2 = binary:part(B, byte_size(A2), byte_size(B) - byte_size(A2)), Diffs = diff(A1, A2, false), DiffsB = diff(B1, B2, false), @@ -540,26 +542,11 @@ pretty_html([{Op, Data}|T], Acc) -> % @doc Compute the source text from a list of diffs. source_text(Diffs) -> - source_text(Diffs, <<>>). - -source_text([], Acc) -> - Acc; -source_text([{insert, _Data}|T], Acc) -> - source_text(T, Acc); -source_text([{_Op, Data}|T], Acc) -> - source_text(T, <>). - + iolist_to_binary([Data || {Op, Data} <- Diffs, Op =/= insert]). % @doc Compute the destination text from a list of diffs. destination_text(Diffs) -> - destination_text(Diffs, <<>>). - -destination_text([], Acc) -> - Acc; -destination_text([{delete, _Data}|T], Acc) -> - destination_text(T, Acc); -destination_text([{_Op, Data}|T], Acc) -> - destination_text(T, <>). + iolist_to_binary([Data || {Op, Data} <- Diffs, Op =/= delete]). % @doc Compute the Levenshtein distance, the number of inserted, deleted or substituted characters. levenshtein(Diffs) -> @@ -1091,26 +1078,22 @@ is_suffix(A, B) -> size(A) =:= binary:longest_common_suffix([A, B]). % -match_front(X1, Y1, A, M, B, N) when X1 < M andalso Y1 < N -> - case array:get(X1, A) =:= array:get(Y1, B) of - true -> - match_front(X1+1, Y1+1, A, M, B, N); - false -> - {X1, Y1} - end; +match_front(X1, Y1, A32, M, B32, N) when X1 < M andalso Y1 < N -> + APart = binary:part(A32, X1 * 4, (M - X1) * 4), + BPart = binary:part(B32, Y1 * 4, (N - Y1) * 4), + Steps = binary:longest_common_prefix([APart, BPart]) div 4, + {X1 + Steps, Y1 + Steps}; match_front(X1, Y1, _, _, _, _) -> {X1, Y1}. % -match_reverse(X1, Y1, A, M, B, N) when X1 < M andalso Y1 < N -> - case array:get(M-X1-1, A) =:= array:get(N-Y1-1, B) of - true -> - match_reverse(X1+1, Y1+1, A, M, B, N); - false -> - {X1, Y1} - end; -match_reverse(X1, Y1, _, _, _, _) -> - {X1, Y1}. +match_reverse(X2, Y2, A32, M, B32, N) when X2 < M andalso Y2 < N -> + APart = binary:part(A32, 0, (M - X2) * 4), + BPart = binary:part(B32, 0, (N - Y2) * 4), + Steps = binary:longest_common_suffix([APart, BPart]) div 4, + {X2 + Steps, Y2 + Steps}; +match_reverse(X2, Y2, _, _, _, _) -> + {X2, Y2}. %% Implementation of the for statement @@ -1175,22 +1158,13 @@ text_size(_, _) -> error(badarg). %% -%% Array utilities +%% UTF-32 utilities %% -% @doc Create an array from a utf8 binary. -array_from_binary(Bin) when is_binary(Bin) -> - array:from_list(unicode:characters_to_list(Bin, utf8)). - -% @doc Create a binary from an array containing unicode characters. -binary_from_array(Start, End, Array) -> - binary_from_array(Start, End, Array, <<>>). - -binary_from_array(N, End, Array, Acc) when N < End -> - C = array:get(N, Array), - binary_from_array(N+1, End, Array, <>); -binary_from_array(_, _, _, Acc) -> - Acc. +% @doc Convert the first N codepoints of a UTF-32BE binary to a UTF-8 binary. +utf32_prefix_to_utf8(Utf32, CodepointCount) -> + Prefix32 = binary:part(Utf32, 0, CodepointCount * 4), + unicode:characters_to_binary(Prefix32, utf32, utf8). %% @doc Checks the trailing bytes for utf8 prefix bytes. repair_tail(<<>>) -> @@ -1313,12 +1287,6 @@ for_test() -> ?assertEqual(0, for(0, 10, fun(I, _N) -> {break, I} end, undefined)), ok. -array_test() -> - ?assertEqual(20, array:size(array_from_binary(<<"de apen eten bananen">>))), - ?assertEqual(<<"broodje aap">>, binary_from_array(0, 11, array_from_binary(<<"broodje aap">>))), - ?assertEqual(<<"aa">>, binary_from_array(0, 2, array_from_binary(<<"aap">>))), - ?assertEqual(<<"ap">>, binary_from_array(1, 3, array_from_binary(<<"aap">>))), - ok. diff_utf8_test() -> ?assertEqual([{equal, <<208,174, 208,189, 208,184, 208,186, 208,190, 208,180>>}], diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 27454ea..a0b8849 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -23,7 +23,7 @@ -include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). --define(NUM_TESTS, 800). +-define(NUM_TESTS, 500). %% %% Properties From b822b7b751f92d7704326cec66532e46bb490624 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 19:35:56 +0200 Subject: [PATCH 09/47] Update otp_versions for testing --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ca85721..0fed54b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,14 +18,14 @@ jobs: strategy: matrix: - otp_version: [22,23,24] + otp_version: [25,26,27] os: [ubuntu-latest] container: image: erlang:${{ matrix.otp_version }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Compile run: make - name: Test From 6fc9c9be41a27d480d57acf3211b971cd5249046 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 19:37:24 +0200 Subject: [PATCH 10/47] Update otp_versions for testing --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0fed54b..7adb427 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,7 +18,7 @@ jobs: strategy: matrix: - otp_version: [25,26,27] + otp_version: [26,27,28] os: [ubuntu-latest] container: From 336a95a7c3bb6819b4e34003f3ae48a2f91c6858 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 19:57:36 +0200 Subject: [PATCH 11/47] Supress dialyzer warning --- src/diffy.app.src | 2 +- src/diffy.erl | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/diffy.app.src b/src/diffy.app.src index ec9688e..4e9d5b5 100644 --- a/src/diffy.app.src +++ b/src/diffy.app.src @@ -2,7 +2,7 @@ {description, "Diff, match patch implementation"}, {vsn, "git"}, {registered, []}, - {applications, [kernel, stdlib]}, + {applications, [kernel, stdlib, zotonic_stdlib]}, {env, []}, {maintainers, ["Maas-Maarten Zeeman", "Zotonic Team"]}, {licenses, ["Apache 2.0"]}, diff --git a/src/diffy.erl b/src/diffy.erl index 5c96c6d..46234d8 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -79,6 +79,8 @@ length2 = 0 }). +-dialyzer({no_match, for/5}). + % @doc Compute the difference between two binary texts % -spec diff(unicode:unicode_binary(), unicode:unicode_binary()) -> diffs(). @@ -1105,10 +1107,8 @@ for(From, To, _Step, _Fun, State) when From >= To -> State; for(From, To, Step, Fun, State) -> case Fun(From, State) of - {continue, S1} -> - for(From + Step, To, Step, Fun, S1); - {break, S1} -> - S1 + {continue, S1} -> for(From + Step, To, Step, Fun, S1); + {break, S1} -> S1 end. split_pre_and_suffix(Text1, Text2) -> From 1c2c8e8c38358d9af89ccb01923a424094d590f9 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 20:04:46 +0200 Subject: [PATCH 12/47] Extra plt apss for dialyzer --- rebar.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/rebar.config b/rebar.config index 304974c..6425ad6 100644 --- a/rebar.config +++ b/rebar.config @@ -26,3 +26,8 @@ ]} ]} ]}. + +{dialyzer, [ + {plt_extra_apps, [eunit]} +]}. + From 82122f10d647635cc7bd3034945adc0fdc8430fb Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 20:21:47 +0200 Subject: [PATCH 13/47] More dialyzer fixes in test --- rebar.config | 8 ++++---- test/diffy_tests.erl | 8 ++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/rebar.config b/rebar.config index 6425ad6..08a0aed 100644 --- a/rebar.config +++ b/rebar.config @@ -22,12 +22,12 @@ {dialyzer, [ {warnings, [ no_return + ]}, + {plt_extra_apps, [ + eunit, + proper ]} ]} ]} ]}. -{dialyzer, [ - {plt_extra_apps, [eunit]} -]}. - diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index a0b8849..d016669 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -23,6 +23,14 @@ -include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). +-dialyzer({no_opaque, [ + cleanup_merge_prop_test/0, + cleanup_efficiency_prop_test/0, + cleanup_semantic_prop_test/0, + random_inner_diff_prop_test/0, + random_diffs_prop_test/0 +]}). + -define(NUM_TESTS, 500). %% From 9e05a94aee8877a76a63a5a82bb3e54da25920df Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 20:39:16 +0200 Subject: [PATCH 14/47] More strict dialyzer and xref settings --- rebar.config | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/rebar.config b/rebar.config index 08a0aed..bbf548a 100644 --- a/rebar.config +++ b/rebar.config @@ -12,8 +12,10 @@ {xref_checks, [ undefined_function_calls, + undefined_functions, locals_not_used, - deprecated_function_calls + deprecated_function_calls, + deprecated_functions ]}, {xref_ignores, [ @@ -21,7 +23,12 @@ {dialyzer, [ {warnings, [ - no_return + no_return, + extra_return, + missing_return, + specdiffs, + overspecs, + underspecs ]}, {plt_extra_apps, [ eunit, From 0af6dbbb1c736cae9ac3592293f56becad4eb4cb Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Tue, 7 Apr 2026 22:06:51 +0200 Subject: [PATCH 15/47] More strict dialyzer and xref settings --- rebar.config | 2 -- 1 file changed, 2 deletions(-) diff --git a/rebar.config b/rebar.config index bbf548a..03ca5c6 100644 --- a/rebar.config +++ b/rebar.config @@ -26,8 +26,6 @@ no_return, extra_return, missing_return, - specdiffs, - overspecs, underspecs ]}, {plt_extra_apps, [ From 51f623157b38101a5efa79ef21ab8bffb3e8c080 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Wed, 8 Apr 2026 09:25:07 +0200 Subject: [PATCH 16/47] Added make doc and introduce ex_doc --- Makefile | 13 +++++++++---- rebar.config | 10 ++++++++++ src/diffy.erl | 1 + src/diffy_simple_patch.erl | 3 ++- src/diffy_term.erl | 3 ++- 5 files changed, 24 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 82eaca1..b3e31ac 100644 --- a/Makefile +++ b/Makefile @@ -28,13 +28,18 @@ xref: $(REBAR) dialyzer: $(REBAR) $(REBAR) as test dialyzer -clean: $(REBAR) +clean: $(REBAR) clean_doc $(REBAR) clean -distclean: - rm -rf _build - rm $(REBAR) +clean_doc: + @rm -rf doc +distclean: clean_doc + @rm -rf _build + @rm $(REBAR) + +doc: $(REBAR) + $(REBAR) ex_doc --output doc --formatter html # dializer diff --git a/rebar.config b/rebar.config index 03ca5c6..b8dd080 100644 --- a/rebar.config +++ b/rebar.config @@ -1,8 +1,18 @@ {erl_opts, [debug_info, warn_unused, warn_shadow_vars]}. + {deps, [ {zotonic_stdlib, "1.27.0"} ]}. +{project_plugins, [rebar3_ex_doc]}. + +{hex, [{doc, ex_doc}]}. + +{ex_doc, [ + {source_url, <<"https://github.com/zotonic/diffy">>}, + {extras, [<<"README.md">>, <<"LICENSE">>]}, + {main, <<"readme">>} +]}. {profiles, [ {test, [ diff --git a/src/diffy.erl b/src/diffy.erl index 46234d8..1de37b2 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -2,6 +2,7 @@ %% @copyright 2014-2026 Maas-Maarten Zeeman %% %% @doc Diffy, an erlang diff match and patch implementation +%% @end %% %% Copyright 2014-2026 Maas-Maarten Zeeman %% diff --git a/src/diffy_simple_patch.erl b/src/diffy_simple_patch.erl index 66a6ae8..29b5753 100644 --- a/src/diffy_simple_patch.erl +++ b/src/diffy_simple_patch.erl @@ -2,8 +2,9 @@ %% @copyright 2014 Maas-Maarten Zeeman %% %% @doc Diffy, an erlang diff match and patch implementation +%% @end %% -%% Copyright 2014 Maas-Maarten Zeeman +%% Copyright 2014-2026 Maas-Maarten Zeeman %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. diff --git a/src/diffy_term.erl b/src/diffy_term.erl index ed25387..2639bf6 100644 --- a/src/diffy_term.erl +++ b/src/diffy_term.erl @@ -3,8 +3,9 @@ %% %% @doc Diffy, an erlang diff match and patch implementation %% Adapted from diffy.erl for simple diff on a list of Erlang terms +%% @end %% -%% Copyright 2014-2015 Maas-Maarten Zeeman, Marc Worrell +%% Copyright 2014-2026 Maas-Maarten Zeeman, Marc Worrell %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. From 6e2a5dff7dd58e5f217aa061a6c3a2dad0a4568e Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Wed, 8 Apr 2026 12:26:07 +0200 Subject: [PATCH 17/47] Remove dep zotonic_stdlib and implement html escaping --- rebar.config | 4 +--- rebar.lock | 23 +---------------------- src/diffy.app.src | 2 +- src/diffy.erl | 24 +++++++++++++++++------- test/diffy_tests.erl | 5 +++++ 5 files changed, 25 insertions(+), 33 deletions(-) diff --git a/rebar.config b/rebar.config index b8dd080..ce7ffd3 100644 --- a/rebar.config +++ b/rebar.config @@ -1,8 +1,6 @@ {erl_opts, [debug_info, warn_unused, warn_shadow_vars]}. -{deps, [ - {zotonic_stdlib, "1.27.0"} -]}. +{deps, [ ]}. {project_plugins, [rebar3_ex_doc]}. diff --git a/rebar.lock b/rebar.lock index 98901fd..57afcca 100644 --- a/rebar.lock +++ b/rebar.lock @@ -1,22 +1 @@ -{"1.2.0", -[{<<"cowlib">>,{pkg,<<"cowlib">>,<<"2.16.0">>},1}, - {<<"qdate_localtime">>,{pkg,<<"qdate_localtime">>,<<"1.2.2">>},1}, - {<<"ssl_verify_fun">>,{pkg,<<"ssl_verify_fun">>,<<"1.1.7">>},2}, - {<<"tls_certificate_check">>, - {pkg,<<"tls_certificate_check">>,<<"1.31.0">>}, - 1}, - {<<"zotonic_stdlib">>,{pkg,<<"zotonic_stdlib">>,<<"1.27.0">>},0}]}. -[ -{pkg_hash,[ - {<<"cowlib">>, <<"54592074EBBBB92EE4746C8A8846E5605052F29309D3A873468D76CDF932076F">>}, - {<<"qdate_localtime">>, <<"43E1B20102F50A8B2A2BE7042C2F6BE989AD96CA2CC319DB5DF56E122E8873F6">>}, - {<<"ssl_verify_fun">>, <<"354C321CF377240C7B8716899E182CE4890C5938111A1296ADD3EC74CF1715DF">>}, - {<<"tls_certificate_check">>, <<"9A910B54D8CB96CC810CABF4C0129F21360F82022B20180849F1442A25CCBB04">>}, - {<<"zotonic_stdlib">>, <<"36D6F7A1004DEE169A61ADB57FDE8175F39F59634B5FFFD4AA0C1D0985D2A74E">>}]}, -{pkg_hash_ext,[ - {<<"cowlib">>, <<"7F478D80D66B747344F0EA7708C187645CFCC08B11AA424632F78E25BF05DB51">>}, - {<<"qdate_localtime">>, <<"A38D5F1C5AE14B22F471E442B262AECCAFB915B664C7C364443DC73179C50FDA">>}, - {<<"ssl_verify_fun">>, <<"FE4C190E8F37401D30167C8C405EDA19469F34577987C76DDE613E838BBC67F8">>}, - {<<"tls_certificate_check">>, <<"9D2B41B128D5507BD8AD93E1A998E06D0AB2F9A772AF343F4C00BF76C6BE1532">>}, - {<<"zotonic_stdlib">>, <<"B9555F50717F2F8FBD3D4156CE7F4E2DF380441D942DE54789466940929B08C3">>}]} -]. +[]. diff --git a/src/diffy.app.src b/src/diffy.app.src index 4e9d5b5..ec9688e 100644 --- a/src/diffy.app.src +++ b/src/diffy.app.src @@ -2,7 +2,7 @@ {description, "Diff, match patch implementation"}, {vsn, "git"}, {registered, []}, - {applications, [kernel, stdlib, zotonic_stdlib]}, + {applications, [kernel, stdlib]}, {env, []}, {maintainers, ["Maas-Maarten Zeeman", "Zotonic Team"]}, {licenses, ["Apache 2.0"]}, diff --git a/src/diffy.erl b/src/diffy.erl index 1de37b2..37b9dcf 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -525,23 +525,33 @@ diff_bisect_split(A, B, A32, B32, X, Y) -> Diffs ++ DiffsB. % @doc Convert the diffs into a pretty html report --spec pretty_html(diffs()) -> iolist(). pretty_html(Diffs) -> pretty_html(Diffs, []). pretty_html([], Acc) -> lists:reverse(Acc); -pretty_html([{Op, Data}|T], Acc) -> - Text = z_html:escape(Data), +pretty_html([{Op, Data} | T], Acc) -> + Safe = html_escape(Data), HTML = case Op of insert -> - [<<"">>, Text, <<"">>]; + [<<"">>, Safe, <<"">>]; delete -> - [<<"">>, Text, <<"">>]; + [<<"">>, Safe, <<"">>]; equal -> - [<<"">>, Text, <<"">>] + [<<"">>, Safe, <<"">>] end, - pretty_html(T, [HTML|Acc]). + pretty_html(T, [HTML | Acc]). + +html_escape(B) when is_binary(B) -> + binary:replace(B, + [<<"&">>, <<"<">>, <<">">>, <<"\"">>, <<"'">>], + fun (<<"&">>) -> <<"&">>; + (<<"<">>) -> <<"<">>; + (<<">">>) -> <<">">>; + (<<"\"">>) -> <<""">>; + (<<"'">>) -> <<"'">> + end, + [global]). % @doc Compute the source text from a list of diffs. source_text(Diffs) -> diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index d016669..694db66 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -142,8 +142,13 @@ pretty_html_test() -> ?assertEqual(<<"test">>, pretty_html([{equal, <<"test">>}])), ?assertEqual(<<"footest">>, pretty_html([{delete, <<"foo">>}, {equal, <<"test">>}])), + ?assertEqual(<<"footest">>, pretty_html([{insert, <<"foo">>}, {equal, <<"test">>}])), + + %% escaping. + ?assertEqual(<<"<span>foo</span>& < > " '">>, + pretty_html([{insert, <<"foo">>}, {equal, <<"& < > \" '">>}])), ok. source_text_test() -> From 53d29fb73206743a7abee46188f6fb812402c624 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Wed, 8 Apr 2026 14:03:15 +0200 Subject: [PATCH 18/47] binary:replace with replacement function is only available from otp 27 --- src/diffy.erl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/diffy.erl b/src/diffy.erl index 37b9dcf..dab2972 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -542,6 +542,7 @@ pretty_html([{Op, Data} | T], Acc) -> end, pretty_html(T, [HTML | Acc]). +-if(?OTP_RELEASE >= 27). html_escape(B) when is_binary(B) -> binary:replace(B, [<<"&">>, <<"<">>, <<">">>, <<"\"">>, <<"'">>], @@ -552,7 +553,23 @@ html_escape(B) when is_binary(B) -> (<<"'">>) -> <<"'">> end, [global]). +-else. +html_escape(B) when is_binary(B) -> + lists:foldl(fun({From, To}, Acc) -> + binary:replace(Acc, From, To, [global]) + end, + B, + [ + {<<"&">>, <<"&">>}, + {<<"<">>, <<"<">>}, + {<<">">>, <<">">>}, + {<<"\"">>, <<""">>}, + {<<"'">>, <<"'">>} + ]). +-endif. + +% Above function can be replaced with this when OTP 27 is the lowest supported % @doc Compute the source text from a list of diffs. source_text(Diffs) -> iolist_to_binary([Data || {Op, Data} <- Diffs, Op =/= insert]). From 8d05c22579c62d809847111d55495fb6d98f0ad6 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Thu, 9 Apr 2026 22:39:45 +0200 Subject: [PATCH 19/47] Changed the internal functions to use utf32 binaries instead of trying to work with utf8 and repair the result --- src/diffy.erl | 877 ++++++++++++++++++++----------------------- test/diffy_tests.erl | 99 ++++- 2 files changed, 503 insertions(+), 473 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index dab2972..57d04ce 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -23,6 +23,7 @@ -export([ diff/2, + diff/3, diff_bisect/2, diff_linemode/2, @@ -52,16 +53,19 @@ -type diff() :: {diff_op(), unicode:unicode_binary()}. -type diffs() :: list(diff()). +-type diff_option() :: + semantic | + efficiency | + {efficiency, EditCost :: pos_integer()} | + no_linemode. + -type for_fun() :: fun((integer(), term()) -> {continue, term()} | {break, term()}). --export_type([diff_op/0, diff/0, diffs/0]). +-export_type([diff_op/0, diff/0, diffs/0, diff_option/0]). -define(PATCH_MARGIN, 4). --define(PATCH_MAX_PATCH_LEN, 32). - --define(MATCH_MAXBITS, 31). - -define(IS_INS_OR_DEL(Op), (Op =:= insert orelse Op =:= delete)). +-define(PHASH2_RANGE, (1 bsl 32)). -record(bisect_state, { k1start = 0, k1end = 0, @@ -82,34 +86,72 @@ -dialyzer({no_match, for/5}). -% @doc Compute the difference between two binary texts -% +% @doc Compute the difference between two binary texts. -spec diff(unicode:unicode_binary(), unicode:unicode_binary()) -> diffs(). diff(Text1, Text2) -> - diff(Text1, Text2, true). + diff(Text1, Text2, []). + +% @doc Compute the difference between two binary texts with options. +% +% Options: +% semantic - run cleanup_semantic/1 on the result +% efficiency - run cleanup_efficiency/1 on the result (default edit cost 4) +% {efficiency, Cost} - run cleanup_efficiency/2 with a custom edit cost +% no_linemode - disable the linemode optimization for large texts +% +% Cleanups are always applied in the correct order: semantic first, then efficiency. +-spec diff(unicode:unicode_binary(), unicode:unicode_binary(), [diff_option()]) -> diffs(). +diff(Text1, Text2, Options) when is_list(Options) -> + CheckLines = not lists:member(no_linemode, Options), + T1 = to_utf32(Text1), + T2 = to_utf32(Text2), + Diffs32 = diff32(T1, T2, CheckLines), + Diffs1 = case lists:member(semantic, Options) of + true -> cleanup_semantic32(Diffs32); + false -> Diffs32 + end, + Diffs2 = case efficiency_opt(Options) of + none -> Diffs1; + default -> cleanup_efficiency32(Diffs1); + {custom, Cost} -> cleanup_efficiency32(Diffs1, Cost) + end, + %% Single conversion at the exit boundary. + [{Op, to_utf8(D)} || {Op, D} <- Diffs2]. + +%% Extract the efficiency option, preferring {efficiency, Cost} over plain efficiency. +efficiency_opt(Options) -> + case lists:keyfind(efficiency, 1, Options) of + {efficiency, Cost} -> {custom, Cost}; + false -> + case lists:member(efficiency, Options) of + true -> default; + false -> none + end + end. -diff(<<>>, <<>>, _CheckLines) -> +%% Internal diff working entirely in UTF-32 binaries. +diff32(<<>>, <<>>, _CheckLines) -> []; -diff(Text1, Text2, _CheckLines) when Text1 =:= Text2 -> +diff32(Text1, Text2, _CheckLines) when Text1 =:= Text2 -> [{equal, Text1}]; -diff(Text1, Text2, CheckLines) -> +diff32(Text1, Text2, CheckLines) -> {Prefix, MText1, MText2, Suffix} = split_pre_and_suffix(Text1, Text2), Diffs = compute_diff(MText1, MText2, CheckLines), Diffs1 = case Suffix of - <<>> -> Diffs; - _ -> Diffs ++ [{equal, Suffix}] - end, + <<>> -> Diffs; + _ -> Diffs ++ [{equal, Suffix}] + end, - Diffs2 = case Prefix of - <<>> -> Diffs1; - _ -> [{equal, Prefix} | Diffs1] - end, + Diffs2 = case Prefix of + <<>> -> Diffs1; + _ -> [{equal, Prefix} | Diffs1] + end, - cleanup_merge(Diffs2). + cleanup_merge32(Diffs2). -%% This assumes Text1 and Text2 don't have a common prefix +%% This assumes Text1 and Text2 don't have a common prefix. Operates on UTF-32. compute_diff(<<>>, NewText, _CheckLines) -> [{insert, NewText}]; compute_diff(OldText, <<>>, _CheckLines) -> @@ -118,22 +160,23 @@ compute_diff(OldText, NewText, CheckLines) -> OldStNew = size(OldText) < size(NewText), {ShortText, LongText} = case OldStNew of - true -> {OldText, NewText}; - false -> {NewText, OldText} - end, + true -> {OldText, NewText}; + false -> {NewText, OldText} + end, case binary:match(LongText, ShortText) of {Start, Length} -> <> = LongText, Op = diff_op(OldStNew), - [{Op, Pre}, {equal, ShortText}, {Op, Suf}]; + [{Op, Pre}, {equal, ShortText}, {Op, Suf}]; nomatch -> - case single_char(ShortText) of + %% In UTF-32, a single codepoint is exactly 4 bytes. + case size(ShortText) =:= 4 of true -> [{delete, OldText}, {insert, NewText}]; false -> try_half_match(OldText, NewText, CheckLines) - end + end end. diff_op(true) -> insert; @@ -143,14 +186,15 @@ diff_op(false) -> delete. try_half_match(OldText, NewText, CheckLines) -> case half_match(OldText, NewText) of {half_match, A1, A2, B1, B2, Common} -> - Diffs1 = diff(A1, B1, CheckLines), - Diffs2 = diff(A2, B2, CheckLines), + Diffs1 = diff32(A1, B1, CheckLines), + Diffs2 = diff32(A2, B2, CheckLines), Diffs1 ++ [{equal, Common} | Diffs2]; undefined -> compute_diff1(OldText, NewText, CheckLines) end. %% Check if we can do a half-match diff, returns undefined if it is not advantageous. +%% Operates on UTF-32 binaries — size comparisons are in bytes (4 bytes per codepoint). half_match(A, B) -> AGtB = size(A) > size(B), {Short, Long} = case AGtB of @@ -158,28 +202,28 @@ half_match(A, B) -> false -> {A, B} end, - case text_smaller_than(Long, 4) orelse size(Short) * 2 < size(Long) of + %% text_smaller_than(Long, 4) becomes size(Long) < 4*4 in UTF-32. + case size(Long) < 16 orelse size(Short) * 2 < size(Long) of true -> %% No point in looking. undefined; false -> - %% Note: this could split through a utf8 byte sequence. Hm1 = half_match_i(Long, Short, (size(Long) + 3) div 4), Hm2 = half_match_i(Long, Short, (size(Long) + 1) div 2), %% Select the longest half-match. Hm = case {Hm1, Hm2} of - {undefined, undefined} -> - undefined; - {undefined, _} -> - Hm2; - {_, undefined} -> - Hm1; - {{half_match, _, _, _, _, C1}, {half_match, _, _, _, _, C2}} when size(C1) > size(C2) -> - Hm1; - {_, _} -> - Hm2 - end, + {undefined, undefined} -> + undefined; + {undefined, _} -> + Hm2; + {_, undefined} -> + Hm1; + {{half_match, _, _, _, _, C1}, {half_match, _, _, _, _, C2}} when size(C1) > size(C2) -> + Hm1; + {_, _} -> + Hm2 + end, %% Swap values if A was smaller than B case Hm of @@ -193,19 +237,14 @@ half_match(A, B) -> end end. - % Find the best common overlap at location I. half_match_i(Long, Short, I) -> {NewI, Seed} = seed(Long, I), case Seed of - <<>> -> - undefined; - _ -> - best_common(Long, Short, Seed, NewI, 0, - undefined, undefined, undefined, undefined, <<>>) + <<>> -> undefined; + _ -> best_common(Long, Short, Seed, NewI, 0, <<>>, <<>>, <<>>, <<>>, <<>>) end. - %% Find the best common overlap inside two texts. best_common(Long, Short, Seed, SeedLoc, Start, BestLongA, BestLongB, BestShortA, BestShortB, BestCommon) -> @@ -251,45 +290,49 @@ best_common(Long, Short, Seed, SeedLoc, Start, end end. -%% @doc Return the position of the next character. -next_char(Bin, Pos) -> - <<_:Pos/binary, C/utf8, _Rest/binary>> = Bin, - %% The next char is at binary position... - Pos + size(<>). +%% @doc Return the byte position of the next codepoint in a UTF-32 binary. +next_char(_Bin, Pos) -> + Pos + 4. -%% +%% +%% In UTF-32 every codepoint is exactly 4 bytes, so any 4-byte-aligned slice +%% is a valid codepoint boundary — no repair_head/repair_tail needed. seed(Long, Start) -> SeedSize = size(Long) div 4, - %% Note, need to split on utf8 character boundary here. - <<_Pre:Start/binary, Seed:SeedSize/binary, _Post/binary>> = Long, - - %% Utf-8 repair the seed's head and tail. - {Pre, Seed1} = repair_head(Seed), - {Seed2, _} = repair_tail(Seed1), + %% Align Start to a 4-byte (codepoint) boundary. + AlignedStart = (Start div 4) * 4, + <<_Pre:AlignedStart/binary, Seed:SeedSize/binary, _Post/binary>> = Long, - %% return the start position of the seed and the seed itself. - {Start - size(Pre), Seed2}. + {AlignedStart, Seed}. %% Line diff compute_diff1(Text1, Text2, true) -> - diff_linemode(Text1, Text2); -compute_diff1(Text1, Text2, false) when size(Text1) > 100 orelse size(Text2) > 100 -> - diff_linemode(Text1, Text2); + diff_linemode32(Text1, Text2); +compute_diff1(Text1, Text2, false) when size(Text1) > 400 orelse size(Text2) > 400 -> + %% 100 UTF-8 bytes ≈ 400 UTF-32 bytes (conservative upper bound) + diff_linemode32(Text1, Text2); compute_diff1(Text1, Text2, false) -> - diff_bisect(Text1, Text2). + diff_bisect32(Text1, Text2). -%% Compute diff in linemode +%% Public entry: accepts UTF-8, converts at boundary. diff_linemode(Text1, Text2) -> + T1 = to_utf32(Text1), + T2 = to_utf32(Text2), + Diffs32 = diff_linemode32(T1, T2), + [{Op, to_utf8(D)} || {Op, D} <- Diffs32]. + +%% Internal: operates entirely on UTF-32 binaries. +diff_linemode32(Text1, Text2) -> {CharText1, CharText2, Lines} = lines_to_chars(Text1, Text2), - Diffs = diff(CharText1, CharText2, false), + Diffs = diff32(CharText1, CharText2, false), %% Transform the diffs back to lines. Diffs1 = decode_lines(Diffs, Lines), - Cleaned = cleanup_merge(Diffs1), + Cleaned = cleanup_merge32(Diffs1), cleanup_line_diff(Cleaned, <<>>, <<>>, [], []). @@ -313,45 +356,51 @@ cleanup_line_diff([{equal, _}=E|Rest], DeleteData, InsertData, TmpAcc, Acc) %% Found leading insert and delete data, diff the texts and replace the operations. cleanup_line_diff([{equal, _}=E|Rest], DeleteData, InsertData, _TmpAcc, Acc) -> - %% rediff the delete and insert data. - Diffs = diff(DeleteData, InsertData, false), + %% Data is already UTF-32 — pass directly to diff32. + Diffs = diff32(DeleteData, InsertData, false), Acc1 = lists:reverse(Diffs) ++ Acc, cleanup_line_diff(Rest, <<>>, <<>>, [], [E|Acc1]). -%% Diff lines +%% Diff lines. +%% Text1 and Text2 are UTF-32 binaries. Lines are stored as UTF-32 binaries. +%% CharText1/CharText2 are UTF-32 binaries where each 4-byte word is a line index. lines_to_chars(Text1, Text2) -> - {CharText1, NextChar, Lines1, Dict1} = lines_to_chars(Text1, 0, <<>>, 0, [], dict:new()), - {CharText2, _, Lines2, _Dict2} = lines_to_chars(Text2, 0, <<>>, NextChar, Lines1, Dict1), + Utf8Text1 = to_utf8(Text1), + Utf8Text2 = to_utf8(Text2), + {CharText1, NextChar, Lines1, Map1} = lines_to_chars(Utf8Text1, 0, <<>>, 0, [], #{}), + {CharText2, _, Lines2, _Map2} = lines_to_chars(Utf8Text2, 0, <<>>, NextChar, Lines1, Map1), {CharText1, CharText2, lists:reverse(Lines2)}. -% Transform each unique line into a single char -lines_to_chars(Text, Idx, CharText, NextChar, Lines, D) when Idx >= size(Text) -> - {CharText, NextChar, Lines, D}; -lines_to_chars(Text, Idx, CharText, NextChar, Lines, D) -> - case binary:match(Text, <<"\n">>, [{scope, {Idx, size(Text)-Idx}}]) of +%% Transform each unique line into a 4-byte index; store line content as UTF-32. +lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) when Idx >= byte_size(Text) -> + {CharText, NextChar, Lines, Map}; +lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) -> + case binary:match(Text, <<"\n">>, [{scope, {Idx, byte_size(Text)-Idx}}]) of nomatch -> <<_:Idx/binary, Line/binary>> = Text, - {Char, NextChar1, Lines1, D1} = insert_line(Line, Lines, D, NextChar), - CharText1 = <>, - {CharText1, NextChar1, Lines1, D1}; + {Char, NextChar1, Lines1, Map1} = insert_line(to_utf32(Line), Lines, Map, NextChar), + CharText1 = <>, + {CharText1, NextChar1, Lines1, Map1}; {Start, _} -> LineLength = Start - Idx + 1, <<_:Idx/binary, Line:LineLength/binary, _/binary>> = Text, - - {Char, NextChar1, Lines1, D1} = insert_line(Line, Lines, D, NextChar), - CharText1 = <>, - - lines_to_chars(Text, Idx + LineLength, CharText1, NextChar1, Lines1, D1) + {Char, NextChar1, Lines1, Map1} = insert_line(to_utf32(Line), Lines, Map, NextChar), + CharText1 = <>, + lines_to_chars(Text, Idx + LineLength, CharText1, NextChar1, Lines1, Map1) end. -insert_line(Line, Lines, Dict, NextChar) -> - case dict:find(Line, Dict) of - {ok, Char} -> - {Char, NextChar, Lines, Dict}; - error -> - {NextChar, NextChar+1, [Line|Lines], dict:store(Line, NextChar, Dict)} + +insert_line(Line, Lines, Map, NextChar) -> + Hash = erlang:phash2(Line, ?PHASH2_RANGE), + case Map of + %% Hash hit — verify the stored line matches to guard against collisions. + #{Hash := {Char, Line}} -> + {Char, NextChar, Lines, Map}; + %% Hash miss or collision with a different line — assign a new index. + _ -> + {NextChar, NextChar + 1, [Line | Lines], Map#{Hash => {NextChar, Line}}} end. decode_lines(Diffs, Lines) when is_list(Lines) -> @@ -361,7 +410,8 @@ decode_lines(Diffs, Lines) when is_list(Lines) -> decode_lines([], _LinesTuple, Acc) -> lists:reverse(Acc); decode_lines([{Op, Data} | Rest], LinesTuple, Acc) -> - Data1 = << <<(element(C + 1, LinesTuple))/binary>> || <> <= Data >>, + %% Each index is a 32-bit word; lines are already UTF-32 — just concatenate. + Data1 = << <<(element(C + 1, LinesTuple))/binary>> || <> <= Data >>, decode_lines(Rest, LinesTuple, [{Op, Data1} | Acc]). @@ -377,16 +427,20 @@ decode_lines([{Op, Data} | Rest], LinesTuple, Acc) -> %% Returns: %% Array of diff tuples. %% """ +%% Public entry point — converts UTF-8 inputs to UTF-32, runs bisect, converts back. diff_bisect(A, B) when is_binary(A) andalso is_binary(B) -> - A32 = unicode:characters_to_binary(A, utf8, utf32), - B32 = unicode:characters_to_binary(B, utf8, utf32), - M = byte_size(A32) div 4, - N = byte_size(B32) div 4, - try compute_diff_bisect1(A32, B32, M, N) of + Diffs32 = diff_bisect32(to_utf32(A), to_utf32(B)), + [{Op, to_utf8(D)} || {Op, D} <- Diffs32]. + +%% Internal bisect working entirely on UTF-32 binaries. +diff_bisect32(A, B) -> + M = byte_size(A) div 4, + N = byte_size(B) div 4, + try compute_diff_bisect1(A, B, M, N) of no_overlap -> [{delete, A}, {insert, B}] catch throw:{overlap, X, Y} -> - diff_bisect_split(A, B, A32, B32, X, Y) + diff_bisect_split(A, B, X, Y) end. compute_diff_bisect1(A, B, M, N) -> @@ -414,11 +468,13 @@ compute_diff_bisect1(A, B, M, N) -> S3 = for(-D + S1#bisect_state.k1start, D + 1 - S1#bisect_state.k1end, 2, fun(K1, S2) -> K1Offset = VOffset + K1, - X1 = case K1 =:= -D orelse (K1 =/= D andalso - (array:get(K1Offset-1, S2#bisect_state.v1) < array:get(K1Offset+1, S2#bisect_state.v1))) of - true -> array:get(K1Offset + 1, S2#bisect_state.v1); - false -> array:get(K1Offset - 1, S2#bisect_state.v1) + 1 - end, + X1 = case K1 =:= -D + orelse (K1 =/= D + andalso (array:get(K1Offset-1, S2#bisect_state.v1) < array:get(K1Offset+1, S2#bisect_state.v1))) + of + true -> array:get(K1Offset + 1, S2#bisect_state.v1); + false -> array:get(K1Offset - 1, S2#bisect_state.v1) + 1 + end, Y1 = X1 - K1, {X1_1, Y1_1} = match_front(X1, Y1, A, M, B, N), @@ -460,13 +516,13 @@ compute_diff_bisect1(A, B, M, N) -> %% Walk the reverse path one step. (verdacht hetzelfde als het ding hierboven...) S5 = for(-D + S3#bisect_state.k2start, D + 1 - S3#bisect_state.k2end, 2, fun(K2, S4) -> K2Offset = VOffset + K2, - X2 = case K2 =:= -D orelse (K2 =/= D andalso - array:get(K2Offset-1, S4#bisect_state.v2) < array:get(K2Offset+1, S4#bisect_state.v2)) of - true -> - array:get(K2Offset + 1, S4#bisect_state.v2); - false -> - array:get(K2Offset - 1, S4#bisect_state.v2) + 1 - end, + X2 = case K2 =:= -D + orelse (K2 =/= D + andalso array:get(K2Offset-1, S4#bisect_state.v2) < array:get(K2Offset+1, S4#bisect_state.v2)) + of + true -> array:get(K2Offset + 1, S4#bisect_state.v2); + false -> array:get(K2Offset - 1, S4#bisect_state.v2) + 1 + end, Y2 = X2 - K2, @@ -511,18 +567,14 @@ compute_diff_bisect1(A, B, M, N) -> no_overlap. -% @doc Split A and B and process the parts. -diff_bisect_split(A, B, A32, B32, X, Y) -> - A1 = utf32_prefix_to_utf8(A32, X), - A2 = utf32_prefix_to_utf8(B32, Y), - - B1 = binary:part(A, byte_size(A1), byte_size(A) - byte_size(A1)), - B2 = binary:part(B, byte_size(A2), byte_size(B) - byte_size(A2)), +% @doc Split A and B at the overlap point and recursively diff each half. +diff_bisect_split(A, B, X, Y) -> + A1 = binary:part(A, 0, X * 4), + A2 = binary:part(B, 0, Y * 4), + B1 = binary:part(A, X * 4, byte_size(A) - X * 4), + B2 = binary:part(B, Y * 4, byte_size(B) - Y * 4), - Diffs = diff(A1, A2, false), - DiffsB = diff(B1, B2, false), - - Diffs ++ DiffsB. + diff32(A1, A2, false) ++ diff32(B1, B2, false). % @doc Convert the diffs into a pretty html report pretty_html(Diffs) -> @@ -597,45 +649,50 @@ levenshtein([{equal, _Data}|T], Insertions, Deletions, Levenshtein) -> % -spec cleanup_merge(diffs()) -> diffs(). cleanup_merge(Diffs) -> - Diffs1 = cleanup_merge(Diffs, []), + Diffs32 = [{Op, to_utf32(D)} || {Op, D} <- Diffs], + [{Op, to_utf8(D)} || {Op, D} <- cleanup_merge32(Diffs32)]. + +%% Internal cleanup_merge operating on UTF-32 diffs. +cleanup_merge32(Diffs) -> + Diffs1 = cleanup_merge32(Diffs, []), canonicalize_edits(Diffs1, []). %% Done -cleanup_merge([], Acc) -> +cleanup_merge32([], Acc) -> lists:reverse(Acc); %% Remove operations without data. -cleanup_merge([{_Op, <<>>}|T], Acc) -> - cleanup_merge(T, Acc); +cleanup_merge32([{_Op, <<>>}|T], Acc) -> + cleanup_merge32(T, Acc); %% Merge data from equal operations -cleanup_merge([{Op2, Data2}|T], [{Op1, Data1}|Acc]) when Op1 =:= Op2 -> - cleanup_merge(T, [{Op1, <>}|Acc]); +cleanup_merge32([{Op2, Data2}|T], [{Op1, Data1}|Acc]) when Op1 =:= Op2 -> + cleanup_merge32(T, [{Op1, <>}|Acc]); %% Cleanup edits before equal operation -cleanup_merge([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc]) when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal -> - cleanup_merge(T, [I, {Op3, <>}|Acc]); +cleanup_merge32([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc]) when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal -> + cleanup_merge32(T, [I, {Op3, <>}|Acc]); %% Check if Op1Data and Op2Data have common prefixes. -cleanup_merge([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc]) when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal -> +cleanup_merge32([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc]) when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal -> {Prefix, Op1DataD, Op2DataD, Suffix} = split_pre_and_suffix(Op1Data, Op2Data), - cleanup_merge(T, [{equal, <>}, + cleanup_merge32(T, [{equal, <>}, {Op1, Op1DataD}, {Op2, Op2DataD}, {equal, <>}|Acc]); %% Check for slide left and slide right edits -cleanup_merge([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =:= insert orelse Op =:= delete -> +cleanup_merge32([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =:= insert orelse Op =:= delete -> case is_suffix(E2, I) of false -> case is_prefix(E1, I) of false -> - cleanup_merge(T, [H|Acc]); + cleanup_merge32(T, [H|Acc]); true -> P = size(E1), <<_:P/binary, Post/binary>> = I, - cleanup_merge([{equal, <>}, {Op, <>}|T], AccTail) + cleanup_merge32([{equal, <>}, {Op, <>}|T], AccTail) end; true -> R = size(I) - size(E2), - <> = I, - cleanup_merge([{Op, <>}, {equal, <>}|T], AccTail) + <> = I, + cleanup_merge32([{Op, <>}, {equal, <>}|T], AccTail) end; -cleanup_merge([H|T], Acc) -> - cleanup_merge(T, [H|Acc]). +cleanup_merge32([H|T], Acc) -> + cleanup_merge32(T, [H|Acc]). canonicalize_edits([{insert, I}, {delete, D} | T], Acc) -> canonicalize_edits(T, [{insert, I}, {delete, D} | Acc]); @@ -648,8 +705,13 @@ canonicalize_edits([], Acc) -> % -spec cleanup_semantic(diffs()) -> diffs(). cleanup_semantic(Diffs) -> + Diffs32 = [{Op, to_utf32(D)} || {Op, D} <- Diffs], + [{Op, to_utf8(D)} || {Op, D} <- cleanup_semantic32(Diffs32)]. + +%% Internal semantic cleanup operating on UTF-32 diffs. +cleanup_semantic32(Diffs) -> Diffs1 = cleanup_semantic_breakpoints(Diffs), - Diffs2 = cleanup_merge(Diffs1), + Diffs2 = cleanup_merge32(Diffs1), Diffs3 = cleanup_semantic_lossless(Diffs2), cleanup_semantic_overlaps(Diffs3). @@ -664,13 +726,13 @@ find_breakpoint([], _Acc, _LI1, _LD1, _LI2, _LD2, _LE) -> find_breakpoint([{equal, Data} | T], Acc, _LI1, _LD1, LI2, LD2, _LE) -> find_breakpoint(T, [{equal, Data} | Acc], LI2, LD2, 0, 0, Data); find_breakpoint([{insert, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) -> - NewLI2 = LI2 + text_size(Data), + NewLI2 = LI2 + text_size32(Data), case is_breakpoint(LE, LI1, LD1, NewLI2, LD2) of true -> {found, apply_breakpoint(LE, Acc, [{insert, Data} | T])}; false -> find_breakpoint(T, [{insert, Data} | Acc], LI1, LD1, NewLI2, LD2, LE) end; find_breakpoint([{delete, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) -> - NewLD2 = LD2 + text_size(Data), + NewLD2 = LD2 + text_size32(Data), case is_breakpoint(LE, LI1, LD1, LI2, NewLD2) of true -> {found, apply_breakpoint(LE, Acc, [{delete, Data} | T])}; false -> find_breakpoint(T, [{delete, Data} | Acc], LI1, LD1, LI2, NewLD2, LE) @@ -678,7 +740,7 @@ find_breakpoint([{delete, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) -> is_breakpoint(undefined, _, _, _, _) -> false; is_breakpoint(LE, LI1, LD1, LI2, LD2) -> - LEN = text_size(LE), + LEN = text_size32(LE), LEN =< max(LI1, LD1) andalso LEN =< max(LI2, LD2). apply_breakpoint(LE, Acc, T) -> @@ -743,8 +805,9 @@ find_best_slide(E1, Edit, E2, BestScore, BestE1, BestEdit, BestE2) -> {BestE1, BestEdit, BestE2} end. -can_slide_right(<>, <>) -> - {true, <>, RestEdit, RestE2}; +%% In UTF-32 each codepoint is exactly 4 bytes — no pattern matching on variable-width needed. +can_slide_right(<>, <>) -> + {true, <>, RestEdit, RestE2}; can_slide_right(_, _) -> false. @@ -776,31 +839,25 @@ cleanup_semantic_overlaps(Diffs) -> cleanup_semantic_overlaps([{delete, Del}, {insert, Ins} | T], Acc) -> Overlap1 = common_overlap(Del, Ins), Overlap2 = common_overlap(Ins, Del), + TDel = text_size32(Del), + TIns = text_size32(Ins), if Overlap1 >= Overlap2 -> - TDel = size(Del), - TIns = size(Ins), - Overlap1BytesDel = overlap_to_bytes_end(Del, Overlap1), - Overlap1BytesIns = overlap_to_bytes_start(Ins, Overlap1), if - Overlap1BytesDel >= TDel / 2 orelse Overlap1BytesIns >= TIns / 2 -> - Common = binary:part(Ins, 0, Overlap1BytesIns), - NewDel = binary:part(Del, 0, TDel - Overlap1BytesDel), - NewIns = binary:part(Ins, Overlap1BytesIns, TIns - Overlap1BytesIns), + Overlap1 * 2 >= TDel orelse Overlap1 * 2 >= TIns -> + Common = binary:part(Ins, 0, Overlap1 * 4), + NewDel = binary:part(Del, 0, (TDel - Overlap1) * 4), + NewIns = binary:part(Ins, Overlap1 * 4, (TIns - Overlap1) * 4), cleanup_semantic_overlaps([{insert, NewIns} | T], [{equal, Common}, {delete, NewDel} | Acc]); true -> cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) end; true -> - TDel = size(Del), - TIns = size(Ins), - Overlap2BytesIns = overlap_to_bytes_end(Ins, Overlap2), - Overlap2BytesDel = overlap_to_bytes_start(Del, Overlap2), if - Overlap2BytesIns >= TIns / 2 orelse Overlap2BytesDel >= TDel / 2 -> - Common = binary:part(Ins, TIns - Overlap2BytesIns, Overlap2BytesIns), - NewIns = binary:part(Ins, 0, TIns - Overlap2BytesIns), - NewDel = binary:part(Del, Overlap2BytesDel, TDel - Overlap2BytesDel), + Overlap2 * 2 >= TIns orelse Overlap2 * 2 >= TDel -> + Common = binary:part(Ins, (TIns - Overlap2) * 4, Overlap2 * 4), + NewIns = binary:part(Ins, 0, (TIns - Overlap2) * 4), + NewDel = binary:part(Del, Overlap2 * 4, (TDel - Overlap2) * 4), cleanup_semantic_overlaps([{delete, NewDel} | T], [{equal, Common}, {insert, NewIns} | Acc]); true -> cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) @@ -811,47 +868,26 @@ cleanup_semantic_overlaps([H | T], Acc) -> cleanup_semantic_overlaps([], Acc) -> lists:reverse(Acc). -%% Helper functions for semantic cleanup - -%% @doc Convert N codepoints from the start of Bin to a byte offset. -%% This is consistent with text_size/1 which counts codepoints (not grapheme clusters). -overlap_to_bytes_start(Bin, N) -> - codepoints_to_bytes(Bin, N, 0). - -codepoints_to_bytes(_Bin, 0, Acc) -> - Acc; -codepoints_to_bytes(<>, N, Acc) -> - codepoints_to_bytes(Rest, N - 1, Acc + byte_size(<>)); -codepoints_to_bytes(<<_C, Rest/binary>>, N, Acc) -> - %% Invalid utf-8 byte, count as 1 - codepoints_to_bytes(Rest, N - 1, Acc + 1); -codepoints_to_bytes(<<>>, _N, Acc) -> - Acc. - -%% @doc Convert N codepoints from the END of Bin to a byte count of that suffix. -overlap_to_bytes_end(Bin, N) -> - SkipChars = text_size(Bin) - N, - SkipBytes = codepoints_to_bytes(Bin, SkipChars, 0), - byte_size(Bin) - SkipBytes. +%% In UTF-32 every codepoint is exactly 4 bytes, so all byte/codepoint conversions +%% are simple multiplications and binary:part calls. +%% @doc Return the first Len codepoints of Bin as a binary. substring_start(Bin, Len) -> - binary:part(Bin, 0, overlap_to_bytes_start(Bin, Len)). + binary:part(Bin, 0, Len * 4). +%% @doc Return the last Len codepoints of Bin as a binary. substring_end(Bin, Len) -> - TotalLen = text_size(Bin), + TotalLen = text_size32(Bin), case TotalLen =< Len of true -> Bin; - false -> - SkipChars = TotalLen - Len, - SkipBytes = codepoints_to_bytes(Bin, SkipChars, 0), - binary:part(Bin, SkipBytes, byte_size(Bin) - SkipBytes) + false -> binary:part(Bin, (TotalLen - Len) * 4, Len * 4) end. common_overlap(<<>>, _) -> 0; common_overlap(_, <<>>) -> 0; common_overlap(Text1, Text2) -> - T1Len = text_size(Text1), - T2Len = text_size(Text2), + T1Len = text_size32(Text1), + T2Len = text_size32(Text2), {T1, T2, TMin} = if T1Len > T2Len -> {substring_end(Text1, T2Len), Text2, T2Len}; T1Len < T2Len -> {Text1, substring_start(Text2, T1Len), T1Len}; @@ -867,7 +903,8 @@ common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin -> case binary:match(T2, Pattern) of nomatch -> Best; {FoundByteOffset, _} -> - FoundCharCount = text_size(binary:part(T2, 0, FoundByteOffset)), + %% In UTF-32, byte offset maps directly to codepoint count. + FoundCharCount = FoundByteOffset div 4, NewLength = Length + FoundCharCount, if NewLength > TMin -> Best; @@ -883,22 +920,15 @@ common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin -> common_overlap_loop(_T1, _T2, _TMin, Best, _Length) -> Best. -first_char(<>) -> C; +%% In UTF-32 the first and last codepoints are always at fixed byte offsets. +first_char(<>) -> C; first_char(_) -> undefined. last_char(<<>>) -> undefined; -last_char(Bin) when is_binary(Bin) -> - last_char(Bin, byte_size(Bin) - 1). - -last_char(Bin, Pos) -> - case binary:at(Bin, Pos) band 16#C0 of - 16#80 -> - % continuation byte, keep scanning back - last_char(Bin, Pos - 1); - _ -> - <<_:Pos/binary, C/utf8, _/binary>> = Bin, - C - end. +last_char(Bin) -> + Size = byte_size(Bin), + <<_:(Size-4)/binary, C:32>> = Bin, + C. is_non_alphanumeric(undefined) -> true; is_non_alphanumeric(C) -> @@ -921,17 +951,25 @@ is_whitespace(C) -> is_linebreak(C) -> C =:= $\n orelse C =:= $\r. -is_blankline_end(Bin) -> - case re:run(Bin, <<"\n\r?\n$">> ) of - {match, _} -> true; - nomatch -> false - end. +%% In UTF-32 each codepoint is 4 bytes, so newline patterns are fixed-width. +is_blankline_end(Bin) when byte_size(Bin) >= 8 -> + Size = byte_size(Bin), + case Bin of + <<_:(Size-8)/binary, $\n:32, $\n:32>> -> true; + <<_:(Size-12)/binary, $\n:32, $\r:32, $\n:32>> -> true; + _ -> false + end; +is_blankline_end(_) -> false. -is_blankline_start(Bin) -> - case re:run(Bin, <<"^\r?\n\r?\n">> ) of - {match, _} -> true; - nomatch -> false - end. +is_blankline_start(Bin) when byte_size(Bin) >= 8 -> + case Bin of + <<$\n:32, $\n:32, _/binary>> -> true; + <<$\n:32, $\r:32, $\n:32, _/binary>> -> true; + <<$\r:32, $\n:32, $\n:32, _/binary>> -> true; + <<$\r:32, $\n:32, $\r:32, $\n:32, _/binary>> -> true; + _ -> false + end; +is_blankline_start(_) -> false. % @doc Do efficiency cleanup of diffs. % @@ -939,60 +977,53 @@ is_blankline_start(Bin) -> cleanup_efficiency(Diffs) -> cleanup_efficiency(Diffs, 4). +-spec cleanup_efficiency(diffs(), pos_integer()) -> diffs(). cleanup_efficiency(Diffs, EditCost) -> - cleanup_efficiency(Diffs, false, EditCost, []). + Diffs32 = [{Op, to_utf32(D)} || {Op, D} <- Diffs], + [{Op, to_utf8(D)} || {Op, D} <- cleanup_efficiency32(Diffs32, EditCost)]. + +%% Internal efficiency cleanup operating on UTF-32 diffs. +cleanup_efficiency32(Diffs) -> + cleanup_efficiency32(Diffs, 4). + +cleanup_efficiency32(Diffs, EditCost) -> + cleanup_efficiency32(Diffs, false, EditCost, []). %% Done. -cleanup_efficiency([], Changed, _EditCost, Acc) -> +cleanup_efficiency32([], Changed, _EditCost, Acc) -> Diffs = lists:reverse(Acc), case Changed of false -> Diffs; - true -> cleanup_merge(Diffs) + true -> cleanup_merge32(Diffs) end; %% Any equality which is surrounded on both sides by an insertion and deletion need less then %% EditCost characters for it to be advantageous to split. -cleanup_efficiency([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc) when +cleanup_efficiency32([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc) when O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) -> case text_smaller_than(XY, EditCost) of true -> - %% Split Del = {delete, XY}, Ins = {insert, XY}, - - cleanup_efficiency([Ins, B | T], true, EditCost, [Del, A | Acc]); + cleanup_efficiency32([Ins, B | T], true, EditCost, [Del, A | Acc]); false -> - %% Equal is big enough, move A and equal out of the way. - cleanup_efficiency([B | T], Changed, EditCost, [E, A |Acc]) + cleanup_efficiency32([B | T], Changed, EditCost, [E, A | Acc]) end; %% Any equality which is surrounded on one side by an existing insertion and deletion and on the -%% other side by an exisiting insertion or deletion needs by less than half C characters long for it -%% to be advantagous to split. -cleanup_efficiency([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc) when +%% other side by an existing insertion or deletion needs less than half C characters long for it +%% to be advantageous to split. +cleanup_efficiency32([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc) when O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) andalso ?IS_INS_OR_DEL(O3) -> case text_smaller_than(X, EditCost div 2 + 1) of true -> - %% Split Del = {delete, X}, Ins = {insert, X}, - cleanup_efficiency([Ins, C | T], true, EditCost, [Del, B, A | Acc]); + cleanup_efficiency32([Ins, C | T], true, EditCost, [Del, B, A | Acc]); false -> - %% Equal is big enough, move delete and equal out of the way. - cleanup_efficiency([B, E, C | T], Changed, EditCost, [A |Acc]) + cleanup_efficiency32([B, E, C | T], Changed, EditCost, [A | Acc]) end; -cleanup_efficiency([H|T], Changed, EditCost, Acc) -> - cleanup_efficiency(T, Changed, EditCost, [H|Acc]). - +cleanup_efficiency32([H | T], Changed, EditCost, Acc) -> + cleanup_efficiency32(T, Changed, EditCost, [H | Acc]). -% @doc Return true iff the text is smaller than specified -text_smaller_than(_, 0) -> - false; -text_smaller_than(<<>>, _Size) -> - true; -text_smaller_than(<<_C/utf8, Rest/binary>>, Size) when Size > 0 -> - text_smaller_than(Rest, Size-1); -text_smaller_than(<<_C, Rest/binary>>, Size) when Size > 0 -> - %% Illegal utf-8 string, just count this as a single character and continue - text_smaller_than(Rest, Size-1). % @doc create a patch from a list of diffs make_patch(Diffs) when is_list(Diffs) -> @@ -1012,10 +1043,8 @@ make_patch(Diffs, SourceText) when is_list(Diffs) andalso is_binary(SourceText) make_patch([], _PrePatchText, _PostPatchText, _Count1, _Count2, [Patch|Rest]=Patches) -> case Patch#patch.diffs of - [] -> - lists:reverse(Rest); - _ -> - lists:reverse(Patches) + [] -> lists:reverse(Rest); + _ -> lists:reverse(Patches) end; make_patch([{insert, Data}=D|T], PrePatchText, PostPatchText, Count1, Count2, [Patch|Rest]) -> @@ -1090,11 +1119,6 @@ unique_match(Pattern, Text) -> %% Helpers %% -% @doc Return true iff binary is a single character. -single_char(<<>>) -> false; -single_char(<<_C/utf8>>) -> true; -single_char(Bin) when is_binary(Bin) -> false. - % @doc Return true iff A is a prefix of B is_prefix(A, B) when size(A) > size(B) -> false; @@ -1155,120 +1179,45 @@ split_pre_and_suffix(Text1, Text2) -> {Prefix, MiddleText1, MiddleText2, Suffix}. -% @doc Return the common prefix of Text1 and Text2. (utf8 aware) +% @doc Return the common prefix of Text1 and Text2. Works on UTF-32 — always codepoint-aligned. common_prefix(Text1, Text2) -> Length = binary:longest_common_prefix([Text1, Text2]), - Prefix = binary:part(Text1, 0, Length), - - %% Utf-8 repair the tail of the prefix. It could contain a half utf-8 char. - {Prefix1, _} = repair_tail(Prefix), - Prefix1. + %% Round down to 4-byte boundary (should already be aligned for valid UTF-32). + binary:part(Text1, 0, (Length div 4) * 4). -% @doc Return the common prefix of Text1 and Text2 (utf8 aware) +% @doc Return the common suffix of Text1 and Text2. Works on UTF-32 — always codepoint-aligned. common_suffix(Text1, Text2) -> Length = binary:longest_common_suffix([Text1, Text2]), - Suffix = binary:part(Text1, size(Text1), -Length), - - %% Utf-8 repair the head of the suffix. Could contain a half utf8 char - {_, Suffix1} = repair_head(Suffix), - Suffix1. + binary:part(Text1, byte_size(Text1), -((Length div 4) * 4)). -% @doc Count the number of characters in a utf8 binary. +% @doc Count the number of codepoints in a UTF-8 binary. +% @deprecated Use text_size32/1 internally. This public function may be removed in a future version. +-spec text_size(unicode:unicode_binary()) -> non_neg_integer(). text_size(Text) when is_binary(Text) -> - text_size(Text, 0). + string:length(Text). -text_size(<<>>, Count) -> - Count; -text_size(<<_C/utf8, Rest/binary>>, Count) -> - text_size(Rest, Count+1); -text_size(_, _) -> - error(badarg). +% @doc Count the number of codepoints in a UTF-32 binary. O(1). +text_size32(Text) when is_binary(Text) -> + byte_size(Text) div 4. + +% @doc Return true iff Text has fewer than Size codepoints. O(1) for UTF-32. +text_smaller_than(_, 0) -> + false; +text_smaller_than(Text, Size) -> + byte_size(Text) < Size * 4. %% -%% UTF-32 utilities +%% UTF-32 boundary helpers %% -% @doc Convert the first N codepoints of a UTF-32BE binary to a UTF-8 binary. -utf32_prefix_to_utf8(Utf32, CodepointCount) -> - Prefix32 = binary:part(Utf32, 0, CodepointCount * 4), - unicode:characters_to_binary(Prefix32, utf32, utf8). - -%% @doc Checks the trailing bytes for utf8 prefix bytes. -repair_tail(<<>>) -> - {<<>>, <<>>}; -%% Checks -repair_tail(Bin) -> - Size = size(Bin), - Size1 = Size-1, Size2 = Size-2, Size3 = Size-3, Size4 = Size-4, - case Bin of - %% Valid 1 -byte - <<_:Size1/binary, 2#0:1, _A:7>> -> - {Bin, <<>>}; - - %% Invalid 1-byte - <> -> - {Pre, <<2#110:3, A:5>>}; - <> -> - {Pre, <<2#1110:4, A:4>>}; - <> -> - {Pre, <<2#11110:5, A:3>>}; - - %% Valid 2-byte ending - <<_:Size2/binary, 2#110:3, _A:5, 2#10:2, _B:6>> -> - {Bin, <<>>}; - - %% Invalid 2-byte ending - <> -> - {Pre, <<2#1110:4, A:4, 2#10:2, B:6>>}; - <> -> - {Pre, <<2#11110:5, A:3, 2#10:2, B:6>>}; - - %% Valid 3-byte ending - <<_:Size3/binary, 2#1110:4, _A:4, 2#10:2, _B:6, 2#10:2, _C:6>> -> - {Bin, <<>>}; - - %% Invalid 3-byte ending - <> -> - {Pre, <<2#11110:5, A:3, 2#10:2, B:6, 2#10:2, C:6>>}; - - %% Valid 4-byte ending - <<_:Size4/binary, 2#11110:5, _A:3, 2#10:2, _B:6, 2#10:2, _C:6, 2#10:2, _D:6>> -> - {Bin, <<>>}; - - %% Illegal utf-8 sequence. - _ -> - %% Can't repair it, just return - {Bin, <<>>} - end. +% @doc Convert a UTF-8 binary to UTF-32, crashing on invalid input. +to_utf32(Bin) -> + <<_/binary>> = unicode:characters_to_binary(Bin, utf8, utf32). -% @doc Checks the beginning of a binary and strips of partial utf-8 encoded bytes. -repair_head(<<>>) -> - {<<>>, <<>>}; -% valid 1-byte beginning -repair_head(<<2#0:1, _A:7, _Rest/binary>>=Bin) -> - {<<>>, Bin}; -% valid 4-byte beginning -repair_head(<<2#11110:5, _A:3, 2#10:2, _B:6, 2#10:2, _C:6, 2#10:2, _D:6, _Rest/binary>>=Bin) -> - {<<>>, Bin}; -% valid 3-byte beginning -repair_head(<<2#1110:4, _A:4, 2#10:2, _B:6, 2#10:2, _C:6, _Rest/binary>>=Bin) -> - {<<>>, Bin}; -% invalid 3-byte beginning -repair_head(<<2#10:2, A:6, 2#10:2, B:6, 2#10:2, C:6, Rest/binary>>) -> - {<<2#10:2, A:6, 2#10:2, B:6, 2#10:2, C:6>>, Rest}; -% valid 2-byte beginning -repair_head(<<2#110:3, _A:5, 2#10:2, _B:6, _Rest/binary>>=Bin) -> - {<<>>, Bin}; -% invalid 2-byte beginnings -repair_head(<<2#10:2, A:6, 2#10:2, B:6, Rest/binary>>) -> - {<<2#10:2, A:6, 2#10:2, B:6>>, Rest}; -% invalid 1-byte beginning -repair_head(<<2#10:2, A:6, Rest/binary>>) -> - {<<2#10:2, A:6>>, Rest}; -repair_head(Bin) -> - %% Illegal sequence, can't repair it. - {<<>>, Bin}. +% @doc Convert a UTF-32 binary to UTF-8, crashing on invalid input. +to_utf8(Bin) -> + <<_/binary>> = unicode:characters_to_binary(Bin, utf32, utf8). %% %% Tests @@ -1278,44 +1227,11 @@ repair_head(Bin) -> -include_lib("eunit/include/eunit.hrl"). -repair_tail_test() -> - ?assertEqual({<<>>, <<>>}, repair_tail(<<>>)), - ?assertEqual({<<"aap">>, <<>>}, repair_tail(<<"aap">>)), - ?assertEqual({<<200/utf8>>, <<>>}, repair_tail(<<200/utf8>>)), - ?assertEqual({<<600/utf8>>, <<>>}, repair_tail(<<600/utf8>>)), - ?assertEqual({<<1000/utf8>>, <<>>}, repair_tail(<<1000/utf8>>)), - - ?assertEqual({<<"aap">>, <<200>>}, repair_tail(<<"aap", 200>>)), - - ?assertEqual({<<"test">>, <<240, 159, 159>>}, repair_tail(<<116,101,115,116,240,159,159>>)), - - ok. - -repair_head_test() -> - ?assertEqual({<<>>, <<>>}, repair_head(<<>>)), - ?assertEqual({<<>>, <<"a">>}, repair_head(<<"a">>)), - ?assertEqual({<<>>, <<"aap">>}, repair_head(<<"aap">>)), - ?assertEqual({<<>>, <<200/utf8>>}, repair_head(<<200/utf8>>)), - ?assertEqual({<<>>, <<600/utf8>>}, repair_head(<<600/utf8>>)), - ?assertEqual({<<>>, <<1000/utf8>>}, repair_head(<<1000/utf8>>)), - - %% - ?assertEqual({<<2#10:2, 10:6>>, <<"aap">>}, - repair_head(<<2#10:2, 10:6, "aap">>)), - ?assertEqual({<<2#10:2, 60:6, 2#10:2, 10:6>>, <<"aap">>}, - repair_head(<<2#10:2, 60:6, 2#10:2, 10:6, "aap">>)), - ?assertEqual({<<2#10:2, 60:6, 2#10:2, 10:6, 2#10:2, 13:6>>, <<"aap">>}, - repair_head(<<2#10:2, 60:6, 2#10:2, 10:6, 2#10:2, 13:6, "aap">>)), - - ok. - - for_test() -> ?assertEqual(9, for(0, 10, fun(I, _N) -> {continue, I} end, undefined)), ?assertEqual(0, for(0, 10, fun(I, _N) -> {break, I} end, undefined)), ok. - diff_utf8_test() -> ?assertEqual([{equal, <<208,174, 208,189, 208,184, 208,186, 208,190, 208,180>>}], diff(<<208,174,208,189,208,184,208,186,208,190,208,180>>, @@ -1342,10 +1258,6 @@ diff_bisect_test() -> {equal,<<" a banana">>}], diff_bisect(<<"fruit flies like a banana">>, <<"fruit flies eat a banana">>)), - - %?assertEqual([{delete,<<"cat">>}, - % {insert,<<"map">>}], diff_bisect(<<"cat">>, <<"map">>)), - ?assertEqual([{delete,<<"c">>}, {insert,<<"m">>}, {equal,<<"a">>}, @@ -1361,84 +1273,87 @@ diff_bisect_test() -> ?assertEqual([{equal, <<"text">>}], diff_bisect(<<"text">>, <<"text">>)), - ok. +%% half_match operates on UTF-32 internally; wrap inputs/outputs for testing. +half_match_utf8(A, B) -> + case half_match(to_utf32(A), to_utf32(B)) of + undefined -> undefined; + {half_match, A1, A2, B1, B2, C} -> + {half_match, to_utf8(A1), to_utf8(A2), to_utf8(B1), to_utf8(B2), to_utf8(C)} + end. + half_match_test() -> - ?assertEqual(undefined, half_match(<<"1234567890">>, <<"abcdef">>)), - ?assertEqual(undefined, half_match(<<"12345">>, <<"23">>)), + ?assertEqual(undefined, half_match_utf8(<<"1234567890">>, <<"abcdef">>)), + ?assertEqual(undefined, half_match_utf8(<<"12345">>, <<"23">>)), %% Single Match ?assertEqual({half_match, <<"12">>, <<"90">>, <<"a">>, <<"z">>, <<"345678">>}, - half_match(<<"1234567890">>, <<"a345678z">>)), + half_match_utf8(<<"1234567890">>, <<"a345678z">>)), ?assertEqual({half_match, <<"a">>, <<"z">>, <<"12">>, <<"90">>, <<"345678">>}, - half_match(<<"a345678z">>, <<"1234567890">>)), + half_match_utf8(<<"a345678z">>, <<"1234567890">>)), ?assertEqual({half_match, <<"abc">>, <<"z">>, <<"1234">>, <<"0">>, <<"56789">>}, - half_match(<<"abc56789z">>, <<"1234567890">>)), + half_match_utf8(<<"abc56789z">>, <<"1234567890">>)), ?assertEqual({half_match, <<"a">>, <<"xyz">>, <<"1">>, <<"7890">>, <<"23456">>}, - half_match(<<"a23456xyz">>, <<"1234567890">>)), + half_match_utf8(<<"a23456xyz">>, <<"1234567890">>)), %% Multiple Matches ?assertEqual({half_match, <<"12123">>, <<"123121">>, <<"a">>, <<"z">>, <<"1234123451234">>}, - half_match(<<"121231234123451234123121">>, <<"a1234123451234z">>)), + half_match_utf8(<<"121231234123451234123121">>, <<"a1234123451234z">>)), ?assertEqual({half_match, <<"">>, <<"-=-=-=-=-=">>, <<"x">>, <<"">>, <<"x-=-=-=-=-=-=-=">>}, - half_match(<<"x-=-=-=-=-=-=-=-=-=-=-=-=">>, <<"xx-=-=-=-=-=-=-=">>)), + half_match_utf8(<<"x-=-=-=-=-=-=-=-=-=-=-=-=">>, <<"xx-=-=-=-=-=-=-=">>)), ?assertEqual({half_match, <<"-=-=-=-=-=">>, <<"">>, <<"">>, <<"y">>, <<"-=-=-=-=-=-=-=y">>}, - half_match(<<"-=-=-=-=-=-=-=-=-=-=-=-=y">>, <<"-=-=-=-=-=-=-=yy">>)), + half_match_utf8(<<"-=-=-=-=-=-=-=-=-=-=-=-=y">>, <<"-=-=-=-=-=-=-=yy">>)), - % Non-optimal halfmatch. - % Optimal diff would be -q+x=H-i+e=lloHe+Hu=llo-Hew+y not -qHillo+x=HelloHe-w+Hulloy ?assertEqual({half_match, <<"qHillo">>, <<"w">>, <<"x">>, <<"Hulloy">>, <<"HelloHe">>}, - half_match(<<"qHilloHelloHew">>, <<"xHelloHeHulloy">>)), + half_match_utf8(<<"qHilloHelloHew">>, <<"xHelloHeHulloy">>)), ok. - +%% common_prefix/suffix operate on UTF-32; wrap for testing. common_prefix_test() -> - ?assertEqual(<<>>, common_prefix(<<"Text">>, <<"Next">>)), - ?assertEqual(<<"T">>, common_prefix(<<"Text">>, <<"Tax">>)), - ?assertEqual(<<"text">>, common_prefix(<<"text">>, <<"text">>)), - - ?assertEqual(<<"test🟡"/utf8>>, common_prefix(<<"test🟡123"/utf8>>, <<"test🟡456"/utf8>>)), + Prefix = fun(A, B) -> to_utf8(common_prefix(to_utf32(A), to_utf32(B))) end, - ?assertEqual(<<"test">>, common_prefix(<<"test🟢123"/utf8>>, <<"test🟡123"/utf8>>)), - ?assertEqual(<<"test">>, common_prefix(<<"test🟡123"/utf8>>, <<"test🟢123"/utf8>>)), - - ?assertEqual(<<"test">>, common_prefix(<<"test🟡123"/utf8>>, <<"test🔵123"/utf8>>)), - ?assertEqual(<<"test">>, common_prefix(<<"test🔵123"/utf8>>, <<"test🟡123"/utf8>>)), - - ?assertEqual(<<"test">>, common_prefix(<<"test🟡123"/utf8>>, <<"test⚫️123"/utf8>>)), - ?assertEqual(<<"test">>, common_prefix(<<"test⚫️123"/utf8>>, <<"test🟡123"/utf8>>)), + ?assertEqual(<<>>, Prefix(<<"Text">>, <<"Next">>)), + ?assertEqual(<<"T">>, Prefix(<<"Text">>, <<"Tax">>)), + ?assertEqual(<<"text">>, Prefix(<<"text">>, <<"text">>)), + ?assertEqual(<<"test🟡"/utf8>>, Prefix(<<"test🟡123"/utf8>>, <<"test🟡456"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🟢123"/utf8>>, <<"test🟡123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🟡123"/utf8>>, <<"test🟢123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🟡123"/utf8>>, <<"test🔵123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🔵123"/utf8>>, <<"test🟡123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🟡123"/utf8>>, <<"test⚫️123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test⚫️123"/utf8>>, <<"test🟡123"/utf8>>)), ok. - common_suffix_test() -> - ?assertEqual(<<"ext">>, common_suffix(<<"Text">>, <<"Next">>)), - ?assertEqual(<<>>, common_suffix(<<"Text">>, <<"Tax">>)), - ?assertEqual(<<"text">>, common_suffix(<<"text">>, <<"text">>)), + Suffix = fun(A, B) -> to_utf8(common_suffix(to_utf32(A), to_utf32(B))) end, + + ?assertEqual(<<"ext">>, Suffix(<<"Text">>, <<"Next">>)), + ?assertEqual(<<>>, Suffix(<<"Text">>, <<"Tax">>)), + ?assertEqual(<<"text">>, Suffix(<<"text">>, <<"text">>)), ok. +%% split_pre_and_suffix operates on UTF-32; wrap for testing. split_pre_and_suffix_test() -> - ?assertEqual({<<>>, <<>>, <<>>, <<>>}, split_pre_and_suffix(<<>>, <<>>)), - - ?assertEqual({<<>>, <<"a">>, <<"b">>, <<>>}, split_pre_and_suffix(<<"a">>, <<"b">>)), - - ?assertEqual({<<"a">>, <<"b">>, <<"c">>, <<"d">>}, - split_pre_and_suffix(<<"abd">>, <<"acd">>)), - ?assertEqual({<<"aa">>, <<"bb">>, <<"cc">>, <<"dd">>}, - split_pre_and_suffix(<<"aabbdd">>, <<"aaccdd">>)), - ?assertEqual({<<"aa">>, <<"bb">>, <<"c">>, <<"dd">>}, - split_pre_and_suffix(<<"aabbdd">>, <<"aacdd">>)), + Split = fun(A, B) -> + {P, M1, M2, S} = split_pre_and_suffix(to_utf32(A), to_utf32(B)), + {to_utf8(P), to_utf8(M1), to_utf8(M2), to_utf8(S)} + end, + ?assertEqual({<<>>, <<>>, <<>>, <<>>}, Split(<<>>, <<>>)), + ?assertEqual({<<>>, <<"a">>, <<"b">>, <<>>}, Split(<<"a">>, <<"b">>)), + ?assertEqual({<<"a">>, <<"b">>, <<"c">>, <<"d">>}, Split(<<"abd">>, <<"acd">>)), + ?assertEqual({<<"aa">>, <<"bb">>, <<"cc">>, <<"dd">>}, Split(<<"aabbdd">>, <<"aaccdd">>)), + ?assertEqual({<<"aa">>, <<"bb">>, <<"c">>, <<"dd">>}, Split(<<"aabbdd">>, <<"aacdd">>)), ?assertEqual({<<"cat ">>, <<>>, <<"mouse dog ">>, <<>>}, - split_pre_and_suffix(<<"cat ">>, <<"cat mouse dog ">>)), - - ok. + Split(<<"cat ">>, <<"cat mouse dog ">>)), + ok. unique_match_test() -> ?assertEqual(true, unique_match(<<"a">>, <<"abc">>)), @@ -1447,54 +1362,74 @@ unique_match_test() -> ?assertEqual(false, unique_match(<<"ab">>, <<"abab">>)), ok. - text_smaller_than_test() -> - ?assertEqual(true, text_smaller_than(<<>>, 5)), - ?assertEqual(true, text_smaller_than(<<>>, 1)), - - ?assertEqual(false, text_smaller_than(<<>>, 0)), - - ?assertEqual(false, text_smaller_than(<<"abc">>, 0)), - ?assertEqual(false, text_smaller_than(<<"abc">>, 1)), - ?assertEqual(true, text_smaller_than(<<"abc">>, 4)), - - %% Test if we count characters. - Utf8Binary = <<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>, - ?assertEqual(true, size(Utf8Binary) > 5), % binary is larger due to utf8 encoding - ?assertEqual(true, text_smaller_than(Utf8Binary, 5)), - ?assertEqual(false, text_smaller_than(Utf8Binary, 4)), - - %% Test illegal utf8 sequence, the chars are counted as normal chars - ?assertEqual(false, text_smaller_than(<<149,157,112,8>>, 4)), + %% text_smaller_than now works on UTF-32 binaries. + ?assertEqual(true, text_smaller_than(to_utf32(<<>>), 5)), + ?assertEqual(true, text_smaller_than(to_utf32(<<>>), 1)), + ?assertEqual(false, text_smaller_than(to_utf32(<<>>), 0)), + ?assertEqual(false, text_smaller_than(to_utf32(<<"abc">>), 0)), + ?assertEqual(false, text_smaller_than(to_utf32(<<"abc">>), 1)), + ?assertEqual(true, text_smaller_than(to_utf32(<<"abc">>), 4)), + + %% Multi-byte UTF-8 characters each become exactly 4 bytes in UTF-32. + Utf32 = to_utf32(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>), + ?assertEqual(true, text_smaller_than(Utf32, 5)), + ?assertEqual(false, text_smaller_than(Utf32, 4)), ok. lines_to_chars_test() -> - ?assertEqual({<<>>, <<>>, []}, lines_to_chars(<<>>, <<>>)), - - %% Simple text - ?assertEqual({<<0, 1>>, <<0, 2>>, [<<"hello\n">>, <<"world\n">>, <<"maas\n">>]}, - lines_to_chars(<<"hello\n\world\n">>, <<"hello\nmaas\n">>)), - - %% No newline at the end. - ?assertEqual({<<0, 1>>, <<0, 2>>, [<<"hello\n">>, <<"world\n">>, <<"maas">>]}, - lines_to_chars(<<"hello\n\world\n">>, <<"hello\nmaas">>)), - - %% No newline at the end. - ?assertEqual({<<0, 1>>, <<0, 2>>, [<<"hello\n">>, <<"world\n">>, <<"maas">>]}, - lines_to_chars(<<"hello\n\world\n">>, <<"hello\nmaas">>)), - - %% With empty lines - ?assertEqual({<<0, 1, 2>>, <<0, 1, 3>>, [<<"hello\n">>, <<"\n">>, <<"world\n">>, <<"maas">>]}, - lines_to_chars(<<"hello\n\nworld\n">>, <<"hello\n\nmaas">>)), + %% lines_to_chars takes UTF-32 input, returns UTF-32 index sequences and UTF-32 lines. + {C1, C2, Lines} = lines_to_chars(to_utf32(<<>>), to_utf32(<<>>)), + ?assertEqual(<<>>, C1), + ?assertEqual(<<>>, C2), + ?assertEqual([], Lines), + + {C3, C4, Lines2} = lines_to_chars(to_utf32(<<"hello\nworld\n">>), to_utf32(<<"hello\nmaas\n">>)), + %% Lines are stored as UTF-32 binaries. + ?assertEqual([to_utf32(<<"hello\n">>), to_utf32(<<"world\n">>), to_utf32(<<"maas\n">>)], Lines2), + ?assertEqual(<<0:32, 1:32>>, C3), + ?assertEqual(<<0:32, 2:32>>, C4), ok. - diff_linemode_test() -> ?assertEqual([{equal, <<"hello\n">>}, {delete, <<"world\n">>}, {insert, <<"maas\n">>}], diff_linemode(<<"hello\nworld\n">>, <<"hello\nmaas\n">>)), ok. +diff_options_test() -> + A = <<"cat">>, + B = <<"map">>, + + %% No options — same as diff/2. + ?assertEqual(diff(A, B), diff(A, B, [])), + + %% no_linemode: result is structurally equivalent (same source/dest text). + NoLinemode = diff(A, B, [no_linemode]), + ?assertEqual(source_text(diff(A, B)), source_text(NoLinemode)), + ?assertEqual(destination_text(diff(A, B)), destination_text(NoLinemode)), + + %% semantic option applies cleanup_semantic to the raw diff. + ?assertEqual(cleanup_semantic(diff(A, B)), diff(A, B, [semantic])), + + %% efficiency option applies cleanup_efficiency to the raw diff. + ?assertEqual(cleanup_efficiency(diff(A, B)), diff(A, B, [efficiency])), + + %% {efficiency, Cost} applies cleanup_efficiency/2 with the given cost. + ?assertEqual(cleanup_efficiency(diff(A, B), 2), diff(A, B, [{efficiency, 2}])), + + %% Both: semantic first, then efficiency. + ?assertEqual( + cleanup_efficiency(cleanup_semantic(diff(A, B))), + diff(A, B, [semantic, efficiency])), + + %% Order of options in list does not affect cleanup order. + ?assertEqual( + diff(A, B, [semantic, efficiency]), + diff(A, B, [efficiency, semantic])), + + ok. + -endif. diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 694db66..06f9e86 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -77,7 +77,7 @@ html_like() -> {2, utf8(4)}, % Some small portions of unicode chars. {2, range($0, $9)}, % numbers {2, $\s}, % whitespace - {4, $\n}, % linebreaks + {4, $\n}, % linebreaks {2, oneof([$., $-, $!, $?, $,])} % punctuation ]))). @@ -314,7 +314,7 @@ text_size_test() -> ?assertEqual(4, diffy:text_size(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>)), %% Bad utf-8 input results in a badarg. - ?assertError(badarg, diffy:text_size(<<149,157,112,8>>)), + ?assertError({badarg, _}, diffy:text_size(<<149,157,112,8>>)), ok. @@ -355,6 +355,101 @@ diff_test() -> <<"cat mouse dog ">>)), ok. + +diff_linemode_corners_test() -> + %% Empty inputs. + ?assertEqual([], diffy:diff_linemode(<<>>, <<>>)), + ?assertEqual([{insert, <<"hello\n">>}], diffy:diff_linemode(<<>>, <<"hello\n">>)), + ?assertEqual([{delete, <<"hello\n">>}], diffy:diff_linemode(<<"hello\n">>, <<>>)), + + %% Identical input — single equal op. + ?assertEqual([{equal, <<"hello\nworld\n">>}], + diffy:diff_linemode(<<"hello\nworld\n">>, <<"hello\nworld\n">>)), + + %% No newline at end of file — last line treated as its own token. + ?assertEqual( + [{equal, <<"hello\n">>}, {delete, <<"world">>}, {insert, <<"maas">>}], + diffy:diff_linemode(<<"hello\nworld">>, <<"hello\nmaas">>)), + + %% Blank lines — exercise is_blankline_start/end and the \n\n pattern. + %% The rediff within cleanup_line_diff splits b\n vs c\n at character level. + ?assertEqual( + [{equal, <<"a\n\n">>}, {delete, <<"b">>}, {insert, <<"c">>}, {equal, <<"\nd\n">>}], + diffy:diff_linemode(<<"a\n\nb\nd\n">>, <<"a\n\nc\nd\n">>)), + + %% \r\n line endings — exercises the \r\n\r\n blankline pattern. + ?assertEqual( + [{equal, <<"hello\r\n">>}, {delete, <<"world\r\n">>}, {insert, <<"maas\r\n">>}], + diffy:diff_linemode(<<"hello\r\nworld\r\n">>, <<"hello\r\nmaas\r\n">>)), + + %% Repeated lines — the same line appearing multiple times should reuse the same index. + ?assertEqual( + [{equal, <<"a\nb\na\n">>}, {insert, <<"b\n">>}], + diffy:diff_linemode(<<"a\nb\na\n">>, <<"a\nb\na\nb\n">>)), + + %% Large enough to trigger linemode via compute_diff1 size threshold. + %% Build two texts that differ only in one line buried in > 100 chars of context. + Prefix = binary:copy(<<"padding line\n">>, 10), + Suffix = binary:copy(<<"trailing line\n">>, 10), + Text1 = <>, + Text2 = <>, + Diffs = diffy:diff(Text1, Text2), + %% Source and destination text must be preserved exactly. + ?assertEqual(Text1, diffy:source_text(Diffs)), + ?assertEqual(Text2, diffy:destination_text(Diffs)), + %% Must contain at least one delete and one insert — the changed line. + ?assert(lists:any(fun({delete, _}) -> true; (_) -> false end, Diffs)), + ?assert(lists:any(fun({insert, _}) -> true; (_) -> false end, Diffs)), + + %% Multi-byte UTF-8 lines — verify encoding survives the linemode round-trip. + ?assertEqual( + [{equal, <<"héllo\n"/utf8>>}, {delete, <<"wörld\n"/utf8>>}, {insert, <<"wörlt\n"/utf8>>}], + diffy:diff_linemode(<<"héllo\nwörld\n"/utf8>>, <<"héllo\nwörlt\n"/utf8>>)), + + %% cleanup_line_diff rediff path — two changed lines adjacent to an equal trigger + %% the rediff of accumulated delete+insert data. + T1 = <<"aaa\nbbb\nccc\n">>, + T2 = <<"aab\nbbc\nccc\n">>, + RediffDiffs = diffy:diff_linemode(T1, T2), + ?assertEqual(T1, diffy:source_text(RediffDiffs)), + ?assertEqual(T2, diffy:destination_text(RediffDiffs)), + + ok. + +diff_options_test() -> + A = <<"one two x four five">>, + B = <<"one TWO x FOUR five">>, + + %% No options — same as diff/2. + ?assertEqual(diffy:diff(A, B), diffy:diff(A, B, [])), + + %% no_linemode: result is structurally equivalent (same source/dest text). + NoLinemode = diffy:diff(A, B, [no_linemode]), + ?assertEqual(diffy:source_text(diffy:diff(A, B)), diffy:source_text(NoLinemode)), + ?assertEqual(diffy:destination_text(diffy:diff(A, B)), diffy:destination_text(NoLinemode)), + + %% semantic option applies cleanup_semantic to the raw diff. + ?assertEqual(diffy:cleanup_semantic(diffy:diff(A, B)), diffy:diff(A, B, [semantic])), + + %% efficiency option applies cleanup_efficiency to the raw diff. + ?assertEqual(diffy:cleanup_efficiency(diffy:diff(A, B)), diffy:diff(A, B, [efficiency])), + + %% {efficiency, Cost} applies cleanup_efficiency/2 with the given cost. + ?assertEqual(diffy:cleanup_efficiency(diffy:diff(A, B), 2), diffy:diff(A, B, [{efficiency, 2}])), + + %% Both: semantic first, then efficiency. + ?assertEqual( + diffy:cleanup_efficiency(diffy:cleanup_semantic(diffy:diff(A, B))), + diffy:diff(A, B, [semantic, efficiency])), + + %% Order of options in list does not affect cleanup order. + ?assertEqual( + diffy:diff(A, B, [semantic, efficiency]), + diffy:diff(A, B, [efficiency, semantic])), + + ok. + + %% %% Helpers From 99e22aa885981e5a24e4abe382ac0707565698e3 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Fri, 10 Apr 2026 09:03:38 +0200 Subject: [PATCH 20/47] Reuse long and short size --- src/diffy.erl | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 57d04ce..312feab 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -196,20 +196,23 @@ try_half_match(OldText, NewText, CheckLines) -> %% Check if we can do a half-match diff, returns undefined if it is not advantageous. %% Operates on UTF-32 binaries — size comparisons are in bytes (4 bytes per codepoint). half_match(A, B) -> - AGtB = size(A) > size(B), - {Short, Long} = case AGtB of - true -> {B, A}; - false -> {A, B} - end, + AgtB = size(A) > size(B), + {Short, Long} = case AgtB of + true -> {B, A}; + false -> {A, B} + end, + + LongSize = size(Long), + ShortSize = size(Short), %% text_smaller_than(Long, 4) becomes size(Long) < 4*4 in UTF-32. - case size(Long) < 16 orelse size(Short) * 2 < size(Long) of + case LongSize < 16 orelse ShortSize * 2 < LongSize of true -> %% No point in looking. undefined; false -> - Hm1 = half_match_i(Long, Short, (size(Long) + 3) div 4), - Hm2 = half_match_i(Long, Short, (size(Long) + 1) div 2), + Hm1 = half_match_i(Long, Short, (LongSize + 3) div 4), + Hm2 = half_match_i(Long, Short, (LongSize + 1) div 2), %% Select the longest half-match. Hm = case {Hm1, Hm2} of @@ -229,7 +232,7 @@ half_match(A, B) -> case Hm of undefined -> undefined; {half_match, T1A, T1B, T2A, T2B, MidCommon} -> - case AGtB of + case AgtB of true -> Hm; false -> {half_match, T2A, T2B, T1A, T1B, MidCommon} From 4d1df305164c422bfcaaf29781d903ae4d2362e3 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Fri, 10 Apr 2026 09:45:15 +0200 Subject: [PATCH 21/47] Using phash2 for keys in the index map can lead to collisions which can be better handled by the low level map implementation --- src/diffy.erl | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 312feab..70d1afc 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -396,14 +396,11 @@ lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) -> insert_line(Line, Lines, Map, NextChar) -> - Hash = erlang:phash2(Line, ?PHASH2_RANGE), case Map of - %% Hash hit — verify the stored line matches to guard against collisions. - #{Hash := {Char, Line}} -> + #{Line := Char} -> {Char, NextChar, Lines, Map}; - %% Hash miss or collision with a different line — assign a new index. _ -> - {NextChar, NextChar + 1, [Line | Lines], Map#{Hash => {NextChar, Line}}} + {NextChar, NextChar + 1, [Line | Lines], Map#{Line => NextChar}} end. decode_lines(Diffs, Lines) when is_list(Lines) -> From a311a71a1039d379e29729000326ce097f5d1c00 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Fri, 10 Apr 2026 14:49:02 +0200 Subject: [PATCH 22/47] Fix utf32 size boundary problem: --- src/diffy.erl | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 70d1afc..b952cf2 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -211,8 +211,11 @@ half_match(A, B) -> %% No point in looking. undefined; false -> - Hm1 = half_match_i(Long, Short, (LongSize + 3) div 4), - Hm2 = half_match_i(Long, Short, (LongSize + 1) div 2), + %% Seed positions are quarter-way and half-way through Long, + %% expressed as byte offsets (codepoints * 4). + LongLen = LongSize div 4, %% codepoint count + Hm1 = half_match_i(Long, Short, ((LongLen + 3) div 4) * 4), + Hm2 = half_match_i(Long, Short, ((LongLen + 1) div 2) * 4), %% Select the longest half-match. Hm = case {Hm1, Hm2} of @@ -298,16 +301,12 @@ next_char(_Bin, Pos) -> Pos + 4. %% -%% In UTF-32 every codepoint is exactly 4 bytes, so any 4-byte-aligned slice -%% is a valid codepoint boundary — no repair_head/repair_tail needed. +%% In UTF-32 every codepoint is exactly 4 bytes. Start is always a 4-byte-aligned +%% byte offset, so no alignment step is needed. seed(Long, Start) -> SeedSize = size(Long) div 4, - - %% Align Start to a 4-byte (codepoint) boundary. - AlignedStart = (Start div 4) * 4, - <<_Pre:AlignedStart/binary, Seed:SeedSize/binary, _Post/binary>> = Long, - - {AlignedStart, Seed}. + <<_Pre:Start/binary, Seed:SeedSize/binary, _Post/binary>> = Long, + {Start, Seed}. %% Line diff @@ -1285,12 +1284,10 @@ half_match_utf8(A, B) -> end. half_match_test() -> - ?assertEqual(undefined, half_match_utf8(<<"1234567890">>, <<"abcdef">>)), - ?assertEqual(undefined, half_match_utf8(<<"12345">>, <<"23">>)), + ?assertEqual(undefined, half_match_utf8(<<"1234567890">>, <<"abcdef">>)), ?assertEqual(undefined, half_match_utf8(<<"12345">>, <<"23">>)), %% Single Match - ?assertEqual({half_match, <<"12">>, <<"90">>, <<"a">>, <<"z">>, <<"345678">>}, - half_match_utf8(<<"1234567890">>, <<"a345678z">>)), + ?assertEqual({half_match, <<"12">>, <<"90">>, <<"a">>, <<"z">>, <<"345678">>}, half_match_utf8(<<"1234567890">>, <<"a345678z">>)), ?assertEqual({half_match, <<"a">>, <<"z">>, <<"12">>, <<"90">>, <<"345678">>}, half_match_utf8(<<"a345678z">>, <<"1234567890">>)), ?assertEqual({half_match, <<"abc">>, <<"z">>, <<"1234">>, <<"0">>, <<"56789">>}, @@ -1311,6 +1308,24 @@ half_match_test() -> ?assertEqual({half_match, <<"qHillo">>, <<"w">>, <<"x">>, <<"Hulloy">>, <<"HelloHe">>}, half_match_utf8(<<"qHilloHelloHew">>, <<"xHelloHeHulloy">>)), + ?assertEqual({half_match, <<"qHillo"/utf8>>, <<"w"/utf8>>, <<"x"/utf8>>, <<"eHull💯y"/utf8>>, <<"🐶🐱🐭🐹🐰H❤️"/utf8>>}, + half_match_utf8(<<"qHillo🐶🐱🐭🐹🐰H❤️w"/utf8>>, <<"x🐶🐱🐭🐹🐰H❤️eHull💯y"/utf8>>)), + + %% Unicode: é is 2 UTF-8 bytes but 1 codepoint (4 UTF-32 bytes). + %% With the old bug, size(Long) div 4 gave the wrong seed position + %% because byte_size in UTF-32 ≠ codepoint_count for multi-byte UTF-8 chars. + %% Long = éééééééééé (10 chars), Short = a + éééééééé + z (10 chars). + %% half_match should find the 8-char common section of é's. + E = <<233/utf8>>, + ULong = binary:copy(E, 10), + UShort = <<"a", (binary:copy(E, 8))/binary, "z">>, + UDiff = diff(ULong, UShort), + ?assertEqual(ULong, source_text(UDiff)), + ?assertEqual(UShort, destination_text(UDiff)), + %% The 8-char run of é must appear as a single equal op. + Equal8 = binary:copy(E, 8), + ?assert(lists:member({equal, Equal8}, UDiff)), + ok. %% common_prefix/suffix operate on UTF-32; wrap for testing. From 9b09fb8988fe2b53860dcf7a7dde6976d62beb33 Mon Sep 17 00:00:00 2001 From: MM Zeeman Date: Fri, 10 Apr 2026 15:49:37 +0200 Subject: [PATCH 23/47] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/diffy.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/diffy.erl b/src/diffy.erl index b952cf2..282f3cd 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -304,7 +304,9 @@ next_char(_Bin, Pos) -> %% In UTF-32 every codepoint is exactly 4 bytes. Start is always a 4-byte-aligned %% byte offset, so no alignment step is needed. seed(Long, Start) -> - SeedSize = size(Long) div 4, + TotalCodepoints = size(Long) div 4, + SeedCodepoints = TotalCodepoints div 4, + SeedSize = SeedCodepoints * 4, <<_Pre:Start/binary, Seed:SeedSize/binary, _Post/binary>> = Long, {Start, Seed}. From e2815378130327f77ba41359d30a62cc741000c9 Mon Sep 17 00:00:00 2001 From: MM Zeeman Date: Fri, 10 Apr 2026 15:52:38 +0200 Subject: [PATCH 24/47] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/diffy.erl | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 282f3cd..af32fe6 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -164,7 +164,7 @@ compute_diff(OldText, NewText, CheckLines) -> false -> {NewText, OldText} end, - case binary:match(LongText, ShortText) of + case aligned_utf32_match(LongText, ShortText, 0) of {Start, Length} -> <> = LongText, Op = diff_op(OldStNew), @@ -255,7 +255,7 @@ half_match_i(Long, Short, I) -> best_common(Long, Short, Seed, SeedLoc, Start, BestLongA, BestLongB, BestShortA, BestShortB, BestCommon) -> %% Check if we can find a match for Seed2 inside the shorttext. - case binary:match(Short, Seed, [{scope, {Start, size(Short)-Start}}]) of + case aligned_utf32_match(Short, Seed, Start) of nomatch -> case size(BestCommon) * 2 >= size(Long) of false -> @@ -296,6 +296,29 @@ best_common(Long, Short, Seed, SeedLoc, Start, end end. +%% @doc Round a byte offset up to the next UTF-32 codepoint boundary. +align_utf32_offset(Offset) when Offset rem 4 =:= 0 -> + Offset; +align_utf32_offset(Offset) -> + Offset + (4 - (Offset rem 4)). + +%% @doc Find a match whose start offset is aligned to a UTF-32 codepoint boundary. +aligned_utf32_match(Bin, Pattern, Start) -> + AlignedStart = align_utf32_offset(Start), + case AlignedStart >= size(Bin) of + true -> + nomatch; + false -> + case binary:match(Bin, Pattern, [{scope, {AlignedStart, size(Bin) - AlignedStart}}]) of + nomatch -> + nomatch; + {MatchStart, Length} when MatchStart rem 4 =:= 0 -> + {MatchStart, Length}; + {MatchStart, _Length} -> + aligned_utf32_match(Bin, Pattern, MatchStart + 1) + end + end. + %% @doc Return the byte position of the next codepoint in a UTF-32 binary. next_char(_Bin, Pos) -> Pos + 4. From 0f049ea102b1f2087ddd86bdb62c4d470f241890 Mon Sep 17 00:00:00 2001 From: MM Zeeman Date: Fri, 10 Apr 2026 15:53:18 +0200 Subject: [PATCH 25/47] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/diffy.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffy.erl b/src/diffy.erl index af32fe6..c32b2f0 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -1219,7 +1219,7 @@ common_suffix(Text1, Text2) -> % @deprecated Use text_size32/1 internally. This public function may be removed in a future version. -spec text_size(unicode:unicode_binary()) -> non_neg_integer(). text_size(Text) when is_binary(Text) -> - string:length(Text). + byte_size(to_utf32(Text)) div 4. % @doc Count the number of codepoints in a UTF-32 binary. O(1). text_size32(Text) when is_binary(Text) -> From 7fc3b2328418c2016efbc4986b0594bf54c87e1f Mon Sep 17 00:00:00 2001 From: MM Zeeman Date: Fri, 10 Apr 2026 15:56:57 +0200 Subject: [PATCH 26/47] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/diffy.erl | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index c32b2f0..f34b6fd 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -1237,11 +1237,25 @@ text_smaller_than(Text, Size) -> % @doc Convert a UTF-8 binary to UTF-32, crashing on invalid input. to_utf32(Bin) -> - <<_/binary>> = unicode:characters_to_binary(Bin, utf8, utf32). + case unicode:characters_to_binary(Bin, utf8, utf32) of + Out when is_binary(Out) -> + Out; + {error, _, _} -> + error(badarg); + {incomplete, _, _} -> + error(badarg) + end. % @doc Convert a UTF-32 binary to UTF-8, crashing on invalid input. to_utf8(Bin) -> - <<_/binary>> = unicode:characters_to_binary(Bin, utf32, utf8). + case unicode:characters_to_binary(Bin, utf32, utf8) of + Out when is_binary(Out) -> + Out; + {error, _, _} -> + error(badarg); + {incomplete, _, _} -> + error(badarg) + end. %% %% Tests From 001b75c9c796fe7e331701fb61a6316968a9347f Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Fri, 10 Apr 2026 15:57:54 +0200 Subject: [PATCH 27/47] Fix text_size test --- test/diffy_tests.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 06f9e86..a642f2c 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -314,7 +314,7 @@ text_size_test() -> ?assertEqual(4, diffy:text_size(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>)), %% Bad utf-8 input results in a badarg. - ?assertError({badarg, _}, diffy:text_size(<<149,157,112,8>>)), + ?assertError(badarg, diffy:text_size(<<149,157,112,8>>)), ok. From b1033019881b91401e5c1119598486444889e4e3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:02:14 +0000 Subject: [PATCH 28/47] Initial plan From 356304437c061ce2c394a9c6e3029bbef4fddcfd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:04:59 +0000 Subject: [PATCH 29/47] Add seed_test/0 EUnit tests for UTF-32 alignment invariants Agent-Logs-Url: https://github.com/zotonic/diffy/sessions/2c2ca16d-02fc-4d4c-b55a-570d0b9ac07e Co-authored-by: mmzeeman <1024972+mmzeeman@users.noreply.github.com> --- src/diffy.erl | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/src/diffy.erl b/src/diffy.erl index f34b6fd..441584d 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -1486,4 +1486,65 @@ diff_options_test() -> ok. +seed_test() -> + %% 1. Empty binary: no codepoints, seed is empty. + ?assertEqual({0, <<>>}, seed(<<>>, 0)), + + %% 2. Binary shorter than 4 codepoints (3 codepoints): 3 div 4 = 0, seed is empty. + Short3 = to_utf32(<<"abc">>), + ?assertEqual({0, <<>>}, seed(Short3, 0)), + + %% 3. Exactly 4 codepoints, Start=0: seed is 1 codepoint (the first one). + Exact4 = to_utf32(<<"abcd">>), + ?assertEqual({0, to_utf32(<<"a">>)}, seed(Exact4, 0)), + + %% 4. 8 codepoints, Start=0: seed is 2 codepoints starting at offset 0. + Long8 = to_utf32(<<"12345678">>), + ?assertEqual({0, to_utf32(<<"12">>)}, seed(Long8, 0)), + + %% 5. 16 codepoints, Start=8 (byte offset = 2 codepoints in): + %% seed is 4 codepoints; returned Start equals 8 and seed bytes are the correct slice. + Long16 = to_utf32(<<"abcdefghijklmnop">>), + {S5, Seed5} = seed(Long16, 8), + ?assertEqual(8, S5), + ?assertEqual(to_utf32(<<"cdef">>), Seed5), + + %% 6. ASCII text round-trip: "1234567890" (10 chars), seed at quarter-way offset. + Ascii10 = to_utf32(<<"1234567890">>), + %% TotalCodepoints=10, SeedCodepoints=2; Start=0 (quarter-way = 0 for simplicity). + {_, SeedAscii} = seed(Ascii10, 0), + ?assertEqual(<<"12">>, to_utf8(SeedAscii)), + + %% 7. Multi-byte codepoint alignment: 10 Greek letters (2 UTF-8 bytes each, 4 UTF-32 bytes each). + Greek10 = to_utf32(<<"αβγδεζηθικ"/utf8>>), + {Start7, Seed7} = seed(Greek10, 0), + %% Returned Start is 0. + ?assertEqual(0, Start7), + %% Seed is 4-byte-aligned. + ?assertEqual(0, byte_size(Seed7) rem 4), + %% Seed length = (10 div 4) * 4 = 8 bytes = 2 codepoints. + ?assertEqual((10 div 4) * 4, byte_size(Seed7)), + %% Seed decodes back to the first 2 Greek letters. + ?assertEqual(<<"αβ"/utf8>>, to_utf8(Seed7)), + + %% 8. Emoji (4-byte UTF-8 codepoints): 10 emoji, seed is first 2. + Emoji10 = to_utf32(<<"🐶🐱🐭🐹🐰🐨🐯🦁🐮🐷"/utf8>>), + {_, SeedEmoji} = seed(Emoji10, 0), + %% Seed length = (10 div 4) * 4 = 8 bytes = 2 codepoints. + ?assertEqual((10 div 4) * 4, byte_size(SeedEmoji)), + %% Seed decodes back to the first 2 emoji. + ?assertEqual(<<"🐶🐱"/utf8>>, to_utf8(SeedEmoji)), + + %% 9. Seed start offset preserved: non-zero Start is returned unchanged. + Long12 = to_utf32(<<"abcdefghijkl">>), + {Start9, _} = seed(Long12, 8), + ?assertEqual(8, Start9), + + %% 10. Seed is a contiguous slice of Long: binary:part(Long, Start, byte_size(Seed)) =:= Seed. + Long20 = to_utf32(<<"abcdefghijklmnopqrst">>), + {Start10, Seed10} = seed(Long20, 8), + ?assertEqual(Seed10, binary:part(Long20, Start10, byte_size(Seed10))), + + ok. + -endif. From 83179221c27db91fca9bbb164548eed7605990ba Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:05:36 +0000 Subject: [PATCH 30/47] Fix misleading comments in seed_test/0 Agent-Logs-Url: https://github.com/zotonic/diffy/sessions/2c2ca16d-02fc-4d4c-b55a-570d0b9ac07e Co-authored-by: mmzeeman <1024972+mmzeeman@users.noreply.github.com> --- src/diffy.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 441584d..33817fe 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -1502,7 +1502,7 @@ seed_test() -> Long8 = to_utf32(<<"12345678">>), ?assertEqual({0, to_utf32(<<"12">>)}, seed(Long8, 0)), - %% 5. 16 codepoints, Start=8 (byte offset = 2 codepoints in): + %% 5. 16 codepoints, Start=8 (8 bytes = 2 codepoints * 4 bytes/codepoint): %% seed is 4 codepoints; returned Start equals 8 and seed bytes are the correct slice. Long16 = to_utf32(<<"abcdefghijklmnop">>), {S5, Seed5} = seed(Long16, 8), @@ -1511,7 +1511,7 @@ seed_test() -> %% 6. ASCII text round-trip: "1234567890" (10 chars), seed at quarter-way offset. Ascii10 = to_utf32(<<"1234567890">>), - %% TotalCodepoints=10, SeedCodepoints=2; Start=0 (quarter-way = 0 for simplicity). + %% TotalCodepoints=10, SeedCodepoints=2; Start=0 to keep the offset 4-byte-aligned. {_, SeedAscii} = seed(Ascii10, 0), ?assertEqual(<<"12">>, to_utf8(SeedAscii)), From 3fbdd22c4658dfb58c9961a899ae6a956492a90f Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Fri, 10 Apr 2026 22:08:03 +0200 Subject: [PATCH 31/47] Fix the aligned_utf32_match function and added tests --- src/diffy.erl | 54 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 33817fe..b088681 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -65,7 +65,7 @@ -define(PATCH_MARGIN, 4). -define(IS_INS_OR_DEL(Op), (Op =:= insert orelse Op =:= delete)). --define(PHASH2_RANGE, (1 bsl 32)). +-define(IS_UTF32_ALIGNED(Offset), (Offset rem 4 =:= 0)). -record(bisect_state, { k1start = 0, k1end = 0, @@ -296,26 +296,24 @@ best_common(Long, Short, Seed, SeedLoc, Start, end end. -%% @doc Round a byte offset up to the next UTF-32 codepoint boundary. -align_utf32_offset(Offset) when Offset rem 4 =:= 0 -> - Offset; -align_utf32_offset(Offset) -> - Offset + (4 - (Offset rem 4)). - %% @doc Find a match whose start offset is aligned to a UTF-32 codepoint boundary. -aligned_utf32_match(Bin, Pattern, Start) -> - AlignedStart = align_utf32_offset(Start), - case AlignedStart >= size(Bin) of +aligned_utf32_match(Bin, Pattern, Start) + when ?IS_UTF32_ALIGNED(Start) andalso Start >= 0 -> + case Start + size(Pattern) > size(Bin) of true -> nomatch; false -> - case binary:match(Bin, Pattern, [{scope, {AlignedStart, size(Bin) - AlignedStart}}]) of + case binary:match(Bin, Pattern, [{scope, {Start, size(Bin) - Start}}]) of nomatch -> nomatch; - {MatchStart, Length} when MatchStart rem 4 =:= 0 -> + {MatchStart, Length} when ?IS_UTF32_ALIGNED(MatchStart) -> + %% Match found, and it is correctly aligned. {MatchStart, Length}; {MatchStart, _Length} -> - aligned_utf32_match(Bin, Pattern, MatchStart + 1) + %% Misaligned hit. binary:match found the first byte-level match, + %% so there is no aligned match before MatchStart. Skip directly + %% to the next aligned boundary after MatchStart. + aligned_utf32_match(Bin, Pattern, MatchStart + (4 - MatchStart rem 4)) end end. @@ -1547,4 +1545,34 @@ seed_test() -> ok. +aligned_utf32_match_test() -> + ?assertEqual(nomatch, aligned_utf32_match(<<>>, <<0,0,0,0>>, 0)), + ?assertEqual(nomatch, aligned_utf32_match(<<>>, <<0,0,0,0>>, 4)), + + ?assertError(function_clause, aligned_utf32_match(<<>>, <<0,0,0,0>>, 3)), + ?assertError(function_clause, aligned_utf32_match(<<>>, <<0,0,0,0>>, -4)), + + ?assertEqual({0, 4}, aligned_utf32_match(<<1,2,3,4>>, <<1,2,3,4>>, 0)), + ?assertEqual({4, 4}, aligned_utf32_match(<<0,0,0,0, 1,2,3,4>>, <<1,2,3,4>>, 0)), + + %% These will binary match, but the match is not on a utf32 boundary + ?assertEqual(nomatch, aligned_utf32_match(<<0,0,1,2, 3,4,5,6>>, <<1,2,3,4>>, 0)), + ?assertEqual({8,4}, aligned_utf32_match(<<0,0,1,2, 3,4,5,6, 1,2,3,4>>, <<1,2,3,4>>, 0)), + ?assertEqual({8,4}, aligned_utf32_match(<<0,0,1,2, 3,4,5,6, 1,2,3,4>>, <<1,2,3,4>>, 4)), + ?assertEqual(nomatch, aligned_utf32_match(<<0,0,1,2, 3,4,5,1, 2,3,4,0>>, <<1,2,3,4>>, 4)), + + %% Some longer matches + ?assertEqual({40, 20}, aligned_utf32_match(to_utf32(<<"the quick brown fox jumps over the lazy dog"/utf8>>), + to_utf32(<<"brown"/utf8>>), 0)), + ?assertEqual(nomatch, aligned_utf32_match(to_utf32(<<"the quick brown fox jumps over the lazy dog"/utf8>>), + to_utf32(<<"blue"/utf8>>), 0)), + + %% All emoticon matches emoticons + ?assertEqual(nomatch, aligned_utf32_match(to_utf32(<<"😔😟😕🙁☹️😣😖😫😩🥺🥶"/utf8>>), + to_utf32(<<"💩"/utf8>>), 0)), + ?assertEqual({16,12}, aligned_utf32_match(to_utf32(<<"😔😟😕🙁☹️💩😣😖😫😩🥺🥶"/utf8>>), + to_utf32(<<"☹️💩"/utf8>>), 0)), + + ok. + -endif. From e19afc27fa552b84d0a7c084381a370ca33074a7 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Fri, 10 Apr 2026 23:08:29 +0200 Subject: [PATCH 32/47] Fix an issue where the overlap could match outside utf32 char boundaries --- src/diffy.erl | 33 ++++++++++++++++++++++++--------- test/diffy_tests.erl | 1 + 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index b088681..2734ce6 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -911,10 +911,13 @@ common_overlap(Text1, Text2) -> T1Len = text_size32(Text1), T2Len = text_size32(Text2), {T1, T2, TMin} = if - T1Len > T2Len -> {substring_end(Text1, T2Len), Text2, T2Len}; - T1Len < T2Len -> {Text1, substring_start(Text2, T1Len), T1Len}; - true -> {Text1, Text2, T1Len} - end, + T1Len > T2Len -> + {substring_end(Text1, T2Len), Text2, T2Len}; + T1Len < T2Len -> + {Text1, substring_start(Text2, T1Len), T1Len}; + true -> + {Text1, Text2, T1Len} + end, case T1 =:= T2 of true -> TMin; false -> common_overlap_loop(T1, T2, TMin, 0, 1) @@ -922,18 +925,18 @@ common_overlap(Text1, Text2) -> common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin -> Pattern = substring_end(T1, Length), - case binary:match(T2, Pattern) of + case aligned_utf32_match(T2, Pattern, 0) of nomatch -> Best; {FoundByteOffset, _} -> %% In UTF-32, byte offset maps directly to codepoint count. FoundCharCount = FoundByteOffset div 4, NewLength = Length + FoundCharCount, - if - NewLength > TMin -> Best; - true -> + case NewLength > TMin of + true -> Best; + false -> case substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of true -> - common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1); + common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1); false -> common_overlap_loop(T1, T2, TMin, Best, NewLength + 1) end @@ -1575,4 +1578,16 @@ aligned_utf32_match_test() -> ok. +common_overlap_loop_test() -> + Abc = to_utf32(<<"abc">>), + Cde = to_utf32(<<"cde">>), + ?assertEqual(1, common_overlap_loop(Abc, Cde, size(Cde), 0, 1)), + + Abcdef = to_utf32(<<"abcdef">>), + Efde = to_utf32(<<"efde">>), + ?assertEqual(2, common_overlap_loop(Abcdef, Efde, size(Cde), 0, 1)), + + ok. + + -endif. diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index a642f2c..0e1980e 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -267,6 +267,7 @@ cleanup_semantic_test() -> ?assertEqual(diffy:destination_text(Diffs), diffy:destination_text(Cleaned)), ok. + cleanup_efficiency_prop_test() -> ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. From 953afd4d4a7284971749e23859e5855f54ac091a Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Sat, 11 Apr 2026 00:13:32 +0200 Subject: [PATCH 33/47] Fix an issue where the overlap could match outside utf32 char boundaries --- src/diffy.erl | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 2734ce6..6498ac2 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -926,7 +926,8 @@ common_overlap(Text1, Text2) -> common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin -> Pattern = substring_end(T1, Length), case aligned_utf32_match(T2, Pattern, 0) of - nomatch -> Best; + nomatch -> + Best; {FoundByteOffset, _} -> %% In UTF-32, byte offset maps directly to codepoint count. FoundCharCount = FoundByteOffset div 4, @@ -936,7 +937,7 @@ common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin -> false -> case substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of true -> - common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1); + common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1); false -> common_overlap_loop(T1, T2, TMin, Best, NewLength + 1) end @@ -1553,7 +1554,6 @@ aligned_utf32_match_test() -> ?assertEqual(nomatch, aligned_utf32_match(<<>>, <<0,0,0,0>>, 4)), ?assertError(function_clause, aligned_utf32_match(<<>>, <<0,0,0,0>>, 3)), - ?assertError(function_clause, aligned_utf32_match(<<>>, <<0,0,0,0>>, -4)), ?assertEqual({0, 4}, aligned_utf32_match(<<1,2,3,4>>, <<1,2,3,4>>, 0)), ?assertEqual({4, 4}, aligned_utf32_match(<<0,0,0,0, 1,2,3,4>>, <<1,2,3,4>>, 0)), @@ -1578,6 +1578,15 @@ aligned_utf32_match_test() -> ok. +common_overlap_test() -> + A = to_utf32(<<"Fire at Will">>), + B = to_utf32(<<"William Riker is number one">>), + + ?assertEqual(4, common_overlap(A, B)), + + ok. + + common_overlap_loop_test() -> Abc = to_utf32(<<"abc">>), Cde = to_utf32(<<"cde">>), From d760a29682aa33871158fb8693ffb77be9c4e59a Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Sat, 11 Apr 2026 15:25:07 +0200 Subject: [PATCH 34/47] Removed unique match function, not relevant for the api and confusing --- src/diffy.erl | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 6498ac2..68f03a8 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -45,8 +45,7 @@ text_size/1, - split_pre_and_suffix/2, - unique_match/2 + split_pre_and_suffix/2 ]). -type diff_op() :: delete | equal | insert. @@ -1123,24 +1122,6 @@ make_patch([{equal, Data}|T], PrePatchText, PostPatchText, Count1, Count2, [Patc make_patch(T, PrePatchText, PostPatchText, Count1+Size, Count2+Size, [P|Rest]). - -% @doc Returns true iff Pattern is a unique match inside Text. -unique_match(Pattern, Text) -> - TextSize = size(Text), - case binary:match(Text, Pattern) of - nomatch -> - error(nomatch); - {Start, Length} when Start + 1 + Length < TextSize -> - %% We have a match, and we can search.. - case binary:match(Text, Pattern, [{scope, {Start+1, TextSize-Start-1}}]) of - nomatch -> true; - {_, _} -> false - end; - {_, _} -> - true - end. - - %% %% Helpers %% @@ -1411,13 +1392,6 @@ split_pre_and_suffix_test() -> Split(<<"cat ">>, <<"cat mouse dog ">>)), ok. -unique_match_test() -> - ?assertEqual(true, unique_match(<<"a">>, <<"abc">>)), - ?assertEqual(true, unique_match(<<"b">>, <<"abc">>)), - ?assertEqual(true, unique_match(<<"c">>, <<"abc">>)), - ?assertEqual(false, unique_match(<<"ab">>, <<"abab">>)), - ok. - text_smaller_than_test() -> %% text_smaller_than now works on UTF-32 binaries. ?assertEqual(true, text_smaller_than(to_utf32(<<>>), 5)), From b462e21969b5244b5800a4a6fe8bf0209f9e9ac3 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Sat, 11 Apr 2026 21:47:22 +0200 Subject: [PATCH 35/47] Use aligned utf32 match to do line_diff. Also removed unneeded conversion from an to utf8 --- src/diffy.erl | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 68f03a8..07756b9 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -390,32 +390,28 @@ cleanup_line_diff([{equal, _}=E|Rest], DeleteData, InsertData, _TmpAcc, Acc) -> %% Text1 and Text2 are UTF-32 binaries. Lines are stored as UTF-32 binaries. %% CharText1/CharText2 are UTF-32 binaries where each 4-byte word is a line index. lines_to_chars(Text1, Text2) -> - Utf8Text1 = to_utf8(Text1), - Utf8Text2 = to_utf8(Text2), - {CharText1, NextChar, Lines1, Map1} = lines_to_chars(Utf8Text1, 0, <<>>, 0, [], #{}), - {CharText2, _, Lines2, _Map2} = lines_to_chars(Utf8Text2, 0, <<>>, NextChar, Lines1, Map1), - + {CharText1, NextChar, Lines1, Map1} = lines_to_chars(Text1, 0, <<>>, 0, [], #{}), + {CharText2, _, Lines2, _Map2} = lines_to_chars(Text2, 0, <<>>, NextChar, Lines1, Map1), {CharText1, CharText2, lists:reverse(Lines2)}. %% Transform each unique line into a 4-byte index; store line content as UTF-32. lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) when Idx >= byte_size(Text) -> {CharText, NextChar, Lines, Map}; -lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) -> - case binary:match(Text, <<"\n">>, [{scope, {Idx, byte_size(Text)-Idx}}]) of +lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) when ?IS_UTF32_ALIGNED(Idx) -> + case aligned_utf32_match(Text, <<$\n:32>>, Idx) of nomatch -> <<_:Idx/binary, Line/binary>> = Text, - {Char, NextChar1, Lines1, Map1} = insert_line(to_utf32(Line), Lines, Map, NextChar), + {Char, NextChar1, Lines1, Map1} = insert_line(Line, Lines, Map, NextChar), CharText1 = <>, {CharText1, NextChar1, Lines1, Map1}; {Start, _} -> - LineLength = Start - Idx + 1, + LineLength = Start - Idx + 4, <<_:Idx/binary, Line:LineLength/binary, _/binary>> = Text, - {Char, NextChar1, Lines1, Map1} = insert_line(to_utf32(Line), Lines, Map, NextChar), + {Char, NextChar1, Lines1, Map1} = insert_line(Line, Lines, Map, NextChar), CharText1 = <>, lines_to_chars(Text, Idx + LineLength, CharText1, NextChar1, Lines1, Map1) end. - insert_line(Line, Lines, Map, NextChar) -> case Map of #{Line := Char} -> @@ -1555,12 +1551,9 @@ aligned_utf32_match_test() -> common_overlap_test() -> A = to_utf32(<<"Fire at Will">>), B = to_utf32(<<"William Riker is number one">>), - ?assertEqual(4, common_overlap(A, B)), - ok. - common_overlap_loop_test() -> Abc = to_utf32(<<"abc">>), Cde = to_utf32(<<"cde">>), @@ -1572,5 +1565,4 @@ common_overlap_loop_test() -> ok. - -endif. From 23ce7dc9b2ede03f656506a319f772f40dea916b Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Sat, 11 Apr 2026 21:59:30 +0200 Subject: [PATCH 36/47] Replaced some functions with macro's --- src/diffy.erl | 36 +++++++++--------------------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 07756b9..f5e513a 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -65,6 +65,9 @@ -define(PATCH_MARGIN, 4). -define(IS_INS_OR_DEL(Op), (Op =:= insert orelse Op =:= delete)). -define(IS_UTF32_ALIGNED(Offset), (Offset rem 4 =:= 0)). +-define(IS_WS(C), (C =:= $\s orelse C =:= $\t orelse C =:= $\n orelse C =:= $\r orelse C =:= $\f orelse C =:= $\v)). +-define(IS_LB(C), (C =:= $\n orelse C =:= $\r)). +-define(IS_ALPHA(C), ((C >= $a andalso C =< $z) orelse (C >= $A andalso C =< $Z) orelse (C >= $0 andalso C =< $9))). -record(bisect_state, { k1start = 0, k1end = 0, @@ -833,12 +836,12 @@ cleanup_semantic_score(_, <<>>) -> 6; cleanup_semantic_score(One, Two) -> Char1 = last_char(One), Char2 = first_char(Two), - NonAlphaNumeric1 = is_non_alphanumeric(Char1), - NonAlphaNumeric2 = is_non_alphanumeric(Char2), - Whitespace1 = NonAlphaNumeric1 andalso is_whitespace(Char1), - Whitespace2 = NonAlphaNumeric2 andalso is_whitespace(Char2), - LineBreak1 = Whitespace1 andalso is_linebreak(Char1), - LineBreak2 = Whitespace2 andalso is_linebreak(Char2), + NonAlphaNumeric1 = not ?IS_ALPHA(Char1), + NonAlphaNumeric2 = not ?IS_ALPHA(Char2), + Whitespace1 = NonAlphaNumeric1 andalso ?IS_WS(Char1), + Whitespace2 = NonAlphaNumeric2 andalso ?IS_WS(Char2), + LineBreak1 = Whitespace1 andalso ?IS_LB(Char1), + LineBreak2 = Whitespace2 andalso ?IS_LB(Char2), BlankLine1 = LineBreak1 andalso is_blankline_end(One), BlankLine2 = LineBreak2 andalso is_blankline_start(Two), if @@ -951,27 +954,6 @@ last_char(Bin) -> <<_:(Size-4)/binary, C:32>> = Bin, C. -is_non_alphanumeric(undefined) -> true; -is_non_alphanumeric(C) -> - not ((C >= $a andalso C =< $z) orelse - (C >= $A andalso C =< $Z) orelse - (C >= $0 andalso C =< $9)). - -is_whitespace(undefined) -> false; -is_whitespace(C) -> - case C of - $\s -> true; - $\t -> true; - $\n -> true; - $\r -> true; - $\f -> true; - $\v -> true; - _ -> false - end. - -is_linebreak(C) -> - C =:= $\n orelse C =:= $\r. - %% In UTF-32 each codepoint is 4 bytes, so newline patterns are fixed-width. is_blankline_end(Bin) when byte_size(Bin) >= 8 -> Size = byte_size(Bin), From cff9bcd6a2f9164090cc826358578897c61bd0b2 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Sat, 11 Apr 2026 22:14:37 +0200 Subject: [PATCH 37/47] Some reformatting --- src/diffy.erl | 92 ++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 49 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index f5e513a..d017107 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -109,14 +109,14 @@ diff(Text1, Text2, Options) when is_list(Options) -> T2 = to_utf32(Text2), Diffs32 = diff32(T1, T2, CheckLines), Diffs1 = case lists:member(semantic, Options) of - true -> cleanup_semantic32(Diffs32); - false -> Diffs32 - end, + true -> cleanup_semantic32(Diffs32); + false -> Diffs32 + end, Diffs2 = case efficiency_opt(Options) of - none -> Diffs1; - default -> cleanup_efficiency32(Diffs1); - {custom, Cost} -> cleanup_efficiency32(Diffs1, Cost) - end, + none -> Diffs1; + default -> cleanup_efficiency32(Diffs1); + {custom, Cost} -> cleanup_efficiency32(Diffs1, Cost) + end, %% Single conversion at the exit boundary. [{Op, to_utf8(D)} || {Op, D} <- Diffs2]. @@ -519,12 +519,10 @@ compute_diff_bisect1(A, B, M, N) -> true -> % Mirror x2 onto top-left coordinate system. X2 = M - V2AtOffset, - if - X1_1 >= X2 -> - % Overlap detected - throw({overlap, X1_1, Y1_1}); - true -> - {continue, S2_1} + case X1_1 >= X2 of + % Overlap detected + true -> throw({overlap, X1_1, Y1_1}); + false -> {continue, S2_1} end; false -> {continue, S2_1} end @@ -568,13 +566,11 @@ compute_diff_bisect1(A, B, M, N) -> true -> X1 = V1AtOffset, Y1 = VOffset + X1 - K1Offset, - if - % Mirror x2 onto top-left coordinate system. - X1 >= M - X2_1 -> - % Overlap detected - throw({overlap, X1, Y1}); - true -> - {continue, S4_1} + % Mirror x2 onto top-left coordinate system. + case X1 >= M - X2_1 of + % Overlap detected + true -> throw({overlap, X1, Y1}); + false -> {continue, S4_1} end; false -> {continue, S4_1} end @@ -795,13 +791,13 @@ cleanup_semantic_lossless([], Acc) -> slide_edit(E1, Edit, E2) -> Suffix = common_suffix(E1, Edit), {E1_1, Edit_1, E2_1} = case Suffix of - <<>> -> {E1, Edit, E2}; - _ -> - SLen = size(Suffix), - { binary:part(E1, 0, size(E1) - SLen), - <>, - <> } - end, + <<>> -> {E1, Edit, E2}; + _ -> + SLen = size(Suffix), + { binary:part(E1, 0, size(E1) - SLen), + <>, + <> } + end, find_best_slide(E1_1, Edit_1, E2_1). find_best_slide(E1, Edit, E2) -> @@ -815,11 +811,9 @@ find_best_slide(E1, Edit, E2, BestScore, BestE1, BestEdit, BestE2) -> NewEdit = <>, NewE2 = RestE2, NewScore = cleanup_semantic_score(NewE1, NewEdit) + cleanup_semantic_score(NewEdit, NewE2), - if - NewScore >= BestScore -> - find_best_slide(NewE1, NewEdit, NewE2, NewScore, NewE1, NewEdit, NewE2); - true -> - find_best_slide(NewE1, NewEdit, NewE2, BestScore, BestE1, BestEdit, BestE2) + case NewScore >= BestScore of + true -> find_best_slide(NewE1, NewEdit, NewE2, NewScore, NewE1, NewEdit, NewE2); + false -> find_best_slide(NewE1, NewEdit, NewE2, BestScore, BestE1, BestEdit, BestE2) end; false -> {BestE1, BestEdit, BestE2} @@ -861,25 +855,25 @@ cleanup_semantic_overlaps([{delete, Del}, {insert, Ins} | T], Acc) -> Overlap2 = common_overlap(Ins, Del), TDel = text_size32(Del), TIns = text_size32(Ins), - if - Overlap1 >= Overlap2 -> - if - Overlap1 * 2 >= TDel orelse Overlap1 * 2 >= TIns -> + case Overlap1 >= Overlap2 of + true -> + case Overlap1 * 2 >= TDel orelse Overlap1 * 2 >= TIns of + true -> Common = binary:part(Ins, 0, Overlap1 * 4), NewDel = binary:part(Del, 0, (TDel - Overlap1) * 4), NewIns = binary:part(Ins, Overlap1 * 4, (TIns - Overlap1) * 4), cleanup_semantic_overlaps([{insert, NewIns} | T], [{equal, Common}, {delete, NewDel} | Acc]); - true -> + false -> cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) end; - true -> - if - Overlap2 * 2 >= TIns orelse Overlap2 * 2 >= TDel -> + false -> + case Overlap2 * 2 >= TIns orelse Overlap2 * 2 >= TDel of + true -> Common = binary:part(Ins, (TIns - Overlap2) * 4, Overlap2 * 4), NewIns = binary:part(Ins, 0, (TIns - Overlap2) * 4), NewDel = binary:part(Del, Overlap2 * 4, (TDel - Overlap2) * 4), cleanup_semantic_overlaps([{delete, NewDel} | T], [{equal, Common}, {insert, NewIns} | Acc]); - true -> + false -> cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) end end; @@ -958,7 +952,7 @@ last_char(Bin) -> is_blankline_end(Bin) when byte_size(Bin) >= 8 -> Size = byte_size(Bin), case Bin of - <<_:(Size-8)/binary, $\n:32, $\n:32>> -> true; + <<_:(Size-8)/binary, $\n:32, $\n:32>> -> true; <<_:(Size-12)/binary, $\n:32, $\r:32, $\n:32>> -> true; _ -> false end; @@ -966,9 +960,9 @@ is_blankline_end(_) -> false. is_blankline_start(Bin) when byte_size(Bin) >= 8 -> case Bin of - <<$\n:32, $\n:32, _/binary>> -> true; - <<$\n:32, $\r:32, $\n:32, _/binary>> -> true; - <<$\r:32, $\n:32, $\n:32, _/binary>> -> true; + <<$\n:32, $\n:32, _/binary>> -> true; + <<$\n:32, $\r:32, $\n:32, _/binary>> -> true; + <<$\r:32, $\n:32, $\n:32, _/binary>> -> true; <<$\r:32, $\n:32, $\r:32, $\n:32, _/binary>> -> true; _ -> false end; @@ -1001,8 +995,8 @@ cleanup_efficiency32([], Changed, _EditCost, Acc) -> end; %% Any equality which is surrounded on both sides by an insertion and deletion need less then %% EditCost characters for it to be advantageous to split. -cleanup_efficiency32([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc) when - O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) -> +cleanup_efficiency32([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc) + when O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) -> case text_smaller_than(XY, EditCost) of true -> Del = {delete, XY}, @@ -1014,8 +1008,8 @@ cleanup_efficiency32([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCos %% Any equality which is surrounded on one side by an existing insertion and deletion and on the %% other side by an existing insertion or deletion needs less than half C characters long for it %% to be advantageous to split. -cleanup_efficiency32([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc) when - O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) andalso ?IS_INS_OR_DEL(O3) -> +cleanup_efficiency32([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc) + when O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) andalso ?IS_INS_OR_DEL(O3) -> case text_smaller_than(X, EditCost div 2 + 1) of true -> Del = {delete, X}, From 15360775d8ff76e1d4df72b57f807257c840aa60 Mon Sep 17 00:00:00 2001 From: MM Zeeman Date: Sun, 12 Apr 2026 10:51:28 +0200 Subject: [PATCH 38/47] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b3e31ac..716ad98 100644 --- a/Makefile +++ b/Makefile @@ -36,7 +36,7 @@ clean_doc: distclean: clean_doc @rm -rf _build - @rm $(REBAR) + @rm -f $(REBAR) doc: $(REBAR) $(REBAR) ex_doc --output doc --formatter html From 554a227d586a2573ec5d1fb33555965590a12343 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Sun, 12 Apr 2026 10:56:44 +0200 Subject: [PATCH 39/47] Fix size parameter in common_overlap_loop test --- src/diffy.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index d017107..be93a86 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -1533,11 +1533,11 @@ common_overlap_test() -> common_overlap_loop_test() -> Abc = to_utf32(<<"abc">>), Cde = to_utf32(<<"cde">>), - ?assertEqual(1, common_overlap_loop(Abc, Cde, size(Cde), 0, 1)), + ?assertEqual(1, common_overlap_loop(Abc, Cde, text_size(Cde), 0, 1)), Abcdef = to_utf32(<<"abcdef">>), Efde = to_utf32(<<"efde">>), - ?assertEqual(2, common_overlap_loop(Abcdef, Efde, size(Cde), 0, 1)), + ?assertEqual(2, common_overlap_loop(Abcdef, Efde, text_size(Efde), 0, 1)), ok. From 023124c450ad8679ee2ff7efaf105a2151bd26f9 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Sun, 12 Apr 2026 12:06:40 +0200 Subject: [PATCH 40/47] Minor refactoring --- src/diffy.erl | 42 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index be93a86..3aee7ea 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -62,6 +62,7 @@ -export_type([diff_op/0, diff/0, diffs/0, diff_option/0]). +-define(DEFAULT_EDIT_COST, 4). -define(PATCH_MARGIN, 4). -define(IS_INS_OR_DEL(Op), (Op =:= insert orelse Op =:= delete)). -define(IS_UTF32_ALIGNED(Offset), (Offset rem 4 =:= 0)). @@ -104,38 +105,27 @@ diff(Text1, Text2) -> % Cleanups are always applied in the correct order: semantic first, then efficiency. -spec diff(unicode:unicode_binary(), unicode:unicode_binary(), [diff_option()]) -> diffs(). diff(Text1, Text2, Options) when is_list(Options) -> - CheckLines = not lists:member(no_linemode, Options), T1 = to_utf32(Text1), T2 = to_utf32(Text2), + CheckLines = not proplists:get_value(no_linemode, Options, false), Diffs32 = diff32(T1, T2, CheckLines), - Diffs1 = case lists:member(semantic, Options) of + Diffs1 = case proplists:get_value(semantic, Options) of true -> cleanup_semantic32(Diffs32); - false -> Diffs32 + _ -> Diffs32 end, - Diffs2 = case efficiency_opt(Options) of - none -> Diffs1; - default -> cleanup_efficiency32(Diffs1); - {custom, Cost} -> cleanup_efficiency32(Diffs1, Cost) + Diffs2 = case proplists:get_value(efficiency, Options) of + true -> cleanup_efficiency32(Diffs1); + Cost when is_integer(Cost) andalso Cost > 0 -> cleanup_efficiency32(Diffs1, Cost); + _ -> Diffs1 end, %% Single conversion at the exit boundary. [{Op, to_utf8(D)} || {Op, D} <- Diffs2]. -%% Extract the efficiency option, preferring {efficiency, Cost} over plain efficiency. -efficiency_opt(Options) -> - case lists:keyfind(efficiency, 1, Options) of - {efficiency, Cost} -> {custom, Cost}; - false -> - case lists:member(efficiency, Options) of - true -> default; - false -> none - end - end. - %% Internal diff working entirely in UTF-32 binaries. -diff32(<<>>, <<>>, _CheckLines) -> - []; -diff32(Text1, Text2, _CheckLines) when Text1 =:= Text2 -> - [{equal, Text1}]; +diff32(<<>>, <<>>, _CheckLines) -> []; +diff32(<<>>, Text2, _CheckLines) -> [{insert, Text2}]; +diff32(Text1, <<>>, _CheckLines) -> [{delete, Text1}]; +diff32(Text1, Text2, _CheckLines) when Text1 =:= Text2 -> [{equal, Text1}]; diff32(Text1, Text2, CheckLines) -> {Prefix, MText1, MText2, Suffix} = split_pre_and_suffix(Text1, Text2), @@ -154,10 +144,8 @@ diff32(Text1, Text2, CheckLines) -> cleanup_merge32(Diffs2). %% This assumes Text1 and Text2 don't have a common prefix. Operates on UTF-32. -compute_diff(<<>>, NewText, _CheckLines) -> - [{insert, NewText}]; -compute_diff(OldText, <<>>, _CheckLines) -> - [{delete, OldText}]; +compute_diff(<<>>, NewText, _CheckLines) -> [{insert, NewText}]; +compute_diff(OldText, <<>>, _CheckLines) -> [{delete, OldText}]; compute_diff(OldText, NewText, CheckLines) -> OldStNew = size(OldText) < size(NewText), @@ -981,7 +969,7 @@ cleanup_efficiency(Diffs, EditCost) -> %% Internal efficiency cleanup operating on UTF-32 diffs. cleanup_efficiency32(Diffs) -> - cleanup_efficiency32(Diffs, 4). + cleanup_efficiency32(Diffs, ?DEFAULT_EDIT_COST). cleanup_efficiency32(Diffs, EditCost) -> cleanup_efficiency32(Diffs, false, EditCost, []). From 5c7759481bf2dac53e2afb319285ec97a76bcd46 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Sun, 12 Apr 2026 20:35:01 +0200 Subject: [PATCH 41/47] Fix a problem where cleanup_merge did not collapse all possible outcomes --- src/diffy.erl | 32 +++++++++++++++----------------- test/diffy_tests.erl | 25 +++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 3aee7ea..b47f868 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -658,8 +658,7 @@ cleanup_merge(Diffs) -> %% Internal cleanup_merge operating on UTF-32 diffs. cleanup_merge32(Diffs) -> - Diffs1 = cleanup_merge32(Diffs, []), - canonicalize_edits(Diffs1, []). + cleanup_merge32(Diffs, []). %% Done cleanup_merge32([], Acc) -> @@ -667,19 +666,25 @@ cleanup_merge32([], Acc) -> %% Remove operations without data. cleanup_merge32([{_Op, <<>>}|T], Acc) -> cleanup_merge32(T, Acc); -%% Merge data from equal operations +%% Ensure delete/insert ordering: if insert is on top and a delete arrives, sink the insert. +cleanup_merge32([{delete, _}=D|T], [{insert, _}=I|Acc]) -> + cleanup_merge32([D, I|T], Acc); +%% Merge data from equal operations. cleanup_merge32([{Op2, Data2}|T], [{Op1, Data1}|Acc]) when Op1 =:= Op2 -> cleanup_merge32(T, [{Op1, <>}|Acc]); -%% Cleanup edits before equal operation -cleanup_merge32([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc]) when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal -> - cleanup_merge32(T, [I, {Op3, <>}|Acc]); -%% Check if Op1Data and Op2Data have common prefixes. -cleanup_merge32([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc]) when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal -> +%% Cleanup edits before equal operation — re-queue merged op for further processing. +cleanup_merge32([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc]) + when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal -> + cleanup_merge32([I, {Op3, <>} | T], Acc); +%% Factor out common prefixes and suffixes from adjacent insert/delete pairs. +cleanup_merge32([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc]) + when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal -> {Prefix, Op1DataD, Op2DataD, Suffix} = split_pre_and_suffix(Op1Data, Op2Data), cleanup_merge32(T, [{equal, <>}, {Op1, Op1DataD}, {Op2, Op2DataD}, {equal, <>}|Acc]); -%% Check for slide left and slide right edits -cleanup_merge32([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =:= insert orelse Op =:= delete -> +%% Slide edits left and right. +cleanup_merge32([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) + when Op =:= insert orelse Op =:= delete -> case is_suffix(E2, I) of false -> case is_prefix(E1, I) of @@ -698,13 +703,6 @@ cleanup_merge32([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op = cleanup_merge32([H|T], Acc) -> cleanup_merge32(T, [H|Acc]). -canonicalize_edits([{insert, I}, {delete, D} | T], Acc) -> - canonicalize_edits(T, [{insert, I}, {delete, D} | Acc]); -canonicalize_edits([H | T], Acc) -> - canonicalize_edits(T, [H | Acc]); -canonicalize_edits([], Acc) -> - lists:reverse(Acc). - % @doc Do semantic cleanup of diffs % -spec cleanup_semantic(diffs()) -> diffs(). diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 0e1980e..293ca46 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -48,6 +48,13 @@ prop_cleanup_merge() -> andalso DestinationText =:= diffy:destination_text(CleanDiffs) end). +prop_cleanup_merge_idempotent() -> + ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), + begin + Cleaned = cleanup_merge(Diffs), + Cleaned =:= cleanup_merge(Cleaned) + end). + prop_cleanup_efficiency() -> ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), begin @@ -233,6 +240,24 @@ cleanup_merge_test() -> ok. +%% delete/insert/delete — the two deletes merge, then insert must be re-checked +%% against the equal below it, which should then slide +requeue_i_test() -> + ?assertEqual([{delete, <<"aXa">>}, {insert, <<"b">>}], + cleanup_merge([{delete, <<"a">>}, {insert, <<"b">>}, {delete, <<"Xa">>}])). + +%% Three consecutive deletes separated by inserts collapse correctly +triple_delete_test() -> + ?assertEqual([{delete, <<"abc">>}, {insert, <<"xyz">>}], + cleanup_merge([{delete, <<"a">>}, {insert, <<"x">>}, + {delete, <<"b">>}, {insert, <<"y">>}, + {delete, <<"c">>}, {insert, <<"z">>}])). + +%% After sliding, the two equals on either side should merge into one +slide_merge_test() -> + ?assertEqual([{insert, <<"aX">>}, {equal, <<"abc">>}], + cleanup_merge([{equal, <<"a">>}, {insert, <<"Xa">>}, {equal, <<"bc">>}])). + cleanup_merge_prop_test() -> ?assertEqual(true, proper:quickcheck(prop_cleanup_merge(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. From 0cb1e7ff2350f391e164601f88137bb989cd56b1 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Sun, 12 Apr 2026 21:24:48 +0200 Subject: [PATCH 42/47] Don't fall through to default when efficiency cost is mis-configured --- src/diffy.erl | 4 ++-- test/diffy_tests.erl | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index b47f868..c4af0a0 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -114,9 +114,9 @@ diff(Text1, Text2, Options) when is_list(Options) -> _ -> Diffs32 end, Diffs2 = case proplists:get_value(efficiency, Options) of + NoEfficiency when NoEfficiency =:= undefined orelse NoEfficiency =:= false -> Diffs1; true -> cleanup_efficiency32(Diffs1); - Cost when is_integer(Cost) andalso Cost > 0 -> cleanup_efficiency32(Diffs1, Cost); - _ -> Diffs1 + Cost when is_integer(Cost) andalso Cost > 0 -> cleanup_efficiency32(Diffs1, Cost) end, %% Single conversion at the exit boundary. [{Op, to_utf8(D)} || {Op, D} <- Diffs2]. diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 293ca46..4f7370e 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -51,6 +51,7 @@ prop_cleanup_merge() -> prop_cleanup_merge_idempotent() -> ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), begin + % Cleaning the diffs again shoul not result in more changes Cleaned = cleanup_merge(Diffs), Cleaned =:= cleanup_merge(Cleaned) end). From 5c8bb4cd80348264ef02c0f11c2e81fae6b20cb7 Mon Sep 17 00:00:00 2001 From: Maas-Maarten Zeeman Date: Mon, 13 Apr 2026 20:34:29 +0200 Subject: [PATCH 43/47] Don't attempt to do linemode diffs for terms --- src/diffy.erl | 1 - src/diffy_term.erl | 10 +++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index c4af0a0..273d636 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -17,7 +17,6 @@ %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. %% See the License for the specific language governing permissions and %% limitations under the License. -%% Erlang diff-match-patch implementation -module(diffy). diff --git a/src/diffy_term.erl b/src/diffy_term.erl index 2639bf6..23c23b6 100644 --- a/src/diffy_term.erl +++ b/src/diffy_term.erl @@ -26,11 +26,10 @@ diff/2 ]). --type diff_op() :: delete | equal | insert. --type diff() :: {diff_op(), term()}. +-type diff() :: {diffy:diff_op(), term()}. -type diffs() :: list(diff()). --export_type([ diffs/0 ]). +-export_type([ diff/0, diffs/0 ]). -spec diff(list(), list()) -> diffs(). diff(A, A) -> @@ -42,7 +41,7 @@ diff([], B) -> diff(A, B) when is_list(A), is_list(B) -> {Dict0, N} = term_dict(A, dict:new(), 0), {Dict, _N} = term_dict(B, Dict0, N), - Diff = diffy:diff(map_terms(A, Dict), map_terms(B, Dict)), + Diff = diffy:diff(map_terms(A, Dict), map_terms(B, Dict), [no_linemode]), unmap_diff(Diff, Dict). term_dict([], D, N) -> @@ -69,7 +68,6 @@ unmap_diff_1({Op, B}, RDict) -> {Op, [ dict:fetch(C, RDict) || C <- Cs ]}. - -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). @@ -89,7 +87,5 @@ diffy_term_test() -> diffy_term:diff([a,b,c,d,e], [a,e,b,c,d])), ok. - -endif. - From 9000ce3fa4c01ba46a2692308cd6e4373d1ce4cb Mon Sep 17 00:00:00 2001 From: MM Zeeman Date: Mon, 13 Apr 2026 21:08:00 +0200 Subject: [PATCH 44/47] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- test/diffy_tests.erl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 4f7370e..b2ecd4f 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -72,10 +72,10 @@ prop_cleanup_semantic() -> begin SourceText = diffy:source_text(Diffs), DestinationText = diffy:destination_text(Diffs), - EfficientDiffs = cleanup_semantic(Diffs), + SemanticDiffs = cleanup_semantic(Diffs), - SourceText =:= diffy:source_text(EfficientDiffs) - andalso DestinationText =:= diffy:destination_text(EfficientDiffs) + SourceText =:= diffy:source_text(SemanticDiffs) + andalso DestinationText =:= diffy:destination_text(SemanticDiffs) end). html_like() -> From 762fc294503193ceaf5f6fc4e8ef5050819d07ab Mon Sep 17 00:00:00 2001 From: MM Zeeman Date: Mon, 13 Apr 2026 21:08:36 +0200 Subject: [PATCH 45/47] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- test/diffy_tests.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index b2ecd4f..c7c9781 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -51,7 +51,7 @@ prop_cleanup_merge() -> prop_cleanup_merge_idempotent() -> ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), begin - % Cleaning the diffs again shoul not result in more changes + % Cleaning the diffs again should not result in more changes Cleaned = cleanup_merge(Diffs), Cleaned =:= cleanup_merge(Cleaned) end). From 2350f64369afd35c637dd056f212a747c2b664ed Mon Sep 17 00:00:00 2001 From: MM Zeeman Date: Mon, 13 Apr 2026 21:09:20 +0200 Subject: [PATCH 46/47] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/diffy.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffy.erl b/src/diffy.erl index 273d636..8de8864 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -957,7 +957,7 @@ is_blankline_start(_) -> false. % -spec cleanup_efficiency(diffs()) -> diffs(). cleanup_efficiency(Diffs) -> - cleanup_efficiency(Diffs, 4). + cleanup_efficiency(Diffs, ?DEFAULT_EDIT_COST). -spec cleanup_efficiency(diffs(), pos_integer()) -> diffs(). cleanup_efficiency(Diffs, EditCost) -> From 4385ab8448535ea878052729c14df5d60dd3db22 Mon Sep 17 00:00:00 2001 From: MM Zeeman Date: Mon, 13 Apr 2026 21:09:52 +0200 Subject: [PATCH 47/47] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/diffy.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffy.erl b/src/diffy.erl index 8de8864..d9b7f5d 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -1518,11 +1518,11 @@ common_overlap_test() -> common_overlap_loop_test() -> Abc = to_utf32(<<"abc">>), Cde = to_utf32(<<"cde">>), - ?assertEqual(1, common_overlap_loop(Abc, Cde, text_size(Cde), 0, 1)), + ?assertEqual(1, common_overlap_loop(Abc, Cde, text_size32(Cde), 0, 1)), Abcdef = to_utf32(<<"abcdef">>), Efde = to_utf32(<<"efde">>), - ?assertEqual(2, common_overlap_loop(Abcdef, Efde, text_size(Efde), 0, 1)), + ?assertEqual(2, common_overlap_loop(Abcdef, Efde, text_size32(Efde), 0, 1)), ok.