diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ca85721..7adb427 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -18,14 +18,14 @@ jobs: strategy: matrix: - otp_version: [22,23,24] + otp_version: [26,27,28] os: [ubuntu-latest] container: image: erlang:${{ matrix.otp_version }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Compile run: make - name: Test diff --git a/Makefile b/Makefile index 82eaca1..716ad98 100644 --- a/Makefile +++ b/Makefile @@ -28,13 +28,18 @@ xref: $(REBAR) dialyzer: $(REBAR) $(REBAR) as test dialyzer -clean: $(REBAR) +clean: $(REBAR) clean_doc $(REBAR) clean -distclean: - rm -rf _build - rm $(REBAR) +clean_doc: + @rm -rf doc +distclean: clean_doc + @rm -rf _build + @rm -f $(REBAR) + +doc: $(REBAR) + $(REBAR) ex_doc --output doc --formatter html # dializer diff --git a/rebar.config b/rebar.config index adb9366..ce7ffd3 100644 --- a/rebar.config +++ b/rebar.config @@ -1,19 +1,29 @@ {erl_opts, [debug_info, warn_unused, warn_shadow_vars]}. -{deps, [ - {zotonic_stdlib, "1.2.3"} -]}. +{deps, [ ]}. + +{project_plugins, [rebar3_ex_doc]}. + +{hex, [{doc, ex_doc}]}. + +{ex_doc, [ + {source_url, <<"https://github.com/zotonic/diffy">>}, + {extras, [<<"README.md">>, <<"LICENSE">>]}, + {main, <<"readme">>} +]}. {profiles, [ {test, [ {deps, [ - {proper, "1.2.0"} + {proper, "1.5.0"} ]}, {xref_checks, [ undefined_function_calls, + undefined_functions, locals_not_used, - deprecated_function_calls + deprecated_function_calls, + deprecated_functions ]}, {xref_ignores, [ @@ -21,8 +31,16 @@ {dialyzer, [ {warnings, [ - no_return + no_return, + extra_return, + missing_return, + underspecs + ]}, + {plt_extra_apps, [ + eunit, + proper ]} ]} ]} ]}. + diff --git a/rebar.lock b/rebar.lock index 05bb477..57afcca 100644 --- a/rebar.lock +++ b/rebar.lock @@ -1,11 +1 @@ -{"1.2.0", -[{<<"proper">>,{pkg,<<"proper">>,<<"1.2.0">>},0}, - {<<"zotonic_stdlib">>,{pkg,<<"zotonic_stdlib">>,<<"1.2.3">>},0}]}. -[ -{pkg_hash,[ - {<<"proper">>, <<"1466492385959412A02871505434E72E92765958C60DBA144B43863554B505A4">>}, - {<<"zotonic_stdlib">>, <<"4A33B60C82379169C9934CCD1FC9E512CA16B922E131AD6B6D26E562F66DF9CC">>}]}, -{pkg_hash_ext,[ - {<<"proper">>, <<"CBC3766C08337806741343D330BF4BCB826155D2141BE8514C4B02858AA19FD3">>}, - {<<"zotonic_stdlib">>, <<"4712DD7A0C0C600AFEDAFDA738D40FEBF10CFC2485E62D109361FCC190F7381A">>}]} -]. +[]. diff --git a/src/diffy.erl b/src/diffy.erl index fc75a87..d9b7f5d 100644 --- a/src/diffy.erl +++ b/src/diffy.erl @@ -1,9 +1,10 @@ %% @author Maas-Maarten Zeeman -%% @copyright 2014-2019 Maas-Maarten Zeeman +%% @copyright 2014-2026 Maas-Maarten Zeeman %% %% @doc Diffy, an erlang diff match and patch implementation +%% @end %% -%% Copyright 2014-2019 Maas-Maarten Zeeman +%% Copyright 2014-2026 Maas-Maarten Zeeman %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. @@ -16,12 +17,12 @@ %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. %% See the License for the specific language governing permissions and %% limitations under the License. -%% Erlang diff-match-patch implementation -module(diffy). -export([ diff/2, + diff/3, diff_bisect/2, diff_linemode/2, @@ -43,24 +44,30 @@ text_size/1, - split_pre_and_suffix/2, - unique_match/2 + split_pre_and_suffix/2 ]). -type diff_op() :: delete | equal | insert. -type diff() :: {diff_op(), unicode:unicode_binary()}. -type diffs() :: list(diff()). +-type diff_option() :: + semantic | + efficiency | + {efficiency, EditCost :: pos_integer()} | + no_linemode. + -type for_fun() :: fun((integer(), term()) -> {continue, term()} | {break, term()}). --export_type([diffs/0]). +-export_type([diff_op/0, diff/0, diffs/0, diff_option/0]). +-define(DEFAULT_EDIT_COST, 4). -define(PATCH_MARGIN, 4). --define(PATCH_MAX_PATCH_LEN, 32). - --define(MATCH_MAXBITS, 31). - -define(IS_INS_OR_DEL(Op), (Op =:= insert orelse Op =:= delete)). +-define(IS_UTF32_ALIGNED(Offset), (Offset rem 4 =:= 0)). +-define(IS_WS(C), (C =:= $\s orelse C =:= $\t orelse C =:= $\n orelse C =:= $\r orelse C =:= $\f orelse C =:= $\v)). +-define(IS_LB(C), (C =:= $\n orelse C =:= $\r)). +-define(IS_ALPHA(C), ((C >= $a andalso C =< $z) orelse (C >= $A andalso C =< $Z) orelse (C >= $0 andalso C =< $9))). -record(bisect_state, { k1start = 0, k1end = 0, @@ -79,58 +86,86 @@ length2 = 0 }). -% @doc Compute the difference between two binary texts -% +-dialyzer({no_match, for/5}). + +% @doc Compute the difference between two binary texts. -spec diff(unicode:unicode_binary(), unicode:unicode_binary()) -> diffs(). diff(Text1, Text2) -> - diff(Text1, Text2, true). + diff(Text1, Text2, []). -diff(<<>>, <<>>, _CheckLines) -> - []; -diff(Text1, Text2, _CheckLines) when Text1 =:= Text2 -> - [{equal, Text1}]; -diff(Text1, Text2, CheckLines) -> +% @doc Compute the difference between two binary texts with options. +% +% Options: +% semantic - run cleanup_semantic/1 on the result +% efficiency - run cleanup_efficiency/1 on the result (default edit cost 4) +% {efficiency, Cost} - run cleanup_efficiency/2 with a custom edit cost +% no_linemode - disable the linemode optimization for large texts +% +% Cleanups are always applied in the correct order: semantic first, then efficiency. +-spec diff(unicode:unicode_binary(), unicode:unicode_binary(), [diff_option()]) -> diffs(). +diff(Text1, Text2, Options) when is_list(Options) -> + T1 = to_utf32(Text1), + T2 = to_utf32(Text2), + CheckLines = not proplists:get_value(no_linemode, Options, false), + Diffs32 = diff32(T1, T2, CheckLines), + Diffs1 = case proplists:get_value(semantic, Options) of + true -> cleanup_semantic32(Diffs32); + _ -> Diffs32 + end, + Diffs2 = case proplists:get_value(efficiency, Options) of + NoEfficiency when NoEfficiency =:= undefined orelse NoEfficiency =:= false -> Diffs1; + true -> cleanup_efficiency32(Diffs1); + Cost when is_integer(Cost) andalso Cost > 0 -> cleanup_efficiency32(Diffs1, Cost) + end, + %% Single conversion at the exit boundary. + [{Op, to_utf8(D)} || {Op, D} <- Diffs2]. + +%% Internal diff working entirely in UTF-32 binaries. +diff32(<<>>, <<>>, _CheckLines) -> []; +diff32(<<>>, Text2, _CheckLines) -> [{insert, Text2}]; +diff32(Text1, <<>>, _CheckLines) -> [{delete, Text1}]; +diff32(Text1, Text2, _CheckLines) when Text1 =:= Text2 -> [{equal, Text1}]; +diff32(Text1, Text2, CheckLines) -> {Prefix, MText1, MText2, Suffix} = split_pre_and_suffix(Text1, Text2), Diffs = compute_diff(MText1, MText2, CheckLines), Diffs1 = case Suffix of - <<>> -> Diffs; - _ -> Diffs ++ [{equal, Suffix}] - end, + <<>> -> Diffs; + _ -> Diffs ++ [{equal, Suffix}] + end, - Diffs2 = case Prefix of - <<>> -> Diffs1; - _ -> [{equal, Prefix} | Diffs1] - end, + Diffs2 = case Prefix of + <<>> -> Diffs1; + _ -> [{equal, Prefix} | Diffs1] + end, - cleanup_merge(Diffs2). + cleanup_merge32(Diffs2). -%% This assumes Text1 and Text2 don't have a common prefix -compute_diff(<<>>, NewText, _CheckLines) -> - [{insert, NewText}]; -compute_diff(OldText, <<>>, _CheckLines) -> - [{delete, OldText}]; +%% This assumes Text1 and Text2 don't have a common prefix. Operates on UTF-32. +compute_diff(<<>>, NewText, _CheckLines) -> [{insert, NewText}]; +compute_diff(OldText, <<>>, _CheckLines) -> [{delete, OldText}]; compute_diff(OldText, NewText, CheckLines) -> OldStNew = size(OldText) < size(NewText), {ShortText, LongText} = case OldStNew of - true -> {OldText, NewText}; - false -> {NewText, OldText} - end, + true -> {OldText, NewText}; + false -> {NewText, OldText} + end, - case binary:match(LongText, ShortText) of + case aligned_utf32_match(LongText, ShortText, 0) of {Start, Length} -> <> = LongText, Op = diff_op(OldStNew), - [{Op, Pre}, {equal, ShortText}, {Op, Suf}]; + [{Op, Pre}, {equal, ShortText}, {Op, Suf}]; nomatch -> - case single_char(ShortText) of + %% In UTF-32, a single codepoint is exactly 4 bytes. + case size(ShortText) =:= 4 of true -> [{delete, OldText}, {insert, NewText}]; false -> try_half_match(OldText, NewText, CheckLines) - end + end end. diff_op(true) -> insert; @@ -140,49 +175,56 @@ diff_op(false) -> delete. try_half_match(OldText, NewText, CheckLines) -> case half_match(OldText, NewText) of {half_match, A1, A2, B1, B2, Common} -> - Diffs1 = diff(A1, B1, CheckLines), - Diffs2 = diff(A2, B2, CheckLines), + Diffs1 = diff32(A1, B1, CheckLines), + Diffs2 = diff32(A2, B2, CheckLines), Diffs1 ++ [{equal, Common} | Diffs2]; undefined -> compute_diff1(OldText, NewText, CheckLines) end. %% Check if we can do a half-match diff, returns undefined if it is not advantageous. +%% Operates on UTF-32 binaries — size comparisons are in bytes (4 bytes per codepoint). half_match(A, B) -> - AGtB = size(A) > size(B), - {Short, Long} = case AGtB of - true -> {B, A}; - false -> {A, B} - end, + AgtB = size(A) > size(B), + {Short, Long} = case AgtB of + true -> {B, A}; + false -> {A, B} + end, + + LongSize = size(Long), + ShortSize = size(Short), - case text_smaller_than(Long, 4) orelse size(Short) * 2 < size(Long) of + %% text_smaller_than(Long, 4) becomes size(Long) < 4*4 in UTF-32. + case LongSize < 16 orelse ShortSize * 2 < LongSize of true -> %% No point in looking. undefined; false -> - %% Note: this could split through a utf8 byte sequence. - Hm1 = half_match_i(Long, Short, (size(Long) + 3) div 4), - Hm2 = half_match_i(Long, Short, (size(Long) + 1) div 2), + %% Seed positions are quarter-way and half-way through Long, + %% expressed as byte offsets (codepoints * 4). + LongLen = LongSize div 4, %% codepoint count + Hm1 = half_match_i(Long, Short, ((LongLen + 3) div 4) * 4), + Hm2 = half_match_i(Long, Short, ((LongLen + 1) div 2) * 4), %% Select the longest half-match. Hm = case {Hm1, Hm2} of - {undefined, undefined} -> - undefined; - {undefined, _} -> - Hm2; - {_, undefined} -> - Hm1; - {{half_match, _, _, _, _, C1}, {half_match, _, _, _, _, C2}} when size(C1) > size(C2) -> - Hm1; - {_, _} -> - Hm2 - end, + {undefined, undefined} -> + undefined; + {undefined, _} -> + Hm2; + {_, undefined} -> + Hm1; + {{half_match, _, _, _, _, C1}, {half_match, _, _, _, _, C2}} when size(C1) > size(C2) -> + Hm1; + {_, _} -> + Hm2 + end, %% Swap values if A was smaller than B case Hm of undefined -> undefined; {half_match, T1A, T1B, T2A, T2B, MidCommon} -> - case AGtB of + case AgtB of true -> Hm; false -> {half_match, T2A, T2B, T1A, T1B, MidCommon} @@ -190,24 +232,19 @@ half_match(A, B) -> end end. - % Find the best common overlap at location I. half_match_i(Long, Short, I) -> {NewI, Seed} = seed(Long, I), case Seed of - <<>> -> - undefined; - _ -> - best_common(Long, Short, Seed, NewI, 0, - undefined, undefined, undefined, undefined, <<>>) + <<>> -> undefined; + _ -> best_common(Long, Short, Seed, NewI, 0, <<>>, <<>>, <<>>, <<>>, <<>>) end. - %% Find the best common overlap inside two texts. best_common(Long, Short, Seed, SeedLoc, Start, BestLongA, BestLongB, BestShortA, BestShortB, BestCommon) -> %% Check if we can find a match for Seed2 inside the shorttext. - case binary:match(Short, Seed, [{scope, {Start, size(Short)-Start}}]) of + case aligned_utf32_match(Short, Seed, Start) of nomatch -> case size(BestCommon) * 2 >= size(Long) of false -> @@ -248,45 +285,68 @@ best_common(Long, Short, Seed, SeedLoc, Start, end end. -%% @doc Return the position of the next character. -next_char(Bin, Pos) -> - <<_:Pos/binary, C/utf8, _Rest/binary>> = Bin, - %% The next char is at binary position... - Pos + size(<>). +%% @doc Find a match whose start offset is aligned to a UTF-32 codepoint boundary. +aligned_utf32_match(Bin, Pattern, Start) + when ?IS_UTF32_ALIGNED(Start) andalso Start >= 0 -> + case Start + size(Pattern) > size(Bin) of + true -> + nomatch; + false -> + case binary:match(Bin, Pattern, [{scope, {Start, size(Bin) - Start}}]) of + nomatch -> + nomatch; + {MatchStart, Length} when ?IS_UTF32_ALIGNED(MatchStart) -> + %% Match found, and it is correctly aligned. + {MatchStart, Length}; + {MatchStart, _Length} -> + %% Misaligned hit. binary:match found the first byte-level match, + %% so there is no aligned match before MatchStart. Skip directly + %% to the next aligned boundary after MatchStart. + aligned_utf32_match(Bin, Pattern, MatchStart + (4 - MatchStart rem 4)) + end + end. + +%% @doc Return the byte position of the next codepoint in a UTF-32 binary. +next_char(_Bin, Pos) -> + Pos + 4. -%% +%% +%% In UTF-32 every codepoint is exactly 4 bytes. Start is always a 4-byte-aligned +%% byte offset, so no alignment step is needed. seed(Long, Start) -> - SeedSize = size(Long) div 4, - - %% Note, need to split on utf8 character boundary here. + TotalCodepoints = size(Long) div 4, + SeedCodepoints = TotalCodepoints div 4, + SeedSize = SeedCodepoints * 4, <<_Pre:Start/binary, Seed:SeedSize/binary, _Post/binary>> = Long, - - %% Utf-8 repair the seed's head and tail. - {Pre, Seed1} = repair_head(Seed), - {Seed2, _} = repair_tail(Seed1), - - %% return the start position of the seed and the seed itself. - {Start - size(Pre), Seed2}. + {Start, Seed}. %% Line diff compute_diff1(Text1, Text2, true) -> - diff_linemode(Text1, Text2); -compute_diff1(Text1, Text2, false) when size(Text1) > 100 orelse size(Text2) > 100 -> - diff_linemode(Text1, Text2); + diff_linemode32(Text1, Text2); +compute_diff1(Text1, Text2, false) when size(Text1) > 400 orelse size(Text2) > 400 -> + %% 100 UTF-8 bytes ≈ 400 UTF-32 bytes (conservative upper bound) + diff_linemode32(Text1, Text2); compute_diff1(Text1, Text2, false) -> - diff_bisect(Text1, Text2). + diff_bisect32(Text1, Text2). -%% Compute diff in linemode +%% Public entry: accepts UTF-8, converts at boundary. diff_linemode(Text1, Text2) -> + T1 = to_utf32(Text1), + T2 = to_utf32(Text2), + Diffs32 = diff_linemode32(T1, T2), + [{Op, to_utf8(D)} || {Op, D} <- Diffs32]. + +%% Internal: operates entirely on UTF-32 binaries. +diff_linemode32(Text1, Text2) -> {CharText1, CharText2, Lines} = lines_to_chars(Text1, Text2), - Diffs = diff(CharText1, CharText2, false), + Diffs = diff32(CharText1, CharText2, false), %% Transform the diffs back to lines. - Diffs1 = chars_to_lines(Diffs, Lines), + Diffs1 = decode_lines(Diffs, Lines), - Cleaned = cleanup_merge(Diffs1), + Cleaned = cleanup_merge32(Diffs1), cleanup_line_diff(Cleaned, <<>>, <<>>, [], []). @@ -310,57 +370,56 @@ cleanup_line_diff([{equal, _}=E|Rest], DeleteData, InsertData, TmpAcc, Acc) %% Found leading insert and delete data, diff the texts and replace the operations. cleanup_line_diff([{equal, _}=E|Rest], DeleteData, InsertData, _TmpAcc, Acc) -> - %% rediff the delete and insert data. - Diffs = diff(DeleteData, InsertData, false), + %% Data is already UTF-32 — pass directly to diff32. + Diffs = diff32(DeleteData, InsertData, false), Acc1 = lists:reverse(Diffs) ++ Acc, cleanup_line_diff(Rest, <<>>, <<>>, [], [E|Acc1]). -%% Diff lines +%% Diff lines. +%% Text1 and Text2 are UTF-32 binaries. Lines are stored as UTF-32 binaries. +%% CharText1/CharText2 are UTF-32 binaries where each 4-byte word is a line index. lines_to_chars(Text1, Text2) -> - {CharText1, NextChar, Lines1, Dict1} = lines_to_chars(Text1, 0, <<>>, 0, [], dict:new()), - {CharText2, _, Lines2, _Dict2} = lines_to_chars(Text2, 0, <<>>, NextChar, Lines1, Dict1), - + {CharText1, NextChar, Lines1, Map1} = lines_to_chars(Text1, 0, <<>>, 0, [], #{}), + {CharText2, _, Lines2, _Map2} = lines_to_chars(Text2, 0, <<>>, NextChar, Lines1, Map1), {CharText1, CharText2, lists:reverse(Lines2)}. -% Transform each unique line into a single char -lines_to_chars(Text, Idx, CharText, NextChar, Lines, D) when Idx >= size(Text) -> - {CharText, NextChar, Lines, D}; -lines_to_chars(Text, Idx, CharText, NextChar, Lines, D) -> - case binary:match(Text, <<"\n">>, [{scope, {Idx, size(Text)-Idx}}]) of +%% Transform each unique line into a 4-byte index; store line content as UTF-32. +lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) when Idx >= byte_size(Text) -> + {CharText, NextChar, Lines, Map}; +lines_to_chars(Text, Idx, CharText, NextChar, Lines, Map) when ?IS_UTF32_ALIGNED(Idx) -> + case aligned_utf32_match(Text, <<$\n:32>>, Idx) of nomatch -> <<_:Idx/binary, Line/binary>> = Text, - {Char, NextChar1, Lines1, D1} = insert_line(Line, Lines, D, NextChar), - CharText1 = <>, - {CharText1, NextChar1, Lines1, D1}; + {Char, NextChar1, Lines1, Map1} = insert_line(Line, Lines, Map, NextChar), + CharText1 = <>, + {CharText1, NextChar1, Lines1, Map1}; {Start, _} -> - LineLength = Start - Idx + 1, + LineLength = Start - Idx + 4, <<_:Idx/binary, Line:LineLength/binary, _/binary>> = Text, - - {Char, NextChar1, Lines1, D1} = insert_line(Line, Lines, D, NextChar), - CharText1 = <>, - - lines_to_chars(Text, Idx + LineLength, CharText1, NextChar1, Lines1, D1) + {Char, NextChar1, Lines1, Map1} = insert_line(Line, Lines, Map, NextChar), + CharText1 = <>, + lines_to_chars(Text, Idx + LineLength, CharText1, NextChar1, Lines1, Map1) end. -insert_line(Line, Lines, Dict, NextChar) -> - case dict:find(Line, Dict) of - {ok, Char} -> - {Char, NextChar, Lines, Dict}; - error -> - {NextChar, NextChar+1, [Line|Lines], dict:store(Line, NextChar, Dict)} +insert_line(Line, Lines, Map, NextChar) -> + case Map of + #{Line := Char} -> + {Char, NextChar, Lines, Map}; + _ -> + {NextChar, NextChar + 1, [Line | Lines], Map#{Line => NextChar}} end. -%% -chars_to_lines(Diffs, Lines) when is_list(Lines) -> - A = array:from_list(Lines), - chars_to_lines(Diffs, A, []). +decode_lines(Diffs, Lines) when is_list(Lines) -> + LinesTuple = list_to_tuple(Lines), + decode_lines(Diffs, LinesTuple, []). -chars_to_lines([], _A, Acc) -> +decode_lines([], _LinesTuple, Acc) -> lists:reverse(Acc); -chars_to_lines([{Op, Data}|Rest], LineArray, Acc) -> - Data1 = << <<(array:get(C, LineArray))/binary>> || <> <= Data >>, - chars_to_lines(Rest, LineArray, [{Op, Data1}|Acc]). +decode_lines([{Op, Data} | Rest], LinesTuple, Acc) -> + %% Each index is a 32-bit word; lines are already UTF-32 — just concatenate. + Data1 = << <<(element(C + 1, LinesTuple))/binary>> || <> <= Data >>, + decode_lines(Rest, LinesTuple, [{Op, Data1} | Acc]). % Find the 'middle snake' of a diff, split the problem in two @@ -375,20 +434,26 @@ chars_to_lines([{Op, Data}|Rest], LineArray, Acc) -> %% Returns: %% Array of diff tuples. %% """ +%% Public entry point — converts UTF-8 inputs to UTF-32, runs bisect, converts back. diff_bisect(A, B) when is_binary(A) andalso is_binary(B) -> - ArrA = array_from_binary(A), - ArrB = array_from_binary(B), - try compute_diff_bisect1(ArrA, ArrB, array:size(ArrA), array:size(ArrB)) of - no_overlap -> [{delete, A}, {insert, B}] + Diffs32 = diff_bisect32(to_utf32(A), to_utf32(B)), + [{Op, to_utf8(D)} || {Op, D} <- Diffs32]. + +%% Internal bisect working entirely on UTF-32 binaries. +diff_bisect32(A, B) -> + M = byte_size(A) div 4, + N = byte_size(B) div 4, + try compute_diff_bisect1(A, B, M, N) of + no_overlap -> [{delete, A}, {insert, B}] catch - throw:{overlap, A1, B1, X, Y} -> - diff_bisect_split(A1, B1, X, Y) + throw:{overlap, X, Y} -> + diff_bisect_split(A, B, X, Y) end. compute_diff_bisect1(A, B, M, N) -> %% TODO, add deadline... - MaxD = int_ceil((M + N) / 2), + MaxD = ceil((M + N) / 2), VOffset = MaxD, VLength = 2 * MaxD, @@ -410,11 +475,13 @@ compute_diff_bisect1(A, B, M, N) -> S3 = for(-D + S1#bisect_state.k1start, D + 1 - S1#bisect_state.k1end, 2, fun(K1, S2) -> K1Offset = VOffset + K1, - X1 = case K1 =:= -D orelse (K1 =/= D andalso - (array:get(K1Offset-1, S2#bisect_state.v1) < array:get(K1Offset+1, S2#bisect_state.v1))) of - true -> array:get(K1Offset + 1, S2#bisect_state.v1); - false -> array:get(K1Offset - 1, S2#bisect_state.v1) + 1 - end, + X1 = case K1 =:= -D + orelse (K1 =/= D + andalso (array:get(K1Offset-1, S2#bisect_state.v1) < array:get(K1Offset+1, S2#bisect_state.v1))) + of + true -> array:get(K1Offset + 1, S2#bisect_state.v1); + false -> array:get(K1Offset - 1, S2#bisect_state.v1) + 1 + end, Y1 = X1 - K1, {X1_1, Y1_1} = match_front(X1, Y1, A, M, B, N), @@ -439,12 +506,10 @@ compute_diff_bisect1(A, B, M, N) -> true -> % Mirror x2 onto top-left coordinate system. X2 = M - V2AtOffset, - if - X1_1 >= X2 -> - % Overlap detected - throw({overlap, A, B, X1_1, Y1_1}); - true -> - {continue, S2_1} + case X1_1 >= X2 of + % Overlap detected + true -> throw({overlap, X1_1, Y1_1}); + false -> {continue, S2_1} end; false -> {continue, S2_1} end @@ -456,13 +521,13 @@ compute_diff_bisect1(A, B, M, N) -> %% Walk the reverse path one step. (verdacht hetzelfde als het ding hierboven...) S5 = for(-D + S3#bisect_state.k2start, D + 1 - S3#bisect_state.k2end, 2, fun(K2, S4) -> K2Offset = VOffset + K2, - X2 = case K2 =:= -D orelse (K2 =/= D andalso - array:get(K2Offset-1, S4#bisect_state.v2) < array:get(K2Offset+1, S4#bisect_state.v2)) of - true -> - array:get(K2Offset + 1, S4#bisect_state.v2); - false -> - array:get(K2Offset - 1, S4#bisect_state.v2) + 1 - end, + X2 = case K2 =:= -D + orelse (K2 =/= D + andalso array:get(K2Offset-1, S4#bisect_state.v2) < array:get(K2Offset+1, S4#bisect_state.v2)) + of + true -> array:get(K2Offset + 1, S4#bisect_state.v2); + false -> array:get(K2Offset - 1, S4#bisect_state.v2) + 1 + end, Y2 = X2 - K2, @@ -488,13 +553,11 @@ compute_diff_bisect1(A, B, M, N) -> true -> X1 = V1AtOffset, Y1 = VOffset + X1 - K1Offset, - if - % Mirror x2 onto top-left coordinate system. - X1 >= M - X2_1 -> - % Overlap detected - throw({overlap, A, B, X1, Y1}); - true -> - {continue, S4_1} + % Mirror x2 onto top-left coordinate system. + case X1 >= M - X2_1 of + % Overlap detected + true -> throw({overlap, X1, Y1}); + false -> {continue, S4_1} end; false -> {continue, S4_1} end @@ -507,60 +570,68 @@ compute_diff_bisect1(A, B, M, N) -> no_overlap. -% @doc Split A and B and process the parts. +% @doc Split A and B at the overlap point and recursively diff each half. diff_bisect_split(A, B, X, Y) -> - A1 = binary_from_array(0, X, A), - A2 = binary_from_array(0, Y, B), + A1 = binary:part(A, 0, X * 4), + A2 = binary:part(B, 0, Y * 4), + B1 = binary:part(A, X * 4, byte_size(A) - X * 4), + B2 = binary:part(B, Y * 4, byte_size(B) - Y * 4), - B1 = binary_from_array(X, array:size(A), A), - B2 = binary_from_array(Y, array:size(B), B), - - Diffs = diff(A1, A2, false), - DiffsB = diff(B1, B2, false), - - Diffs ++ DiffsB. + diff32(A1, A2, false) ++ diff32(B1, B2, false). % @doc Convert the diffs into a pretty html report --spec pretty_html(diffs()) -> iolist(). pretty_html(Diffs) -> pretty_html(Diffs, []). pretty_html([], Acc) -> lists:reverse(Acc); -pretty_html([{Op, Data}|T], Acc) -> - Text = z_html:escape(Data), +pretty_html([{Op, Data} | T], Acc) -> + Safe = html_escape(Data), HTML = case Op of insert -> - [<<"">>, Text, <<"">>]; + [<<"">>, Safe, <<"">>]; delete -> - [<<"">>, Text, <<"">>]; + [<<"">>, Safe, <<"">>]; equal -> - [<<"">>, Text, <<"">>] + [<<"">>, Safe, <<"">>] end, - pretty_html(T, [HTML|Acc]). + pretty_html(T, [HTML | Acc]). + +-if(?OTP_RELEASE >= 27). +html_escape(B) when is_binary(B) -> + binary:replace(B, + [<<"&">>, <<"<">>, <<">">>, <<"\"">>, <<"'">>], + fun (<<"&">>) -> <<"&">>; + (<<"<">>) -> <<"<">>; + (<<">">>) -> <<">">>; + (<<"\"">>) -> <<""">>; + (<<"'">>) -> <<"'">> + end, + [global]). +-else. +html_escape(B) when is_binary(B) -> + lists:foldl(fun({From, To}, Acc) -> + binary:replace(Acc, From, To, [global]) + end, + B, + [ + {<<"&">>, <<"&">>}, + {<<"<">>, <<"<">>}, + {<<">">>, <<">">>}, + {<<"\"">>, <<""">>}, + {<<"'">>, <<"'">>} + ]). +-endif. + +% Above function can be replaced with this when OTP 27 is the lowest supported % @doc Compute the source text from a list of diffs. source_text(Diffs) -> - source_text(Diffs, <<>>). - -source_text([], Acc) -> - Acc; -source_text([{insert, _Data}|T], Acc) -> - source_text(T, Acc); -source_text([{_Op, Data}|T], Acc) -> - source_text(T, <>). - + iolist_to_binary([Data || {Op, Data} <- Diffs, Op =/= insert]). % @doc Compute the destination text from a list of diffs. destination_text(Diffs) -> - destination_text(Diffs, <<>>). - -destination_text([], Acc) -> - Acc; -destination_text([{delete, _Data}|T], Acc) -> - destination_text(T, Acc); -destination_text([{_Op, Data}|T], Acc) -> - destination_text(T, <>). + iolist_to_binary([Data || {Op, Data} <- Diffs, Op =/= delete]). % @doc Compute the Levenshtein distance, the number of inserted, deleted or substituted characters. levenshtein(Diffs) -> @@ -581,117 +652,361 @@ levenshtein([{equal, _Data}|T], Insertions, Deletions, Levenshtein) -> % -spec cleanup_merge(diffs()) -> diffs(). cleanup_merge(Diffs) -> - cleanup_merge(Diffs, []). + Diffs32 = [{Op, to_utf32(D)} || {Op, D} <- Diffs], + [{Op, to_utf8(D)} || {Op, D} <- cleanup_merge32(Diffs32)]. + +%% Internal cleanup_merge operating on UTF-32 diffs. +cleanup_merge32(Diffs) -> + cleanup_merge32(Diffs, []). %% Done -cleanup_merge([], Acc) -> +cleanup_merge32([], Acc) -> lists:reverse(Acc); %% Remove operations without data. -cleanup_merge([{_Op, <<>>}|T], Acc) -> - cleanup_merge(T, Acc); -%% Merge data from equal operations -cleanup_merge([{Op2, Data2}|T], [{Op1, Data1}|Acc]) when Op1 =:= Op2 -> - cleanup_merge(T, [{Op1, <>}|Acc]); -%% Cleanup edits before equal operation -cleanup_merge([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc]) when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal -> - cleanup_merge(T, [I, {Op3, <>}|Acc]); -%% Check if Op1Data and Op2Data have common prefixes. -cleanup_merge([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc]) when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal -> +cleanup_merge32([{_Op, <<>>}|T], Acc) -> + cleanup_merge32(T, Acc); +%% Ensure delete/insert ordering: if insert is on top and a delete arrives, sink the insert. +cleanup_merge32([{delete, _}=D|T], [{insert, _}=I|Acc]) -> + cleanup_merge32([D, I|T], Acc); +%% Merge data from equal operations. +cleanup_merge32([{Op2, Data2}|T], [{Op1, Data1}|Acc]) when Op1 =:= Op2 -> + cleanup_merge32(T, [{Op1, <>}|Acc]); +%% Cleanup edits before equal operation — re-queue merged op for further processing. +cleanup_merge32([{Op1, Data1}|T], [{Op2, _}=I, {Op3, Data3}|Acc]) + when Op1 =/= Op2 andalso Op1 =:= Op3 andalso Op2 =/= equal andalso Op3 =/= equal -> + cleanup_merge32([I, {Op3, <>} | T], Acc); +%% Factor out common prefixes and suffixes from adjacent insert/delete pairs. +cleanup_merge32([{equal, E1}|T], [{Op1, Op1Data}, {Op2, Op2Data}, {equal, E2}|Acc]) + when Op1 =/= Op2 andalso Op1 =/= equal andalso Op2 =/= equal -> {Prefix, Op1DataD, Op2DataD, Suffix} = split_pre_and_suffix(Op1Data, Op2Data), - cleanup_merge(T, [{equal, <>}, + cleanup_merge32(T, [{equal, <>}, {Op1, Op1DataD}, {Op2, Op2DataD}, {equal, <>}|Acc]); -%% Check for slide left and slide right edits -cleanup_merge([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) when Op =:= insert orelse Op =:= delete -> +%% Slide edits left and right. +cleanup_merge32([{equal, E1}=H|T], [{Op, I}, {equal, E2}|AccTail]=Acc) + when Op =:= insert orelse Op =:= delete -> case is_suffix(E2, I) of false -> case is_prefix(E1, I) of false -> - cleanup_merge(T, [H|Acc]); + cleanup_merge32(T, [H|Acc]); true -> P = size(E1), <<_:P/binary, Post/binary>> = I, - cleanup_merge([{equal, <>}, {Op, <>}|T], AccTail) + cleanup_merge32([{equal, <>}, {Op, <>}|T], AccTail) end; true -> R = size(I) - size(E2), - <> = I, - cleanup_merge([{Op, <>}, {equal, <>}|T], AccTail) + <> = I, + cleanup_merge32([{Op, <>}, {equal, <>}|T], AccTail) end; -cleanup_merge([H|T], Acc) -> - cleanup_merge(T, [H|Acc]). +cleanup_merge32([H|T], Acc) -> + cleanup_merge32(T, [H|Acc]). % @doc Do semantic cleanup of diffs % -spec cleanup_semantic(diffs()) -> diffs(). cleanup_semantic(Diffs) -> - cleanup_semantic(Diffs, []). + Diffs32 = [{Op, to_utf32(D)} || {Op, D} <- Diffs], + [{Op, to_utf8(D)} || {Op, D} <- cleanup_semantic32(Diffs32)]. + +%% Internal semantic cleanup operating on UTF-32 diffs. +cleanup_semantic32(Diffs) -> + Diffs1 = cleanup_semantic_breakpoints(Diffs), + Diffs2 = cleanup_merge32(Diffs1), + Diffs3 = cleanup_semantic_lossless(Diffs2), + cleanup_semantic_overlaps(Diffs3). + +cleanup_semantic_breakpoints(Diffs) -> + case find_breakpoint(Diffs, [], 0, 0, 0, 0, undefined) of + {found, NewDiffs} -> cleanup_semantic_breakpoints(NewDiffs); + not_found -> Diffs + end. -cleanup_semantic([], Acc) -> - lists:reverse(Acc); -cleanup_semantic([H|T], Acc) -> - cleanup_semantic(T, [H|Acc]). +find_breakpoint([], _Acc, _LI1, _LD1, _LI2, _LD2, _LE) -> + not_found; +find_breakpoint([{equal, Data} | T], Acc, _LI1, _LD1, LI2, LD2, _LE) -> + find_breakpoint(T, [{equal, Data} | Acc], LI2, LD2, 0, 0, Data); +find_breakpoint([{insert, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) -> + NewLI2 = LI2 + text_size32(Data), + case is_breakpoint(LE, LI1, LD1, NewLI2, LD2) of + true -> {found, apply_breakpoint(LE, Acc, [{insert, Data} | T])}; + false -> find_breakpoint(T, [{insert, Data} | Acc], LI1, LD1, NewLI2, LD2, LE) + end; +find_breakpoint([{delete, Data} | T], Acc, LI1, LD1, LI2, LD2, LE) -> + NewLD2 = LD2 + text_size32(Data), + case is_breakpoint(LE, LI1, LD1, LI2, NewLD2) of + true -> {found, apply_breakpoint(LE, Acc, [{delete, Data} | T])}; + false -> find_breakpoint(T, [{delete, Data} | Acc], LI1, LD1, LI2, NewLD2, LE) + end. + +is_breakpoint(undefined, _, _, _, _) -> false; +is_breakpoint(LE, LI1, LD1, LI2, LD2) -> + LEN = text_size32(LE), + LEN =< max(LI1, LD1) andalso LEN =< max(LI2, LD2). + +apply_breakpoint(LE, Acc, T) -> + replace_equality(LE, Acc, T). + +replace_equality(LE, [{equal, LE} | T_Acc], T) -> + lists:reverse(T_Acc) ++ [{delete, LE}, {insert, LE} | T]; +replace_equality(LE, [H | T_Acc], T) -> + replace_equality(LE, T_Acc, [H | T]). + +cleanup_semantic_lossless(Diffs) -> + cleanup_semantic_lossless(Diffs, []). + +cleanup_semantic_lossless([{equal, E1}, {Op, Edit}, {equal, E2} | T], Acc) when ?IS_INS_OR_DEL(Op) -> + {NewE1, NewEdit, NewE2} = slide_edit(E1, Edit, E2), + case NewE1 of + <<>> -> + cleanup_semantic_lossless(lists:reverse(Acc, [{Op, NewEdit}, {equal, NewE2} | T]), []); + _ -> + case NewE2 of + <<>> -> + cleanup_semantic_lossless(lists:reverse(Acc, [{equal, NewE1}, {Op, NewEdit} | T]), []); + _ -> + cleanup_semantic_lossless([{Op, NewEdit}, {equal, NewE2} | T], [{equal, NewE1} | Acc]) + end + end; +cleanup_semantic_lossless([H | T], Acc) -> + cleanup_semantic_lossless(T, [H | Acc]); +cleanup_semantic_lossless([], Acc) -> + lists:reverse(Acc). + +slide_edit(E1, Edit, E2) -> + Suffix = common_suffix(E1, Edit), + {E1_1, Edit_1, E2_1} = case Suffix of + <<>> -> {E1, Edit, E2}; + _ -> + SLen = size(Suffix), + { binary:part(E1, 0, size(E1) - SLen), + <>, + <> } + end, + find_best_slide(E1_1, Edit_1, E2_1). + +find_best_slide(E1, Edit, E2) -> + Score = cleanup_semantic_score(E1, Edit) + cleanup_semantic_score(Edit, E2), + find_best_slide(E1, Edit, E2, Score, E1, Edit, E2). + +find_best_slide(E1, Edit, E2, BestScore, BestE1, BestEdit, BestE2) -> + case can_slide_right(Edit, E2) of + {true, Char, RestEdit, RestE2} -> + NewE1 = <>, + NewEdit = <>, + NewE2 = RestE2, + NewScore = cleanup_semantic_score(NewE1, NewEdit) + cleanup_semantic_score(NewEdit, NewE2), + case NewScore >= BestScore of + true -> find_best_slide(NewE1, NewEdit, NewE2, NewScore, NewE1, NewEdit, NewE2); + false -> find_best_slide(NewE1, NewEdit, NewE2, BestScore, BestE1, BestEdit, BestE2) + end; + false -> + {BestE1, BestEdit, BestE2} + end. + +%% In UTF-32 each codepoint is exactly 4 bytes — no pattern matching on variable-width needed. +can_slide_right(<>, <>) -> + {true, <>, RestEdit, RestE2}; +can_slide_right(_, _) -> + false. + +cleanup_semantic_score(<<>>, _) -> 6; +cleanup_semantic_score(_, <<>>) -> 6; +cleanup_semantic_score(One, Two) -> + Char1 = last_char(One), + Char2 = first_char(Two), + NonAlphaNumeric1 = not ?IS_ALPHA(Char1), + NonAlphaNumeric2 = not ?IS_ALPHA(Char2), + Whitespace1 = NonAlphaNumeric1 andalso ?IS_WS(Char1), + Whitespace2 = NonAlphaNumeric2 andalso ?IS_WS(Char2), + LineBreak1 = Whitespace1 andalso ?IS_LB(Char1), + LineBreak2 = Whitespace2 andalso ?IS_LB(Char2), + BlankLine1 = LineBreak1 andalso is_blankline_end(One), + BlankLine2 = LineBreak2 andalso is_blankline_start(Two), + if + BlankLine1 orelse BlankLine2 -> 5; + LineBreak1 orelse LineBreak2 -> 4; + NonAlphaNumeric1 andalso (not Whitespace1) andalso Whitespace2 -> 3; + Whitespace1 orelse Whitespace2 -> 2; + NonAlphaNumeric1 orelse NonAlphaNumeric2 -> 1; + true -> 0 + end. + +cleanup_semantic_overlaps(Diffs) -> + cleanup_semantic_overlaps(Diffs, []). + +cleanup_semantic_overlaps([{delete, Del}, {insert, Ins} | T], Acc) -> + Overlap1 = common_overlap(Del, Ins), + Overlap2 = common_overlap(Ins, Del), + TDel = text_size32(Del), + TIns = text_size32(Ins), + case Overlap1 >= Overlap2 of + true -> + case Overlap1 * 2 >= TDel orelse Overlap1 * 2 >= TIns of + true -> + Common = binary:part(Ins, 0, Overlap1 * 4), + NewDel = binary:part(Del, 0, (TDel - Overlap1) * 4), + NewIns = binary:part(Ins, Overlap1 * 4, (TIns - Overlap1) * 4), + cleanup_semantic_overlaps([{insert, NewIns} | T], [{equal, Common}, {delete, NewDel} | Acc]); + false -> + cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) + end; + false -> + case Overlap2 * 2 >= TIns orelse Overlap2 * 2 >= TDel of + true -> + Common = binary:part(Ins, (TIns - Overlap2) * 4, Overlap2 * 4), + NewIns = binary:part(Ins, 0, (TIns - Overlap2) * 4), + NewDel = binary:part(Del, Overlap2 * 4, (TDel - Overlap2) * 4), + cleanup_semantic_overlaps([{delete, NewDel} | T], [{equal, Common}, {insert, NewIns} | Acc]); + false -> + cleanup_semantic_overlaps([{insert, Ins} | T], [{delete, Del} | Acc]) + end + end; +cleanup_semantic_overlaps([H | T], Acc) -> + cleanup_semantic_overlaps(T, [H | Acc]); +cleanup_semantic_overlaps([], Acc) -> + lists:reverse(Acc). + +%% In UTF-32 every codepoint is exactly 4 bytes, so all byte/codepoint conversions +%% are simple multiplications and binary:part calls. + +%% @doc Return the first Len codepoints of Bin as a binary. +substring_start(Bin, Len) -> + binary:part(Bin, 0, Len * 4). + +%% @doc Return the last Len codepoints of Bin as a binary. +substring_end(Bin, Len) -> + TotalLen = text_size32(Bin), + case TotalLen =< Len of + true -> Bin; + false -> binary:part(Bin, (TotalLen - Len) * 4, Len * 4) + end. + +common_overlap(<<>>, _) -> 0; +common_overlap(_, <<>>) -> 0; +common_overlap(Text1, Text2) -> + T1Len = text_size32(Text1), + T2Len = text_size32(Text2), + {T1, T2, TMin} = if + T1Len > T2Len -> + {substring_end(Text1, T2Len), Text2, T2Len}; + T1Len < T2Len -> + {Text1, substring_start(Text2, T1Len), T1Len}; + true -> + {Text1, Text2, T1Len} + end, + case T1 =:= T2 of + true -> TMin; + false -> common_overlap_loop(T1, T2, TMin, 0, 1) + end. + +common_overlap_loop(T1, T2, TMin, Best, Length) when Length =< TMin -> + Pattern = substring_end(T1, Length), + case aligned_utf32_match(T2, Pattern, 0) of + nomatch -> + Best; + {FoundByteOffset, _} -> + %% In UTF-32, byte offset maps directly to codepoint count. + FoundCharCount = FoundByteOffset div 4, + NewLength = Length + FoundCharCount, + case NewLength > TMin of + true -> Best; + false -> + case substring_end(T1, NewLength) =:= substring_start(T2, NewLength) of + true -> + common_overlap_loop(T1, T2, TMin, NewLength, NewLength + 1); + false -> + common_overlap_loop(T1, T2, TMin, Best, NewLength + 1) + end + end + end; +common_overlap_loop(_T1, _T2, _TMin, Best, _Length) -> + Best. + +%% In UTF-32 the first and last codepoints are always at fixed byte offsets. +first_char(<>) -> C; +first_char(_) -> undefined. + +last_char(<<>>) -> undefined; +last_char(Bin) -> + Size = byte_size(Bin), + <<_:(Size-4)/binary, C:32>> = Bin, + C. + +%% In UTF-32 each codepoint is 4 bytes, so newline patterns are fixed-width. +is_blankline_end(Bin) when byte_size(Bin) >= 8 -> + Size = byte_size(Bin), + case Bin of + <<_:(Size-8)/binary, $\n:32, $\n:32>> -> true; + <<_:(Size-12)/binary, $\n:32, $\r:32, $\n:32>> -> true; + _ -> false + end; +is_blankline_end(_) -> false. + +is_blankline_start(Bin) when byte_size(Bin) >= 8 -> + case Bin of + <<$\n:32, $\n:32, _/binary>> -> true; + <<$\n:32, $\r:32, $\n:32, _/binary>> -> true; + <<$\r:32, $\n:32, $\n:32, _/binary>> -> true; + <<$\r:32, $\n:32, $\r:32, $\n:32, _/binary>> -> true; + _ -> false + end; +is_blankline_start(_) -> false. % @doc Do efficiency cleanup of diffs. % -spec cleanup_efficiency(diffs()) -> diffs(). cleanup_efficiency(Diffs) -> - cleanup_efficiency(Diffs, 4). + cleanup_efficiency(Diffs, ?DEFAULT_EDIT_COST). +-spec cleanup_efficiency(diffs(), pos_integer()) -> diffs(). cleanup_efficiency(Diffs, EditCost) -> - cleanup_efficiency(Diffs, false, EditCost, []). + Diffs32 = [{Op, to_utf32(D)} || {Op, D} <- Diffs], + [{Op, to_utf8(D)} || {Op, D} <- cleanup_efficiency32(Diffs32, EditCost)]. + +%% Internal efficiency cleanup operating on UTF-32 diffs. +cleanup_efficiency32(Diffs) -> + cleanup_efficiency32(Diffs, ?DEFAULT_EDIT_COST). + +cleanup_efficiency32(Diffs, EditCost) -> + cleanup_efficiency32(Diffs, false, EditCost, []). %% Done. -cleanup_efficiency([], Changed, _EditCost, Acc) -> +cleanup_efficiency32([], Changed, _EditCost, Acc) -> Diffs = lists:reverse(Acc), case Changed of false -> Diffs; - true -> cleanup_merge(Diffs) + true -> cleanup_merge32(Diffs) end; %% Any equality which is surrounded on both sides by an insertion and deletion need less then %% EditCost characters for it to be advantageous to split. -cleanup_efficiency([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc) when - O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) -> +cleanup_efficiency32([{O1, _}=A, {equal, XY}=E, {O2, _}=B | T], Changed, EditCost, Acc) + when O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) -> case text_smaller_than(XY, EditCost) of true -> - %% Split Del = {delete, XY}, Ins = {insert, XY}, - - cleanup_efficiency([Ins, B | T], true, EditCost, [Del, A | Acc]); + cleanup_efficiency32([Ins, B | T], true, EditCost, [Del, A | Acc]); false -> - %% Equal is big enough, move A and equal out of the way. - cleanup_efficiency([B | T], Changed, EditCost, [E, A |Acc]) + cleanup_efficiency32([B | T], Changed, EditCost, [E, A | Acc]) end; %% Any equality which is surrounded on one side by an existing insertion and deletion and on the -%% other side by an exisiting insertion or deletion needs by less than half C characters long for it -%% to be advantagous to split. -cleanup_efficiency([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc) when - O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) andalso ?IS_INS_OR_DEL(O3) -> +%% other side by an existing insertion or deletion needs less than half C characters long for it +%% to be advantageous to split. +cleanup_efficiency32([{O1, _}=A, {O2, _}=B, {equal, X}=E, {O3, _}=C | T], Changed, EditCost, Acc) + when O1 =/= O2 andalso ?IS_INS_OR_DEL(O1) andalso ?IS_INS_OR_DEL(O2) andalso ?IS_INS_OR_DEL(O3) -> case text_smaller_than(X, EditCost div 2 + 1) of true -> - %% Split Del = {delete, X}, Ins = {insert, X}, - cleanup_efficiency([Ins, C | T], true, EditCost, [Del, B, A | Acc]); + cleanup_efficiency32([Ins, C | T], true, EditCost, [Del, B, A | Acc]); false -> - %% Equal is big enough, move delete and equal out of the way. - cleanup_efficiency([B, E, C | T], Changed, EditCost, [A |Acc]) + cleanup_efficiency32([B, E, C | T], Changed, EditCost, [A | Acc]) end; -cleanup_efficiency([H|T], Changed, EditCost, Acc) -> - cleanup_efficiency(T, Changed, EditCost, [H|Acc]). +cleanup_efficiency32([H | T], Changed, EditCost, Acc) -> + cleanup_efficiency32(T, Changed, EditCost, [H | Acc]). -% @doc Return true iff the text is smaller than specified -text_smaller_than(_, 0) -> - false; -text_smaller_than(<<>>, _Size) -> - true; -text_smaller_than(<<_C/utf8, Rest/binary>>, Size) when Size > 0 -> - text_smaller_than(Rest, Size-1); -text_smaller_than(<<_C, Rest/binary>>, Size) when Size > 0 -> - %% Illegal utf-8 string, just count this as a single character and continue - text_smaller_than(Rest, Size-1). - % @doc create a patch from a list of diffs make_patch(Diffs) when is_list(Diffs) -> %% Reconstruct the source-text from the diffs. @@ -710,10 +1025,8 @@ make_patch(Diffs, SourceText) when is_list(Diffs) andalso is_binary(SourceText) make_patch([], _PrePatchText, _PostPatchText, _Count1, _Count2, [Patch|Rest]=Patches) -> case Patch#patch.diffs of - [] -> - lists:reverse(Rest); - _ -> - lists:reverse(Patches) + [] -> lists:reverse(Rest); + _ -> lists:reverse(Patches) end; make_patch([{insert, Data}=D|T], PrePatchText, PostPatchText, Count1, Count2, [Patch|Rest]) -> @@ -766,33 +1079,10 @@ make_patch([{equal, Data}|T], PrePatchText, PostPatchText, Count1, Count2, [Patc make_patch(T, PrePatchText, PostPatchText, Count1+Size, Count2+Size, [P|Rest]). - -% @doc Returns true iff Pattern is a unique match inside Text. -unique_match(Pattern, Text) -> - TextSize = size(Text), - case binary:match(Text, Pattern) of - nomatch -> - error(nomatch); - {Start, Length} when Start + 1 + Length < TextSize -> - %% We have a match, and we can search.. - case binary:match(Text, Pattern, [{scope, {Start+1, TextSize-Start-1}}]) of - nomatch -> true; - {_, _} -> false - end; - {_, _} -> - true - end. - - %% %% Helpers %% -% @doc Return true iff binary is a single character. -single_char(<<>>) -> false; -single_char(<<_C/utf8>>) -> true; -single_char(Bin) when is_binary(Bin) -> false. - % @doc Return true iff A is a prefix of B is_prefix(A, B) when size(A) > size(B) -> false; @@ -806,26 +1096,22 @@ is_suffix(A, B) -> size(A) =:= binary:longest_common_suffix([A, B]). % -match_front(X1, Y1, A, M, B, N) when X1 < M andalso Y1 < N -> - case array:get(X1, A) =:= array:get(Y1, B) of - true -> - match_front(X1+1, Y1+1, A, M, B, N); - false -> - {X1, Y1} - end; +match_front(X1, Y1, A32, M, B32, N) when X1 < M andalso Y1 < N -> + APart = binary:part(A32, X1 * 4, (M - X1) * 4), + BPart = binary:part(B32, Y1 * 4, (N - Y1) * 4), + Steps = binary:longest_common_prefix([APart, BPart]) div 4, + {X1 + Steps, Y1 + Steps}; match_front(X1, Y1, _, _, _, _) -> {X1, Y1}. % -match_reverse(X1, Y1, A, M, B, N) when X1 < M andalso Y1 < N -> - case array:get(M-X1-1, A) =:= array:get(N-Y1-1, B) of - true -> - match_reverse(X1+1, Y1+1, A, M, B, N); - false -> - {X1, Y1} - end; -match_reverse(X1, Y1, _, _, _, _) -> - {X1, Y1}. +match_reverse(X2, Y2, A32, M, B32, N) when X2 < M andalso Y2 < N -> + APart = binary:part(A32, 0, (M - X2) * 4), + BPart = binary:part(B32, 0, (N - Y2) * 4), + Steps = binary:longest_common_suffix([APart, BPart]) div 4, + {X2 + Steps, Y2 + Steps}; +match_reverse(X2, Y2, _, _, _, _) -> + {X2, Y2}. %% Implementation of the for statement @@ -837,10 +1123,8 @@ for(From, To, _Step, _Fun, State) when From >= To -> State; for(From, To, Step, Fun, State) -> case Fun(From, State) of - {continue, S1} -> - for(From + Step, To, Step, Fun, S1); - {break, S1} -> - S1 + {continue, S1} -> for(From + Step, To, Step, Fun, S1); + {break, S1} -> S1 end. split_pre_and_suffix(Text1, Text2) -> @@ -859,144 +1143,58 @@ split_pre_and_suffix(Text1, Text2) -> {Prefix, MiddleText1, MiddleText2, Suffix}. -% @doc Return the common prefix of Text1 and Text2. (utf8 aware) +% @doc Return the common prefix of Text1 and Text2. Works on UTF-32 — always codepoint-aligned. common_prefix(Text1, Text2) -> Length = binary:longest_common_prefix([Text1, Text2]), - Prefix = binary:part(Text1, 0, Length), - - %% Utf-8 repair the tail of the prefix. It could contain a half utf-8 char. - {Prefix1, _} = repair_tail(Prefix), - Prefix1. + %% Round down to 4-byte boundary (should already be aligned for valid UTF-32). + binary:part(Text1, 0, (Length div 4) * 4). -% @doc Return the common prefix of Text1 and Text2 (utf8 aware) +% @doc Return the common suffix of Text1 and Text2. Works on UTF-32 — always codepoint-aligned. common_suffix(Text1, Text2) -> Length = binary:longest_common_suffix([Text1, Text2]), - Suffix = binary:part(Text1, size(Text1), -Length), + binary:part(Text1, byte_size(Text1), -((Length div 4) * 4)). - %% Utf-8 repair the head of the suffix. Could contain a half utf8 char - {_, Suffix1} = repair_head(Suffix), - Suffix1. - -% @doc Count the number of characters in a utf8 binary. +% @doc Count the number of codepoints in a UTF-8 binary. +% @deprecated Use text_size32/1 internally. This public function may be removed in a future version. +-spec text_size(unicode:unicode_binary()) -> non_neg_integer(). text_size(Text) when is_binary(Text) -> - text_size(Text, 0). + byte_size(to_utf32(Text)) div 4. + +% @doc Count the number of codepoints in a UTF-32 binary. O(1). +text_size32(Text) when is_binary(Text) -> + byte_size(Text) div 4. -text_size(<<>>, Count) -> - Count; -text_size(<<_C/utf8, Rest/binary>>, Count) -> - text_size(Rest, Count+1); -text_size(_, _) -> - error(badarg). +% @doc Return true iff Text has fewer than Size codepoints. O(1) for UTF-32. +text_smaller_than(_, 0) -> + false; +text_smaller_than(Text, Size) -> + byte_size(Text) < Size * 4. %% -%% Array utilities +%% UTF-32 boundary helpers %% -% @doc Create an array from a utf8 binary. -array_from_binary(Bin) when is_binary(Bin) -> - array_from_binary(Bin, 0, array:new()). - -array_from_binary(<<>>, _N, Array) -> - array:fix(Array); -array_from_binary(<>, N, Array) -> - array_from_binary(Rest, N+1, array:set(N, C, Array)). - -% @doc Create a binary from an array containing unicode characters. -binary_from_array(Start, End, Array) -> - binary_from_array(Start, End, Array, <<>>). - -binary_from_array(N, End, Array, Acc) when N < End -> - C = array:get(N, Array), - binary_from_array(N+1, End, Array, <>); -binary_from_array(_, _, _, Acc) -> - Acc. - -%% @doc Checks the trailing bytes for utf8 prefix bytes. -repair_tail(<<>>) -> - {<<>>, <<>>}; -%% Checks -repair_tail(Bin) -> - Size = size(Bin), - Size1 = Size-1, Size2 = Size-2, Size3 = Size-3, Size4 = Size-4, - case Bin of - %% Valid 1 -byte - <<_:Size1/binary, 2#0:1, _A:7>> -> - {Bin, <<>>}; - - %% Invalid 1-byte - <> -> - {Pre, <<2#110:3, A:5>>}; - <> -> - {Pre, <<2#1110:4, A:4>>}; - <> -> - {Pre, <<2#11110:5, A:3>>}; - - %% Valid 2-byte ending - <<_:Size2/binary, 2#110:3, _A:5, 2#10:2, _B:6>> -> - {Bin, <<>>}; - - %% Invalid 2-byte ending - <> -> - {Pre, <<2#1110:4, A:4, 2#10:2, B:6>>}; - <> -> - {Pre, <<2#11110:5, A:3, 2#10:2, B:6>>}; - - %% Valid 3-byte ending - <<_:Size3/binary, 2#1110:4, _A:4, 2#10:2, _B:6, 2#10:2, _C:6>> -> - {Bin, <<>>}; - - %% Invalid 3-byte ending - <> -> - {Pre, <<2#11110:5, A:3, 2#10:2, B:6, 2#10:2, C:6>>}; - - %% Valid 4-byte ending - <<_:Size4/binary, 2#11110:5, _A:3, 2#10:2, _B:6, 2#10:2, _C:6, 2#10:2, _D:6>> -> - {Bin, <<>>}; - - %% Illegal utf-8 sequence. - _ -> - %% Can't repair it, just return - {Bin, <<>>} +% @doc Convert a UTF-8 binary to UTF-32, crashing on invalid input. +to_utf32(Bin) -> + case unicode:characters_to_binary(Bin, utf8, utf32) of + Out when is_binary(Out) -> + Out; + {error, _, _} -> + error(badarg); + {incomplete, _, _} -> + error(badarg) end. -% @doc Checks the beginning of a binary and strips of partial utf-8 encoded bytes. -repair_head(<<>>) -> - {<<>>, <<>>}; -% valid 1-byte beginning -repair_head(<<2#0:1, _A:7, _Rest/binary>>=Bin) -> - {<<>>, Bin}; -% valid 4-byte beginning -repair_head(<<2#11110:5, _A:3, 2#10:2, _B:6, 2#10:2, _C:6, 2#10:2, _D:6, _Rest/binary>>=Bin) -> - {<<>>, Bin}; -% valid 3-byte beginning -repair_head(<<2#1110:4, _A:4, 2#10:2, _B:6, 2#10:2, _C:6, _Rest/binary>>=Bin) -> - {<<>>, Bin}; -% invalid 3-byte beginning -repair_head(<<2#10:2, A:6, 2#10:2, B:6, 2#10:2, C:6, Rest/binary>>) -> - {<<2#10:2, A:6, 2#10:2, B:6, 2#10:2, C:6>>, Rest}; -% valid 2-byte beginning -repair_head(<<2#110:3, _A:5, 2#10:2, _B:6, _Rest/binary>>=Bin) -> - {<<>>, Bin}; -% invalid 2-byte beginnings -repair_head(<<2#10:2, A:6, 2#10:2, B:6, Rest/binary>>) -> - {<<2#10:2, A:6, 2#10:2, B:6>>, Rest}; -% invalid 1-byte beginning -repair_head(<<2#10:2, A:6, Rest/binary>>) -> - {<<2#10:2, A:6>>, Rest}; -repair_head(Bin) -> - %% Illegal sequence, can't repair it. - {<<>>, Bin}. - - -%% This function can go away when we support OTP 20 and up. -%% -int_ceil(Number) -> - T = trunc(Number), - case (Number - T) of - Neg when Neg < 0 -> T; - Pos when Pos > 0 -> T + 1; - _ -> T +% @doc Convert a UTF-32 binary to UTF-8, crashing on invalid input. +to_utf8(Bin) -> + case unicode:characters_to_binary(Bin, utf32, utf8) of + Out when is_binary(Out) -> + Out; + {error, _, _} -> + error(badarg); + {incomplete, _, _} -> + error(badarg) end. %% @@ -1007,50 +1205,11 @@ int_ceil(Number) -> -include_lib("eunit/include/eunit.hrl"). -repair_tail_test() -> - ?assertEqual({<<>>, <<>>}, repair_tail(<<>>)), - ?assertEqual({<<"aap">>, <<>>}, repair_tail(<<"aap">>)), - ?assertEqual({<<200/utf8>>, <<>>}, repair_tail(<<200/utf8>>)), - ?assertEqual({<<600/utf8>>, <<>>}, repair_tail(<<600/utf8>>)), - ?assertEqual({<<1000/utf8>>, <<>>}, repair_tail(<<1000/utf8>>)), - - ?assertEqual({<<"aap">>, <<200>>}, repair_tail(<<"aap", 200>>)), - - ?assertEqual({<<"test">>, <<240, 159, 159>>}, repair_tail(<<116,101,115,116,240,159,159>>)), - - ok. - -repair_head_test() -> - ?assertEqual({<<>>, <<>>}, repair_head(<<>>)), - ?assertEqual({<<>>, <<"a">>}, repair_head(<<"a">>)), - ?assertEqual({<<>>, <<"aap">>}, repair_head(<<"aap">>)), - ?assertEqual({<<>>, <<200/utf8>>}, repair_head(<<200/utf8>>)), - ?assertEqual({<<>>, <<600/utf8>>}, repair_head(<<600/utf8>>)), - ?assertEqual({<<>>, <<1000/utf8>>}, repair_head(<<1000/utf8>>)), - - %% - ?assertEqual({<<2#10:2, 10:6>>, <<"aap">>}, - repair_head(<<2#10:2, 10:6, "aap">>)), - ?assertEqual({<<2#10:2, 60:6, 2#10:2, 10:6>>, <<"aap">>}, - repair_head(<<2#10:2, 60:6, 2#10:2, 10:6, "aap">>)), - ?assertEqual({<<2#10:2, 60:6, 2#10:2, 10:6, 2#10:2, 13:6>>, <<"aap">>}, - repair_head(<<2#10:2, 60:6, 2#10:2, 10:6, 2#10:2, 13:6, "aap">>)), - - ok. - - for_test() -> ?assertEqual(9, for(0, 10, fun(I, _N) -> {continue, I} end, undefined)), ?assertEqual(0, for(0, 10, fun(I, _N) -> {break, I} end, undefined)), ok. -array_test() -> - ?assertEqual(20, array:size(array_from_binary(<<"de apen eten bananen">>))), - ?assertEqual(<<"broodje aap">>, binary_from_array(0, 11, array_from_binary(<<"broodje aap">>))), - ?assertEqual(<<"aa">>, binary_from_array(0, 2, array_from_binary(<<"aap">>))), - ?assertEqual(<<"ap">>, binary_from_array(1, 3, array_from_binary(<<"aap">>))), - ok. - diff_utf8_test() -> ?assertEqual([{equal, <<208,174, 208,189, 208,184, 208,186, 208,190, 208,180>>}], diff(<<208,174,208,189,208,184,208,186,208,190,208,180>>, @@ -1077,10 +1236,6 @@ diff_bisect_test() -> {equal,<<" a banana">>}], diff_bisect(<<"fruit flies like a banana">>, <<"fruit flies eat a banana">>)), - - %?assertEqual([{delete,<<"cat">>}, - % {insert,<<"map">>}], diff_bisect(<<"cat">>, <<"map">>)), - ?assertEqual([{delete,<<"c">>}, {insert,<<"m">>}, {equal,<<"a">>}, @@ -1096,139 +1251,278 @@ diff_bisect_test() -> ?assertEqual([{equal, <<"text">>}], diff_bisect(<<"text">>, <<"text">>)), - ok. +%% half_match operates on UTF-32 internally; wrap inputs/outputs for testing. +half_match_utf8(A, B) -> + case half_match(to_utf32(A), to_utf32(B)) of + undefined -> undefined; + {half_match, A1, A2, B1, B2, C} -> + {half_match, to_utf8(A1), to_utf8(A2), to_utf8(B1), to_utf8(B2), to_utf8(C)} + end. + half_match_test() -> - ?assertEqual(undefined, half_match(<<"1234567890">>, <<"abcdef">>)), - ?assertEqual(undefined, half_match(<<"12345">>, <<"23">>)), + ?assertEqual(undefined, half_match_utf8(<<"1234567890">>, <<"abcdef">>)), ?assertEqual(undefined, half_match_utf8(<<"12345">>, <<"23">>)), %% Single Match - ?assertEqual({half_match, <<"12">>, <<"90">>, <<"a">>, <<"z">>, <<"345678">>}, - half_match(<<"1234567890">>, <<"a345678z">>)), + ?assertEqual({half_match, <<"12">>, <<"90">>, <<"a">>, <<"z">>, <<"345678">>}, half_match_utf8(<<"1234567890">>, <<"a345678z">>)), ?assertEqual({half_match, <<"a">>, <<"z">>, <<"12">>, <<"90">>, <<"345678">>}, - half_match(<<"a345678z">>, <<"1234567890">>)), + half_match_utf8(<<"a345678z">>, <<"1234567890">>)), ?assertEqual({half_match, <<"abc">>, <<"z">>, <<"1234">>, <<"0">>, <<"56789">>}, - half_match(<<"abc56789z">>, <<"1234567890">>)), + half_match_utf8(<<"abc56789z">>, <<"1234567890">>)), ?assertEqual({half_match, <<"a">>, <<"xyz">>, <<"1">>, <<"7890">>, <<"23456">>}, - half_match(<<"a23456xyz">>, <<"1234567890">>)), + half_match_utf8(<<"a23456xyz">>, <<"1234567890">>)), %% Multiple Matches ?assertEqual({half_match, <<"12123">>, <<"123121">>, <<"a">>, <<"z">>, <<"1234123451234">>}, - half_match(<<"121231234123451234123121">>, <<"a1234123451234z">>)), + half_match_utf8(<<"121231234123451234123121">>, <<"a1234123451234z">>)), ?assertEqual({half_match, <<"">>, <<"-=-=-=-=-=">>, <<"x">>, <<"">>, <<"x-=-=-=-=-=-=-=">>}, - half_match(<<"x-=-=-=-=-=-=-=-=-=-=-=-=">>, <<"xx-=-=-=-=-=-=-=">>)), + half_match_utf8(<<"x-=-=-=-=-=-=-=-=-=-=-=-=">>, <<"xx-=-=-=-=-=-=-=">>)), ?assertEqual({half_match, <<"-=-=-=-=-=">>, <<"">>, <<"">>, <<"y">>, <<"-=-=-=-=-=-=-=y">>}, - half_match(<<"-=-=-=-=-=-=-=-=-=-=-=-=y">>, <<"-=-=-=-=-=-=-=yy">>)), + half_match_utf8(<<"-=-=-=-=-=-=-=-=-=-=-=-=y">>, <<"-=-=-=-=-=-=-=yy">>)), - % Non-optimal halfmatch. - % Optimal diff would be -q+x=H-i+e=lloHe+Hu=llo-Hew+y not -qHillo+x=HelloHe-w+Hulloy ?assertEqual({half_match, <<"qHillo">>, <<"w">>, <<"x">>, <<"Hulloy">>, <<"HelloHe">>}, - half_match(<<"qHilloHelloHew">>, <<"xHelloHeHulloy">>)), + half_match_utf8(<<"qHilloHelloHew">>, <<"xHelloHeHulloy">>)), + + ?assertEqual({half_match, <<"qHillo"/utf8>>, <<"w"/utf8>>, <<"x"/utf8>>, <<"eHull💯y"/utf8>>, <<"🐶🐱🐭🐹🐰H❤️"/utf8>>}, + half_match_utf8(<<"qHillo🐶🐱🐭🐹🐰H❤️w"/utf8>>, <<"x🐶🐱🐭🐹🐰H❤️eHull💯y"/utf8>>)), + + %% Unicode: é is 2 UTF-8 bytes but 1 codepoint (4 UTF-32 bytes). + %% With the old bug, size(Long) div 4 gave the wrong seed position + %% because byte_size in UTF-32 ≠ codepoint_count for multi-byte UTF-8 chars. + %% Long = éééééééééé (10 chars), Short = a + éééééééé + z (10 chars). + %% half_match should find the 8-char common section of é's. + E = <<233/utf8>>, + ULong = binary:copy(E, 10), + UShort = <<"a", (binary:copy(E, 8))/binary, "z">>, + UDiff = diff(ULong, UShort), + ?assertEqual(ULong, source_text(UDiff)), + ?assertEqual(UShort, destination_text(UDiff)), + %% The 8-char run of é must appear as a single equal op. + Equal8 = binary:copy(E, 8), + ?assert(lists:member({equal, Equal8}, UDiff)), ok. - +%% common_prefix/suffix operate on UTF-32; wrap for testing. common_prefix_test() -> - ?assertEqual(<<>>, common_prefix(<<"Text">>, <<"Next">>)), - ?assertEqual(<<"T">>, common_prefix(<<"Text">>, <<"Tax">>)), - ?assertEqual(<<"text">>, common_prefix(<<"text">>, <<"text">>)), + Prefix = fun(A, B) -> to_utf8(common_prefix(to_utf32(A), to_utf32(B))) end, - ?assertEqual(<<"test🟡"/utf8>>, common_prefix(<<"test🟡123"/utf8>>, <<"test🟡456"/utf8>>)), + ?assertEqual(<<>>, Prefix(<<"Text">>, <<"Next">>)), + ?assertEqual(<<"T">>, Prefix(<<"Text">>, <<"Tax">>)), + ?assertEqual(<<"text">>, Prefix(<<"text">>, <<"text">>)), - ?assertEqual(<<"test">>, common_prefix(<<"test🟢123"/utf8>>, <<"test🟡123"/utf8>>)), - ?assertEqual(<<"test">>, common_prefix(<<"test🟡123"/utf8>>, <<"test🟢123"/utf8>>)), - - ?assertEqual(<<"test">>, common_prefix(<<"test🟡123"/utf8>>, <<"test🔵123"/utf8>>)), - ?assertEqual(<<"test">>, common_prefix(<<"test🔵123"/utf8>>, <<"test🟡123"/utf8>>)), + ?assertEqual(<<"test🟡"/utf8>>, Prefix(<<"test🟡123"/utf8>>, <<"test🟡456"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🟢123"/utf8>>, <<"test🟡123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🟡123"/utf8>>, <<"test🟢123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🟡123"/utf8>>, <<"test🔵123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🔵123"/utf8>>, <<"test🟡123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test🟡123"/utf8>>, <<"test⚫️123"/utf8>>)), + ?assertEqual(<<"test">>, Prefix(<<"test⚫️123"/utf8>>, <<"test🟡123"/utf8>>)), - ?assertEqual(<<"test">>, common_prefix(<<"test🟡123"/utf8>>, <<"test⚫️123"/utf8>>)), - ?assertEqual(<<"test">>, common_prefix(<<"test⚫️123"/utf8>>, <<"test🟡123"/utf8>>)), + ok. +common_suffix_test() -> + Suffix = fun(A, B) -> to_utf8(common_suffix(to_utf32(A), to_utf32(B))) end, + ?assertEqual(<<"ext">>, Suffix(<<"Text">>, <<"Next">>)), + ?assertEqual(<<>>, Suffix(<<"Text">>, <<"Tax">>)), + ?assertEqual(<<"text">>, Suffix(<<"text">>, <<"text">>)), ok. +%% split_pre_and_suffix operates on UTF-32; wrap for testing. +split_pre_and_suffix_test() -> + Split = fun(A, B) -> + {P, M1, M2, S} = split_pre_and_suffix(to_utf32(A), to_utf32(B)), + {to_utf8(P), to_utf8(M1), to_utf8(M2), to_utf8(S)} + end, -common_suffix_test() -> - ?assertEqual(<<"ext">>, common_suffix(<<"Text">>, <<"Next">>)), - ?assertEqual(<<>>, common_suffix(<<"Text">>, <<"Tax">>)), - ?assertEqual(<<"text">>, common_suffix(<<"text">>, <<"text">>)), + ?assertEqual({<<>>, <<>>, <<>>, <<>>}, Split(<<>>, <<>>)), + ?assertEqual({<<>>, <<"a">>, <<"b">>, <<>>}, Split(<<"a">>, <<"b">>)), + ?assertEqual({<<"a">>, <<"b">>, <<"c">>, <<"d">>}, Split(<<"abd">>, <<"acd">>)), + ?assertEqual({<<"aa">>, <<"bb">>, <<"cc">>, <<"dd">>}, Split(<<"aabbdd">>, <<"aaccdd">>)), + ?assertEqual({<<"aa">>, <<"bb">>, <<"c">>, <<"dd">>}, Split(<<"aabbdd">>, <<"aacdd">>)), + ?assertEqual({<<"cat ">>, <<>>, <<"mouse dog ">>, <<>>}, + Split(<<"cat ">>, <<"cat mouse dog ">>)), ok. -split_pre_and_suffix_test() -> - ?assertEqual({<<>>, <<>>, <<>>, <<>>}, split_pre_and_suffix(<<>>, <<>>)), +text_smaller_than_test() -> + %% text_smaller_than now works on UTF-32 binaries. + ?assertEqual(true, text_smaller_than(to_utf32(<<>>), 5)), + ?assertEqual(true, text_smaller_than(to_utf32(<<>>), 1)), + ?assertEqual(false, text_smaller_than(to_utf32(<<>>), 0)), + ?assertEqual(false, text_smaller_than(to_utf32(<<"abc">>), 0)), + ?assertEqual(false, text_smaller_than(to_utf32(<<"abc">>), 1)), + ?assertEqual(true, text_smaller_than(to_utf32(<<"abc">>), 4)), + + %% Multi-byte UTF-8 characters each become exactly 4 bytes in UTF-32. + Utf32 = to_utf32(<<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>), + ?assertEqual(true, text_smaller_than(Utf32, 5)), + ?assertEqual(false, text_smaller_than(Utf32, 4)), - ?assertEqual({<<>>, <<"a">>, <<"b">>, <<>>}, split_pre_and_suffix(<<"a">>, <<"b">>)), - - ?assertEqual({<<"a">>, <<"b">>, <<"c">>, <<"d">>}, - split_pre_and_suffix(<<"abd">>, <<"acd">>)), - ?assertEqual({<<"aa">>, <<"bb">>, <<"cc">>, <<"dd">>}, - split_pre_and_suffix(<<"aabbdd">>, <<"aaccdd">>)), - ?assertEqual({<<"aa">>, <<"bb">>, <<"c">>, <<"dd">>}, - split_pre_and_suffix(<<"aabbdd">>, <<"aacdd">>)), + ok. - ?assertEqual({<<"cat ">>, <<>>, <<"mouse dog ">>, <<>>}, - split_pre_and_suffix(<<"cat ">>, <<"cat mouse dog ">>)), +lines_to_chars_test() -> + %% lines_to_chars takes UTF-32 input, returns UTF-32 index sequences and UTF-32 lines. + {C1, C2, Lines} = lines_to_chars(to_utf32(<<>>), to_utf32(<<>>)), + ?assertEqual(<<>>, C1), + ?assertEqual(<<>>, C2), + ?assertEqual([], Lines), + + {C3, C4, Lines2} = lines_to_chars(to_utf32(<<"hello\nworld\n">>), to_utf32(<<"hello\nmaas\n">>)), + %% Lines are stored as UTF-32 binaries. + ?assertEqual([to_utf32(<<"hello\n">>), to_utf32(<<"world\n">>), to_utf32(<<"maas\n">>)], Lines2), + ?assertEqual(<<0:32, 1:32>>, C3), + ?assertEqual(<<0:32, 2:32>>, C4), - ok. + ok. + +diff_linemode_test() -> + ?assertEqual([{equal, <<"hello\n">>}, {delete, <<"world\n">>}, {insert, <<"maas\n">>}], + diff_linemode(<<"hello\nworld\n">>, <<"hello\nmaas\n">>)), -unique_match_test() -> - ?assertEqual(true, unique_match(<<"a">>, <<"abc">>)), - ?assertEqual(true, unique_match(<<"b">>, <<"abc">>)), - ?assertEqual(true, unique_match(<<"c">>, <<"abc">>)), - ?assertEqual(false, unique_match(<<"ab">>, <<"abab">>)), ok. +diff_options_test() -> + A = <<"cat">>, + B = <<"map">>, -text_smaller_than_test() -> - ?assertEqual(true, text_smaller_than(<<>>, 5)), - ?assertEqual(true, text_smaller_than(<<>>, 1)), + %% No options — same as diff/2. + ?assertEqual(diff(A, B), diff(A, B, [])), + + %% no_linemode: result is structurally equivalent (same source/dest text). + NoLinemode = diff(A, B, [no_linemode]), + ?assertEqual(source_text(diff(A, B)), source_text(NoLinemode)), + ?assertEqual(destination_text(diff(A, B)), destination_text(NoLinemode)), + + %% semantic option applies cleanup_semantic to the raw diff. + ?assertEqual(cleanup_semantic(diff(A, B)), diff(A, B, [semantic])), - ?assertEqual(false, text_smaller_than(<<>>, 0)), + %% efficiency option applies cleanup_efficiency to the raw diff. + ?assertEqual(cleanup_efficiency(diff(A, B)), diff(A, B, [efficiency])), - ?assertEqual(false, text_smaller_than(<<"abc">>, 0)), - ?assertEqual(false, text_smaller_than(<<"abc">>, 1)), - ?assertEqual(true, text_smaller_than(<<"abc">>, 4)), + %% {efficiency, Cost} applies cleanup_efficiency/2 with the given cost. + ?assertEqual(cleanup_efficiency(diff(A, B), 2), diff(A, B, [{efficiency, 2}])), - %% Test if we count characters. - Utf8Binary = <<1046/utf8, 1011/utf8, 1022/utf8, 127/utf8>>, - ?assertEqual(true, size(Utf8Binary) > 5), % binary is larger due to utf8 encoding - ?assertEqual(true, text_smaller_than(Utf8Binary, 5)), - ?assertEqual(false, text_smaller_than(Utf8Binary, 4)), + %% Both: semantic first, then efficiency. + ?assertEqual( + cleanup_efficiency(cleanup_semantic(diff(A, B))), + diff(A, B, [semantic, efficiency])), - %% Test illegal utf8 sequence, the chars are counted as normal chars - ?assertEqual(false, text_smaller_than(<<149,157,112,8>>, 4)), + %% Order of options in list does not affect cleanup order. + ?assertEqual( + diff(A, B, [semantic, efficiency]), + diff(A, B, [efficiency, semantic])), ok. -lines_to_chars_test() -> - ?assertEqual({<<>>, <<>>, []}, lines_to_chars(<<>>, <<>>)), - - %% Simple text - ?assertEqual({<<0, 1>>, <<0, 2>>, [<<"hello\n">>, <<"world\n">>, <<"maas\n">>]}, - lines_to_chars(<<"hello\n\world\n">>, <<"hello\nmaas\n">>)), - - %% No newline at the end. - ?assertEqual({<<0, 1>>, <<0, 2>>, [<<"hello\n">>, <<"world\n">>, <<"maas">>]}, - lines_to_chars(<<"hello\n\world\n">>, <<"hello\nmaas">>)), - - %% No newline at the end. - ?assertEqual({<<0, 1>>, <<0, 2>>, [<<"hello\n">>, <<"world\n">>, <<"maas">>]}, - lines_to_chars(<<"hello\n\world\n">>, <<"hello\nmaas">>)), - - %% With empty lines - ?assertEqual({<<0, 1, 2>>, <<0, 1, 3>>, [<<"hello\n">>, <<"\n">>, <<"world\n">>, <<"maas">>]}, - lines_to_chars(<<"hello\n\nworld\n">>, <<"hello\n\nmaas">>)), +seed_test() -> + %% 1. Empty binary: no codepoints, seed is empty. + ?assertEqual({0, <<>>}, seed(<<>>, 0)), + + %% 2. Binary shorter than 4 codepoints (3 codepoints): 3 div 4 = 0, seed is empty. + Short3 = to_utf32(<<"abc">>), + ?assertEqual({0, <<>>}, seed(Short3, 0)), + + %% 3. Exactly 4 codepoints, Start=0: seed is 1 codepoint (the first one). + Exact4 = to_utf32(<<"abcd">>), + ?assertEqual({0, to_utf32(<<"a">>)}, seed(Exact4, 0)), + + %% 4. 8 codepoints, Start=0: seed is 2 codepoints starting at offset 0. + Long8 = to_utf32(<<"12345678">>), + ?assertEqual({0, to_utf32(<<"12">>)}, seed(Long8, 0)), + + %% 5. 16 codepoints, Start=8 (8 bytes = 2 codepoints * 4 bytes/codepoint): + %% seed is 4 codepoints; returned Start equals 8 and seed bytes are the correct slice. + Long16 = to_utf32(<<"abcdefghijklmnop">>), + {S5, Seed5} = seed(Long16, 8), + ?assertEqual(8, S5), + ?assertEqual(to_utf32(<<"cdef">>), Seed5), + + %% 6. ASCII text round-trip: "1234567890" (10 chars), seed at quarter-way offset. + Ascii10 = to_utf32(<<"1234567890">>), + %% TotalCodepoints=10, SeedCodepoints=2; Start=0 to keep the offset 4-byte-aligned. + {_, SeedAscii} = seed(Ascii10, 0), + ?assertEqual(<<"12">>, to_utf8(SeedAscii)), + + %% 7. Multi-byte codepoint alignment: 10 Greek letters (2 UTF-8 bytes each, 4 UTF-32 bytes each). + Greek10 = to_utf32(<<"αβγδεζηθικ"/utf8>>), + {Start7, Seed7} = seed(Greek10, 0), + %% Returned Start is 0. + ?assertEqual(0, Start7), + %% Seed is 4-byte-aligned. + ?assertEqual(0, byte_size(Seed7) rem 4), + %% Seed length = (10 div 4) * 4 = 8 bytes = 2 codepoints. + ?assertEqual((10 div 4) * 4, byte_size(Seed7)), + %% Seed decodes back to the first 2 Greek letters. + ?assertEqual(<<"αβ"/utf8>>, to_utf8(Seed7)), + + %% 8. Emoji (4-byte UTF-8 codepoints): 10 emoji, seed is first 2. + Emoji10 = to_utf32(<<"🐶🐱🐭🐹🐰🐨🐯🦁🐮🐷"/utf8>>), + {_, SeedEmoji} = seed(Emoji10, 0), + %% Seed length = (10 div 4) * 4 = 8 bytes = 2 codepoints. + ?assertEqual((10 div 4) * 4, byte_size(SeedEmoji)), + %% Seed decodes back to the first 2 emoji. + ?assertEqual(<<"🐶🐱"/utf8>>, to_utf8(SeedEmoji)), + + %% 9. Seed start offset preserved: non-zero Start is returned unchanged. + Long12 = to_utf32(<<"abcdefghijkl">>), + {Start9, _} = seed(Long12, 8), + ?assertEqual(8, Start9), + + %% 10. Seed is a contiguous slice of Long: binary:part(Long, Start, byte_size(Seed)) =:= Seed. + Long20 = to_utf32(<<"abcdefghijklmnopqrst">>), + {Start10, Seed10} = seed(Long20, 8), + ?assertEqual(Seed10, binary:part(Long20, Start10, byte_size(Seed10))), ok. +aligned_utf32_match_test() -> + ?assertEqual(nomatch, aligned_utf32_match(<<>>, <<0,0,0,0>>, 0)), + ?assertEqual(nomatch, aligned_utf32_match(<<>>, <<0,0,0,0>>, 4)), -diff_linemode_test() -> - ?assertEqual([{equal, <<"hello\n">>}, {delete, <<"world\n">>}, {insert, <<"maas\n">>}], - diff_linemode(<<"hello\nworld\n">>, <<"hello\nmaas\n">>)), + ?assertError(function_clause, aligned_utf32_match(<<>>, <<0,0,0,0>>, 3)), + + ?assertEqual({0, 4}, aligned_utf32_match(<<1,2,3,4>>, <<1,2,3,4>>, 0)), + ?assertEqual({4, 4}, aligned_utf32_match(<<0,0,0,0, 1,2,3,4>>, <<1,2,3,4>>, 0)), + + %% These will binary match, but the match is not on a utf32 boundary + ?assertEqual(nomatch, aligned_utf32_match(<<0,0,1,2, 3,4,5,6>>, <<1,2,3,4>>, 0)), + ?assertEqual({8,4}, aligned_utf32_match(<<0,0,1,2, 3,4,5,6, 1,2,3,4>>, <<1,2,3,4>>, 0)), + ?assertEqual({8,4}, aligned_utf32_match(<<0,0,1,2, 3,4,5,6, 1,2,3,4>>, <<1,2,3,4>>, 4)), + ?assertEqual(nomatch, aligned_utf32_match(<<0,0,1,2, 3,4,5,1, 2,3,4,0>>, <<1,2,3,4>>, 4)), + + %% Some longer matches + ?assertEqual({40, 20}, aligned_utf32_match(to_utf32(<<"the quick brown fox jumps over the lazy dog"/utf8>>), + to_utf32(<<"brown"/utf8>>), 0)), + ?assertEqual(nomatch, aligned_utf32_match(to_utf32(<<"the quick brown fox jumps over the lazy dog"/utf8>>), + to_utf32(<<"blue"/utf8>>), 0)), + + %% All emoticon matches emoticons + ?assertEqual(nomatch, aligned_utf32_match(to_utf32(<<"😔😟😕🙁☹️😣😖😫😩🥺🥶"/utf8>>), + to_utf32(<<"💩"/utf8>>), 0)), + ?assertEqual({16,12}, aligned_utf32_match(to_utf32(<<"😔😟😕🙁☹️💩😣😖😫😩🥺🥶"/utf8>>), + to_utf32(<<"☹️💩"/utf8>>), 0)), + + ok. + +common_overlap_test() -> + A = to_utf32(<<"Fire at Will">>), + B = to_utf32(<<"William Riker is number one">>), + ?assertEqual(4, common_overlap(A, B)), + ok. + +common_overlap_loop_test() -> + Abc = to_utf32(<<"abc">>), + Cde = to_utf32(<<"cde">>), + ?assertEqual(1, common_overlap_loop(Abc, Cde, text_size32(Cde), 0, 1)), + + Abcdef = to_utf32(<<"abcdef">>), + Efde = to_utf32(<<"efde">>), + ?assertEqual(2, common_overlap_loop(Abcdef, Efde, text_size32(Efde), 0, 1)), ok. diff --git a/src/diffy_simple_patch.erl b/src/diffy_simple_patch.erl index 66a6ae8..29b5753 100644 --- a/src/diffy_simple_patch.erl +++ b/src/diffy_simple_patch.erl @@ -2,8 +2,9 @@ %% @copyright 2014 Maas-Maarten Zeeman %% %% @doc Diffy, an erlang diff match and patch implementation +%% @end %% -%% Copyright 2014 Maas-Maarten Zeeman +%% Copyright 2014-2026 Maas-Maarten Zeeman %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. diff --git a/src/diffy_term.erl b/src/diffy_term.erl index ed25387..23c23b6 100644 --- a/src/diffy_term.erl +++ b/src/diffy_term.erl @@ -3,8 +3,9 @@ %% %% @doc Diffy, an erlang diff match and patch implementation %% Adapted from diffy.erl for simple diff on a list of Erlang terms +%% @end %% -%% Copyright 2014-2015 Maas-Maarten Zeeman, Marc Worrell +%% Copyright 2014-2026 Maas-Maarten Zeeman, Marc Worrell %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. @@ -25,11 +26,10 @@ diff/2 ]). --type diff_op() :: delete | equal | insert. --type diff() :: {diff_op(), term()}. +-type diff() :: {diffy:diff_op(), term()}. -type diffs() :: list(diff()). --export_type([ diffs/0 ]). +-export_type([ diff/0, diffs/0 ]). -spec diff(list(), list()) -> diffs(). diff(A, A) -> @@ -41,7 +41,7 @@ diff([], B) -> diff(A, B) when is_list(A), is_list(B) -> {Dict0, N} = term_dict(A, dict:new(), 0), {Dict, _N} = term_dict(B, Dict0, N), - Diff = diffy:diff(map_terms(A, Dict), map_terms(B, Dict)), + Diff = diffy:diff(map_terms(A, Dict), map_terms(B, Dict), [no_linemode]), unmap_diff(Diff, Dict). term_dict([], D, N) -> @@ -68,7 +68,6 @@ unmap_diff_1({Op, B}, RDict) -> {Op, [ dict:fetch(C, RDict) || C <- Cs ]}. - -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). @@ -88,7 +87,5 @@ diffy_term_test() -> diffy_term:diff([a,b,c,d,e], [a,e,b,c,d])), ok. - -endif. - diff --git a/test/diffy_tests.erl b/test/diffy_tests.erl index 42065a4..c7c9781 100644 --- a/test/diffy_tests.erl +++ b/test/diffy_tests.erl @@ -1,9 +1,9 @@ %% @author Maas-Maarten Zeeman -%% @copyright 2014 Maas-Maarten Zeeman +%% @copyright 2014-2026 Maas-Maarten Zeeman %% %% @doc Diffy, an erlang diff match and patch implementation %% -%% Copyright 2014 Maas-Maarten Zeeman +%% Copyright 2014-2026 Maas-Maarten Zeeman %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. @@ -23,32 +23,59 @@ -include_lib("proper/include/proper.hrl"). -include_lib("eunit/include/eunit.hrl"). +-dialyzer({no_opaque, [ + cleanup_merge_prop_test/0, + cleanup_efficiency_prop_test/0, + cleanup_semantic_prop_test/0, + random_inner_diff_prop_test/0, + random_diffs_prop_test/0 +]}). + +-define(NUM_TESTS, 500). + %% %% Properties %% prop_cleanup_merge() -> - ?FORALL(Diffs, diffy:diffs(), + ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), begin SourceText = diffy:source_text(Diffs), DestinationText = diffy:destination_text(Diffs), - CleanDiffs = cleanup_merge(Diffs), - SourceText == diffy:source_text(CleanDiffs) andalso - DestinationText == diffy:destination_text(CleanDiffs) + SourceText =:= diffy:source_text(CleanDiffs) + andalso DestinationText =:= diffy:destination_text(CleanDiffs) + end). + +prop_cleanup_merge_idempotent() -> + ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), + begin + % Cleaning the diffs again should not result in more changes + Cleaned = cleanup_merge(Diffs), + Cleaned =:= cleanup_merge(Cleaned) end). prop_cleanup_efficiency() -> - ?FORALL(Diffs, diffy:diffs(), + ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), begin SourceText = diffy:source_text(Diffs), DestinationText = diffy:destination_text(Diffs), - EfficientDiffs = cleanup_efficiency(Diffs), - SourceText == diffy:source_text(EfficientDiffs) andalso - DestinationText == diffy:destination_text(EfficientDiffs) + SourceText =:= diffy:source_text(EfficientDiffs) + andalso DestinationText =:= diffy:destination_text(EfficientDiffs) + end). + +prop_cleanup_semantic() -> + ?FORALL(Diffs, list({diff_op(), proper_unicode:utf8()}), + begin + SourceText = diffy:source_text(Diffs), + DestinationText = diffy:destination_text(Diffs), + SemanticDiffs = cleanup_semantic(Diffs), + + SourceText =:= diffy:source_text(SemanticDiffs) + andalso DestinationText =:= diffy:destination_text(SemanticDiffs) end). html_like() -> @@ -58,7 +85,7 @@ html_like() -> {2, utf8(4)}, % Some small portions of unicode chars. {2, range($0, $9)}, % numbers {2, $\s}, % whitespace - {4, $\n}, % linebreaks + {4, $\n}, % linebreaks {2, oneof([$., $-, $!, $?, $,])} % punctuation ]))). @@ -123,8 +150,13 @@ pretty_html_test() -> ?assertEqual(<<"test">>, pretty_html([{equal, <<"test">>}])), ?assertEqual(<<"footest">>, pretty_html([{delete, <<"foo">>}, {equal, <<"test">>}])), + ?assertEqual(<<"footest">>, pretty_html([{insert, <<"foo">>}, {equal, <<"test">>}])), + + %% escaping. + ?assertEqual(<<"<span>foo</span>& < > " '">>, + pretty_html([{insert, <<"foo">>}, {equal, <<"& < > \" '">>}])), ok. source_text_test() -> @@ -209,8 +241,26 @@ cleanup_merge_test() -> ok. +%% delete/insert/delete — the two deletes merge, then insert must be re-checked +%% against the equal below it, which should then slide +requeue_i_test() -> + ?assertEqual([{delete, <<"aXa">>}, {insert, <<"b">>}], + cleanup_merge([{delete, <<"a">>}, {insert, <<"b">>}, {delete, <<"Xa">>}])). + +%% Three consecutive deletes separated by inserts collapse correctly +triple_delete_test() -> + ?assertEqual([{delete, <<"abc">>}, {insert, <<"xyz">>}], + cleanup_merge([{delete, <<"a">>}, {insert, <<"x">>}, + {delete, <<"b">>}, {insert, <<"y">>}, + {delete, <<"c">>}, {insert, <<"z">>}])). + +%% After sliding, the two equals on either side should merge into one +slide_merge_test() -> + ?assertEqual([{insert, <<"aX">>}, {equal, <<"abc">>}], + cleanup_merge([{equal, <<"a">>}, {insert, <<"Xa">>}, {equal, <<"bc">>}])). + cleanup_merge_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_cleanup_merge(), [{numtests, 500}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_cleanup_merge(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. cleanup_semantic_test() -> @@ -225,27 +275,39 @@ cleanup_semantic_test() -> ?assertEqual([{delete, <<"abc">>}, {insert, <<"ABC">>}, {equal, <<"1234">>}, {delete, <<"wxyz">>}], cleanup_semantic([{delete, <<"abc">>}, {insert, <<"ABC">>}, {equal, <<"1234">>}, {delete, <<"wxyz">>}])), - % % Simple elimination. - % ?assertEqual([{delete, <<"abc">>}, {insert, <<"b">>}], - % cleanup_semantic([{delete, <<"a">>}, {equal, <<"b">>}, {delete, <<"c">>}])), + % Simple elimination. + ?assertEqual([{delete, <<"abc">>}, {insert, <<"b">>}], + cleanup_semantic([{delete, <<"a">>}, {equal, <<"b">>}, {delete, <<"c">>}])), - % % Multiple eliminations. - % ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}], - % cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}, - % {equal, <<"_">>}, {insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}])), + % Multiple eliminations. + ?assertEqual([{delete, <<"AB_AB">>}, {insert, <<"1A2_1A2">>}], + cleanup_semantic([{insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}, + {equal, <<"_">>}, {insert, <<"1">>}, {equal, <<"A">>}, {delete, <<"B">>}, {insert, <<"2">>}])), + + % Regression test for UTF-8 data loss in cleanup_semantic_overlaps + % Ins1 = <<0,32,204,128,0,0>> (size 6, text_size 5) + % Ins2 = <<0,0,0,0,0,0,0,0>> (size 8, text_size 8) + % Total Dest size 14, text_size 13 + Diffs = [{delete,<<0,0,0,0,0,0,0,0>>},{insert,<<0,32,204,128,0,0>>},{insert,<<0,0,0,0,0,0,0,0>>}], + Cleaned = cleanup_semantic(Diffs), + ?assertEqual(diffy:destination_text(Diffs), diffy:destination_text(Cleaned)), ok. cleanup_efficiency_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, 500}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_cleanup_efficiency(), [{numtests, ?NUM_TESTS}, {to_file, user}])), + ok. + +cleanup_semantic_prop_test() -> + ?assertEqual(true, proper:quickcheck(prop_cleanup_semantic(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. random_diffs_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, 500}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_make_diff(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. random_inner_diff_prop_test() -> - ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, 500}, {to_file, user}])), + ?assertEqual(true, proper:quickcheck(prop_inner_diff(), [{numtests, ?NUM_TESTS}, {to_file, user}])), ok. cleanup_efficiency_test() -> @@ -261,7 +323,7 @@ cleanup_efficiency_test() -> cleanup_efficiency([{delete, <<"ab">>}, {insert, <<"12">>}, {equal, <<"xyz">>}, {delete, <<"cd">>}, {insert, <<"34">>}])), % Three-edit elimination - ?assertEqual([{insert, <<"12x34">>}, {delete, <<"xcd">>}], + ?assertEqual([{delete, <<"xcd">>}, {insert, <<"12x34">>}], cleanup_efficiency([{insert, <<"12">>}, {equal, <<"x">>}, {delete, <<"cd">>}, {insert, <<"34">>}])), % Backpass elimination @@ -320,11 +382,109 @@ diff_test() -> <<"cat mouse dog ">>)), ok. + +diff_linemode_corners_test() -> + %% Empty inputs. + ?assertEqual([], diffy:diff_linemode(<<>>, <<>>)), + ?assertEqual([{insert, <<"hello\n">>}], diffy:diff_linemode(<<>>, <<"hello\n">>)), + ?assertEqual([{delete, <<"hello\n">>}], diffy:diff_linemode(<<"hello\n">>, <<>>)), + + %% Identical input — single equal op. + ?assertEqual([{equal, <<"hello\nworld\n">>}], + diffy:diff_linemode(<<"hello\nworld\n">>, <<"hello\nworld\n">>)), + + %% No newline at end of file — last line treated as its own token. + ?assertEqual( + [{equal, <<"hello\n">>}, {delete, <<"world">>}, {insert, <<"maas">>}], + diffy:diff_linemode(<<"hello\nworld">>, <<"hello\nmaas">>)), + + %% Blank lines — exercise is_blankline_start/end and the \n\n pattern. + %% The rediff within cleanup_line_diff splits b\n vs c\n at character level. + ?assertEqual( + [{equal, <<"a\n\n">>}, {delete, <<"b">>}, {insert, <<"c">>}, {equal, <<"\nd\n">>}], + diffy:diff_linemode(<<"a\n\nb\nd\n">>, <<"a\n\nc\nd\n">>)), + + %% \r\n line endings — exercises the \r\n\r\n blankline pattern. + ?assertEqual( + [{equal, <<"hello\r\n">>}, {delete, <<"world\r\n">>}, {insert, <<"maas\r\n">>}], + diffy:diff_linemode(<<"hello\r\nworld\r\n">>, <<"hello\r\nmaas\r\n">>)), + + %% Repeated lines — the same line appearing multiple times should reuse the same index. + ?assertEqual( + [{equal, <<"a\nb\na\n">>}, {insert, <<"b\n">>}], + diffy:diff_linemode(<<"a\nb\na\n">>, <<"a\nb\na\nb\n">>)), + + %% Large enough to trigger linemode via compute_diff1 size threshold. + %% Build two texts that differ only in one line buried in > 100 chars of context. + Prefix = binary:copy(<<"padding line\n">>, 10), + Suffix = binary:copy(<<"trailing line\n">>, 10), + Text1 = <>, + Text2 = <>, + Diffs = diffy:diff(Text1, Text2), + %% Source and destination text must be preserved exactly. + ?assertEqual(Text1, diffy:source_text(Diffs)), + ?assertEqual(Text2, diffy:destination_text(Diffs)), + %% Must contain at least one delete and one insert — the changed line. + ?assert(lists:any(fun({delete, _}) -> true; (_) -> false end, Diffs)), + ?assert(lists:any(fun({insert, _}) -> true; (_) -> false end, Diffs)), + + %% Multi-byte UTF-8 lines — verify encoding survives the linemode round-trip. + ?assertEqual( + [{equal, <<"héllo\n"/utf8>>}, {delete, <<"wörld\n"/utf8>>}, {insert, <<"wörlt\n"/utf8>>}], + diffy:diff_linemode(<<"héllo\nwörld\n"/utf8>>, <<"héllo\nwörlt\n"/utf8>>)), + + %% cleanup_line_diff rediff path — two changed lines adjacent to an equal trigger + %% the rediff of accumulated delete+insert data. + T1 = <<"aaa\nbbb\nccc\n">>, + T2 = <<"aab\nbbc\nccc\n">>, + RediffDiffs = diffy:diff_linemode(T1, T2), + ?assertEqual(T1, diffy:source_text(RediffDiffs)), + ?assertEqual(T2, diffy:destination_text(RediffDiffs)), + + ok. + +diff_options_test() -> + A = <<"one two x four five">>, + B = <<"one TWO x FOUR five">>, + + %% No options — same as diff/2. + ?assertEqual(diffy:diff(A, B), diffy:diff(A, B, [])), + + %% no_linemode: result is structurally equivalent (same source/dest text). + NoLinemode = diffy:diff(A, B, [no_linemode]), + ?assertEqual(diffy:source_text(diffy:diff(A, B)), diffy:source_text(NoLinemode)), + ?assertEqual(diffy:destination_text(diffy:diff(A, B)), diffy:destination_text(NoLinemode)), + + %% semantic option applies cleanup_semantic to the raw diff. + ?assertEqual(diffy:cleanup_semantic(diffy:diff(A, B)), diffy:diff(A, B, [semantic])), + + %% efficiency option applies cleanup_efficiency to the raw diff. + ?assertEqual(diffy:cleanup_efficiency(diffy:diff(A, B)), diffy:diff(A, B, [efficiency])), + + %% {efficiency, Cost} applies cleanup_efficiency/2 with the given cost. + ?assertEqual(diffy:cleanup_efficiency(diffy:diff(A, B), 2), diffy:diff(A, B, [{efficiency, 2}])), + + %% Both: semantic first, then efficiency. + ?assertEqual( + diffy:cleanup_efficiency(diffy:cleanup_semantic(diffy:diff(A, B))), + diffy:diff(A, B, [semantic, efficiency])), + + %% Order of options in list does not affect cleanup order. + ?assertEqual( + diffy:diff(A, B, [semantic, efficiency]), + diffy:diff(A, B, [efficiency, semantic])), + + ok. + + %% %% Helpers %% +diff_op() -> + oneof([insert, delete, equal]). + pretty_html(Diffs) -> iolist_to_binary(diffy:pretty_html(Diffs)).