Skip to content

Commit 3822fbe

Browse files
patch from upstream: 'Python3: Stop breaking surrogate pairs in toDelta()'
1 parent 0083470 commit 3822fbe

File tree

2 files changed

+74
-7
lines changed

2 files changed

+74
-7
lines changed

diff_match_patch/diff_match_patch.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1136,6 +1136,8 @@ def diff_prettyHtml(self, diffs):
11361136
"""
11371137
html = []
11381138
for op, data in diffs:
1139+
if 0 == len(data):
1140+
continue
11391141
text = (
11401142
data.replace("&", "&")
11411143
.replace("<", "&lt;")
@@ -1225,9 +1227,9 @@ def diff_toDelta(self, diffs):
12251227
data = data.encode("utf-8")
12261228
text.append("+" + urllib.parse.quote(data, "!~*'();/?:@&=+$,# "))
12271229
elif op == self.DIFF_DELETE:
1228-
text.append("-%d" % len(data))
1230+
text.append("-%d" % (len(data.encode('utf-16-be')) // 2))
12291231
elif op == self.DIFF_EQUAL:
1230-
text.append("=%d" % len(data))
1232+
text.append("=%d" % (len(data.encode('utf-16-be')) // 2))
12311233
return "\t".join(text)
12321234

12331235
def diff_fromDelta(self, text1, delta):
@@ -1245,7 +1247,8 @@ def diff_fromDelta(self, text1, delta):
12451247
ValueError: If invalid input.
12461248
"""
12471249
diffs = []
1248-
pointer = 0 # Cursor in text1
1250+
as_utf16 = text1.encode('utf-16-be')
1251+
pointer = 0 # Cursor in as_utf16
12491252
tokens = delta.split("\t")
12501253
for token in tokens:
12511254
if token == "":
@@ -1264,8 +1267,8 @@ def diff_fromDelta(self, text1, delta):
12641267
raise ValueError("Invalid number in diff_fromDelta: " + param)
12651268
if n < 0:
12661269
raise ValueError("Negative number in diff_fromDelta: " + param)
1267-
text = text1[pointer : pointer + n]
1268-
pointer += n
1270+
text = as_utf16[pointer: pointer + n * 2].decode('utf-16-be')
1271+
pointer += n * 2
12691272
if token[0] == "=":
12701273
diffs.append((self.DIFF_EQUAL, text))
12711274
else:
@@ -1275,10 +1278,10 @@ def diff_fromDelta(self, text1, delta):
12751278
raise ValueError(
12761279
"Invalid diff operation in diff_fromDelta: " + token[0]
12771280
)
1278-
if pointer != len(text1):
1281+
if pointer != len(as_utf16):
12791282
raise ValueError(
12801283
"Delta length (%d) does not equal source text length (%d)."
1281-
% (pointer, len(text1))
1284+
% (pointer, len(as_utf16))
12821285
)
12831286
return diffs
12841287

diff_match_patch/tests/diff_match_patch_test.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,11 @@ def testDiffDelta(self):
833833
# Convert delta string into a diff.
834834
self.assertEqual(diffs, self.dmp.diff_fromDelta(text1, delta))
835835

836+
diffs = self.dmp.diff_main("\U0001F64B\U0001F64B", "\U0001F64B\U0001F64C\U0001F64B")
837+
delta = self.dmp.diff_toDelta(diffs)
838+
self.assertEqual("=2\t+%F0%9F%99%8C\t=2", delta)
839+
self.assertEqual(diffs, self.dmp.diff_fromDelta("\U0001F64B\U0001F64B", "=2\t+%F0%9F%99%8C\t=2"))
840+
836841
# Verify pool of unchanged characters.
837842
diffs = [
838843
(
@@ -849,6 +854,65 @@ def testDiffDelta(self):
849854
# Convert delta string into a diff.
850855
self.assertEqual(diffs, self.dmp.diff_fromDelta("", delta))
851856

857+
# Unicode: split surrogates
858+
self.assertEqual(
859+
self.dmp.diff_toDelta([
860+
(self.dmp.DIFF_INSERT, '\U0001F171'),
861+
(self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
862+
]),
863+
self.dmp.diff_toDelta(self.dmp.diff_main(
864+
'\U0001F170\U0001F171',
865+
'\U0001F171\U0001F170\U0001F171'
866+
)),
867+
'Inserting similar surrogate pair at beginning'
868+
)
869+
self.assertEqual(
870+
self.dmp.diff_toDelta([
871+
(self.dmp.DIFF_EQUAL, '\U0001F170'),
872+
(self.dmp.DIFF_INSERT, '\U0001F172'),
873+
(self.dmp.DIFF_EQUAL, '\U0001F171')
874+
]),
875+
self.dmp.diff_toDelta(self.dmp.diff_main(
876+
'\U0001F170\U0001F171',
877+
'\U0001F170\U0001F172\U0001F171'
878+
)),
879+
'Inserting similar surrogate pair in the middle'
880+
)
881+
self.assertEqual(
882+
self.dmp.diff_toDelta([
883+
(self.dmp.DIFF_DELETE, '\U0001F171'),
884+
(self.dmp.DIFF_EQUAL, '\U0001F170\U0001F171')
885+
]),
886+
self.dmp.diff_toDelta(self.dmp.diff_main(
887+
'\U0001F171\U0001F170\U0001F171',
888+
'\U0001F170\U0001F171'
889+
)),
890+
'Deleting similar surogate pair at the beginning'
891+
)
892+
self.assertEqual(
893+
self.dmp.diff_toDelta([
894+
(self.dmp.DIFF_EQUAL, '\U0001F170'),
895+
(self.dmp.DIFF_DELETE, '\U0001F172'),
896+
(self.dmp.DIFF_EQUAL, '\U0001F171')
897+
]),
898+
self.dmp.diff_toDelta(self.dmp.diff_main(
899+
'\U0001F170\U0001F172\U0001F171',
900+
'\U0001F170\U0001F171'
901+
)),
902+
'Deleting similar surogate pair in the middle'
903+
)
904+
self.assertEqual(
905+
self.dmp.diff_toDelta([
906+
(self.dmp.DIFF_DELETE, '\U0001F170'),
907+
(self.dmp.DIFF_INSERT, '\U0001F171')
908+
]),
909+
self.dmp.diff_toDelta(self.dmp.diff_main(
910+
'\U0001F170',
911+
'\U0001F171'
912+
)),
913+
'Swap surrogate pair'
914+
)
915+
852916
# 160 kb string.
853917
a = "abcdefghij"
854918
for i in range(14):

0 commit comments

Comments
 (0)