Skip to content

Commit f413b87

Browse files
serhiy-storchakaVermeille
authored andcommitted
[3.14] gh-74902: Add Unicode Grapheme Cluster Break algorithm (GH-143076)
Add the unicodedata.iter_graphemes() function to iterate over grapheme clusters according to rules defined in Unicode Standard Annex #29. Add unicodedata.grapheme_cluster_break(), unicodedata.indic_conjunct_break() and unicodedata.extended_pictographic() functions to get the properties of the character which are related to the above algorithm. (cherry picked from commit bab1d7a) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Guillaume "Vermeille" Sanchez <guillaume.v.sanchez@gmail.com>
1 parent 5df7652 commit f413b87

File tree

6 files changed

+4284
-3057
lines changed

6 files changed

+4284
-3057
lines changed

Lib/test/test_unicodedata.py

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,206 @@ def test_east_asian_width_unassigned(self):
651651
self.assertEqual(eaw(char), 'A')
652652
self.assertIs(self.db.name(char, None), None)
653653

654+
def test_grapheme_cluster_break(self):
655+
gcb = self.db._grapheme_cluster_break
656+
self.assertEqual(gcb(' '), 'Other')
657+
self.assertEqual(gcb('x'), 'Other')
658+
self.assertEqual(gcb('\U0010FFFF'), 'Other')
659+
self.assertEqual(gcb('\r'), 'CR')
660+
self.assertEqual(gcb('\n'), 'LF')
661+
self.assertEqual(gcb('\0'), 'Control')
662+
self.assertEqual(gcb('\t'), 'Control')
663+
self.assertEqual(gcb('\x1F'), 'Control')
664+
self.assertEqual(gcb('\x7F'), 'Control')
665+
self.assertEqual(gcb('\x9F'), 'Control')
666+
self.assertEqual(gcb('\U000E0001'), 'Control')
667+
self.assertEqual(gcb('\u0300'), 'Extend')
668+
self.assertEqual(gcb('\u200C'), 'Extend')
669+
self.assertEqual(gcb('\U000E01EF'), 'Extend')
670+
self.assertEqual(gcb('\u1159'), 'L')
671+
self.assertEqual(gcb('\u11F9'), 'T')
672+
self.assertEqual(gcb('\uD788'), 'LV')
673+
self.assertEqual(gcb('\uD7A3'), 'LVT')
674+
# New in 5.0.0
675+
self.assertEqual(gcb('\u05BA'), 'Extend')
676+
self.assertEqual(gcb('\u20EF'), 'Extend')
677+
# New in 5.1.0
678+
self.assertEqual(gcb('\u2064'), 'Control')
679+
self.assertEqual(gcb('\uAA4D'), 'SpacingMark')
680+
# New in 5.2.0
681+
self.assertEqual(gcb('\u0816'), 'Extend')
682+
self.assertEqual(gcb('\uA97C'), 'L')
683+
self.assertEqual(gcb('\uD7C6'), 'V')
684+
self.assertEqual(gcb('\uD7FB'), 'T')
685+
# New in 6.0.0
686+
self.assertEqual(gcb('\u093A'), 'Extend')
687+
self.assertEqual(gcb('\U00011002'), 'SpacingMark')
688+
# New in 6.1.0
689+
self.assertEqual(gcb('\U000E0FFF'), 'Control')
690+
self.assertEqual(gcb('\U00016F7E'), 'SpacingMark')
691+
# New in 6.2.0
692+
self.assertEqual(gcb('\U0001F1E6'), 'Regional_Indicator')
693+
self.assertEqual(gcb('\U0001F1FF'), 'Regional_Indicator')
694+
# New in 6.3.0
695+
self.assertEqual(gcb('\u180E'), 'Control')
696+
self.assertEqual(gcb('\u1A1B'), 'Extend')
697+
# New in 7.0.0
698+
self.assertEqual(gcb('\u0E33'), 'SpacingMark')
699+
self.assertEqual(gcb('\u0EB3'), 'SpacingMark')
700+
self.assertEqual(gcb('\U0001BCA3'), 'Control')
701+
self.assertEqual(gcb('\U0001E8D6'), 'Extend')
702+
self.assertEqual(gcb('\U0001163E'), 'SpacingMark')
703+
# New in 8.0.0
704+
self.assertEqual(gcb('\u08E3'), 'Extend')
705+
self.assertEqual(gcb('\U00011726'), 'SpacingMark')
706+
# New in 9.0.0
707+
self.assertEqual(gcb('\u0600'), 'Prepend')
708+
self.assertEqual(gcb('\U000E007F'), 'Extend')
709+
self.assertEqual(gcb('\U00011CB4'), 'SpacingMark')
710+
self.assertEqual(gcb('\u200D'), 'ZWJ')
711+
# New in 10.0.0
712+
self.assertEqual(gcb('\U00011D46'), 'Prepend')
713+
self.assertEqual(gcb('\U00011D47'), 'Extend')
714+
self.assertEqual(gcb('\U00011A97'), 'SpacingMark')
715+
# New in 11.0.0
716+
self.assertEqual(gcb('\U000110CD'), 'Prepend')
717+
self.assertEqual(gcb('\u07FD'), 'Extend')
718+
self.assertEqual(gcb('\U00011EF6'), 'SpacingMark')
719+
# New in 12.0.0
720+
self.assertEqual(gcb('\U00011A84'), 'Prepend')
721+
self.assertEqual(gcb('\U00013438'), 'Control')
722+
self.assertEqual(gcb('\U0001E2EF'), 'Extend')
723+
self.assertEqual(gcb('\U00016F87'), 'SpacingMark')
724+
# New in 13.0.0
725+
self.assertEqual(gcb('\U00011941'), 'Prepend')
726+
self.assertEqual(gcb('\U00016FE4'), 'Extend')
727+
self.assertEqual(gcb('\U00011942'), 'SpacingMark')
728+
# New in 14.0.0
729+
self.assertEqual(gcb('\u0891'), 'Prepend')
730+
self.assertEqual(gcb('\U0001E2AE'), 'Extend')
731+
# New in 15.0.0
732+
self.assertEqual(gcb('\U00011F02'), 'Prepend')
733+
self.assertEqual(gcb('\U0001343F'), 'Control')
734+
self.assertEqual(gcb('\U0001E4EF'), 'Extend')
735+
self.assertEqual(gcb('\U00011F3F'), 'SpacingMark')
736+
# New in 16.0.0
737+
self.assertEqual(gcb('\U000113D1'), 'Prepend')
738+
self.assertEqual(gcb('\U0001E5EF'), 'Extend')
739+
self.assertEqual(gcb('\U0001612C'), 'SpacingMark')
740+
self.assertEqual(gcb('\U00016D63'), 'V')
741+
self.assertRaises(TypeError, gcb)
742+
self.assertRaises(TypeError, gcb, b'x')
743+
self.assertRaises(TypeError, gcb, 120)
744+
self.assertRaises(TypeError, gcb, '')
745+
self.assertRaises(TypeError, gcb, 'xx')
746+
747+
def test_indic_conjunct_break(self):
748+
incb = self.db._indic_conjunct_break
749+
self.assertEqual(incb(' '), 'None')
750+
self.assertEqual(incb('x'), 'None')
751+
self.assertEqual(incb('\U0010FFFF'), 'None')
752+
# New in 15.1.0
753+
self.assertEqual(incb('\u094D'), 'Linker')
754+
self.assertEqual(incb('\u0D4D'), 'Linker')
755+
self.assertEqual(incb('\u0915'), 'Consonant')
756+
self.assertEqual(incb('\u0D3A'), 'Consonant')
757+
self.assertEqual(incb('\u0300'), 'Extend')
758+
self.assertEqual(incb('\U0001E94A'), 'Extend')
759+
# New in 16.0.0
760+
self.assertEqual(incb('\u034F'), 'Extend')
761+
self.assertEqual(incb('\U000E01EF'), 'Extend')
762+
self.assertRaises(TypeError, incb)
763+
self.assertRaises(TypeError, incb, b'x')
764+
self.assertRaises(TypeError, incb, 120)
765+
self.assertRaises(TypeError, incb, '')
766+
self.assertRaises(TypeError, incb, 'xx')
767+
768+
def test_extended_pictographic(self):
769+
ext_pict = self.db._extended_pictographic
770+
self.assertIs(ext_pict(' '), False)
771+
self.assertIs(ext_pict('x'), False)
772+
self.assertIs(ext_pict('\U0010FFFF'), False)
773+
# New in 13.0.0
774+
self.assertIs(ext_pict('\xA9'), True)
775+
self.assertIs(ext_pict('\u203C'), True)
776+
self.assertIs(ext_pict('\U0001FAD6'), True)
777+
self.assertIs(ext_pict('\U0001FFFD'), True)
778+
self.assertRaises(TypeError, ext_pict)
779+
self.assertRaises(TypeError, ext_pict, b'x')
780+
self.assertRaises(TypeError, ext_pict, 120)
781+
self.assertRaises(TypeError, ext_pict, '')
782+
self.assertRaises(TypeError, ext_pict, 'xx')
783+
784+
def test_grapheme_break(self):
785+
def graphemes(*args):
786+
return list(map(str, self.db._iter_graphemes(*args)))
787+
788+
self.assertRaises(TypeError, self.db._iter_graphemes)
789+
self.assertRaises(TypeError, self.db._iter_graphemes, b'x')
790+
self.assertRaises(TypeError, self.db._iter_graphemes, 'x', 0, 0, 0)
791+
792+
self.assertEqual(graphemes(''), [])
793+
self.assertEqual(graphemes('abcd'), ['a', 'b', 'c', 'd'])
794+
self.assertEqual(graphemes('abcd', 1), ['b', 'c', 'd'])
795+
self.assertEqual(graphemes('abcd', 1, 3), ['b', 'c'])
796+
self.assertEqual(graphemes('abcd', -3), ['b', 'c', 'd'])
797+
self.assertEqual(graphemes('abcd', 1, -1), ['b', 'c'])
798+
self.assertEqual(graphemes('abcd', 3, 1), [])
799+
self.assertEqual(graphemes('abcd', 5), [])
800+
self.assertEqual(graphemes('abcd', 0, 5), ['a', 'b', 'c', 'd'])
801+
self.assertEqual(graphemes('abcd', -5), ['a', 'b', 'c', 'd'])
802+
self.assertEqual(graphemes('abcd', 0, -5), [])
803+
# GB3
804+
self.assertEqual(graphemes('\r\n'), ['\r\n'])
805+
# GB4
806+
self.assertEqual(graphemes('\r\u0308'), ['\r', '\u0308'])
807+
self.assertEqual(graphemes('\n\u0308'), ['\n', '\u0308'])
808+
self.assertEqual(graphemes('\0\u0308'), ['\0', '\u0308'])
809+
# GB5
810+
self.assertEqual(graphemes('\u06dd\r'), ['\u06dd', '\r'])
811+
self.assertEqual(graphemes('\u06dd\n'), ['\u06dd', '\n'])
812+
self.assertEqual(graphemes('\u06dd\0'), ['\u06dd', '\0'])
813+
# GB6
814+
self.assertEqual(graphemes('\u1100\u1160'), ['\u1100\u1160'])
815+
self.assertEqual(graphemes('\u1100\uAC00'), ['\u1100\uAC00'])
816+
self.assertEqual(graphemes('\u1100\uAC01'), ['\u1100\uAC01'])
817+
# GB7
818+
self.assertEqual(graphemes('\uAC00\u1160'), ['\uAC00\u1160'])
819+
self.assertEqual(graphemes('\uAC00\u11A8'), ['\uAC00\u11A8'])
820+
self.assertEqual(graphemes('\u1160\u1160'), ['\u1160\u1160'])
821+
self.assertEqual(graphemes('\u1160\u11A8'), ['\u1160\u11A8'])
822+
# GB8
823+
self.assertEqual(graphemes('\uAC01\u11A8'), ['\uAC01\u11A8'])
824+
self.assertEqual(graphemes('\u11A8\u11A8'), ['\u11A8\u11A8'])
825+
# GB9
826+
self.assertEqual(graphemes('a\u0300'), ['a\u0300'])
827+
self.assertEqual(graphemes('a\u200D'), ['a\u200D'])
828+
# GB9a
829+
self.assertEqual(graphemes('\u0905\u0903'), ['\u0905\u0903'])
830+
# GB9b
831+
self.assertEqual(graphemes('\u06dd\u0661'), ['\u06dd\u0661'])
832+
# GB9c
833+
self.assertEqual(graphemes('\u0915\u094d\u0924'),
834+
['\u0915\u094d\u0924'])
835+
self.assertEqual(graphemes('\u0915\u094D\u094D\u0924'),
836+
['\u0915\u094D\u094D\u0924'])
837+
self.assertEqual(graphemes('\u0915\u094D\u0924\u094D\u092F'),
838+
['\u0915\u094D\u0924\u094D\u092F'])
839+
# GB11
840+
self.assertEqual(graphemes(
841+
'\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
842+
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'),
843+
['\U0001F9D1\U0001F3FE\u200D\u2764\uFE0F'
844+
'\u200D\U0001F48B\u200D\U0001F9D1\U0001F3FC'])
845+
# GB12
846+
self.assertEqual(graphemes(
847+
'\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
848+
['\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
849+
# GB13
850+
self.assertEqual(graphemes(
851+
'a\U0001F1FA\U0001F1E6\U0001F1FA\U0001F1F3'),
852+
['a', '\U0001F1FA\U0001F1E6', '\U0001F1FA\U0001F1F3'])
853+
654854

655855
class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
656856
db = unicodedata.ucd_3_2_0
@@ -659,6 +859,11 @@ class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
659859
if quicktest else
660860
'caf1a7f2f380f927461837f1901ef20683f98683')
661861

862+
test_grapheme_cluster_break = None
863+
test_indic_conjunct_break = None
864+
test_extended_pictographic = None
865+
test_grapheme_break = None
866+
662867

663868
class UnicodeMiscTest(unittest.TestCase):
664869
db = unicodedata
@@ -761,6 +966,17 @@ def test_linebreak_7643(self):
761966
self.assertEqual(len(lines), 1,
762967
r"%a should not be a linebreak" % c)
763968

969+
def test_segment_object(self):
970+
segments = list(unicodedata._iter_graphemes('spa\u0300m'))
971+
self.assertEqual(len(segments), 4, segments)
972+
segment = segments[2]
973+
self.assertEqual(segment.start, 2)
974+
self.assertEqual(segment.end, 4)
975+
self.assertEqual(str(segment), 'a\u0300')
976+
self.assertEqual(repr(segment), '<Segment 2:4>')
977+
self.assertRaises(TypeError, iter, segment)
978+
self.assertRaises(TypeError, len, segment)
979+
764980

765981
class NormalizationTest(unittest.TestCase):
766982
@staticmethod
@@ -883,5 +1099,61 @@ class MyStr(str):
8831099
self.assertIs(type(normalize(form, MyStr(input_str))), str)
8841100

8851101

1102+
class GraphemeBreakTest(unittest.TestCase):
1103+
@staticmethod
1104+
def check_version(testfile):
1105+
hdr = testfile.readline()
1106+
return unicodedata.unidata_version in hdr
1107+
1108+
@requires_resource('network')
1109+
def test_grapheme_break(self):
1110+
TESTDATAFILE = "auxiliary/GraphemeBreakTest.txt"
1111+
TESTDATAURL = f"https://www.unicode.org/Public/{unicodedata.unidata_version}/ucd/{TESTDATAFILE}"
1112+
1113+
# Hit the exception early
1114+
try:
1115+
testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
1116+
check=self.check_version)
1117+
except PermissionError:
1118+
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
1119+
f"into the test data directory")
1120+
except (OSError, HTTPException) as exc:
1121+
self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
1122+
1123+
with testdata:
1124+
self.run_grapheme_break_tests(testdata)
1125+
1126+
def run_grapheme_break_tests(self, testdata):
1127+
for line in testdata:
1128+
line, _, comment = line.partition('#')
1129+
line = line.strip()
1130+
if not line:
1131+
continue
1132+
comment = comment.strip()
1133+
1134+
chunks = []
1135+
breaks = []
1136+
pos = 0
1137+
for field in line.replace('×', ' ').split():
1138+
if field == '÷':
1139+
chunks.append('')
1140+
breaks.append(pos)
1141+
else:
1142+
chunks[-1] += chr(int(field, 16))
1143+
pos += 1
1144+
self.assertEqual(chunks.pop(), '', line)
1145+
input = ''.join(chunks)
1146+
with self.subTest(line):
1147+
result = list(unicodedata._iter_graphemes(input))
1148+
self.assertEqual(list(map(str, result)), chunks, comment)
1149+
self.assertEqual([x.start for x in result], breaks[:-1], comment)
1150+
self.assertEqual([x.end for x in result], breaks[1:], comment)
1151+
for i in range(1, len(breaks) - 1):
1152+
result = list(unicodedata._iter_graphemes(input, breaks[i]))
1153+
self.assertEqual(list(map(str, result)), chunks[i:], comment)
1154+
self.assertEqual([x.start for x in result], breaks[i:-1], comment)
1155+
self.assertEqual([x.end for x in result], breaks[i+1:], comment)
1156+
1157+
8861158
if __name__ == "__main__":
8871159
unittest.main()

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1656,6 +1656,7 @@ Victor Salgado
16561656
Rich Salz
16571657
Kevin Samborn
16581658
Adrian Sampson
1659+
Guillaume Sanchez
16591660
Nevada Sanchez
16601661
James Sanders
16611662
Ilya Sandler

0 commit comments

Comments
 (0)