@@ -651,6 +651,206 @@ def test_east_asian_width_unassigned(self):
651651 self .assertEqual (eaw (char ), 'A' )
652652 self .assertIs (self .db .name (char , None ), None )
653653
654+ def test_grapheme_cluster_break (self ):
655+ gcb = self .db ._grapheme_cluster_break
656+ self .assertEqual (gcb (' ' ), 'Other' )
657+ self .assertEqual (gcb ('x' ), 'Other' )
658+ self .assertEqual (gcb ('\U0010FFFF ' ), 'Other' )
659+ self .assertEqual (gcb ('\r ' ), 'CR' )
660+ self .assertEqual (gcb ('\n ' ), 'LF' )
661+ self .assertEqual (gcb ('\0 ' ), 'Control' )
662+ self .assertEqual (gcb ('\t ' ), 'Control' )
663+ self .assertEqual (gcb ('\x1F ' ), 'Control' )
664+ self .assertEqual (gcb ('\x7F ' ), 'Control' )
665+ self .assertEqual (gcb ('\x9F ' ), 'Control' )
666+ self .assertEqual (gcb ('\U000E0001 ' ), 'Control' )
667+ self .assertEqual (gcb ('\u0300 ' ), 'Extend' )
668+ self .assertEqual (gcb ('\u200C ' ), 'Extend' )
669+ self .assertEqual (gcb ('\U000E01EF ' ), 'Extend' )
670+ self .assertEqual (gcb ('\u1159 ' ), 'L' )
671+ self .assertEqual (gcb ('\u11F9 ' ), 'T' )
672+ self .assertEqual (gcb ('\uD788 ' ), 'LV' )
673+ self .assertEqual (gcb ('\uD7A3 ' ), 'LVT' )
674+ # New in 5.0.0
675+ self .assertEqual (gcb ('\u05BA ' ), 'Extend' )
676+ self .assertEqual (gcb ('\u20EF ' ), 'Extend' )
677+ # New in 5.1.0
678+ self .assertEqual (gcb ('\u2064 ' ), 'Control' )
679+ self .assertEqual (gcb ('\uAA4D ' ), 'SpacingMark' )
680+ # New in 5.2.0
681+ self .assertEqual (gcb ('\u0816 ' ), 'Extend' )
682+ self .assertEqual (gcb ('\uA97C ' ), 'L' )
683+ self .assertEqual (gcb ('\uD7C6 ' ), 'V' )
684+ self .assertEqual (gcb ('\uD7FB ' ), 'T' )
685+ # New in 6.0.0
686+ self .assertEqual (gcb ('\u093A ' ), 'Extend' )
687+ self .assertEqual (gcb ('\U00011002 ' ), 'SpacingMark' )
688+ # New in 6.1.0
689+ self .assertEqual (gcb ('\U000E0FFF ' ), 'Control' )
690+ self .assertEqual (gcb ('\U00016F7E ' ), 'SpacingMark' )
691+ # New in 6.2.0
692+ self .assertEqual (gcb ('\U0001F1E6 ' ), 'Regional_Indicator' )
693+ self .assertEqual (gcb ('\U0001F1FF ' ), 'Regional_Indicator' )
694+ # New in 6.3.0
695+ self .assertEqual (gcb ('\u180E ' ), 'Control' )
696+ self .assertEqual (gcb ('\u1A1B ' ), 'Extend' )
697+ # New in 7.0.0
698+ self .assertEqual (gcb ('\u0E33 ' ), 'SpacingMark' )
699+ self .assertEqual (gcb ('\u0EB3 ' ), 'SpacingMark' )
700+ self .assertEqual (gcb ('\U0001BCA3 ' ), 'Control' )
701+ self .assertEqual (gcb ('\U0001E8D6 ' ), 'Extend' )
702+ self .assertEqual (gcb ('\U0001163E ' ), 'SpacingMark' )
703+ # New in 8.0.0
704+ self .assertEqual (gcb ('\u08E3 ' ), 'Extend' )
705+ self .assertEqual (gcb ('\U00011726 ' ), 'SpacingMark' )
706+ # New in 9.0.0
707+ self .assertEqual (gcb ('\u0600 ' ), 'Prepend' )
708+ self .assertEqual (gcb ('\U000E007F ' ), 'Extend' )
709+ self .assertEqual (gcb ('\U00011CB4 ' ), 'SpacingMark' )
710+ self .assertEqual (gcb ('\u200D ' ), 'ZWJ' )
711+ # New in 10.0.0
712+ self .assertEqual (gcb ('\U00011D46 ' ), 'Prepend' )
713+ self .assertEqual (gcb ('\U00011D47 ' ), 'Extend' )
714+ self .assertEqual (gcb ('\U00011A97 ' ), 'SpacingMark' )
715+ # New in 11.0.0
716+ self .assertEqual (gcb ('\U000110CD ' ), 'Prepend' )
717+ self .assertEqual (gcb ('\u07FD ' ), 'Extend' )
718+ self .assertEqual (gcb ('\U00011EF6 ' ), 'SpacingMark' )
719+ # New in 12.0.0
720+ self .assertEqual (gcb ('\U00011A84 ' ), 'Prepend' )
721+ self .assertEqual (gcb ('\U00013438 ' ), 'Control' )
722+ self .assertEqual (gcb ('\U0001E2EF ' ), 'Extend' )
723+ self .assertEqual (gcb ('\U00016F87 ' ), 'SpacingMark' )
724+ # New in 13.0.0
725+ self .assertEqual (gcb ('\U00011941 ' ), 'Prepend' )
726+ self .assertEqual (gcb ('\U00016FE4 ' ), 'Extend' )
727+ self .assertEqual (gcb ('\U00011942 ' ), 'SpacingMark' )
728+ # New in 14.0.0
729+ self .assertEqual (gcb ('\u0891 ' ), 'Prepend' )
730+ self .assertEqual (gcb ('\U0001E2AE ' ), 'Extend' )
731+ # New in 15.0.0
732+ self .assertEqual (gcb ('\U00011F02 ' ), 'Prepend' )
733+ self .assertEqual (gcb ('\U0001343F ' ), 'Control' )
734+ self .assertEqual (gcb ('\U0001E4EF ' ), 'Extend' )
735+ self .assertEqual (gcb ('\U00011F3F ' ), 'SpacingMark' )
736+ # New in 16.0.0
737+ self .assertEqual (gcb ('\U000113D1 ' ), 'Prepend' )
738+ self .assertEqual (gcb ('\U0001E5EF ' ), 'Extend' )
739+ self .assertEqual (gcb ('\U0001612C ' ), 'SpacingMark' )
740+ self .assertEqual (gcb ('\U00016D63 ' ), 'V' )
741+ self .assertRaises (TypeError , gcb )
742+ self .assertRaises (TypeError , gcb , b'x' )
743+ self .assertRaises (TypeError , gcb , 120 )
744+ self .assertRaises (TypeError , gcb , '' )
745+ self .assertRaises (TypeError , gcb , 'xx' )
746+
747+ def test_indic_conjunct_break (self ):
748+ incb = self .db ._indic_conjunct_break
749+ self .assertEqual (incb (' ' ), 'None' )
750+ self .assertEqual (incb ('x' ), 'None' )
751+ self .assertEqual (incb ('\U0010FFFF ' ), 'None' )
752+ # New in 15.1.0
753+ self .assertEqual (incb ('\u094D ' ), 'Linker' )
754+ self .assertEqual (incb ('\u0D4D ' ), 'Linker' )
755+ self .assertEqual (incb ('\u0915 ' ), 'Consonant' )
756+ self .assertEqual (incb ('\u0D3A ' ), 'Consonant' )
757+ self .assertEqual (incb ('\u0300 ' ), 'Extend' )
758+ self .assertEqual (incb ('\U0001E94A ' ), 'Extend' )
759+ # New in 16.0.0
760+ self .assertEqual (incb ('\u034F ' ), 'Extend' )
761+ self .assertEqual (incb ('\U000E01EF ' ), 'Extend' )
762+ self .assertRaises (TypeError , incb )
763+ self .assertRaises (TypeError , incb , b'x' )
764+ self .assertRaises (TypeError , incb , 120 )
765+ self .assertRaises (TypeError , incb , '' )
766+ self .assertRaises (TypeError , incb , 'xx' )
767+
768+ def test_extended_pictographic (self ):
769+ ext_pict = self .db ._extended_pictographic
770+ self .assertIs (ext_pict (' ' ), False )
771+ self .assertIs (ext_pict ('x' ), False )
772+ self .assertIs (ext_pict ('\U0010FFFF ' ), False )
773+ # New in 13.0.0
774+ self .assertIs (ext_pict ('\xA9 ' ), True )
775+ self .assertIs (ext_pict ('\u203C ' ), True )
776+ self .assertIs (ext_pict ('\U0001FAD6 ' ), True )
777+ self .assertIs (ext_pict ('\U0001FFFD ' ), True )
778+ self .assertRaises (TypeError , ext_pict )
779+ self .assertRaises (TypeError , ext_pict , b'x' )
780+ self .assertRaises (TypeError , ext_pict , 120 )
781+ self .assertRaises (TypeError , ext_pict , '' )
782+ self .assertRaises (TypeError , ext_pict , 'xx' )
783+
784+ def test_grapheme_break (self ):
785+ def graphemes (* args ):
786+ return list (map (str , self .db ._iter_graphemes (* args )))
787+
788+ self .assertRaises (TypeError , self .db ._iter_graphemes )
789+ self .assertRaises (TypeError , self .db ._iter_graphemes , b'x' )
790+ self .assertRaises (TypeError , self .db ._iter_graphemes , 'x' , 0 , 0 , 0 )
791+
792+ self .assertEqual (graphemes ('' ), [])
793+ self .assertEqual (graphemes ('abcd' ), ['a' , 'b' , 'c' , 'd' ])
794+ self .assertEqual (graphemes ('abcd' , 1 ), ['b' , 'c' , 'd' ])
795+ self .assertEqual (graphemes ('abcd' , 1 , 3 ), ['b' , 'c' ])
796+ self .assertEqual (graphemes ('abcd' , - 3 ), ['b' , 'c' , 'd' ])
797+ self .assertEqual (graphemes ('abcd' , 1 , - 1 ), ['b' , 'c' ])
798+ self .assertEqual (graphemes ('abcd' , 3 , 1 ), [])
799+ self .assertEqual (graphemes ('abcd' , 5 ), [])
800+ self .assertEqual (graphemes ('abcd' , 0 , 5 ), ['a' , 'b' , 'c' , 'd' ])
801+ self .assertEqual (graphemes ('abcd' , - 5 ), ['a' , 'b' , 'c' , 'd' ])
802+ self .assertEqual (graphemes ('abcd' , 0 , - 5 ), [])
803+ # GB3
804+ self .assertEqual (graphemes ('\r \n ' ), ['\r \n ' ])
805+ # GB4
806+ self .assertEqual (graphemes ('\r \u0308 ' ), ['\r ' , '\u0308 ' ])
807+ self .assertEqual (graphemes ('\n \u0308 ' ), ['\n ' , '\u0308 ' ])
808+ self .assertEqual (graphemes ('\0 \u0308 ' ), ['\0 ' , '\u0308 ' ])
809+ # GB5
810+ self .assertEqual (graphemes ('\u06dd \r ' ), ['\u06dd ' , '\r ' ])
811+ self .assertEqual (graphemes ('\u06dd \n ' ), ['\u06dd ' , '\n ' ])
812+ self .assertEqual (graphemes ('\u06dd \0 ' ), ['\u06dd ' , '\0 ' ])
813+ # GB6
814+ self .assertEqual (graphemes ('\u1100 \u1160 ' ), ['\u1100 \u1160 ' ])
815+ self .assertEqual (graphemes ('\u1100 \uAC00 ' ), ['\u1100 \uAC00 ' ])
816+ self .assertEqual (graphemes ('\u1100 \uAC01 ' ), ['\u1100 \uAC01 ' ])
817+ # GB7
818+ self .assertEqual (graphemes ('\uAC00 \u1160 ' ), ['\uAC00 \u1160 ' ])
819+ self .assertEqual (graphemes ('\uAC00 \u11A8 ' ), ['\uAC00 \u11A8 ' ])
820+ self .assertEqual (graphemes ('\u1160 \u1160 ' ), ['\u1160 \u1160 ' ])
821+ self .assertEqual (graphemes ('\u1160 \u11A8 ' ), ['\u1160 \u11A8 ' ])
822+ # GB8
823+ self .assertEqual (graphemes ('\uAC01 \u11A8 ' ), ['\uAC01 \u11A8 ' ])
824+ self .assertEqual (graphemes ('\u11A8 \u11A8 ' ), ['\u11A8 \u11A8 ' ])
825+ # GB9
826+ self .assertEqual (graphemes ('a\u0300 ' ), ['a\u0300 ' ])
827+ self .assertEqual (graphemes ('a\u200D ' ), ['a\u200D ' ])
828+ # GB9a
829+ self .assertEqual (graphemes ('\u0905 \u0903 ' ), ['\u0905 \u0903 ' ])
830+ # GB9b
831+ self .assertEqual (graphemes ('\u06dd \u0661 ' ), ['\u06dd \u0661 ' ])
832+ # GB9c
833+ self .assertEqual (graphemes ('\u0915 \u094d \u0924 ' ),
834+ ['\u0915 \u094d \u0924 ' ])
835+ self .assertEqual (graphemes ('\u0915 \u094D \u094D \u0924 ' ),
836+ ['\u0915 \u094D \u094D \u0924 ' ])
837+ self .assertEqual (graphemes ('\u0915 \u094D \u0924 \u094D \u092F ' ),
838+ ['\u0915 \u094D \u0924 \u094D \u092F ' ])
839+ # GB11
840+ self .assertEqual (graphemes (
841+ '\U0001F9D1 \U0001F3FE \u200D \u2764 \uFE0F '
842+ '\u200D \U0001F48B \u200D \U0001F9D1 \U0001F3FC ' ),
843+ ['\U0001F9D1 \U0001F3FE \u200D \u2764 \uFE0F '
844+ '\u200D \U0001F48B \u200D \U0001F9D1 \U0001F3FC ' ])
845+ # GB12
846+ self .assertEqual (graphemes (
847+ '\U0001F1FA \U0001F1E6 \U0001F1FA \U0001F1F3 ' ),
848+ ['\U0001F1FA \U0001F1E6 ' , '\U0001F1FA \U0001F1F3 ' ])
849+ # GB13
850+ self .assertEqual (graphemes (
851+ 'a\U0001F1FA \U0001F1E6 \U0001F1FA \U0001F1F3 ' ),
852+ ['a' , '\U0001F1FA \U0001F1E6 ' , '\U0001F1FA \U0001F1F3 ' ])
853+
654854
655855class Unicode_3_2_0_FunctionsTest (UnicodeFunctionsTest ):
656856 db = unicodedata .ucd_3_2_0
@@ -659,6 +859,11 @@ class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
659859 if quicktest else
660860 'caf1a7f2f380f927461837f1901ef20683f98683' )
661861
862+ test_grapheme_cluster_break = None
863+ test_indic_conjunct_break = None
864+ test_extended_pictographic = None
865+ test_grapheme_break = None
866+
662867
663868class UnicodeMiscTest (unittest .TestCase ):
664869 db = unicodedata
@@ -761,6 +966,17 @@ def test_linebreak_7643(self):
761966 self .assertEqual (len (lines ), 1 ,
762967 r"%a should not be a linebreak" % c )
763968
969+ def test_segment_object (self ):
970+ segments = list (unicodedata ._iter_graphemes ('spa\u0300 m' ))
971+ self .assertEqual (len (segments ), 4 , segments )
972+ segment = segments [2 ]
973+ self .assertEqual (segment .start , 2 )
974+ self .assertEqual (segment .end , 4 )
975+ self .assertEqual (str (segment ), 'a\u0300 ' )
976+ self .assertEqual (repr (segment ), '<Segment 2:4>' )
977+ self .assertRaises (TypeError , iter , segment )
978+ self .assertRaises (TypeError , len , segment )
979+
764980
765981class NormalizationTest (unittest .TestCase ):
766982 @staticmethod
@@ -883,5 +1099,61 @@ class MyStr(str):
8831099 self .assertIs (type (normalize (form , MyStr (input_str ))), str )
8841100
8851101
1102+ class GraphemeBreakTest (unittest .TestCase ):
1103+ @staticmethod
1104+ def check_version (testfile ):
1105+ hdr = testfile .readline ()
1106+ return unicodedata .unidata_version in hdr
1107+
1108+ @requires_resource ('network' )
1109+ def test_grapheme_break (self ):
1110+ TESTDATAFILE = "auxiliary/GraphemeBreakTest.txt"
1111+ TESTDATAURL = f"https://www.unicode.org/Public/{ unicodedata .unidata_version } /ucd/{ TESTDATAFILE } "
1112+
1113+ # Hit the exception early
1114+ try :
1115+ testdata = open_urlresource (TESTDATAURL , encoding = "utf-8" ,
1116+ check = self .check_version )
1117+ except PermissionError :
1118+ self .skipTest (f"Permission error when downloading { TESTDATAURL } "
1119+ f"into the test data directory" )
1120+ except (OSError , HTTPException ) as exc :
1121+ self .skipTest (f"Failed to download { TESTDATAURL } : { exc } " )
1122+
1123+ with testdata :
1124+ self .run_grapheme_break_tests (testdata )
1125+
1126+ def run_grapheme_break_tests (self , testdata ):
1127+ for line in testdata :
1128+ line , _ , comment = line .partition ('#' )
1129+ line = line .strip ()
1130+ if not line :
1131+ continue
1132+ comment = comment .strip ()
1133+
1134+ chunks = []
1135+ breaks = []
1136+ pos = 0
1137+ for field in line .replace ('×' , ' ' ).split ():
1138+ if field == '÷' :
1139+ chunks .append ('' )
1140+ breaks .append (pos )
1141+ else :
1142+ chunks [- 1 ] += chr (int (field , 16 ))
1143+ pos += 1
1144+ self .assertEqual (chunks .pop (), '' , line )
1145+ input = '' .join (chunks )
1146+ with self .subTest (line ):
1147+ result = list (unicodedata ._iter_graphemes (input ))
1148+ self .assertEqual (list (map (str , result )), chunks , comment )
1149+ self .assertEqual ([x .start for x in result ], breaks [:- 1 ], comment )
1150+ self .assertEqual ([x .end for x in result ], breaks [1 :], comment )
1151+ for i in range (1 , len (breaks ) - 1 ):
1152+ result = list (unicodedata ._iter_graphemes (input , breaks [i ]))
1153+ self .assertEqual (list (map (str , result )), chunks [i :], comment )
1154+ self .assertEqual ([x .start for x in result ], breaks [i :- 1 ], comment )
1155+ self .assertEqual ([x .end for x in result ], breaks [i + 1 :], comment )
1156+
1157+
8861158if __name__ == "__main__" :
8871159 unittest .main ()
0 commit comments