@@ -616,6 +616,221 @@ def test_isxidcontinue(self):
616616 self .assertRaises (TypeError , self .db .isxidcontinue )
617617 self .assertRaises (TypeError , self .db .isxidcontinue , 'xx' )
618618
619+ def test_grapheme_cluster_break (self ):
620+ gcb = self .db .grapheme_cluster_break
621+ self .assertEqual (gcb (' ' ), 'Other' )
622+ self .assertEqual (gcb ('x' ), 'Other' )
623+ self .assertEqual (gcb ('\U0010FFFF ' ), 'Other' )
624+ self .assertEqual (gcb ('\r ' ), 'CR' )
625+ self .assertEqual (gcb ('\n ' ), 'LF' )
626+ self .assertEqual (gcb ('\0 ' ), 'Control' )
627+ self .assertEqual (gcb ('\t ' ), 'Control' )
628+ self .assertEqual (gcb ('\x1F ' ), 'Control' )
629+ self .assertEqual (gcb ('\x7F ' ), 'Control' )
630+ self .assertEqual (gcb ('\x9F ' ), 'Control' )
631+ self .assertEqual (gcb ('\U000E0001 ' ), 'Control' )
632+ self .assertEqual (gcb ('\u0300 ' ), 'Extend' )
633+ self .assertEqual (gcb ('\u200C ' ), 'Extend' )
634+ self .assertEqual (gcb ('\U000E01EF ' ), 'Extend' )
635+ self .assertEqual (gcb ('\u1159 ' ), 'L' )
636+ self .assertEqual (gcb ('\u11F9 ' ), 'T' )
637+ self .assertEqual (gcb ('\uD788 ' ), 'LV' )
638+ self .assertEqual (gcb ('\uD7A3 ' ), 'LVT' )
639+ # New in 5.0.0
640+ self .assertEqual (gcb ('\u05BA ' ), 'Extend' )
641+ self .assertEqual (gcb ('\u20EF ' ), 'Extend' )
642+ # New in 5.1.0
643+ self .assertEqual (gcb ('\u2064 ' ), 'Control' )
644+ self .assertEqual (gcb ('\uAA4D ' ), 'SpacingMark' )
645+ # New in 5.2.0
646+ self .assertEqual (gcb ('\u0816 ' ), 'Extend' )
647+ self .assertEqual (gcb ('\uA97C ' ), 'L' )
648+ self .assertEqual (gcb ('\uD7C6 ' ), 'V' )
649+ self .assertEqual (gcb ('\uD7FB ' ), 'T' )
650+ # New in 6.0.0
651+ self .assertEqual (gcb ('\u093A ' ), 'Extend' )
652+ self .assertEqual (gcb ('\U00011002 ' ), 'SpacingMark' )
653+ # New in 6.1.0
654+ self .assertEqual (gcb ('\U000E0FFF ' ), 'Control' )
655+ self .assertEqual (gcb ('\U00016F7E ' ), 'SpacingMark' )
656+ # New in 6.2.0
657+ self .assertEqual (gcb ('\U0001F1E6 ' ), 'Regional_Indicator' )
658+ self .assertEqual (gcb ('\U0001F1FF ' ), 'Regional_Indicator' )
659+ # New in 6.3.0
660+ self .assertEqual (gcb ('\u180E ' ), 'Control' )
661+ self .assertEqual (gcb ('\u1A1B ' ), 'Extend' )
662+ # New in 7.0.0
663+ self .assertEqual (gcb ('\u0E33 ' ), 'SpacingMark' )
664+ self .assertEqual (gcb ('\u0EB3 ' ), 'SpacingMark' )
665+ self .assertEqual (gcb ('\U0001BCA3 ' ), 'Control' )
666+ self .assertEqual (gcb ('\U0001E8D6 ' ), 'Extend' )
667+ self .assertEqual (gcb ('\U0001163E ' ), 'SpacingMark' )
668+ # New in 8.0.0
669+ self .assertEqual (gcb ('\u08E3 ' ), 'Extend' )
670+ self .assertEqual (gcb ('\U00011726 ' ), 'SpacingMark' )
671+ # New in 9.0.0
672+ self .assertEqual (gcb ('\u0600 ' ), 'Prepend' )
673+ self .assertEqual (gcb ('\U000E007F ' ), 'Extend' )
674+ self .assertEqual (gcb ('\U00011CB4 ' ), 'SpacingMark' )
675+ self .assertEqual (gcb ('\u200D ' ), 'ZWJ' )
676+ # New in 10.0.0
677+ self .assertEqual (gcb ('\U00011D46 ' ), 'Prepend' )
678+ self .assertEqual (gcb ('\U00011D47 ' ), 'Extend' )
679+ self .assertEqual (gcb ('\U00011A97 ' ), 'SpacingMark' )
680+ # New in 11.0.0
681+ self .assertEqual (gcb ('\U000110CD ' ), 'Prepend' )
682+ self .assertEqual (gcb ('\u07FD ' ), 'Extend' )
683+ self .assertEqual (gcb ('\U00011EF6 ' ), 'SpacingMark' )
684+ # New in 12.0.0
685+ self .assertEqual (gcb ('\U00011A84 ' ), 'Prepend' )
686+ self .assertEqual (gcb ('\U00013438 ' ), 'Control' )
687+ self .assertEqual (gcb ('\U0001E2EF ' ), 'Extend' )
688+ self .assertEqual (gcb ('\U00016F87 ' ), 'SpacingMark' )
689+ # New in 13.0.0
690+ self .assertEqual (gcb ('\U00011941 ' ), 'Prepend' )
691+ self .assertEqual (gcb ('\U00016FE4 ' ), 'Extend' )
692+ self .assertEqual (gcb ('\U00011942 ' ), 'SpacingMark' )
693+ # New in 14.0.0
694+ self .assertEqual (gcb ('\u0891 ' ), 'Prepend' )
695+ self .assertEqual (gcb ('\U0001E2AE ' ), 'Extend' )
696+ # New in 15.0.0
697+ self .assertEqual (gcb ('\U00011F02 ' ), 'Prepend' )
698+ self .assertEqual (gcb ('\U0001343F ' ), 'Control' )
699+ self .assertEqual (gcb ('\U0001E4EF ' ), 'Extend' )
700+ self .assertEqual (gcb ('\U00011F3F ' ), 'SpacingMark' )
701+ # New in 16.0.0
702+ self .assertEqual (gcb ('\U000113D1 ' ), 'Prepend' )
703+ self .assertEqual (gcb ('\U0001E5EF ' ), 'Extend' )
704+ self .assertEqual (gcb ('\U0001612C ' ), 'SpacingMark' )
705+ self .assertEqual (gcb ('\U00016D63 ' ), 'V' )
706+ # New in 17.0.0
707+ self .assertEqual (gcb ('\u1AEB ' ), 'Extend' )
708+ self .assertEqual (gcb ('\U00011B67 ' ), 'SpacingMark' )
709+
710+ self .assertRaises (TypeError , gcb )
711+ self .assertRaises (TypeError , gcb , b'x' )
712+ self .assertRaises (TypeError , gcb , 120 )
713+ self .assertRaises (TypeError , gcb , '' )
714+ self .assertRaises (TypeError , gcb , 'xx' )
715+
716+ def test_indic_conjunct_break (self ):
717+ incb = self .db .indic_conjunct_break
718+ self .assertEqual (incb (' ' ), 'None' )
719+ self .assertEqual (incb ('x' ), 'None' )
720+ self .assertEqual (incb ('\U0010FFFF ' ), 'None' )
721+ # New in 15.1.0
722+ self .assertEqual (incb ('\u094D ' ), 'Linker' )
723+ self .assertEqual (incb ('\u0D4D ' ), 'Linker' )
724+ self .assertEqual (incb ('\u0915 ' ), 'Consonant' )
725+ self .assertEqual (incb ('\u0D3A ' ), 'Consonant' )
726+ self .assertEqual (incb ('\u0300 ' ), 'Extend' )
727+ self .assertEqual (incb ('\U0001E94A ' ), 'Extend' )
728+ # New in 16.0.0
729+ self .assertEqual (incb ('\u034F ' ), 'Extend' )
730+ self .assertEqual (incb ('\U000E01EF ' ), 'Extend' )
731+ # New in 17.0.0
732+ self .assertEqual (incb ('\u1039 ' ), 'Linker' )
733+ self .assertEqual (incb ('\U00011F42 ' ), 'Linker' )
734+ self .assertEqual (incb ('\u1000 ' ), 'Consonant' )
735+ self .assertEqual (incb ('\U00011F33 ' ), 'Consonant' )
736+ self .assertEqual (incb ('\U0001E6F5 ' ), 'Extend' )
737+
738+ self .assertRaises (TypeError , incb )
739+ self .assertRaises (TypeError , incb , b'x' )
740+ self .assertRaises (TypeError , incb , 120 )
741+ self .assertRaises (TypeError , incb , '' )
742+ self .assertRaises (TypeError , incb , 'xx' )
743+
744+ def test_extended_pictographic (self ):
745+ ext_pict = self .db .extended_pictographic
746+ self .assertIs (ext_pict (' ' ), False )
747+ self .assertIs (ext_pict ('x' ), False )
748+ self .assertIs (ext_pict ('\U0010FFFF ' ), False )
749+ # New in 13.0.0
750+ self .assertIs (ext_pict ('\xA9 ' ), True )
751+ self .assertIs (ext_pict ('\u203C ' ), True )
752+ self .assertIs (ext_pict ('\U0001FAD6 ' ), True )
753+ self .assertIs (ext_pict ('\U0001FFFD ' ), True )
754+ # New in 17.0.0
755+ self .assertIs (ext_pict ('\u2388 ' ), False )
756+ self .assertIs (ext_pict ('\U0001FA6D ' ), False )
757+
758+ self .assertRaises (TypeError , ext_pict )
759+ self .assertRaises (TypeError , ext_pict , b'x' )
760+ self .assertRaises (TypeError , ext_pict , 120 )
761+ self .assertRaises (TypeError , ext_pict , '' )
762+ self .assertRaises (TypeError , ext_pict , 'xx' )
763+
764+ def test_grapheme_break (self ):
765+ def graphemes (* args ):
766+ return list (map (str , self .db .iter_graphemes (* args )))
767+
768+ self .assertRaises (TypeError , self .db .iter_graphemes )
769+ self .assertRaises (TypeError , self .db .iter_graphemes , b'x' )
770+ self .assertRaises (TypeError , self .db .iter_graphemes , 'x' , 0 , 0 , 0 )
771+
772+ self .assertEqual (graphemes ('' ), [])
773+ self .assertEqual (graphemes ('abcd' ), ['a' , 'b' , 'c' , 'd' ])
774+ self .assertEqual (graphemes ('abcd' , 1 ), ['b' , 'c' , 'd' ])
775+ self .assertEqual (graphemes ('abcd' , 1 , 3 ), ['b' , 'c' ])
776+ self .assertEqual (graphemes ('abcd' , - 3 ), ['b' , 'c' , 'd' ])
777+ self .assertEqual (graphemes ('abcd' , 1 , - 1 ), ['b' , 'c' ])
778+ self .assertEqual (graphemes ('abcd' , 3 , 1 ), [])
779+ self .assertEqual (graphemes ('abcd' , 5 ), [])
780+ self .assertEqual (graphemes ('abcd' , 0 , 5 ), ['a' , 'b' , 'c' , 'd' ])
781+ self .assertEqual (graphemes ('abcd' , - 5 ), ['a' , 'b' , 'c' , 'd' ])
782+ self .assertEqual (graphemes ('abcd' , 0 , - 5 ), [])
783+ # GB3
784+ self .assertEqual (graphemes ('\r \n ' ), ['\r \n ' ])
785+ # GB4
786+ self .assertEqual (graphemes ('\r \u0308 ' ), ['\r ' , '\u0308 ' ])
787+ self .assertEqual (graphemes ('\n \u0308 ' ), ['\n ' , '\u0308 ' ])
788+ self .assertEqual (graphemes ('\0 \u0308 ' ), ['\0 ' , '\u0308 ' ])
789+ # GB5
790+ self .assertEqual (graphemes ('\u06dd \r ' ), ['\u06dd ' , '\r ' ])
791+ self .assertEqual (graphemes ('\u06dd \n ' ), ['\u06dd ' , '\n ' ])
792+ self .assertEqual (graphemes ('\u06dd \0 ' ), ['\u06dd ' , '\0 ' ])
793+ # GB6
794+ self .assertEqual (graphemes ('\u1100 \u1160 ' ), ['\u1100 \u1160 ' ])
795+ self .assertEqual (graphemes ('\u1100 \uAC00 ' ), ['\u1100 \uAC00 ' ])
796+ self .assertEqual (graphemes ('\u1100 \uAC01 ' ), ['\u1100 \uAC01 ' ])
797+ # GB7
798+ self .assertEqual (graphemes ('\uAC00 \u1160 ' ), ['\uAC00 \u1160 ' ])
799+ self .assertEqual (graphemes ('\uAC00 \u11A8 ' ), ['\uAC00 \u11A8 ' ])
800+ self .assertEqual (graphemes ('\u1160 \u1160 ' ), ['\u1160 \u1160 ' ])
801+ self .assertEqual (graphemes ('\u1160 \u11A8 ' ), ['\u1160 \u11A8 ' ])
802+ # GB8
803+ self .assertEqual (graphemes ('\uAC01 \u11A8 ' ), ['\uAC01 \u11A8 ' ])
804+ self .assertEqual (graphemes ('\u11A8 \u11A8 ' ), ['\u11A8 \u11A8 ' ])
805+ # GB9
806+ self .assertEqual (graphemes ('a\u0300 ' ), ['a\u0300 ' ])
807+ self .assertEqual (graphemes ('a\u200D ' ), ['a\u200D ' ])
808+ # GB9a
809+ self .assertEqual (graphemes ('\u0905 \u0903 ' ), ['\u0905 \u0903 ' ])
810+ # GB9b
811+ self .assertEqual (graphemes ('\u06dd \u0661 ' ), ['\u06dd \u0661 ' ])
812+ # GB9c
813+ self .assertEqual (graphemes ('\u0915 \u094d \u0924 ' ),
814+ ['\u0915 \u094d \u0924 ' ])
815+ self .assertEqual (graphemes ('\u0915 \u094D \u094D \u0924 ' ),
816+ ['\u0915 \u094D \u094D \u0924 ' ])
817+ self .assertEqual (graphemes ('\u0915 \u094D \u0924 \u094D \u092F ' ),
818+ ['\u0915 \u094D \u0924 \u094D \u092F ' ])
819+ # GB11
820+ self .assertEqual (graphemes (
821+ '\U0001F9D1 \U0001F3FE \u200D \u2764 \uFE0F '
822+ '\u200D \U0001F48B \u200D \U0001F9D1 \U0001F3FC ' ),
823+ ['\U0001F9D1 \U0001F3FE \u200D \u2764 \uFE0F '
824+ '\u200D \U0001F48B \u200D \U0001F9D1 \U0001F3FC ' ])
825+ # GB12
826+ self .assertEqual (graphemes (
827+ '\U0001F1FA \U0001F1E6 \U0001F1FA \U0001F1F3 ' ),
828+ ['\U0001F1FA \U0001F1E6 ' , '\U0001F1FA \U0001F1F3 ' ])
829+ # GB13
830+ self .assertEqual (graphemes (
831+ 'a\U0001F1FA \U0001F1E6 \U0001F1FA \U0001F1F3 ' ),
832+ ['a' , '\U0001F1FA \U0001F1E6 ' , '\U0001F1FA \U0001F1F3 ' ])
833+
619834
620835class Unicode_3_2_0_FunctionsTest (UnicodeFunctionsTest ):
621836 db = unicodedata .ucd_3_2_0
@@ -624,6 +839,11 @@ class Unicode_3_2_0_FunctionsTest(UnicodeFunctionsTest):
624839 if quicktest else
625840 'f217b8688d7bdff31db4207e078a96702f091597' )
626841
842+ test_grapheme_cluster_break = None
843+ test_indic_conjunct_break = None
844+ test_extended_pictographic = None
845+ test_grapheme_break = None
846+
627847
628848class UnicodeMiscTest (unittest .TestCase ):
629849 db = unicodedata
@@ -726,6 +946,17 @@ def test_linebreak_7643(self):
726946 self .assertEqual (len (lines ), 1 ,
727947 r"%a should not be a linebreak" % c )
728948
949+ def test_segment_object (self ):
950+ segments = list (unicodedata .iter_graphemes ('spa\u0300 m' ))
951+ self .assertEqual (len (segments ), 4 , segments )
952+ segment = segments [2 ]
953+ self .assertEqual (segment .start , 2 )
954+ self .assertEqual (segment .end , 4 )
955+ self .assertEqual (str (segment ), 'a\u0300 ' )
956+ self .assertEqual (repr (segment ), '<Segment 2:4>' )
957+ self .assertRaises (TypeError , iter , segment )
958+ self .assertRaises (TypeError , len , segment )
959+
729960
730961class NormalizationTest (unittest .TestCase ):
731962 @staticmethod
@@ -848,5 +1079,61 @@ class MyStr(str):
8481079 self .assertIs (type (normalize (form , MyStr (input_str ))), str )
8491080
8501081
1082+ class GraphemeBreakTest (unittest .TestCase ):
1083+ @staticmethod
1084+ def check_version (testfile ):
1085+ hdr = testfile .readline ()
1086+ return unicodedata .unidata_version in hdr
1087+
1088+ @requires_resource ('network' )
1089+ def test_grapheme_break (self ):
1090+ TESTDATAFILE = "auxiliary/GraphemeBreakTest.txt"
1091+ TESTDATAURL = f"https://www.unicode.org/Public/{ unicodedata .unidata_version } /ucd/{ TESTDATAFILE } "
1092+
1093+ # Hit the exception early
1094+ try :
1095+ testdata = open_urlresource (TESTDATAURL , encoding = "utf-8" ,
1096+ check = self .check_version )
1097+ except PermissionError :
1098+ self .skipTest (f"Permission error when downloading { TESTDATAURL } "
1099+ f"into the test data directory" )
1100+ except (OSError , HTTPException ) as exc :
1101+ self .skipTest (f"Failed to download { TESTDATAURL } : { exc } " )
1102+
1103+ with testdata :
1104+ self .run_grapheme_break_tests (testdata )
1105+
1106+ def run_grapheme_break_tests (self , testdata ):
1107+ for line in testdata :
1108+ line , _ , comment = line .partition ('#' )
1109+ line = line .strip ()
1110+ if not line :
1111+ continue
1112+ comment = comment .strip ()
1113+
1114+ chunks = []
1115+ breaks = []
1116+ pos = 0
1117+ for field in line .replace ('×' , ' ' ).split ():
1118+ if field == '÷' :
1119+ chunks .append ('' )
1120+ breaks .append (pos )
1121+ else :
1122+ chunks [- 1 ] += chr (int (field , 16 ))
1123+ pos += 1
1124+ self .assertEqual (chunks .pop (), '' , line )
1125+ input = '' .join (chunks )
1126+ with self .subTest (line ):
1127+ result = list (unicodedata .iter_graphemes (input ))
1128+ self .assertEqual (list (map (str , result )), chunks , comment )
1129+ self .assertEqual ([x .start for x in result ], breaks [:- 1 ], comment )
1130+ self .assertEqual ([x .end for x in result ], breaks [1 :], comment )
1131+ for i in range (1 , len (breaks ) - 1 ):
1132+ result = list (unicodedata .iter_graphemes (input , breaks [i ]))
1133+ self .assertEqual (list (map (str , result )), chunks [i :], comment )
1134+ self .assertEqual ([x .start for x in result ], breaks [i :- 1 ], comment )
1135+ self .assertEqual ([x .end for x in result ], breaks [i + 1 :], comment )
1136+
1137+
8511138if __name__ == "__main__" :
8521139 unittest .main ()
0 commit comments