Skip to content

Commit e6d41eb

Browse files
authored
[mypyc] Add support for str.lower() and str.upper() (#20948)
Fixes mypyc/mypyc#1088 Follow up on #19375 with full Unicode support: - ASCII fast path - Shared `CPyStr_ChangeCase` helper, parameterized by function pointers - **`_PyUnicode_ToLowerFull`/`_PyUnicode_ToUpperFull`** for Unicode which handle 1-to-N expansion (e.g., `ß`→`SS`); This was a sticky point with the previous PR which relied on `Py_UNICODE_TOLOWER`/`TOUPPER`. - Temporary `len * 3` UCS-4 buffer for the Unicode path; This is because each Unicode char may be expanded from 1 byte to 3
1 parent 5c78458 commit e6d41eb

9 files changed

Lines changed: 167 additions & 1 deletion

File tree

mypyc/doc/str_operations.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ Methods
4141
* ``s.isalnum()``
4242
* ``s.isdigit()``
4343
* ``s.isspace()``
44+
* ``s.lower()``
4445
* ``s.join(x: Iterable)``
4546
* ``s.lstrip()``
4647
* ``s.lstrip(chars: str)``
@@ -65,6 +66,7 @@ Methods
6566
* ``s.splitlines(keepends: bool)``
6667
* ``s1.startswith(s2: str)``
6768
* ``s1.startswith(t: tuple[str, ...])``
69+
* ``s.upper()``
6870
* ``s.strip()``
6971
* ``s.strip(chars: str)``
7072

mypyc/lib-rt/CPy.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -780,6 +780,8 @@ Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start)
780780
Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end);
781781
CPyTagged CPyStr_Ord(PyObject *obj);
782782
PyObject *CPyStr_Multiply(PyObject *str, CPyTagged count);
783+
PyObject *CPyStr_Lower(PyObject *str);
784+
PyObject *CPyStr_Upper(PyObject *str);
783785
bool CPyStr_IsSpace(PyObject *str);
784786
bool CPyStr_IsAlnum(PyObject *str);
785787
bool CPyStr_IsDigit(PyObject *str);

mypyc/lib-rt/static_data.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ intern_strings(void) {
5555
INTERN_STRING(endswith, "endswith");
5656
INTERN_STRING(get_type_hints, "get_type_hints");
5757
INTERN_STRING(keys, "keys");
58+
INTERN_STRING(lower, "lower");
5859
INTERN_STRING(items, "items");
5960
INTERN_STRING(join, "join");
6061
INTERN_STRING(register_, "register");
@@ -66,6 +67,7 @@ intern_strings(void) {
6667
INTERN_STRING(throw_, "throw");
6768
INTERN_STRING(translate, "translate");
6869
INTERN_STRING(update, "update");
70+
INTERN_STRING(upper, "upper");
6971
INTERN_STRING(values, "values");
7072
return 0;
7173
}

mypyc/lib-rt/static_data.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ typedef struct mypyc_interned_str_struct {
4747
PyObject *endswith;
4848
PyObject *get_type_hints;
4949
PyObject *keys;
50+
PyObject *lower;
5051
PyObject *items;
5152
PyObject *join;
5253
PyObject *register_;
@@ -58,6 +59,7 @@ typedef struct mypyc_interned_str_struct {
5859
PyObject *throw_;
5960
PyObject *translate;
6061
PyObject *update;
62+
PyObject *upper;
6163
PyObject *values;
6264
} mypyc_interned_str_struct;
6365

mypyc/lib-rt/str_ops.c

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
// The _PyUnicode_CheckConsistency definition has been moved to the internal API
1111
// https://github.com/python/cpython/pull/106398
12-
#if defined(Py_DEBUG) && defined(CPY_3_13_FEATURES)
12+
#if defined(Py_DEBUG) && CPY_3_13_FEATURES
1313
#include "internal/pycore_unicodeobject.h"
1414
#endif
1515

@@ -678,6 +678,80 @@ bool CPyStr_IsAlnum(PyObject *str) {
678678
return true;
679679
}
680680

681+
static inline int CPy_ASCII_Lower(unsigned char c) { return Py_TOLOWER(c); }
682+
static inline int CPy_ASCII_Upper(unsigned char c) { return Py_TOUPPER(c); }
683+
684+
static inline PyObject *CPyStr_ChangeCase(PyObject *self,
685+
int (*ascii_func)(unsigned char),
686+
#if CPY_3_13_FEATURES
687+
PyObject *method_name
688+
#else
689+
int (*unicode_func)(Py_UCS4, Py_UCS4 *)
690+
#endif
691+
) {
692+
Py_ssize_t len = PyUnicode_GET_LENGTH(self);
693+
if (len == 0) {
694+
Py_INCREF(self);
695+
return self;
696+
}
697+
698+
// ASCII fast path: 1-to-1, no expansion possible
699+
if (PyUnicode_IS_ASCII(self)) {
700+
PyObject *res = PyUnicode_New(len, 127);
701+
if (res == NULL) return NULL;
702+
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
703+
Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res);
704+
for (Py_ssize_t i = 0; i < len; i++) {
705+
res_data[i] = ascii_func(data[i]);
706+
}
707+
return res;
708+
}
709+
710+
#if CPY_3_13_FEATURES
711+
// On 3.13+, _PyUnicode_ToLowerFull/ToUpperFull are no longer exported,
712+
// so fall back to CPython's method implementation for non-ASCII strings.
713+
return PyObject_CallMethodNoArgs(self, method_name);
714+
#else
715+
// General Unicode: unicode_func handles 1-to-N expansion.
716+
// Worst case: each codepoint expands to 3 (per Unicode standard).
717+
// The tmp buffer is short-lived, and PyUnicode_FromKindAndData
718+
// compacts the result to the optimal string kind automatically.
719+
int kind = PyUnicode_KIND(self);
720+
const void *data = PyUnicode_DATA(self);
721+
Py_UCS4 *tmp = PyMem_Malloc(sizeof(Py_UCS4) * len * 3);
722+
if (tmp == NULL) return PyErr_NoMemory();
723+
724+
Py_UCS4 mapped[3];
725+
Py_ssize_t out_len = 0;
726+
for (Py_ssize_t i = 0; i < len; i++) {
727+
int n = unicode_func(PyUnicode_READ(kind, data, i), mapped);
728+
for (int j = 0; j < n; j++) {
729+
tmp[out_len++] = mapped[j];
730+
}
731+
}
732+
733+
PyObject *res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, tmp, out_len);
734+
PyMem_Free(tmp);
735+
return res;
736+
#endif
737+
}
738+
739+
PyObject *CPyStr_Lower(PyObject *self) {
740+
#if CPY_3_13_FEATURES
741+
return CPyStr_ChangeCase(self, CPy_ASCII_Lower, mypyc_interned_str.lower);
742+
#else
743+
return CPyStr_ChangeCase(self, CPy_ASCII_Lower, _PyUnicode_ToLowerFull);
744+
#endif
745+
}
746+
747+
PyObject *CPyStr_Upper(PyObject *self) {
748+
#if CPY_3_13_FEATURES
749+
return CPyStr_ChangeCase(self, CPy_ASCII_Upper, mypyc_interned_str.upper);
750+
#else
751+
return CPyStr_ChangeCase(self, CPy_ASCII_Upper, _PyUnicode_ToUpperFull);
752+
#endif
753+
}
754+
681755
bool CPyStr_IsDigit(PyObject *str) {
682756
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
683757
if (len == 0) return false;

mypyc/primitives/str_ops.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,24 @@
397397
error_kind=ERR_NEG_INT,
398398
)
399399

400+
# str.lower()
401+
method_op(
402+
name="lower",
403+
arg_types=[str_rprimitive],
404+
return_type=str_rprimitive,
405+
c_function_name="CPyStr_Lower",
406+
error_kind=ERR_MAGIC,
407+
)
408+
409+
# str.upper()
410+
method_op(
411+
name="upper",
412+
arg_types=[str_rprimitive],
413+
return_type=str_rprimitive,
414+
c_function_name="CPyStr_Upper",
415+
error_kind=ERR_MAGIC,
416+
)
417+
400418
method_op(
401419
name="isspace",
402420
arg_types=[str_rprimitive],

mypyc/test-data/fixtures/ir.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def lstrip(self, item: Optional[str] = None) -> str: pass
120120
def rstrip(self, item: Optional[str] = None) -> str: pass
121121
def join(self, x: Iterable[str]) -> str: pass
122122
def format(self, *args: Any, **kwargs: Any) -> str: ...
123+
def lower(self) -> str: ...
123124
def upper(self) -> str: ...
124125
def startswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ...
125126
def endswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ...

mypyc/test-data/irbuild-str.test

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,26 @@ L0:
973973
r0 = CPyStr_Multiply(s, n)
974974
return r0
975975

976+
[case testStrLower]
977+
def do_lower(s: str) -> str:
978+
return s.lower()
979+
[out]
980+
def do_lower(s):
981+
s, r0 :: str
982+
L0:
983+
r0 = CPyStr_Lower(s)
984+
return r0
985+
986+
[case testStrUpper]
987+
def do_upper(s: str) -> str:
988+
return s.upper()
989+
[out]
990+
def do_upper(s):
991+
s, r0 :: str
992+
L0:
993+
r0 = CPyStr_Upper(s)
994+
return r0
995+
976996
[case testStrIsSpace]
977997
def is_space(x: str) -> bool:
978998
return x.isspace()

mypyc/test-data/run-strings.test

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,6 +1258,51 @@ FMT: Final = "{} {}"
12581258
def test_format() -> None:
12591259
assert FMT.format(400 + 20, "roll" + "up") == "420 rollup"
12601260

1261+
[case testLowerAndUpper]
1262+
from typing import Any
1263+
1264+
def test_lower_basic() -> None:
1265+
assert "".lower() == ""
1266+
assert "hello".lower() == "hello"
1267+
assert "HELLO".lower() == "hello"
1268+
assert "Hello World".lower() == "hello world"
1269+
assert "123".lower() == "123"
1270+
assert "ABC123".lower() == "abc123"
1271+
1272+
def test_upper_basic() -> None:
1273+
assert "".upper() == ""
1274+
assert "HELLO".upper() == "HELLO"
1275+
assert "hello".upper() == "HELLO"
1276+
assert "Hello World".upper() == "HELLO WORLD"
1277+
assert "123".upper() == "123"
1278+
assert "abc123".upper() == "ABC123"
1279+
1280+
def test_lower_unicode() -> None:
1281+
assert "\u00C9".lower() == "\u00E9" # É -> é
1282+
assert "\u0391\u0392".lower() == "\u03B1\u03B2" # ΑΒ -> αβ
1283+
assert "\u4E2D\u6587".lower() == "\u4E2D\u6587" # CJK (no case)
1284+
assert "\U0001F600".lower() == "\U0001F600" # Emoji (no case)
1285+
1286+
def test_upper_unicode() -> None:
1287+
assert "\u00E9".upper() == "\u00C9" # é -> É
1288+
assert "\u03B1\u03B2".upper() == "\u0391\u0392" # αβ -> ΑΒ
1289+
assert "\u4E2D\u6587".upper() == "\u4E2D\u6587" # CJK (no case)
1290+
assert "\U0001F600".upper() == "\U0001F600" # Emoji (no case)
1291+
1292+
def test_expansion() -> None:
1293+
# 1-to-N expansion cases
1294+
assert "\u0130".lower() == "\u0069\u0307" # İ -> i + combining dot above
1295+
assert "\uFB03".lower() == "\uFB03" # ffi ligature stays lowercase
1296+
assert "\u00DF".upper() == "SS" # ß -> SS
1297+
assert "\uFB03".upper() == "FFI" # ffi ligature -> FFI
1298+
1299+
def test_comprehensive() -> None:
1300+
for i in range(0x110000):
1301+
c = chr(i)
1302+
a: Any = c
1303+
assert c.lower() == a.lower(), f"lower mismatch at U+{i:04X}"
1304+
assert c.upper() == a.upper(), f"upper mismatch at U+{i:04X}"
1305+
12611306
[case testIsSpace]
12621307
from typing import Any
12631308

0 commit comments

Comments
 (0)