[mypyc] Add support for str.lower() and str.upper() (#20948)

VaggelisD · web-flow · commit e6d41eb6b6d4 · 2026-03-04T22:27:10.000Z
Fixes mypyc/mypyc#1088 Follow up on #19375 with full Unicode support: - ASCII fast path - Shared `CPyStr_ChangeCase` helper, parameterized by function pointers - **`_PyUnicode_ToLowerFull`/`_PyUnicode_ToUpperFull`** for Unicode which handle 1-to-N expansion (e.g., `ß`→`SS`); This was a sticky point with the previous PR which relied on `Py_UNICODE_TOLOWER`/`TOUPPER`. - Temporary `len * 3` UCS-4 buffer for the Unicode path; This is because each Unicode char may be expanded from 1 byte to 3
diff --git a/mypyc/doc/str_operations.rst b/mypyc/doc/str_operations.rst
@@ -41,6 +41,7 @@ Methods
 * ``s.isalnum()``
 * ``s.isdigit()``
 * ``s.isspace()``
+* ``s.lower()``
 * ``s.join(x: Iterable)``
 * ``s.lstrip()``
 * ``s.lstrip(chars: str)``
@@ -65,6 +66,7 @@ Methods
 * ``s.splitlines(keepends: bool)``
 * ``s1.startswith(s2: str)``
 * ``s1.startswith(t: tuple[str, ...])``
+* ``s.upper()``
 * ``s.strip()``
 * ``s.strip(chars: str)``
 
diff --git a/mypyc/lib-rt/CPy.h b/mypyc/lib-rt/CPy.h
@@ -780,6 +780,8 @@ Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start)
 Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end);
 CPyTagged CPyStr_Ord(PyObject *obj);
 PyObject *CPyStr_Multiply(PyObject *str, CPyTagged count);
+PyObject *CPyStr_Lower(PyObject *str);
+PyObject *CPyStr_Upper(PyObject *str);
 bool CPyStr_IsSpace(PyObject *str);
 bool CPyStr_IsAlnum(PyObject *str);
 bool CPyStr_IsDigit(PyObject *str);
diff --git a/mypyc/lib-rt/static_data.c b/mypyc/lib-rt/static_data.c
@@ -55,6 +55,7 @@ intern_strings(void) {
     INTERN_STRING(endswith, "endswith");
     INTERN_STRING(get_type_hints, "get_type_hints");
     INTERN_STRING(keys, "keys");
+    INTERN_STRING(lower, "lower");
     INTERN_STRING(items, "items");
     INTERN_STRING(join, "join");
     INTERN_STRING(register_, "register");
@@ -66,6 +67,7 @@ intern_strings(void) {
     INTERN_STRING(throw_, "throw");
     INTERN_STRING(translate, "translate");
     INTERN_STRING(update, "update");
+    INTERN_STRING(upper, "upper");
     INTERN_STRING(values, "values");
     return 0;
 }
diff --git a/mypyc/lib-rt/static_data.h b/mypyc/lib-rt/static_data.h
@@ -47,6 +47,7 @@ typedef struct mypyc_interned_str_struct {
     PyObject *endswith;
     PyObject *get_type_hints;
     PyObject *keys;
+    PyObject *lower;
     PyObject *items;
     PyObject *join;
     PyObject *register_;
@@ -58,6 +59,7 @@ typedef struct mypyc_interned_str_struct {
     PyObject *throw_;
     PyObject *translate;
     PyObject *update;
+    PyObject *upper;
     PyObject *values;
 } mypyc_interned_str_struct;
 
diff --git a/mypyc/lib-rt/str_ops.c b/mypyc/lib-rt/str_ops.c
@@ -9,7 +9,7 @@
 
 // The _PyUnicode_CheckConsistency definition has been moved to the internal API
 // https://github.com/python/cpython/pull/106398
-#if defined(Py_DEBUG) && defined(CPY_3_13_FEATURES)
+#if defined(Py_DEBUG) && CPY_3_13_FEATURES
 #include "internal/pycore_unicodeobject.h"
 #endif
 
@@ -678,6 +678,80 @@ bool CPyStr_IsAlnum(PyObject *str) {
     return true;
 }
 
+static inline int CPy_ASCII_Lower(unsigned char c) { return Py_TOLOWER(c); }
+static inline int CPy_ASCII_Upper(unsigned char c) { return Py_TOUPPER(c); }
+
+static inline PyObject *CPyStr_ChangeCase(PyObject *self,
+                                    int (*ascii_func)(unsigned char),
+#if CPY_3_13_FEATURES
+                                    PyObject *method_name
+#else
+                                    int (*unicode_func)(Py_UCS4, Py_UCS4 *)
+#endif
+                                    ) {
+    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
+    if (len == 0) {
+        Py_INCREF(self);
+        return self;
+    }
+
+    // ASCII fast path: 1-to-1, no expansion possible
+    if (PyUnicode_IS_ASCII(self)) {
+        PyObject *res = PyUnicode_New(len, 127);
+        if (res == NULL) return NULL;
+        const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
+        Py_UCS1 *res_data = PyUnicode_1BYTE_DATA(res);
+        for (Py_ssize_t i = 0; i < len; i++) {
+            res_data[i] = ascii_func(data[i]);
+        }
+        return res;
+    }
+
+#if CPY_3_13_FEATURES
+    // On 3.13+, _PyUnicode_ToLowerFull/ToUpperFull are no longer exported,
+    // so fall back to CPython's method implementation for non-ASCII strings.
+    return PyObject_CallMethodNoArgs(self, method_name);
+#else
+    // General Unicode: unicode_func handles 1-to-N expansion.
+    // Worst case: each codepoint expands to 3 (per Unicode standard).
+    // The tmp buffer is short-lived, and PyUnicode_FromKindAndData
+    // compacts the result to the optimal string kind automatically.
+    int kind = PyUnicode_KIND(self);
+    const void *data = PyUnicode_DATA(self);
+    Py_UCS4 *tmp = PyMem_Malloc(sizeof(Py_UCS4) * len * 3);
+    if (tmp == NULL) return PyErr_NoMemory();
+
+    Py_UCS4 mapped[3];
+    Py_ssize_t out_len = 0;
+    for (Py_ssize_t i = 0; i < len; i++) {
+        int n = unicode_func(PyUnicode_READ(kind, data, i), mapped);
+        for (int j = 0; j < n; j++) {
+            tmp[out_len++] = mapped[j];
+        }
+    }
+
+    PyObject *res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, tmp, out_len);
+    PyMem_Free(tmp);
+    return res;
+#endif
+}
+
+PyObject *CPyStr_Lower(PyObject *self) {
+#if CPY_3_13_FEATURES
+    return CPyStr_ChangeCase(self, CPy_ASCII_Lower, mypyc_interned_str.lower);
+#else
+    return CPyStr_ChangeCase(self, CPy_ASCII_Lower, _PyUnicode_ToLowerFull);
+#endif
+}
+
+PyObject *CPyStr_Upper(PyObject *self) {
+#if CPY_3_13_FEATURES
+    return CPyStr_ChangeCase(self, CPy_ASCII_Upper, mypyc_interned_str.upper);
+#else
+    return CPyStr_ChangeCase(self, CPy_ASCII_Upper, _PyUnicode_ToUpperFull);
+#endif
+}
+
 bool CPyStr_IsDigit(PyObject *str) {
     Py_ssize_t len = PyUnicode_GET_LENGTH(str);
     if (len == 0) return false;
diff --git a/mypyc/primitives/str_ops.py b/mypyc/primitives/str_ops.py
@@ -397,6 +397,24 @@
     error_kind=ERR_NEG_INT,
 )
 
+# str.lower()
+method_op(
+    name="lower",
+    arg_types=[str_rprimitive],
+    return_type=str_rprimitive,
+    c_function_name="CPyStr_Lower",
+    error_kind=ERR_MAGIC,
+)
+
+# str.upper()
+method_op(
+    name="upper",
+    arg_types=[str_rprimitive],
+    return_type=str_rprimitive,
+    c_function_name="CPyStr_Upper",
+    error_kind=ERR_MAGIC,
+)
+
 method_op(
     name="isspace",
     arg_types=[str_rprimitive],
diff --git a/mypyc/test-data/fixtures/ir.py b/mypyc/test-data/fixtures/ir.py
@@ -120,6 +120,7 @@ def lstrip(self, item: Optional[str] = None) -> str: pass
     def rstrip(self, item: Optional[str] = None) -> str: pass
     def join(self, x: Iterable[str]) -> str: pass
     def format(self, *args: Any, **kwargs: Any) -> str: ...
+    def lower(self) -> str: ...
     def upper(self) -> str: ...
     def startswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ...
     def endswith(self, x: Union[str, Tuple[str, ...]], start: int=..., end: int=...) -> bool: ...
diff --git a/mypyc/test-data/irbuild-str.test b/mypyc/test-data/irbuild-str.test
@@ -973,6 +973,26 @@ L0:
     r0 = CPyStr_Multiply(s, n)
     return r0
 
+[case testStrLower]
+def do_lower(s: str) -> str:
+    return s.lower()
+[out]
+def do_lower(s):
+    s, r0 :: str
+L0:
+    r0 = CPyStr_Lower(s)
+    return r0
+
+[case testStrUpper]
+def do_upper(s: str) -> str:
+    return s.upper()
+[out]
+def do_upper(s):
+    s, r0 :: str
+L0:
+    r0 = CPyStr_Upper(s)
+    return r0
+
 [case testStrIsSpace]
 def is_space(x: str) -> bool:
     return x.isspace()
diff --git a/mypyc/test-data/run-strings.test b/mypyc/test-data/run-strings.test
@@ -1258,6 +1258,51 @@ FMT: Final = "{} {}"
 def test_format() -> None:
     assert FMT.format(400 + 20, "roll" + "up") == "420 rollup"
 
+[case testLowerAndUpper]
+from typing import Any
+
+def test_lower_basic() -> None:
+    assert "".lower() == ""
+    assert "hello".lower() == "hello"
+    assert "HELLO".lower() == "hello"
+    assert "Hello World".lower() == "hello world"
+    assert "123".lower() == "123"
+    assert "ABC123".lower() == "abc123"
+
+def test_upper_basic() -> None:
+    assert "".upper() == ""
+    assert "HELLO".upper() == "HELLO"
+    assert "hello".upper() == "HELLO"
+    assert "Hello World".upper() == "HELLO WORLD"
+    assert "123".upper() == "123"
+    assert "abc123".upper() == "ABC123"
+
+def test_lower_unicode() -> None:
+    assert "\u00C9".lower() == "\u00E9"                # É -> é
+    assert "\u0391\u0392".lower() == "\u03B1\u03B2"    # ΑΒ -> αβ
+    assert "\u4E2D\u6587".lower() == "\u4E2D\u6587"    # CJK (no case)
+    assert "\U0001F600".lower() == "\U0001F600"         # Emoji (no case)
+
+def test_upper_unicode() -> None:
+    assert "\u00E9".upper() == "\u00C9"                # é -> É
+    assert "\u03B1\u03B2".upper() == "\u0391\u0392"    # αβ -> ΑΒ
+    assert "\u4E2D\u6587".upper() == "\u4E2D\u6587"    # CJK (no case)
+    assert "\U0001F600".upper() == "\U0001F600"         # Emoji (no case)
+
+def test_expansion() -> None:
+    # 1-to-N expansion cases
+    assert "\u0130".lower() == "\u0069\u0307"  # İ -> i + combining dot above
+    assert "\uFB03".lower() == "\uFB03"        # ffi ligature stays lowercase
+    assert "\u00DF".upper() == "SS"            # ß -> SS
+    assert "\uFB03".upper() == "FFI"           # ffi ligature -> FFI
+
+def test_comprehensive() -> None:
+    for i in range(0x110000):
+        c = chr(i)
+        a: Any = c
+        assert c.lower() == a.lower(), f"lower mismatch at U+{i:04X}"
+        assert c.upper() == a.upper(), f"upper mismatch at U+{i:04X}"
+
 [case testIsSpace]
 from typing import Any