From 3e0471b4ae098d7a13728b298296f43febf8041b Mon Sep 17 00:00:00 2001 From: PurHur Date: Wed, 20 May 2026 17:23:57 +0000 Subject: [PATCH] Fix mb_strlen UTF-8 character counting for VM and JIT (closes #158) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use host mb_strlen when available, VmString::utf8CharLength fallback, and __compiler_utf8_strlen for JIT/AOT. Add compliance PHPT for é and hello. Co-authored-by: Cursor --- docs/bootstrap-inventory.md | 20 +++-- docs/bootstrap-profile.json | 6 +- ext/standard/VmString.php | 25 ++++++ ext/types/JitMbStrlen.php | 38 +++++++++ ext/types/mb_strlen.php | 79 +++++++++++++++---- lib/AOT/runtime/superglobals_refresh.c | 34 ++++++++ lib/JIT/Builtin/Type.php | 8 ++ test/compliance/cases/stdlib/mb_strlen.phpt | 13 +++ .../cases/stdlib/mb_strlen_jit.phpt | 8 +- 9 files changed, 205 insertions(+), 26 deletions(-) create mode 100644 ext/types/JitMbStrlen.php create mode 100644 test/compliance/cases/stdlib/mb_strlen.phpt diff --git a/docs/bootstrap-inventory.md b/docs/bootstrap-inventory.md index d40d1cd6..12bb8175 100644 --- a/docs/bootstrap-inventory.md +++ b/docs/bootstrap-inventory.md @@ -1,3 +1,5 @@ +Wrote /compiler/docs/bootstrap-inventory.md (252 files, 10 blockers) +Wrote /compiler/docs/bootstrap-profile.json (2 AOT lint targets, 2 excluded files) # Bootstrap inventory (vm.php path) Auto-generated by `script/bootstrap-inventory.php`. Tracks **Phase A** of [#212](https://github.com/PurHur/php-compiler/issues/212) (self-host bootstrap). @@ -8,9 +10,9 @@ Regenerate: `php script/bootstrap-inventory.php` | Metric | Count | |--------|------:| -| PHP files on vm.php path | 251 | +| PHP files on vm.php path | 252 | | Source constructs flagged (blockers) | 10 | -| Source constructs flagged (warnings) | 656 | +| Source constructs flagged (warnings) | 657 | ## Compiler CFG gaps (`lib/Compiler.php`) @@ -182,6 +184,7 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: | `ext/standard/web_bool.php` | 0 | 1 | | `ext/standard/web_int.php` | 0 | 1 | | `ext/standard/web_string.php` | 0 | 1 | +| `ext/types/JitMbStrlen.php` | 0 | 1 | | `ext/types/Module.php` | 0 | 13 | | `ext/types/is_type.php` | 0 | 1 | | `ext/types/mb_strlen.php` | 0 | 1 | @@ -591,9 +594,9 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: ### `ext/standard/VmString.php` **Warnings** (review for bootstrap subset): -- new Exception (line 182) -- new Exception (line 190) -- 56 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler +- new Exception (line 207) +- new Exception (line 215) +- 57 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler - 1 closure(s) ### `ext/standard/abs.php` @@ -1185,6 +1188,11 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: **Warnings** (review for bootstrap subset): - 2 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler +### `ext/types/JitMbStrlen.php` + +**Warnings** (review for bootstrap subset): +- 1 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler + ### `ext/types/Module.php` **Warnings** (review for bootstrap subset): @@ -1210,7 +1218,7 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: ### `ext/types/mb_strlen.php` **Warnings** (review for bootstrap subset): -- 2 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler +- 3 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler ### `ext/types/strlen.php` diff --git a/docs/bootstrap-profile.json b/docs/bootstrap-profile.json index 97d35f28..39d66305 100644 --- a/docs/bootstrap-profile.json +++ b/docs/bootstrap-profile.json @@ -1,3 +1,4 @@ +Wrote /compiler/docs/bootstrap-profile.json (2 AOT lint targets, 2 excluded files) { "phase": "B", "issue": 212, @@ -181,6 +182,7 @@ "ext/standard/web_bool.php", "ext/standard/web_int.php", "ext/standard/web_string.php", + "ext/types/JitMbStrlen.php", "ext/types/Module.php", "ext/types/is_type.php", "ext/types/mb_strlen.php", @@ -285,9 +287,9 @@ "test/bootstrap-aot/echo_hello.php" ], "totals": { - "inventory_files": 251, + "inventory_files": 252, "excluded": 2, - "eligible": 249, + "eligible": 250, "aot_lint_targets": 2 } } diff --git a/ext/standard/VmString.php b/ext/standard/VmString.php index 0861d74f..14a56a3e 100644 --- a/ext/standard/VmString.php +++ b/ext/standard/VmString.php @@ -22,6 +22,31 @@ public static function byteLength(string $string): int return $len; } + /** + * UTF-8 codepoint count for BMP web text (issue #158). Invalid bytes count as one character. + */ + public static function utf8CharLength(string $string): int + { + $byteLen = self::byteLength($string); + $count = 0; + for ($i = 0; $i < $byteLen; ++$count) { + $byte = \ord($string[$i]); + if ($byte < 0x80) { + $i += 1; + } elseif (($byte & 0xE0) === 0xC0 && $i + 1 < $byteLen) { + $i += 2; + } elseif (($byte & 0xF0) === 0xE0 && $i + 2 < $byteLen) { + $i += 3; + } elseif (($byte & 0xF8) === 0xF0 && $i + 3 < $byteLen) { + $i += 4; + } else { + $i += 1; + } + } + + return $count; + } + public static function byteSlice(string $string, int $offset, ?int $length = null): string { $len = self::byteLength($string); diff --git a/ext/types/JitMbStrlen.php b/ext/types/JitMbStrlen.php new file mode 100644 index 00000000..a225550b --- /dev/null +++ b/ext/types/JitMbStrlen.php @@ -0,0 +1,38 @@ +type) { + throw new \LogicException('mb_strlen() only supports strings in this compiler build'); + } + + $literal = $arg->compileTimeString ?? null; + if (null !== $literal) { + return $context->constantFromInteger( + VmString::utf8CharLength($literal), + 'int64' + ); + } + + $strPtr = $context->helper->loadValue($arg); + + return $context->builder->call( + $context->lookupFunction('__compiler_utf8_strlen'), + $strPtr + ); + } +} diff --git a/ext/types/mb_strlen.php b/ext/types/mb_strlen.php index 79ae1996..8fdb3951 100644 --- a/ext/types/mb_strlen.php +++ b/ext/types/mb_strlen.php @@ -9,10 +9,11 @@ use PHPCompiler\Func\Internal; use PHPCompiler\JIT\Context; use PHPCompiler\JIT\Variable; +use PHPCompiler\VM\Variable as VMVariable; use PHPLLVM\Value; /** - * mb_strlen() — byte length of string (UTF-8 safe for ASCII subset in this compiler). + * mb_strlen() — UTF-8 character count for web forms (issue #158). */ final class mb_strlen extends Internal { @@ -23,31 +24,79 @@ public function __construct() public function execute(Frame $frame): void { - if (1 !== \count($frame->calledArgs)) { - throw new \LogicException('mb_strlen() requires exactly one argument'); + $argc = \count($frame->calledArgs); + if ($argc < 1 || $argc > 2) { + throw new \LogicException('mb_strlen() requires one or two arguments'); } - $var = $frame->calledArgs[0]; - if (null !== $frame->returnVar) { - $frame->returnVar->int(VmString::byteLength($var->resolveIndirect()->toString())); + $strVar = $frame->calledArgs[0]->resolveIndirect(); + if (VMVariable::TYPE_STRING !== $strVar->type) { + throw new \LogicException('mb_strlen() only supports strings in this compiler build'); + } + if (null === $frame->returnVar) { + return; + } + $str = $strVar->toString(); + $encoding = 'UTF-8'; + if (2 === $argc) { + $encVar = $frame->calledArgs[1]->resolveIndirect(); + if (VMVariable::TYPE_STRING !== $encVar->type) { + throw new \LogicException('mb_strlen() encoding must be a string in this compiler build'); + } + $encoding = $encVar->toString(); } + $frame->returnVar->int(self::lengthForEncoding($str, $encoding)); } - public Context $context; - public function call(Context $context, Variable ...$args): Value { - $this->context = $context; - if (1 !== \count($args)) { - throw new \LogicException('mb_strlen() requires exactly one argument'); + $argc = \count($args); + if ($argc < 1 || $argc > 2) { + throw new \LogicException('mb_strlen() requires one or two arguments'); + } + if (1 === $argc) { + return JitMbStrlen::utf8Length($context, $args[0]); + } + if (Variable::TYPE_STRING !== $args[1]->type) { + throw new \LogicException('mb_strlen() encoding must be a string in this compiler build'); + } + $encoding = $args[1]->compileTimeString ?? null; + if ('UTF-8' === $encoding) { + return JitMbStrlen::utf8Length($context, $args[0]); + } + if (null !== $encoding && 'ASCII' !== $encoding && '8BIT' !== $encoding) { + throw new \LogicException( + 'mb_strlen() JIT only supports UTF-8, ASCII, or 8BIT encoding literals in this compiler build' + ); } - $argValue = $context->helper->loadValue($args[0]); if (Variable::TYPE_STRING !== $args[0]->type) { throw new \LogicException('mb_strlen() only supports strings in this compiler build'); } - $offset = $this->context->structFieldMap[$argValue->typeOf()->getElementType()->getName()]['length']; + $argValue = $context->helper->loadValue($args[0]); + $offset = $context->structFieldMap[$argValue->typeOf()->getElementType()->getName()]['length']; + + return $context->builder->load( + $context->builder->structGep($argValue, $offset) + ); + } + + private static function lengthForEncoding(string $str, string $encoding): int + { + if ('UTF-8' === $encoding) { + if (\function_exists('mb_strlen')) { + return (int) \mb_strlen($str, 'UTF-8'); + } + + return VmString::utf8CharLength($str); + } + if ('ASCII' === $encoding || '8BIT' === $encoding) { + return VmString::byteLength($str); + } + if (\function_exists('mb_strlen')) { + return (int) \mb_strlen($str, $encoding); + } - return $this->context->builder->load( - $this->context->builder->structGep($argValue, $offset) + throw new \LogicException( + 'mb_strlen() requires mbstring for encoding '.$encoding.' in this compiler build' ); } } diff --git a/lib/AOT/runtime/superglobals_refresh.c b/lib/AOT/runtime/superglobals_refresh.c index cadcaa45..4654fa5c 100644 --- a/lib/AOT/runtime/superglobals_refresh.c +++ b/lib/AOT/runtime/superglobals_refresh.c @@ -1114,6 +1114,40 @@ __string__ *__compiler_strip_tags(__string__ *input, __string__ *allowed) } } +/* + * UTF-8 character count for mb_strlen() JIT/AOT (issue #158). + */ +long long __compiler_utf8_strlen(__string__ *input) +{ + const char *src; + size_t slen; + size_t i = 0; + long long count = 0; + + if (input == NULL) { + return 0; + } + src = nf_strdata(input); + slen = nf_strlen(input); + while (i < slen) { + unsigned char b = (unsigned char) src[i]; + if (b < 0x80) { + i += 1; + } else if ((b & 0xE0) == 0xC0 && i + 1 < slen) { + i += 2; + } else if ((b & 0xF0) == 0xE0 && i + 2 < slen) { + i += 3; + } else if ((b & 0xF8) == 0xF0 && i + 3 < slen) { + i += 4; + } else { + i += 1; + } + count++; + } + + return count; +} + /* * Zend parity for missing array string keys (issue #273). * Called from JIT __hashtable__readStringKeyValue when lookup returns NULL. diff --git a/lib/JIT/Builtin/Type.php b/lib/JIT/Builtin/Type.php index 0b3f2e7e..ffa9270b 100755 --- a/lib/JIT/Builtin/Type.php +++ b/lib/JIT/Builtin/Type.php @@ -53,6 +53,14 @@ public function register(): void { ); $fnStripTags = $this->context->module->addFunction('__compiler_strip_tags', $fntypeStripTags); $this->context->registerFunction('__compiler_strip_tags', $fnStripTags); + $i64 = $this->context->getTypeFromString('int64'); + $fntypeUtf8Strlen = $this->context->context->functionType( + $i64, + false, + $this->context->getTypeFromString('__string__*') + ); + $fnUtf8Strlen = $this->context->module->addFunction('__compiler_utf8_strlen', $fntypeUtf8Strlen); + $this->context->registerFunction('__compiler_utf8_strlen', $fnUtf8Strlen); HttpResponseCode::implement($this->context); $i8p = $this->context->getTypeFromString('int8*'); $i32 = $this->context->getTypeFromString('int32'); diff --git a/test/compliance/cases/stdlib/mb_strlen.phpt b/test/compliance/cases/stdlib/mb_strlen.phpt new file mode 100644 index 00000000..db37f852 --- /dev/null +++ b/test/compliance/cases/stdlib/mb_strlen.phpt @@ -0,0 +1,13 @@ +--TEST-- +stdlib mb_strlen() UTF-8 character count (VM) +--FILE-- +