From fa463064bda084c6d41530282ca52492de948cbc Mon Sep 17 00:00:00 2001 From: PurHur Date: Tue, 19 May 2026 18:50:10 +0000 Subject: [PATCH] Add strip_tags() stdlib with VM, JIT, and AOT paths. Implements HTML tag stripping with allow-lists, comment/PHP tag removal, LLVM compile-time folding, and a C runtime helper for dynamic strings. Closes #299. Co-authored-by: Cursor --- docs/bootstrap-inventory.md | 51 +++-- docs/capabilities.md | 6 +- ext/standard/JitStripTags.php | 78 +++++++ ext/standard/Module.php | 1 + ext/standard/VmString.php | 128 +++++++++++ ext/standard/strip_tags.php | 56 +++++ lib/AOT/runtime/superglobals_refresh.c | 213 ++++++++++++++++++ lib/JIT/Builtin/Type.php | 8 + test/compliance/cases/stdlib/strip_tags.phpt | 15 ++ .../cases/stdlib/strip_tags_jit.phpt | 13 ++ test/fixtures/aot/cases/strip_tags.phpt | 11 + 11 files changed, 562 insertions(+), 18 deletions(-) create mode 100644 ext/standard/JitStripTags.php create mode 100644 ext/standard/strip_tags.php create mode 100644 test/compliance/cases/stdlib/strip_tags.phpt create mode 100644 test/compliance/cases/stdlib/strip_tags_jit.phpt create mode 100644 test/fixtures/aot/cases/strip_tags.phpt diff --git a/docs/bootstrap-inventory.md b/docs/bootstrap-inventory.md index 658e75ca..e0d23d8c 100644 --- a/docs/bootstrap-inventory.md +++ b/docs/bootstrap-inventory.md @@ -8,9 +8,9 @@ Regenerate: `php script/bootstrap-inventory.php` | Metric | Count | |--------|------:| -| PHP files on vm.php path | 196 | +| PHP files on vm.php path | 199 | | Source constructs flagged (blockers) | 10 | -| Source constructs flagged (warnings) | 523 | +| Source constructs flagged (warnings) | 527 | ## Compiler CFG gaps (`lib/Compiler.php`) @@ -40,6 +40,7 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: | `ext/standard/JitHtmlspecialchars.php` | 0 | 1 | | `ext/standard/JitImplode.php` | 0 | 1 | | `ext/standard/JitNumberFormat.php` | 0 | 1 | +| `ext/standard/JitParseUrl.php` | 0 | 1 | | `ext/standard/JitPath.php` | 0 | 1 | | `ext/standard/JitRealpath.php` | 0 | 1 | | `ext/standard/JitStrPad.php` | 0 | 1 | @@ -47,9 +48,10 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: | `ext/standard/JitStrReplace.php` | 0 | 1 | | `ext/standard/JitStringConcat.php` | 0 | 1 | | `ext/standard/JitStringIndex.php` | 0 | 1 | +| `ext/standard/JitStripTags.php` | 0 | 1 | | `ext/standard/JitStrpos.php` | 0 | 1 | | `ext/standard/JitUrlencode.php` | 0 | 1 | -| `ext/standard/Module.php` | 0 | 94 | +| `ext/standard/Module.php` | 0 | 95 | | `ext/standard/VmFs.php` | 0 | 3 | | `ext/standard/VmNumberFormat.php` | 0 | 1 | | `ext/standard/VmString.php` | 0 | 2 | @@ -135,6 +137,7 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: | `ext/standard/string_ltrim.php` | 0 | 1 | | `ext/standard/string_rtrim.php` | 0 | 1 | | `ext/standard/string_trim.php` | 0 | 1 | +| `ext/standard/strip_tags.php` | 0 | 1 | | `ext/standard/strncmp.php` | 0 | 1 | | `ext/standard/strpos.php` | 0 | 1 | | `ext/standard/strrev.php` | 0 | 1 | @@ -265,6 +268,11 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: **Warnings** (review for bootstrap subset): - 4 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler +### `ext/standard/JitParseUrl.php` + +**Warnings** (review for bootstrap subset): +- 3 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler + ### `ext/standard/JitPath.php` **Warnings** (review for bootstrap subset): @@ -300,6 +308,11 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: **Warnings** (review for bootstrap subset): - 5 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler +### `ext/standard/JitStripTags.php` + +**Warnings** (review for bootstrap subset): +- 3 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler + ### `ext/standard/JitStrpos.php` **Warnings** (review for bootstrap subset): @@ -394,18 +407,19 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: - new str_pad (line 100) - new str_split (line 101) - new htmlspecialchars (line 102) -- new header_ (line 103) -- new http_response_code (line 104) -- new urlencode (line 105) -- new rawurlencode (line 106) -- new parse_url (line 107) -- new dirname (line 108) -- new basename (line 109) -- new realpath (line 110) -- new getenv_ (line 111) -- new putenv_ (line 112) -- new scandir (line 113) -- new glob_ (line 114) +- new strip_tags (line 103) +- new header_ (line 104) +- new http_response_code (line 105) +- new urlencode (line 106) +- new rawurlencode (line 107) +- new parse_url (line 108) +- new dirname (line 109) +- new basename (line 110) +- new realpath (line 111) +- new getenv_ (line 112) +- new putenv_ (line 113) +- new scandir (line 114) +- new glob_ (line 115) - 2 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler ### `ext/standard/VmFs.php` @@ -423,7 +437,7 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: ### `ext/standard/VmString.php` **Warnings** (review for bootstrap subset): -- 40 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler +- 45 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler - 1 closure(s) ### `ext/standard/abs.php` @@ -858,6 +872,11 @@ These `LogicException` messages indicate CFG ops or expressions not yet lowered: **Warnings** (review for bootstrap subset): - 5 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler +### `ext/standard/strip_tags.php` + +**Warnings** (review for bootstrap subset): +- 2 class method(s) — PHPCfg Op\Stmt\ClassMethod not lowered in Compiler + ### `ext/standard/strncmp.php` **Warnings** (review for bootstrap subset): diff --git a/docs/capabilities.md b/docs/capabilities.md index f275e374..41a80b80 100644 --- a/docs/capabilities.md +++ b/docs/capabilities.md @@ -1,5 +1,6 @@ -Wrote /compiler/docs/capabilities.md (105 builtins). -t/capability-matrix.php`. Do not edit by hand. +# Builtin capability matrix + +Auto-generated by `script/capability-matrix.php`. Do not edit by hand. | Function | VM | JIT | AOT | Module | Notes | |----------|:--:|:---:|:---:|--------|-------| @@ -96,6 +97,7 @@ t/capability-matrix.php`. Do not edit by hand. | `str_split` | yes | no | no | standard | doc: VM only; not implemented for JIT in this compiler build | | `str_starts_with` | yes | yes | yes | standard | AOT PHPT | | `strcmp` | yes | yes | yes | standard | | +| `strip_tags` | yes | yes | yes | standard | JIT PHPT; AOT PHPT | | `strlen` | yes | yes | yes | types | JIT PHPT; AOT PHPT | | `strncmp` | yes | yes | yes | standard | | | `strpos` | yes | yes | yes | standard | JIT PHPT; AOT PHPT | diff --git a/ext/standard/JitStripTags.php b/ext/standard/JitStripTags.php new file mode 100644 index 00000000..574f3344 --- /dev/null +++ b/ext/standard/JitStripTags.php @@ -0,0 +1,78 @@ +compileTimeString ?? null; + if (null !== $inputLiteral) { + $allowedLiteral = null; + if (null !== $allowed) { + if (JITVariable::TYPE_STRING === $allowed->type) { + $allowedLiteral = $allowed->compileTimeString ?? ''; + } elseif (JITVariable::TYPE_VALUE !== $allowed->type) { + throw new \LogicException( + 'strip_tags() allowed_tags must be a string or null in this compiler build' + ); + } + } + + return $context->builder->load( + $context->constantStringFromString(VmString::stripTags($inputLiteral, $allowedLiteral)) + ); + } + + $inPtr = self::jitStringArg($context, $input); + $allowPtr = self::jitAllowedArg($context, $allowed); + + return $context->builder->call( + $context->lookupFunction('__compiler_strip_tags'), + $inPtr, + $allowPtr + ); + } + + private static function jitStringArg(Context $context, JITVariable $arg): Value + { + if (JITVariable::TYPE_STRING === $arg->type) { + return $context->helper->loadValue($arg); + } + if (JITVariable::TYPE_VALUE === $arg->type) { + return $context->builder->call( + $context->lookupFunction('__value__readString'), + $arg->value + ); + } + + throw new \LogicException('strip_tags() only supports strings in this compiler build'); + } + + private static function jitAllowedArg(Context $context, ?JITVariable $allowed): Value + { + if (null === $allowed) { + return $context->builder->load($context->constantStringFromString('')); + } + if (JITVariable::TYPE_STRING === $allowed->type) { + return $context->helper->loadValue($allowed); + } + if (JITVariable::TYPE_VALUE === $allowed->type) { + return $context->builder->call( + $context->lookupFunction('__value__readString'), + $allowed->value + ); + } + + throw new \LogicException('strip_tags() allowed_tags must be a string or null in this compiler build'); + } +} diff --git a/ext/standard/Module.php b/ext/standard/Module.php index 9e2e0e00..1014e5df 100755 --- a/ext/standard/Module.php +++ b/ext/standard/Module.php @@ -100,6 +100,7 @@ public function getFunctions(): array new str_pad(), new str_split(), new htmlspecialchars(), + new strip_tags(), new header_(), new http_response_code(), new urlencode(), diff --git a/ext/standard/VmString.php b/ext/standard/VmString.php index ebc9b692..0381ffd5 100644 --- a/ext/standard/VmString.php +++ b/ext/standard/VmString.php @@ -353,6 +353,134 @@ public static function htmlspecialchars( return $out; } + /** + * strip_tags() subset: removes HTML/PHP tags; optional allow-list like "

". + * HTML comments and PHP tags remove their inner content; other tags keep inner text. + */ + public static function stripTags(string $string, ?string $allowedTags = null): string + { + $allowed = null === $allowedTags || '' === $allowedTags + ? [] + : self::parseAllowedTags($allowedTags); + $out = ''; + $len = self::byteLength($string); + $i = 0; + while ($i < $len) { + $ch = $string[$i]; + if ('<' !== $ch) { + $out .= $ch; + ++$i; + continue; + } + if ($i + 3 < $len && '', $i + 4); + if (false !== $end) { + $i = $end + 3; + continue; + } + } + if ($i + 1 < $len && '', $i + 2); + if (false !== $end) { + $i = $end + 2; + continue; + } + } + $gt = self::findSubstring($string, '>', $i + 1); + if (false === $gt) { + $out .= $ch; + ++$i; + continue; + } + $tagContent = self::byteSlice($string, $i + 1, $gt - $i - 1); + $tagName = self::extractTagName($tagContent); + if (null !== $tagName && [] !== $allowed && self::isTagAllowed($tagName, $allowed)) { + $out .= self::byteSlice($string, $i, $gt - $i + 1); + } + $i = $gt + 1; + } + + return $out; + } + + /** + * @return list + */ + private static function parseAllowedTags(string $allowedTags): array + { + $tags = []; + $len = self::byteLength($allowedTags); + $i = 0; + while ($i < $len) { + if ('<' !== $allowedTags[$i]) { + ++$i; + continue; + } + $gt = self::findSubstring($allowedTags, '>', $i + 1); + if (false === $gt) { + break; + } + $name = self::extractTagName(self::byteSlice($allowedTags, $i + 1, $gt - $i - 1)); + if (null !== $name && '' !== $name) { + $tags[] = $name; + } + $i = $gt + 1; + } + + return $tags; + } + + private static function extractTagName(string $tagContent): ?string + { + $len = self::byteLength($tagContent); + $i = 0; + while ($i < $len && self::isTagWhitespace($tagContent[$i])) { + ++$i; + } + if ($i < $len && '/' === $tagContent[$i]) { + ++$i; + } + if ($i >= $len) { + return null; + } + $start = $i; + while ($i < $len) { + $ch = $tagContent[$i]; + if (self::isTagWhitespace($ch) || '>' === $ch || '/' === $ch) { + break; + } + if (!ctype_alpha($ch) && !ctype_digit($ch)) { + return null; + } + ++$i; + } + if ($start === $i) { + return null; + } + + return strtolower(self::byteSlice($tagContent, $start, $i - $start)); + } + + /** + * @param list $allowed + */ + private static function isTagAllowed(string $tagName, array $allowed): bool + { + $tagName = strtolower($tagName); + foreach ($allowed as $name) { + if ($tagName === $name) { + return true; + } + } + + return false; + } + + private static function isTagWhitespace(string $ch): bool + { + return str_contains(self::TRIM_DEFAULT, $ch); + } + /** * @return list */ diff --git a/ext/standard/strip_tags.php b/ext/standard/strip_tags.php new file mode 100644 index 00000000..787d1798 --- /dev/null +++ b/ext/standard/strip_tags.php @@ -0,0 +1,56 @@ +calledArgs); + if ($argc < 1 || $argc > 2) { + throw new \LogicException('strip_tags() requires one or two arguments in this compiler build'); + } + $v = $frame->calledArgs[0]->resolveIndirect(); + if (null === $frame->returnVar) { + return; + } + if (Variable::TYPE_STRING !== $v->type) { + throw new \LogicException('strip_tags() only supports strings in this compiler build'); + } + $allowed = null; + if (2 === $argc) { + $allowVar = $frame->calledArgs[1]->resolveIndirect(); + if (Variable::TYPE_NULL === $allowVar->type) { + $allowed = null; + } elseif (Variable::TYPE_STRING === $allowVar->type) { + $allowed = $allowVar->toString(); + } else { + throw new \LogicException('strip_tags() allowed_tags must be a string or null in this compiler build'); + } + } + $frame->returnVar->string(VmString::stripTags($v->toString(), $allowed)); + } + + public function call(Context $context, JITVariable ...$args): Value + { + $argc = \count($args); + if ($argc < 1 || $argc > 2) { + throw new \LogicException('strip_tags() requires one or two arguments in this compiler build'); + } + $allowed = 2 === $argc ? $args[1] : null; + + return JitStripTags::stripTags($context, $args[0], $allowed); + } +} diff --git a/lib/AOT/runtime/superglobals_refresh.c b/lib/AOT/runtime/superglobals_refresh.c index 286c7def..93f05a1a 100644 --- a/lib/AOT/runtime/superglobals_refresh.c +++ b/lib/AOT/runtime/superglobals_refresh.c @@ -756,3 +756,216 @@ __string__ *__compiler_number_format( return cstr_to_string(buf); } + +static int st_is_space(char ch) +{ + return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || ch == '\v' || ch == '\f'; +} + +static int st_is_tag_char(char ch) +{ + return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9'); +} + +static int st_find_substr(const char *hay, size_t hlen, const char *needle, size_t nlen, size_t from) +{ + size_t i; + + if (nlen == 0 || from + nlen > hlen) { + return -1; + } + for (i = from; i + nlen <= hlen; i++) { + if (memcmp(hay + i, needle, nlen) == 0) { + return (int) i; + } + } + + return -1; +} + +static void st_tolower_buf(char *buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (buf[i] >= 'A' && buf[i] <= 'Z') { + buf[i] = (char) (buf[i] - 'A' + 'a'); + } + } +} + +static int st_extract_tag_name(const char *content, size_t clen, char *out, size_t out_cap) +{ + size_t i = 0; + size_t start; + + while (i < clen && st_is_space(content[i])) { + i++; + } + if (i < clen && content[i] == '/') { + i++; + } + if (i >= clen) { + return 0; + } + start = i; + while (i < clen) { + char ch = content[i]; + if (st_is_space(ch) || ch == '>' || ch == '/') { + break; + } + if (!st_is_tag_char(ch)) { + return 0; + } + i++; + } + if (start == i || i - start >= out_cap) { + return 0; + } + memcpy(out, content + start, i - start); + out[i - start] = '\0'; + st_tolower_buf(out, i - start); + + return 1; +} + +static int st_tag_allowed(const char *name, const char *allowed_tags[], int allowed_count) +{ + int i; + + for (i = 0; i < allowed_count; i++) { + if (strcmp(name, allowed_tags[i]) == 0) { + return 1; + } + } + + return 0; +} + +static int st_parse_allowed(const char *allowed, size_t alen, char tags[][32], int max_tags) +{ + int count = 0; + size_t i = 0; + + while (i < alen && count < max_tags) { + int gt; + char content[128]; + size_t clen; + + if (allowed[i] != '<') { + i++; + continue; + } + gt = st_find_substr(allowed, alen, ">", 1, i + 1); + if (gt < 0) { + break; + } + clen = (size_t) gt - i - 1; + if (clen >= sizeof(content)) { + clen = sizeof(content) - 1; + } + memcpy(content, allowed + i + 1, clen); + content[clen] = '\0'; + if (st_extract_tag_name(content, clen, tags[count], sizeof(tags[0]))) { + count++; + } + i = (size_t) gt + 1; + } + + return count; +} + +/** + * LLVM/AOT runtime: strip_tags() subset (mirrors VmString::stripTags). + */ +__string__ *__compiler_strip_tags(__string__ *input, __string__ *allowed) +{ + const char *src; + size_t slen; + const char *allow_src = ""; + size_t alen = 0; + char allowed_list[32][32]; + int allowed_count = 0; + char *out; + size_t out_cap; + size_t out_len = 0; + size_t i = 0; + + src = nf_strdata(input); + slen = nf_strlen(input); + if (allowed != NULL) { + allow_src = nf_strdata(allowed); + alen = nf_strlen(allowed); + if (alen > 0) { + allowed_count = st_parse_allowed(allow_src, alen, allowed_list, 32); + } + } + out_cap = slen + 1; + out = (char *) malloc(out_cap); + if (out == NULL) { + return cstr_to_string(""); + } + + while (i < slen) { + if (src[i] != '<') { + out[out_len++] = src[i++]; + continue; + } + if (i + 3 < slen && memcmp(src + i, "", 3, i + 4); + if (end >= 0) { + i = (size_t) end + 3; + continue; + } + } + if (i + 1 < slen && memcmp(src + i, "", 2, i + 2); + if (end >= 0) { + i = (size_t) end + 2; + continue; + } + } + { + int gt = st_find_substr(src, slen, ">", 1, i + 1); + char tag_name[32]; + char content[256]; + size_t clen; + + if (gt < 0) { + out[out_len++] = src[i++]; + continue; + } + clen = (size_t) gt - i - 1; + if (clen >= sizeof(content)) { + clen = sizeof(content) - 1; + } + memcpy(content, src + i + 1, clen); + content[clen] = '\0'; + if (st_extract_tag_name(content, clen, tag_name, sizeof(tag_name)) + && allowed_count > 0 && st_tag_allowed(tag_name, allowed_list, allowed_count)) { + size_t tag_len = (size_t) gt - i + 1; + if (out_len + tag_len >= out_cap) { + out_cap = out_cap * 2 + tag_len; + { + char *grown = (char *) realloc(out, out_cap); + if (grown == NULL) { + free(out); + return cstr_to_string(""); + } + out = grown; + } + } + memcpy(out + out_len, src + i, tag_len); + out_len += tag_len; + } + i = (size_t) gt + 1; + } + } + out[out_len] = '\0'; + { + __string__ *result = cstr_to_string(out); + free(out); + + return result; + } +} diff --git a/lib/JIT/Builtin/Type.php b/lib/JIT/Builtin/Type.php index da726c36..d24fca75 100755 --- a/lib/JIT/Builtin/Type.php +++ b/lib/JIT/Builtin/Type.php @@ -45,6 +45,14 @@ public function register(): void { ); $fnNumberFormat = $this->context->module->addFunction('__compiler_number_format', $fntypeNumberFormat); $this->context->registerFunction('__compiler_number_format', $fnNumberFormat); + $fntypeStripTags = $this->context->context->functionType( + $this->context->getTypeFromString('__string__*'), + false, + $this->context->getTypeFromString('__string__*'), + $this->context->getTypeFromString('__string__*') + ); + $fnStripTags = $this->context->module->addFunction('__compiler_strip_tags', $fntypeStripTags); + $this->context->registerFunction('__compiler_strip_tags', $fnStripTags); $i8p = $this->context->getTypeFromString('int8*'); $i32 = $this->context->getTypeFromString('int32'); $sizeT = $this->context->getTypeFromString('size_t'); diff --git a/test/compliance/cases/stdlib/strip_tags.phpt b/test/compliance/cases/stdlib/strip_tags.phpt new file mode 100644 index 00000000..1375d499 --- /dev/null +++ b/test/compliance/cases/stdlib/strip_tags.phpt @@ -0,0 +1,15 @@ +--TEST-- +stdlib strip_tags() +--FILE-- +alert(1)hello'), "\n"; +echo strip_tags('xy', ''), "\n"; +echo strip_tags('

a


b', '


'), "\n"; +echo strip_tags('ab'), "\n"; +echo strip_tags('not a tag < incomplete'), "\n"; +--EXPECT-- +alert(1)hello +xy +

a


b +ab +not a tag < incomplete diff --git a/test/compliance/cases/stdlib/strip_tags_jit.phpt b/test/compliance/cases/stdlib/strip_tags_jit.phpt new file mode 100644 index 00000000..9a43b1c9 --- /dev/null +++ b/test/compliance/cases/stdlib/strip_tags_jit.phpt @@ -0,0 +1,13 @@ +--TEST-- +stdlib strip_tags() JIT/AOT path +--FILE-- +alert(1)hello'), "\n"; +echo strip_tags('xy', ''), "\n"; +echo strip_tags('

a


b', '


'), "\n"; +echo strip_tags('ab'), "\n"; +--EXPECT-- +alert(1)hello +xy +

a


b +ab diff --git a/test/fixtures/aot/cases/strip_tags.phpt b/test/fixtures/aot/cases/strip_tags.phpt new file mode 100644 index 00000000..4e8c7477 --- /dev/null +++ b/test/fixtures/aot/cases/strip_tags.phpt @@ -0,0 +1,11 @@ +--TEST-- +AOT strip_tags() +--FILE-- +alert(1)hello'), "\n"; +echo strip_tags('xy', ''), "\n"; +echo strip_tags('ab'), "\n"; +--EXPECT-- +alert(1)hello +xy +ab