diff --git a/.gitignore b/.gitignore index e515560..419e4db 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ composer.lock .phpstan-cache coverage .idea +.claude diff --git a/composer.json b/composer.json index b86972e..3008241 100644 --- a/composer.json +++ b/composer.json @@ -45,7 +45,7 @@ } }, "scripts": { - "test": "pest", + "test": "pest --parallel", "benchmark": "phpbench run", "ecs": "ecs check --fix", "rector": "rector process", diff --git a/docs/json-repair/configuration.mdx b/docs/json-repair/configuration.mdx index c1fcacd..dc9260d 100644 --- a/docs/json-repair/configuration.mdx +++ b/docs/json-repair/configuration.mdx @@ -12,12 +12,12 @@ When `true` (default), non-ASCII characters are escaped. When `false`, Unicode i use function Cortex\JsonRepair\json_repair; // ensureAscii: true (default) - escapes Unicode -json_repair("{'test_中国人_ascii':'统一码'}", ensureAscii: true); -// {"test_中国人_ascii":"\u7edf\u4e00\u7801"} +json_repair("{'city':'上海'}", ensureAscii: true); +// {"city":"\u4e0a\u6d77"} // ensureAscii: false - preserves Unicode -json_repair("{'test_中国人_ascii':'统一码'}", ensureAscii: false); -// {"test_中国人_ascii":"统一码"} +json_repair("{'city':'上海'}", ensureAscii: false); +// {"city":"上海"} ``` ## omitEmptyValues diff --git a/docs/json-repair/installation.mdx b/docs/json-repair/installation.mdx index 02ee466..0155794 100644 --- a/docs/json-repair/installation.mdx +++ b/docs/json-repair/installation.mdx @@ -40,6 +40,7 @@ If you're contributing to the package or want to run tests: ``` + Runs tests, static analysis (PHPStan), coding standards (ECS), and type coverage: ```bash composer check ``` diff --git a/docs/json-repair/logging.mdx b/docs/json-repair/logging.mdx index c09f080..b720cbb 100644 --- a/docs/json-repair/logging.mdx +++ b/docs/json-repair/logging.mdx @@ -39,15 +39,22 @@ Valid JSON produces a single log entry: When repairs occur, you'll see messages such as: - `Starting JSON repair` -- `Adding missing closing quote for unclosed string` -- `Adding missing closing bracket/brace` +- `Extracted JSON from markdown code block` +- `Removed comments from JSON` +- `Removing single-line comment` +- `Removing multi-line comment` - `Converting single-quoted key to double quotes` -- `Normalizing boolean/null value` (with context: `from: 'True', to: 'true'`) +- `Converting smart/curly quote to standard double quote` +- `Found doubled quote delimiter pattern, normalizing key` - `Adding quotes around unquoted key` - `Found unquoted string value, adding quotes` -- `Inserting missing comma` - `Inserting missing colon after key` -- `Extracted JSON from markdown code block` +- `Inserting missing comma` +- `Removing trailing comma` +- `Normalizing boolean/null value` (with context: `from: 'True', to: 'true'`) +- `Adding missing closing quote for unclosed string` +- `Removing incomplete string value (omitIncompleteStrings enabled)` +- `Adding missing closing bracket/brace` - `Removing key with missing value (omitEmptyValues enabled)` Log entries include `position` and `context` with `>>>` markers showing where the repair occurred. diff --git a/docs/json-repair/quickstart.mdx b/docs/json-repair/quickstart.mdx index d87f85e..d762137 100644 --- a/docs/json-repair/quickstart.mdx +++ b/docs/json-repair/quickstart.mdx @@ -40,6 +40,23 @@ $data = $repairer->decode(); // ['key' => 'value'] ``` +## Error Handling + +`repair()` throws `JsonRepairException` if the input cannot be repaired into valid JSON: + +```php +use Cortex\JsonRepair\JsonRepairer; +use Cortex\JsonRepair\Exceptions\JsonRepairException; + +try { + $repaired = (new JsonRepairer($input))->repair(); +} catch (JsonRepairException $e) { + // input could not be repaired into valid JSON +} +``` + +The helper functions `json_repair()` and `json_repair_decode()` propagate the same exception. + ## Next Steps - See [Repair Examples](/json-repair/repair-examples) for what gets fixed diff --git a/docs/json-repair/repair-examples.mdx b/docs/json-repair/repair-examples.mdx index 5776dfd..c4ddc4d 100644 --- a/docs/json-repair/repair-examples.mdx +++ b/docs/json-repair/repair-examples.mdx @@ -19,6 +19,16 @@ json_repair("{'key': 'string', \"key4\": unquoted}"); // {"key": "string", "key4": "unquoted"} ``` +Typographic (smart/curly) quotes are normalised to standard double quotes: + +```php +json_repair('{"key": "value"}'); +// {"key": "value"} + +json_repair("{'key': 'value'}"); +// {"key": "value"} +``` + Quotes inside string values are escaped: ```php @@ -26,6 +36,13 @@ json_repair('{"key": "v"alu"e"}'); // {"key": "v\"alu\"e"} ``` +Invalid escape sequences are double-escaped so the output stays valid: + +```php +json_repair('{"key": "foo\qbar"}'); +// {"key": "foo\\qbar"} +``` + ## Commas and Colons Trailing commas are removed; missing commas and colons are inserted. diff --git a/src/Concerns/InputSanitization.php b/src/Concerns/InputSanitization.php index cecdd95..a12b8cd 100644 --- a/src/Concerns/InputSanitization.php +++ b/src/Concerns/InputSanitization.php @@ -47,6 +47,11 @@ private function extractJsonFromMarkdown(string $input): string */ private function removeComments(string $input): string { + // Fast path: if no comment markers exist, return as-is without scanning + if (! str_contains($input, '//') && ! str_contains($input, '/*')) { + return $input; + } + $length = strlen($input); $output = ''; $inString = false; diff --git a/src/Concerns/StateMachine.php b/src/Concerns/StateMachine.php index bcb231c..e25e530 100644 --- a/src/Concerns/StateMachine.php +++ b/src/Concerns/StateMachine.php @@ -84,6 +84,7 @@ private function handleStart(string $json, int $i): int private function handleObjectKey(string $json, int $i): int { $char = $json[$i]; + $length = strlen($json); if ($char === '}') { $this->removeTrailingComma(); @@ -97,7 +98,7 @@ private function handleObjectKey(string $json, int $i): int if ($char === '"' || $char === "'") { // Check for double-quote delimiter pattern like ""key"" (slanted delimiter style) // If we have ""X where X is alphanumeric, skip the double quotes and read as unquoted key - if ($i + 2 < strlen($json) && $json[$i + 1] === $char) { + if ($i + 2 < $length && $json[$i + 1] === $char) { $afterDoubleQuote = $json[$i + 2]; if (ctype_alnum($afterDoubleQuote) || $afterDoubleQuote === '_' || $afterDoubleQuote === ' ') { @@ -109,20 +110,16 @@ private function handleObjectKey(string $json, int $i): int $keyEnd = $keyStart; // Read until we hit the closing "" or single " or : or } - while ($keyEnd < strlen($json)) { + while ($keyEnd < $length) { $keyChar = $json[$keyEnd]; // Check for closing "" pattern - if (($keyChar === '"' || $keyChar === "'") && $keyEnd + 1 < strlen( - $json, - ) && $json[$keyEnd + 1] === $keyChar) { + if (($keyChar === '"' || $keyChar === "'") && $keyEnd + 1 < $length && $json[$keyEnd + 1] === $keyChar) { break; } // Also stop at single quote followed by colon (end of key) - if (($keyChar === '"' || $keyChar === "'") && $keyEnd + 1 < strlen( - $json, - ) && $json[$keyEnd + 1] === ':') { + if (($keyChar === '"' || $keyChar === "'") && $keyEnd + 1 < $length && $json[$keyEnd + 1] === ':') { break; } @@ -139,14 +136,12 @@ private function handleObjectKey(string $json, int $i): int $this->state = self::STATE_EXPECTING_COLON; // Skip past the closing "" if present - if ($keyEnd + 1 < strlen( - $json, - ) && ($json[$keyEnd] === '"' || $json[$keyEnd] === "'") && $json[$keyEnd + 1] === $json[$keyEnd]) { + if ($keyEnd + 1 < $length && ($json[$keyEnd] === '"' || $json[$keyEnd] === "'") && $json[$keyEnd + 1] === $json[$keyEnd]) { return $keyEnd + 2; } // Skip past single closing " if present (followed by :) - if ($keyEnd < strlen($json) && ($json[$keyEnd] === '"' || $json[$keyEnd] === "'")) { + if ($keyEnd < $length && ($json[$keyEnd] === '"' || $json[$keyEnd] === "'")) { return $keyEnd + 1; } @@ -184,17 +179,17 @@ private function handleObjectKey(string $json, int $i): int return $i + $smartQuoteLength; } - // Unquoted key + // Unquoted key — batch all key chars into one substr() append if (ctype_alnum($char) || $char === '_' || $char === '-') { $this->log('Adding quotes around unquoted key'); - // Track where the key starts $this->currentKeyStart = strlen($this->output); $this->output .= '"'; - while ($i < strlen($json) && (ctype_alnum($json[$i]) || $json[$i] === '_' || $json[$i] === '-')) { - $this->output .= $json[$i]; + $keyStart = $i; + while ($i < $length && (ctype_alnum($json[$i]) || $json[$i] === '_' || $json[$i] === '-')) { $i++; } + $this->output .= substr($json, $keyStart, $i - $keyStart); $this->output .= '"'; $this->state = self::STATE_EXPECTING_COLON; @@ -218,6 +213,7 @@ private function handleObjectKey(string $json, int $i): int private function handleExpectingColon(string $json, int $i): int { $char = $json[$i]; + $length = strlen($json); if ($char === ':') { $this->output .= ':'; @@ -225,7 +221,7 @@ private function handleExpectingColon(string $json, int $i): int // Preserve whitespace after colon $nextI = $i + 1; - while ($nextI < strlen($json) && $json[$nextI] === ' ') { + while ($nextI < $length && $json[$nextI] === ' ') { $this->output .= ' '; $nextI++; } @@ -259,6 +255,7 @@ private function handleExpectingColon(string $json, int $i): int private function handleObjectValue(string $json, int $i): int { $char = $json[$i]; + $length = strlen($json); $next = $this->tryOpenObjectOrArray($json, $i, true); @@ -270,9 +267,7 @@ private function handleObjectValue(string $json, int $i): int // Check for double quote at start of value (e.g., {"key": ""value"}) // Skip the first quote if it's immediately followed by another quote and then non-quote content // Check what comes after the second quote - if ($i + 1 < strlen($json) && $json[$i + 1] === $char && ($i + 2 < strlen( - $json, - ) && $json[$i + 2] !== $char && $json[$i + 2] !== '}' && $json[$i + 2] !== ',')) { + if ($i + 1 < $length && $json[$i + 1] === $char && ($i + 2 < $length && $json[$i + 2] !== $char && $json[$i + 2] !== '}' && $json[$i + 2] !== ',')) { // Pattern like ""value" - skip the empty quotes and use the value // Skip the first quote entirely return $i + 1; @@ -313,14 +308,15 @@ private function handleObjectValue(string $json, int $i): int } // Handle non-standard booleans/null (True/False/None) - $matchResult = preg_match('/^(true|false|null|True|False|None)\b/i', substr($json, $i), $matches); + $keywordMatch = $this->tryMatchKeyword($json, $i, $length); - if ($matchResult === 1) { - $normalized = $this->normalizeBoolean($matches[1]); + if ($keywordMatch !== null) { + [$normalized, $klen] = $keywordMatch; + $original = substr($json, $i, $klen); - if ($matches[1] !== $normalized) { + if ($original !== $normalized) { $this->log('Normalizing boolean/null value', [ - 'from' => $matches[1], + 'from' => $original, 'to' => $normalized, ]); } @@ -330,7 +326,7 @@ private function handleObjectValue(string $json, int $i): int // Reset key tracking after successfully completing a boolean/null value $this->currentKeyStart = -1; - return $i + strlen($matches[1]); + return $i + $klen; } // Handle numbers @@ -392,6 +388,7 @@ private function handleObjectValue(string $json, int $i): int private function handleArrayValue(string $json, int $i): int { $char = $json[$i]; + $length = strlen($json); if ($char === ']') { $this->removeTrailingComma(); @@ -419,13 +416,14 @@ private function handleArrayValue(string $json, int $i): int } // Handle non-standard booleans/null (True/False/None) - $matchResult = preg_match('/^(true|false|null|True|False|None)\b/i', substr($json, $i), $matches); + $keywordMatch = $this->tryMatchKeyword($json, $i, $length); - if ($matchResult === 1) { - $this->output .= $this->normalizeBoolean($matches[1]); + if ($keywordMatch !== null) { + [$normalized, $klen] = $keywordMatch; + $this->output .= $normalized; $this->state = self::STATE_EXPECTING_COMMA_OR_END; - return $i + strlen($matches[1]); + return $i + $klen; } // Handle numbers @@ -469,7 +467,8 @@ private function handleExpectingCommaOrEnd(string $json, int $i): int // Preserve whitespace after comma $nextI = $i + 1; - while ($nextI < strlen($json) && $json[$nextI] === ' ') { + $length = strlen($json); + while ($nextI < $length && $json[$nextI] === ' ') { $this->output .= ' '; $nextI++; } @@ -511,20 +510,28 @@ private function handleNumber(string $json, int $i): int $i++; } - // Handle integer part + // Handle integer part — batch all digits into one substr() append + $start = $i; while ($i < $length && ctype_digit($json[$i])) { - $this->output .= $json[$i]; $i++; } + if ($i > $start) { + $this->output .= substr($json, $start, $i - $start); + } + // Handle decimal point if ($i < $length && $json[$i] === '.') { $this->output .= '.'; $i++; + $start = $i; while ($i < $length && ctype_digit($json[$i])) { - $this->output .= $json[$i]; $i++; } + + if ($i > $start) { + $this->output .= substr($json, $start, $i - $start); + } } // Handle exponent @@ -538,18 +545,17 @@ private function handleNumber(string $json, int $i): int $i++; } - $hasExponentDigits = false; + // Batch exponent digits; track where they start to detect empty exponent + $digitStart = $i; while ($i < $length && ctype_digit($json[$i])) { - $this->output .= $json[$i]; $i++; - $hasExponentDigits = true; } - // If we started an exponent but don't have digits, remove the incomplete exponent - if (! $hasExponentDigits) { - // Remove the 'e' or 'E' and optional sign - $exponentLength = $i - $exponentStart; - $this->output = substr($this->output, 0, -$exponentLength); + if ($i > $digitStart) { + $this->output .= substr($json, $digitStart, $i - $digitStart); + } else { + // No digits after 'e'/'E' — remove the incomplete exponent (letter + optional sign) + $this->output = substr($this->output, 0, -($digitStart - $exponentStart)); } } @@ -752,4 +758,52 @@ private function escapeStringValue(string $value): string { return str_replace(['\\', '"'], ['\\\\', '\\"'], $value); } + + /** + * Boolean / null keyword literals for {@see tryMatchKeyword()} (substr_compare + word boundary). + * + * @return list Each tuple: keyword text, byte length, normalized JSON token. + */ + private static function keywordMatchSpecs(): array + { + return [ + ['true', 4, 'true'], + ['false', 5, 'false'], + ['null', 4, 'null'], + ['none', 4, 'null'], + ]; + } + + /** + * Try to match a boolean or null keyword at the given position without regex. + * + * Uses substr_compare to avoid creating a substring and bypasses the regex + * engine entirely. Checks a word boundary after the match. + * + * @param int $length Pre-computed strlen($json) + * + * @return array{string, int}|null [normalized_value, keyword_length] or null + */ + private function tryMatchKeyword(string $json, int $i, int $length): ?array + { + $c = $json[$i]; + + // Quick first-char gate before doing heavier work + if (! in_array($c, ['t', 'T', 'f', 'F', 'n', 'N'], true)) { + return null; + } + + foreach (self::keywordMatchSpecs() as [$keyword, $klen, $normalized]) { + if ($length - $i >= $klen && substr_compare($json, $keyword, $i, $klen, true) === 0) { + $afterPos = $i + $klen; + + // Word boundary: next char must not be alphanumeric or underscore + if ($afterPos >= $length || (! ctype_alnum($json[$afterPos]) && $json[$afterPos] !== '_')) { + return [$normalized, $klen]; + } + } + } + + return null; + } }