From f37b437caa55a4aa6a0e534f5e236e8b1d34a1b3 Mon Sep 17 00:00:00 2001 From: Matthew J Mucklo Date: Sun, 12 Apr 2026 22:02:33 -0700 Subject: [PATCH] v3.3: serialization, canonical form, local-part normalizer, docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All v3.3 roadmap items (ecosystem bridges deferred by user direction). Fully additive — no breaking changes for v3.2 callers. Serialization on value objects: - ParsedEmailAddress::toArray(): round-trips to the legacy parse() array shape, field order matching the parser output. Useful when mixing typed and array-based code. - ParsedEmailAddress::toJson(int $flags = 0): json_encode wrapper with JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES always set; additional flags (e.g. JSON_PRETTY_PRINT) passed through. ParseErrorCode is a BackedEnum so it serializes to its backing string value automatically. - ParseResult::toArray() and ParseResult::toJson(): same pattern; each address in the batch is serialized via ParsedEmailAddress::toArray(). Stringable: - ParsedEmailAddress now implements \Stringable. (string) $parsed returns simpleAddress for valid addresses, empty string otherwise. Lets a parsed address drop directly into string contexts (logging, templates). Canonical RFC 5322 display form: - ParsedEmailAddress::canonical(): returns the minimal-quoting canonical form per RFC 5322 §3.2.4 (local-part) and §3.2.5 (phrase). Drops unnecessary quotes that the input may have carried (e.g. '"John Doe" ' -> 'John Doe ') and adds quotes only where required (e.g. '"John Q. Public" ' stays quoted due to the non-atext period). Returns empty string for invalid addresses. - Helpers isAtextDotAtom() and isPhraseAtoms() inspect the content against the ABNF character classes. Local-part normalizer callback: - New ParseOptions property $localPartNormalizer (readonly ?\Closure) and withLocalPartNormalizer(?callable) fluent builder. Any callable is accepted and wrapped via Closure::fromCallable for uniform storage. - The callback fn(string $localPart, string $domain): string is invoked after local-part validation succeeds; its return value replaces local_part_parsed in the output (and the quoted display form is re-derived). originalAddress still preserves the verbatim input for audit/logging. - Typical uses: Gmail dot-insensitivity and +tag stripping, or any domain-specific canonicalization. The callback is gated behind the validation success check so it only sees addresses that conform to the configured ParseOptions rules. - cloneWith() handles localPartNormalizer specially: the $get() closure uses ?? which would treat explicit null as "fall back", so an explicit array_key_exists() check is used instead to support clearing. Docs: - README: switch the nine parse(\$email, false) call sites in examples to parseSingle(\$email); keep one under "Other Examples" to document the legacy array shape with a note pointing new code at the typed API. Add serialization examples in Basic Usage. Add localPartNormalizer to the rule-properties table. - ROADMAP: v3.3 section expanded with checkbox items; serialization, canonicalization, and normalizer items flipped to [x]. Ecosystem bridges listed but unchecked with a note marking them deferred. New "Quality and Infrastructure (ongoing)" section covers mutation testing, property-based tests, PHPStan level bump, Psalm, PhpBench, CONTRIBUTING.md and related cross-release work. - UPGRADE: new v3.2 -> v3.3 section covering the additions. - CHANGELOG: v3.3.0 entry with Added / Changed sections. Tests: 60 tests / 472 assertions (up from 42 / 445 in v3.2). New tests: - toArray round-trips to legacy parse() output exactly (assertSame) - toJson produces parseable JSON with ParseErrorCode as backing string - Stringable returns simpleAddress when valid, '' when invalid - canonical() for six forms: addr-spec only, with atext name, stripping unnecessary name quotes, keeping required name quotes, quoted local-part, invalid -> empty string - Local-part normalizer: Gmail-style rewrite, domain gating, not invoked on invalid inputs, null-clearing via withLocalPartNormalizer --- CHANGELOG.md | 15 ++++ README.md | 41 +++++---- ROADMAP.md | 54 ++++++++++-- UPGRADE.md | 30 +++++++ src/Parse.php | 18 ++++ src/ParseOptions.php | 28 ++++++ src/ParseResult.php | 31 +++++++ src/ParsedEmailAddress.php | 123 ++++++++++++++++++++++++++- tests/ParseTest.php | 170 +++++++++++++++++++++++++++++++++++++ 9 files changed, 489 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 511e0d2..89a13ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,21 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), ## [Unreleased] +## [3.3.0] + +Serialization and ergonomic polish for the typed value objects, a canonical display-form method, and an opt-in local-part normalizer callback. All additions are non-breaking for v3.2 callers. + +### Added +- `ParsedEmailAddress::toArray(): array` — round-trips to the legacy array shape produced by `Parse::parse()`. Useful when mixing typed and array-based code. +- `ParsedEmailAddress::toJson(int $flags = 0): string` — convenience wrapper over `json_encode` with `JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES`. `ParseErrorCode` serializes to its backing string value. +- `ParseResult::toArray()` and `ParseResult::toJson()` — same for the multi-address container; each entry is serialized via `ParsedEmailAddress::toArray()`. +- `ParsedEmailAddress implements \Stringable` — `(string) $parsed` returns the `simpleAddress` for valid addresses, empty string otherwise. Lets a parsed address drop directly into string contexts (logging, templates, etc.). +- `ParsedEmailAddress::canonical(): string` — canonical RFC 5322 display form with minimal quoting per §3.2.4 (local-part) and §3.2.5 (phrase). Drops unnecessary quotes that `$address` may preserve from the input, and adds quotes only where required. Returns empty string for invalid addresses. +- `ParseOptions::$localPartNormalizer` (readonly `?\Closure`) + `withLocalPartNormalizer(?callable)` fluent builder. The callback `fn(string $localPart, string $domain): string` is invoked after local-part validation succeeds; the returned string replaces `local_part_parsed` in the output. Typical uses: Gmail dot-insensitivity, `+tag` plus-addressing, or any domain-specific canonicalization. `originalAddress` still preserves the verbatim input. + +### Changed +- None — all additions; no behavior changes for existing callers. + ## [3.2.0] Streaming batch parsing, severity classification for validation errors, RFC 5322 §4.4 obs-route support, and broader CFWS tolerance around addr-spec boundaries. All additions are non-breaking for v3.1 callers. diff --git a/README.md b/README.md index a208a14..233487e 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,13 @@ foreach (Parse::getInstance()->parseStream($csvRows) as $addr) { if ($addr->invalid) continue; // ... } + +// Serialization (v3.3+) +$parsed = Parse::getInstance()->parseSingle('"J Doe" '); +(string) $parsed; // "j@example.com" — Stringable returns simple_address +$parsed->canonical(); // 'J Doe ' — minimal RFC 5322 quoting +$parsed->toArray(); // legacy array shape, for mixed-API code +$parsed->toJson(); // JSON string ``` ### Advanced Usage with ParseOptions @@ -94,7 +101,7 @@ $parser = new Parse(null, $options); // RFC 6531 — Strict Internationalized (full UTF-8 + NFC normalization) $options = ParseOptions::rfc6531(); $parser = new Parse(null, $options); -$result = $parser->parse('müller@münchen.de', false); // Valid UTF-8 address +$result = $parser->parseSingle('müller@münchen.de'); // Valid UTF-8 address // RFC 5322 — Standard with obsolete syntax support (recommended) $options = ParseOptions::rfc5322(); @@ -133,10 +140,10 @@ $parser = new Parse(null, $options); $options = ParseOptions::rfc6531(); $parser = new Parse(null, $options); -$result = $parser->parse('José.García@españa.es', false); +$result = $parser->parseSingle('José.García@españa.es'); // Valid: UTF-8 characters allowed in rfc6531() preset -$result = $parser->parse('.user@example.com', false); +$result = $parser->parseSingle('.user@example.com'); // Invalid: Leading dot not allowed (dot-atom restrictions still apply) ``` @@ -173,6 +180,7 @@ $parser = new Parse(null, $options); | `validateDisplayNamePhrase` | `false` | Enforce RFC 5322 §3.2.5 phrase syntax on unquoted display names | | `strictIdna` | `false` | Apply full IDNA2008 conformance on U-label domains (RFC 5891/5892/5893) | | `allowObsRoute` | `false` | Accept RFC 5322 §4.4 obs-route source-routes like `<@host1,@host2:user@host3>` | +| `localPartNormalizer` | `null` | `?callable(string $local, string $domain): string` — domain-specific canonicalization hook (v3.3+); set via `withLocalPartNormalizer()` | | **Length & Output** | | | | `enforceLengthLimits` | `true` | Enforce RFC 5321 length limits (64/254/63) | | `includeDomainAscii` | `false` | Include punycode `domain_ascii` in output | @@ -244,9 +252,9 @@ The `domain_ascii` field is included in the output when `includeDomainAscii` is ```php $options = ParseOptions::rfc6531(); $parser = new Parse(null, $options); -$result = $parser->parse('user@bücher.de', false); -// $result['domain'] = 'bücher.de' -// $result['domain_ascii'] = 'xn--bcher-kva.de' +$result = $parser->parseSingle('user@bücher.de'); +// $result->domain === 'bücher.de' +// $result->domainAscii === 'xn--bcher-kva.de' ``` ### Comment Extraction @@ -257,20 +265,20 @@ RFC 5322 allows comments in email addresses using parentheses. The parser automa use Email\Parse; // Single comment -$result = Parse::getInstance()->parse('john@example.com (home address)', false); -// $result['comments'] = ['home address'] +$result = Parse::getInstance()->parseSingle('john@example.com (home address)'); +// $result->comments === ['home address'] // Multiple comments -$result = Parse::getInstance()->parse('test(comment1)(comment2)@example.com', false); -// $result['comments'] = ['comment1', 'comment2'] +$result = Parse::getInstance()->parseSingle('test(comment1)(comment2)@example.com'); +// $result->comments === ['comment1', 'comment2'] // Nested comments -$result = Parse::getInstance()->parse('test@example.com (comment with (nested) parens)', false); -// $result['comments'] = ['comment with (nested) parens'] +$result = Parse::getInstance()->parseSingle('test@example.com (comment with (nested) parens)'); +// $result->comments === ['comment with (nested) parens'] // No comments -$result = Parse::getInstance()->parse('test@example.com', false); -// $result['comments'] = [] +$result = Parse::getInstance()->parseSingle('test@example.com'); +// $result->comments === [] ``` Comments are stripped from the `address` field but preserved in `original_address`. @@ -304,7 +312,7 @@ $parser = new Parse(null, $options); // Use the rfc6531() preset for full internationalized email support $options = ParseOptions::rfc6531(); $parser = new Parse(null, $options); -$result = $parser->parse('müller@münchen.de', false); +$result = $parser->parseSingle('müller@münchen.de'); ``` #### Function Spec #### @@ -357,6 +365,9 @@ $result = $parser->parse('müller@münchen.de', false); Other Examples: --------------- + +The following examples use the legacy array-returning `parse()` method to document its full output shape. New code should prefer `parseSingle()` / `parseMultiple()` (see Basic Usage) for typed return values; both APIs expose the same underlying fields. + ```php $email = '"J Doe" '; $result = Email\Parse::getInstance()->parse($email, false); diff --git a/ROADMAP.md b/ROADMAP.md index 845a398..accefc9 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -57,6 +57,50 @@ Future plans by version. Items here are intent, not commitment — priority and - [x] `obs-domain-list` — the `*("," [CFWS] ["@" domain])` shape is consumed inside `STATE_OBS_ROUTE`. - [x] CFWS (comments / folding whitespace) improvements — look-ahead in the whitespace handler now absorbs CFWS at dot-atom boundaries (`local @domain`, `local@ domain`, `local @ domain`) and around angle-addr delimiters (`< local@domain >`, ``), including folded whitespace (LF + WSP). Comments in these positions were already supported in v3.0. +## v3.3 — Polish, Ergonomics — shipped + +Non-breaking follow-on to v3.2. + +**Serialization ergonomics:** +- [x] `ParsedEmailAddress::toArray(): array` — round-trips to the legacy array shape for callers mixing typed and array-based code. +- [x] `ParsedEmailAddress::toJson(int $flags = 0): string` — convenience wrapper over `json_encode` with `JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES`. +- [x] `implements \Stringable` on `ParsedEmailAddress` — returns `simpleAddress` for valid addresses; empty string otherwise. Drops directly into string contexts. +- [x] `ParseResult::toArray()` and `toJson()` counterparts. + +**Canonicalization (pulled forward from v4.0):** +- [x] `ParsedEmailAddress::canonical(): string` — minimal-quoting RFC 5322 display form per §3.2.4 (local-part) and §3.2.5 (phrase). +- [x] Optional local-part normalizer callback on `ParseOptions` for domain-specific rules (Gmail dot-insensitivity, `+tag` plus-addressing). Attached via `withLocalPartNormalizer(?callable)`. + +**Ecosystem bridges:** *(deferred — out of scope for v3.3 per user direction)* +- [ ] `mmucklo/email-parse-symfony` — Symfony `Constraint` + `ConstraintValidator` attribute. Wraps existing `ParseOptions` presets. +- [ ] `mmucklo/email-parse-laravel` — Laravel validation rule, service provider for DI. +- [ ] PSR-14 event dispatcher integration — emit a `ParsedAddressEvent` per result for observability. + +## Quality and Infrastructure (ongoing) + +Not tied to a specific release; picked up as time allows. + +**Testing depth:** +- [ ] Mutation testing with Infection. Surfaces tests whose assertions are too weak to catch small code mutations. Target ≥85% MSI (mutation score indicator). +- [ ] Property-based testing (Eris or Pest plugin): generate random valid addresses, assert `parseSingle(parseSingle($x)->simpleAddress)` round-trips; perturb bytes and assert error codes. +- [ ] Parse.php line coverage 86.69% → ≥95% — remaining gaps are obscure error branches and the "shouldn't ever get here" default case. +- [ ] CI matrix: add PHP 8.5 once released. + +**Static analysis:** +- [ ] PHPStan level 6 → 8 (or `max`) — tighter generics and inference on the state machine. Likely requires additional docblock array shapes. +- [ ] Add Psalm alongside PHPStan for cross-tool coverage; keep both green. + +**Performance:** +- [ ] PhpBench suite: parsing throughput for realistic inputs (single ASCII, multi-address batch, UTF-8, IDN, obs-route). Establishes a baseline before any optimization. +- [ ] Profile the state machine under mailing-list-sized inputs. Likely hot path: `mb_substr` in the main loop — investigate byte iteration for pure-ASCII inputs. + +**Community / documentation:** +- [ ] `CONTRIBUTING.md` with dev setup, CI expectations, and commit-style guidance. +- [ ] GitHub issue + pull-request templates. +- [ ] `CODE_OF_CONDUCT.md`. +- [ ] Examples directory or GitHub Pages cookbook (UTF-8 addresses, obs-route in practice, custom normalizers once they ship, Symfony/Laravel integration snippets). +- [ ] README cleanup — split the large reference tables into `docs/` sub-pages if the top-level README grows further. + ## v4.0 — Breaking Modernization **API cleanup:** @@ -65,8 +109,8 @@ Future plans by version. Items here are intent, not commitment — priority and - [ ] Deprecate or remove the `getInstance()` singleton (recommend explicit instantiation). - [ ] Constructor promotion on `ParseOptions` with named arguments. -**New capabilities:** -- [ ] Optional DNS/MX validation via callback interface (`DnsValidator`). -- [ ] Group syntax support (RFC 6854: `Group Name: addr1, addr2;`). -- [ ] `canonicalize(ParsedEmailAddress): string` — standard display form. -- [ ] Optional local-part normalizer callback for domain-specific rules (e.g. Gmail dot-insensitivity, plus-addressing). +**New capabilities (genuinely breaking or late-binding):** +- [ ] Optional DNS/MX validation via callback interface (`DnsValidator`). Breaking because the Parse constructor signature grows, and because synchronous DNS lookups change performance characteristics meaningfully. +- [ ] Group syntax support (RFC 6854: `Group Name: addr1, addr2;`). Breaking because it introduces a new output-container shape for grouped results. + +*Note: `canonicalize()` and the local-part normalizer callback were moved to v3.3 as additive (non-breaking) features.* diff --git a/UPGRADE.md b/UPGRADE.md index 1c9b055..55f89c7 100644 --- a/UPGRADE.md +++ b/UPGRADE.md @@ -1,5 +1,35 @@ # Upgrade Guide +## v3.2 → v3.3 + +v3.3 is fully additive — no breaking changes, no behavior changes for existing callers. Everything listed here is opt-in. + +### Additions + +- **Serialization**: `ParsedEmailAddress::toArray()`, `ParsedEmailAddress::toJson()`, and the corresponding methods on `ParseResult`. Use these to round-trip typed objects back to the legacy array shape or emit JSON: + ```php + $result = $parser->parseSingle('user@example.com'); + $result->toArray(); // legacy array shape + $result->toJson(); // JSON string; ParseErrorCode serializes to its backing value + ``` +- **`implements \Stringable`** on `ParsedEmailAddress` — `(string) $parsed` returns `simpleAddress` for valid addresses, empty string otherwise. Lets a parsed address drop into string contexts (logging, templating, concatenation). +- **`ParsedEmailAddress::canonical()`** — minimal-quoting RFC 5322 display form. Drops unnecessary quotes that the `$address` field may preserve from the input; adds quotes only when §3.2.4 / §3.2.5 require them. +- **Local-part normalizer callback** — configure with `withLocalPartNormalizer(fn(string $local, string $domain): string)`. Invoked only after successful validation; the returned string replaces `local_part_parsed`. `originalAddress` still preserves the verbatim input. Example (Gmail): + ```php + $opts = ParseOptions::rfc5322()->withLocalPartNormalizer( + fn (string $local, string $domain): string => + $domain === 'gmail.com' + ? ($plus = strpos(str_replace('.', '', $local), '+')) === false + ? str_replace('.', '', $local) + : substr(str_replace('.', '', $local), 0, $plus) + : $local, + ); + ``` + +### Minimum Requirements (Unchanged) + +PHP `^8.1`, `ext-mbstring`, `ext-intl`. + ## v3.1 → v3.2 v3.2 is fully additive — no breaking changes. Two behavior changes are worth noting for callers who depended on them: diff --git a/src/Parse.php b/src/Parse.php index 3d4f6da..0486968 100644 --- a/src/Parse.php +++ b/src/Parse.php @@ -1079,6 +1079,24 @@ private function addAddress( ? "\"{$emailAddress['local_part_parsed']}\"" : $emailAddress['local_part_parsed']; } + + // Optional caller-supplied local-part normalizer — invoked after structural + // validation so the callback only sees addresses that already conform to + // the configured ParseOptions rules. Typical uses: Gmail dot-insensitivity + // (`john.doe` → `johndoe`), plus-addressing (`user+tag` → `user`), or any + // domain-specific canonicalization. The returned string replaces + // local_part_parsed and the display form is re-derived; `original_address` + // still preserves the verbatim input. + if (!$emailAddress['invalid'] && $this->options->localPartNormalizer !== null) { + $normalizer = $this->options->localPartNormalizer; + $normalized = $normalizer($emailAddress['local_part_parsed'], $emailAddress['domain']); + if ($normalized !== $emailAddress['local_part_parsed']) { + $emailAddress['local_part_parsed'] = $normalized; + $localPart = $emailAddress['local_part_quoted'] + ? "\"{$emailAddress['local_part_parsed']}\"" + : $emailAddress['local_part_parsed']; + } + } } // FQDN check diff --git a/src/ParseOptions.php b/src/ParseOptions.php index e454852..eba2524 100644 --- a/src/ParseOptions.php +++ b/src/ParseOptions.php @@ -43,6 +43,7 @@ class ParseOptions * @param bool $validateDisplayNamePhrase Enforce RFC 5322 §3.2.5 phrase syntax for unquoted display names (atext + WSP only). * @param bool $strictIdna Apply full IDNA2008 conformance on U-label domains (CONTEXTJ/O, Bidi rule, STD3, nontransitional mapping). * @param bool $allowObsRoute Accept RFC 5322 §4.4 obs-route source-route prefix inside angle-addr (e.g. `<@host1,@host2:user@host3>`); the route is captured and the real addr-spec is used ("accept and discard" per spec). + * @param ?\Closure $localPartNormalizer Optional callback `fn(string $localPart, string $domain): string` invoked after local-part validation succeeds. The returned string replaces `local_part_parsed` in the output (and is re-quoted if needed). Typical uses: Gmail dot-insensitivity, `+tag` plus-addressing. */ public function __construct( array $bannedChars = [], @@ -66,6 +67,7 @@ public function __construct( public readonly bool $validateDisplayNamePhrase = false, public readonly bool $strictIdna = false, public readonly bool $allowObsRoute = false, + public readonly ?\Closure $localPartNormalizer = null, ) { foreach ($bannedChars as $char) { $this->bannedChars[$char] = true; @@ -304,6 +306,29 @@ public function withAllowObsRoute(bool $value): self return $this->cloneWith(['allowObsRoute' => $value]); } + /** + * Supply a local-part normalizer callback, or `null` to clear any current one. + * + * The callback is invoked after local-part validation succeeds with + * `fn(string $localPart, string $domain): string`. Its return value + * replaces `local_part_parsed` in the output — typical uses are Gmail + * dot-insensitivity (`john.doe` → `johndoe`) and plus-addressing + * (`user+tag` → `user`), typically gated on the domain. + * + * $opts = ParseOptions::rfc5322()->withLocalPartNormalizer( + * fn(string $local, string $domain): string => + * $domain === 'gmail.com' + * ? strtolower(strstr(str_replace('.', '', $local), '+', true) ?: str_replace('.', '', $local)) + * : $local, + * ); + */ + public function withLocalPartNormalizer(?callable $normalizer): self + { + return $this->cloneWith([ + 'localPartNormalizer' => $normalizer === null ? null : \Closure::fromCallable($normalizer), + ]); + } + /** * Build a new ParseOptions preserving every current value except those * listed in $overrides. @@ -336,6 +361,9 @@ private function cloneWith(array $overrides): self validateDisplayNamePhrase: $get('validateDisplayNamePhrase', $this->validateDisplayNamePhrase), strictIdna: $get('strictIdna', $this->strictIdna), allowObsRoute: $get('allowObsRoute', $this->allowObsRoute), + localPartNormalizer: array_key_exists('localPartNormalizer', $overrides) + ? $overrides['localPartNormalizer'] + : $this->localPartNormalizer, ); } diff --git a/src/ParseResult.php b/src/ParseResult.php index bafa226..593b949 100644 --- a/src/ParseResult.php +++ b/src/ParseResult.php @@ -38,4 +38,35 @@ public static function fromArray(array $arr): self ), ); } + + /** + * Round-trip to the array shape produced by {@see Parse::parse()} in + * multi-address mode. Each address is serialized via + * {@see ParsedEmailAddress::toArray()}. + * + * @return array{success: bool, reason: ?string, email_addresses: array>} + */ + public function toArray(): array + { + return [ + 'success' => $this->success, + 'reason' => $this->reason, + 'email_addresses' => array_map( + fn (ParsedEmailAddress $a) => $a->toArray(), + $this->emailAddresses, + ), + ]; + } + + /** + * JSON-encoded representation. Convenience wrapper over {@see toArray()}. + * + * @param int $flags Flags passed through to `json_encode` (e.g. `JSON_PRETTY_PRINT`). + */ + public function toJson(int $flags = 0): string + { + $encoded = json_encode($this->toArray(), $flags | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); + + return $encoded === false ? '{}' : $encoded; + } } diff --git a/src/ParsedEmailAddress.php b/src/ParsedEmailAddress.php index 3aa9239..30ed57d 100644 --- a/src/ParsedEmailAddress.php +++ b/src/ParsedEmailAddress.php @@ -9,7 +9,7 @@ * Every field is also present in the legacy array output of {@see Parse::parse()}; * callers preferring typed access with IDE autocomplete should use the new methods. */ -final class ParsedEmailAddress +final class ParsedEmailAddress implements \Stringable { /** * @param string $address Canonical address, comments stripped (e.g. `"J Doe" `). @@ -93,4 +93,125 @@ public function invalidSeverity(): ?ValidationSeverity { return $this->invalidReasonCode?->severity(); } + + /** + * Round-trip to the legacy array shape produced by {@see Parse::parse()}. + * Field order matches the parser output so the result is compatible with + * code that consumes the array-based API. `invalidReasonCode` is emitted + * as a `ParseErrorCode` enum (or `null`); callers wanting the string form + * should access `$result['invalid_reason_code']?->value`. + * + * @return array + */ + public function toArray(): array + { + return [ + 'address' => $this->address, + 'simple_address' => $this->simpleAddress, + 'original_address' => $this->originalAddress, + 'name' => $this->name, + 'name_parsed' => $this->nameParsed, + 'local_part' => $this->localPart, + 'local_part_parsed' => $this->localPartParsed, + 'domain_part' => $this->domainPart, + 'domain' => $this->domain, + 'domain_ascii' => $this->domainAscii, + 'ip' => $this->ip, + 'invalid' => $this->invalid, + 'invalid_reason' => $this->invalidReason, + 'invalid_reason_code' => $this->invalidReasonCode, + 'comments' => $this->comments, + 'obs_route' => $this->obsRoute, + ]; + } + + /** + * JSON-encoded representation. Convenience wrapper over {@see toArray()}. + * `ParseErrorCode` serializes to its backing string value under the default + * enum-serialization rules. + * + * @param int $flags Flags passed through to `json_encode` (e.g. `JSON_PRETTY_PRINT`). + */ + public function toJson(int $flags = 0): string + { + $encoded = json_encode($this->toArray(), $flags | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_SLASHES); + + return $encoded === false ? '{}' : $encoded; + } + + /** + * Canonical RFC 5322 display form for the address. + * + * Rules: + * - Invalid addresses return the empty string. + * - No display name: returns `local@domain` (or `local@[IP]`). + * - With display name: returns `Name ` or + * `"Display Name" ` when the name or local-part contains + * characters that require quoting per RFC 5322 §3.2.4 / §3.2.5. + * + * Minimal quoting is applied: quotes are only added when the content + * contains a character outside the atext set (for the local-part) or the + * atext + WSP set (for the display-name phrase). This differs from + * {@see $address} which preserves whichever form the parser observed in + * the input. + */ + public function canonical(): string + { + if ($this->invalid) { + return ''; + } + + $local = self::isAtextDotAtom($this->localPartParsed) || $this->localPartParsed === '' + ? $this->localPartParsed + : '"' . addcslashes($this->localPartParsed, '"\\') . '"'; + + $addrSpec = $local . '@' . $this->domainPart; + + if ($this->nameParsed === '') { + return $addrSpec; + } + + $name = self::isPhraseAtoms($this->nameParsed) + ? $this->nameParsed + : '"' . addcslashes($this->nameParsed, '"\\') . '"'; + + return $name . ' <' . $addrSpec . '>'; + } + + /** + * {@inheritDoc} + * + * Stringable: implicitly convertible to the address's simple form + * (`local@domain-part`) for use in string contexts like logging and + * templating. Invalid addresses stringify to the empty string. + */ + public function __toString(): string + { + return $this->invalid ? '' : $this->simpleAddress; + } + + /** + * True when the string conforms to RFC 5322 §3.2.3 dot-atom-text + * (1*atext *("." 1*atext)) — i.e. can appear unquoted in an addr-spec. + */ + private static function isAtextDotAtom(string $s): bool + { + return (bool) preg_match( + "/^[A-Za-z0-9!#\$%&'*+\\-\\/=?^_`{|}~]+(?:\\.[A-Za-z0-9!#\$%&'*+\\-\\/=?^_`{|}~]+)*\$/", + $s, + ); + } + + /** + * True when the string is a sequence of RFC 5322 §3.2.5 phrase atoms — + * atext runs separated by single spaces — meaning no display-name quoting + * is required. + */ + private static function isPhraseAtoms(string $s): bool + { + return (bool) preg_match( + "/^[A-Za-z0-9!#\$%&'*+\\-\\/=?^_`{|}~]+(?:[ \\t]+[A-Za-z0-9!#\$%&'*+\\-\\/=?^_`{|}~]+)*\$/", + $s, + ); + } } diff --git a/tests/ParseTest.php b/tests/ParseTest.php index a219adf..70194e2 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -669,4 +669,174 @@ public function testCfwsFoldingWhitespace(): void $this->assertSame('local', $result->localPart); $this->assertSame('domain.com', $result->domain); } + + public function testToArrayRoundTripsLegacyShape(): void + { + // Parse an address both ways; toArray() on the typed object must match + // the legacy parse() output exactly (same keys, same order, same types). + $parser = new Parse(); + $legacy = $parser->parse('"J Doe" (nickname)', false); + $typed = $parser->parseSingle('"J Doe" (nickname)'); + + $this->assertSame($legacy, $typed->toArray()); + } + + public function testToArrayPreservesErrorCode(): void + { + $typed = Parse::getInstance()->parseSingle('not-an-email'); + $arr = $typed->toArray(); + $this->assertTrue($arr['invalid']); + $this->assertInstanceOf(\Email\ParseErrorCode::class, $arr['invalid_reason_code']); + } + + public function testToJsonProducesParseableJson(): void + { + $typed = Parse::getInstance()->parseSingle('user@example.com'); + $decoded = json_decode($typed->toJson(), true); + $this->assertIsArray($decoded); + $this->assertSame('user', $decoded['local_part']); + $this->assertSame('example.com', $decoded['domain']); + } + + public function testToJsonSerializesErrorCodeAsString(): void + { + // ParseErrorCode is a BackedEnum; json_encode emits its backing value. + $typed = Parse::getInstance()->parseSingle('<'); + $decoded = json_decode($typed->toJson(), true); + $this->assertSame('multiple_opening_angle', $decoded['invalid_reason_code']); + } + + public function testStringableReturnsSimpleAddressWhenValid(): void + { + $typed = Parse::getInstance()->parseSingle('"J Doe" '); + $this->assertSame('john@example.com', (string) $typed); + } + + public function testStringableReturnsEmptyStringWhenInvalid(): void + { + $typed = Parse::getInstance()->parseSingle('not-an-email'); + $this->assertSame('', (string) $typed); + } + + public function testCanonicalAddrSpecWithoutName(): void + { + $typed = Parse::getInstance()->parseSingle('john@example.com'); + $this->assertSame('john@example.com', $typed->canonical()); + } + + public function testCanonicalAddrSpecWithSimpleName(): void + { + // Atext-only name needs no quotes. + $typed = Parse::getInstance()->parseSingle('John Doe '); + $this->assertSame('John Doe ', $typed->canonical()); + } + + public function testCanonicalStripsUnnecessaryNameQuotes(): void + { + // Input had quotes; canonical form drops them because the name is + // pure atext+WSP and quoting is not required per RFC 5322 §3.2.5. + $typed = Parse::getInstance()->parseSingle('"John Doe" '); + $this->assertSame('John Doe ', $typed->canonical()); + } + + public function testCanonicalKeepsRequiredNameQuotes(): void + { + // Period in display name requires quoting (it's not atext). + $typed = Parse::getInstance()->parseSingle('"John Q. Public" '); + $this->assertSame('"John Q. Public" ', $typed->canonical()); + } + + public function testCanonicalQuotesLocalPartWhenRequired(): void + { + // Local-part with a space must be quoted per RFC 5322 §3.2.4. + $typed = Parse::getInstance()->parseSingle('"with space"@example.com'); + $this->assertSame('"with space"@example.com', $typed->canonical()); + } + + public function testCanonicalReturnsEmptyForInvalidAddress(): void + { + $typed = Parse::getInstance()->parseSingle('not-an-email'); + $this->assertSame('', $typed->canonical()); + } + + public function testParseResultToArrayRoundTripsLegacyShape(): void + { + $parser = new Parse(); + $legacy = $parser->parse('a@a.com, b@b.com', true); + $typed = $parser->parseMultiple('a@a.com, b@b.com'); + + $this->assertSame($legacy, $typed->toArray()); + } + + public function testParseResultToJsonProducesParseableJson(): void + { + $typed = Parse::getInstance()->parseMultiple('a@a.com, b@b.com'); + $decoded = json_decode($typed->toJson(), true); + $this->assertTrue($decoded['success']); + $this->assertCount(2, $decoded['email_addresses']); + $this->assertSame('a', $decoded['email_addresses'][0]['local_part']); + } + + public function testLocalPartNormalizerRewritesLocalPart(): void + { + // Gmail-style: strip dots and +tags from the local-part for gmail.com. + $gmailNormalizer = function (string $local, string $domain): string { + if ($domain !== 'gmail.com') { + return $local; + } + $local = str_replace('.', '', $local); + $plus = strpos($local, '+'); + + return $plus === false ? $local : substr($local, 0, $plus); + }; + + $opts = ParseOptions::rfc5322()->withLocalPartNormalizer($gmailNormalizer); + $result = (new Parse(null, $opts))->parseSingle('john.doe+spam@gmail.com'); + + $this->assertFalse($result->invalid); + $this->assertSame('johndoe', $result->localPartParsed); + $this->assertSame('johndoe@gmail.com', $result->simpleAddress); + // original_address retains the verbatim input for audit. + $this->assertSame('john.doe+spam@gmail.com', $result->originalAddress); + } + + public function testLocalPartNormalizerSkipsOtherDomains(): void + { + // The normalizer is gmail-specific; other domains pass through. + $normalizer = fn (string $local, string $domain) => $domain === 'gmail.com' + ? str_replace('.', '', $local) + : $local; + + $opts = ParseOptions::rfc5322()->withLocalPartNormalizer($normalizer); + $result = (new Parse(null, $opts))->parseSingle('j.doe@example.com'); + + $this->assertSame('j.doe', $result->localPartParsed); + } + + public function testLocalPartNormalizerNotInvokedOnInvalidAddress(): void + { + // Invalid address short-circuits validateLocalPart; the normalizer + // must not run on unvalidated input. + $invocations = 0; + $normalizer = function (string $local, string $domain) use (&$invocations): string { + ++$invocations; + + return $local; + }; + + $opts = ParseOptions::rfc5322()->withLocalPartNormalizer($normalizer); + (new Parse(null, $opts))->parseSingle('not-an-email'); + + $this->assertSame(0, $invocations); + } + + public function testLocalPartNormalizerCanBeClearedByPassingNull(): void + { + $normalizer = fn (string $l) => strtolower($l); + $a = ParseOptions::rfc5322()->withLocalPartNormalizer($normalizer); + $b = $a->withLocalPartNormalizer(null); + + $this->assertNotNull($a->localPartNormalizer); + $this->assertNull($b->localPartNormalizer); + } }