Skip to content

Commit 741e712

Browse files
authored
refactor: simplify encoding mode detection logic (#201)
* Simplify encoding mode detection logic Refactor to simplify checks for numeric and alphanumeric content. * Fix empty string encoding regression: use Mode::BYTE() Fixes a regression introduced in the previous refactor where an empty string ('') was incorrectly encoded as Mode::ALPHANUMERIC() instead of Mode::BYTE(). While technically valid, Mode::BYTE() is a more appropriate choice—it reflects general-purpose encoding and avoids the limitations of the alphanumeric character set. This change also improves clarity and helps prevent similar regressions in the future. Similarly, it ensures correct behavior for Encoder::chooseMode('', 'SHIFT-JIS'). * Add unit test for Encoder::chooseMode with empty string and Shift-JIS encoding * Minor fixes and clarifications getAlphanumericCode(): - rename parameter from $code to $byte for clarity and consistency with docblock - simplify using null coalescing (also reduces lookups) isOnlyDoubleByteKanji(): - update docblock to note empty string returns true (which is important to consider in the caller) * Extract alphanumeric check into a dedicated method * Optimize isOnlyAlphanumeric() using strspn-based technique Implemented a technique that takes advantage of strspn() to optimize isOnlyAlphanumeric(). This approach is much, much faster (hundreds or even thousands of times) and highly scalable, appearing to run in close to O(1). * Use class const for alphanumeric characters
1 parent 50e742e commit 741e712

2 files changed

Lines changed: 27 additions & 25 deletions

File tree

src/Encoder/Encoder.php

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ final class Encoder
2525
/** @deprecated use DEFAULT_BYTE_MODE_ENCODING */
2626
public const DEFAULT_BYTE_MODE_ECODING = self::DEFAULT_BYTE_MODE_ENCODING;
2727

28+
/**
29+
* Allowed characters for the Alphanumeric Mode.
30+
*/
31+
private const ALPHANUMERIC_CHARS = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ $%*+-./:';
32+
2833
/**
2934
* The original table is defined in the table 5 of JISX0510:2004 (p.19).
3035
*/
@@ -148,44 +153,30 @@ public static function encode(
148153
/**
149154
* Gets the alphanumeric code for a byte.
150155
*/
151-
private static function getAlphanumericCode(int $code) : int
156+
private static function getAlphanumericCode(int $byte) : int
152157
{
153-
if (isset(self::ALPHANUMERIC_TABLE[$code])) {
154-
return self::ALPHANUMERIC_TABLE[$code];
155-
}
156-
157-
return -1;
158+
return self::ALPHANUMERIC_TABLE[$byte] ?? -1;
158159
}
159160

160161
/**
161162
* Chooses the best mode for a given content.
162163
*/
163164
private static function chooseMode(string $content, ?string $encoding = null) : Mode
164165
{
166+
if ('' === $content) {
167+
return Mode::BYTE();
168+
}
169+
165170
if (null !== $encoding && 0 === strcasecmp($encoding, 'SHIFT-JIS')) {
166171
return self::isOnlyDoubleByteKanji($content) ? Mode::KANJI() : Mode::BYTE();
167172
}
168173

169-
$hasNumeric = false;
170-
$hasAlphanumeric = false;
171-
$contentLength = strlen($content);
172-
173-
for ($i = 0; $i < $contentLength; ++$i) {
174-
$char = $content[$i];
175-
176-
if (ctype_digit($char)) {
177-
$hasNumeric = true;
178-
} elseif (-1 !== self::getAlphanumericCode(ord($char))) {
179-
$hasAlphanumeric = true;
180-
} else {
181-
return Mode::BYTE();
182-
}
174+
if (ctype_digit($content)) {
175+
return Mode::NUMERIC();
183176
}
184177

185-
if ($hasAlphanumeric) {
178+
if (self::isOnlyAlphanumeric($content)) {
186179
return Mode::ALPHANUMERIC();
187-
} elseif ($hasNumeric) {
188-
return Mode::NUMERIC();
189180
}
190181

191182
return Mode::BYTE();
@@ -205,7 +196,7 @@ private static function calculateMaskPenalty(ByteMatrix $matrix) : int
205196
}
206197

207198
/**
208-
* Checks if content only consists of double-byte kanji characters.
199+
* Checks if content only consists of double-byte kanji characters (or is empty).
209200
*/
210201
private static function isOnlyDoubleByteKanji(string $content) : bool
211202
{
@@ -232,6 +223,14 @@ private static function isOnlyDoubleByteKanji(string $content) : bool
232223
return true;
233224
}
234225

226+
/**
227+
* Checks if content only consists of alphanumeric characters (or is empty).
228+
*/
229+
private static function isOnlyAlphanumeric(string $content) : bool
230+
{
231+
return strlen($content) === strspn($content, self::ALPHANUMERIC_CHARS);
232+
}
233+
235234
/**
236235
* Chooses the best mask pattern for a matrix.
237236
*/

test/Encoder/EncoderTest.php

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ public function testGetAlphanumericCode() : void
6363

6464
public function testChooseMode() : void
6565
{
66+
// Empty string
67+
$this->assertSame(Mode::BYTE(), $this->methods['chooseMode']->invoke(null, ''));
68+
$this->assertSame(Mode::BYTE(), $this->methods['chooseMode']->invoke(null, '', 'SHIFT-JIS'));
69+
6670
// Numeric mode
6771
$this->assertSame(Mode::NUMERIC(), $this->methods['chooseMode']->invoke(null, '0'));
6872
$this->assertSame(Mode::NUMERIC(), $this->methods['chooseMode']->invoke(null, '0123456789'));
@@ -77,7 +81,6 @@ public function testChooseMode() : void
7781
// 8-bit byte mode
7882
$this->assertSame(Mode::BYTE(), $this->methods['chooseMode']->invoke(null, 'a'));
7983
$this->assertSame(Mode::BYTE(), $this->methods['chooseMode']->invoke(null, '#'));
80-
$this->assertSame(Mode::BYTE(), $this->methods['chooseMode']->invoke(null, ''));
8184

8285
// AIUE in Hiragana in SHIFT-JIS
8386
$this->assertSame(Mode::BYTE(), $this->methods['chooseMode']->invoke(null, "\x8\xa\x8\xa\x8\xa\x8\xa6"));

0 commit comments

Comments
 (0)