From 379a11bed73a7e1cc43e82bf490aff5d2db12ce6 Mon Sep 17 00:00:00 2001 From: Ugochukwu Mmaduekwe Date: Sun, 26 Apr 2026 15:34:20 +0100 Subject: [PATCH] improve SIMD detection support --- HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr | 3 +- .../FreePascal.Tests/HashLib.Tests.lpi | 6 +- HashLib.Tests/FreePascal.Tests/HashLib.lpr | 3 +- .../FreePascal.Tests/HashLibConsole.lpi | 6 +- .../FreePascal.Tests/HashLibConsole.lpr | 3 +- HashLib.Tests/src/SimdSelectSlotTests.pas | 253 ++++++++++++++++++ HashLib/src/Checksum/HlpAdler32Dispatch.pas | 4 +- HashLib/src/Checksum/HlpCRCDispatch.pas | 12 +- HashLib/src/Crypto/HlpBlake2BDispatch.pas | 8 +- HashLib/src/Crypto/HlpBlake2SDispatch.pas | 8 +- HashLib/src/Crypto/HlpBlake3Dispatch.pas | 8 +- HashLib/src/Crypto/HlpSHA1Dispatch.pas | 4 +- HashLib/src/Crypto/HlpSHA2_256Dispatch.pas | 4 +- HashLib/src/Crypto/HlpSHA2_512Dispatch.pas | 4 +- HashLib/src/Crypto/HlpSHA3Dispatch.pas | 2 +- HashLib/src/Hash64/HlpXXHash3Dispatch.pas | 8 +- HashLib/src/Include/HashLib.inc | 19 +- HashLib/src/KDF/HlpArgon2Dispatch.pas | 8 +- HashLib/src/KDF/HlpScryptDispatch.pas | 8 +- HashLib/src/Utils/HlpArmSimdFeatures.pas | 45 ++++ HashLib/src/Utils/HlpSimdLevels.pas | 2 +- HashLib/src/Utils/HlpX86SimdFeatures.pas | 186 +++++++++++-- 22 files changed, 525 insertions(+), 79 deletions(-) create mode 100644 HashLib.Tests/src/SimdSelectSlotTests.pas diff --git a/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr b/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr index c104b6f..6204095 100644 --- a/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr +++ b/HashLib.Tests/Delphi.Tests/HashLib.Tests.dpr @@ -162,7 +162,8 @@ uses PBKDF2_HMACTests in '..\src\PBKDF2_HMACTests.pas', PBKDF_Argon2Tests in '..\src\PBKDF_Argon2Tests.pas', PBKDF_ScryptTests in '..\src\PBKDF_ScryptTests.pas', - CRCTests in '..\src\CRCTests.pas'; + CRCTests in '..\src\CRCTests.pas', + SimdSelectSlotTests in '..\src\SimdSelectSlotTests.pas'; begin diff --git a/HashLib.Tests/FreePascal.Tests/HashLib.Tests.lpi b/HashLib.Tests/FreePascal.Tests/HashLib.Tests.lpi index f7de0fc..8b69156 100644 --- a/HashLib.Tests/FreePascal.Tests/HashLib.Tests.lpi +++ b/HashLib.Tests/FreePascal.Tests/HashLib.Tests.lpi @@ -38,7 +38,7 @@ - + @@ -96,6 +96,10 @@ + + + + diff --git a/HashLib.Tests/FreePascal.Tests/HashLib.lpr b/HashLib.Tests/FreePascal.Tests/HashLib.lpr index 8a1b5fd..727f5ca 100644 --- a/HashLib.Tests/FreePascal.Tests/HashLib.lpr +++ b/HashLib.Tests/FreePascal.Tests/HashLib.lpr @@ -18,7 +18,8 @@ PBKDF2_HMACTests, PBKDF_Argon2Tests, PBKDF_ScryptTests, - CRCTests; + CRCTests, + SimdSelectSlotTests; {$R *.res} diff --git a/HashLib.Tests/FreePascal.Tests/HashLibConsole.lpi b/HashLib.Tests/FreePascal.Tests/HashLibConsole.lpi index 9458e47..ce1b682 100644 --- a/HashLib.Tests/FreePascal.Tests/HashLibConsole.lpi +++ b/HashLib.Tests/FreePascal.Tests/HashLibConsole.lpi @@ -62,7 +62,7 @@ - + @@ -120,6 +120,10 @@ + + + + diff --git a/HashLib.Tests/FreePascal.Tests/HashLibConsole.lpr b/HashLib.Tests/FreePascal.Tests/HashLibConsole.lpr index 9877669..42f4f4c 100644 --- a/HashLib.Tests/FreePascal.Tests/HashLibConsole.lpr +++ b/HashLib.Tests/FreePascal.Tests/HashLibConsole.lpr @@ -17,7 +17,8 @@ PBKDF2_HMACTests, PBKDF_Argon2Tests, PBKDF_ScryptTests, - CRCTests; + CRCTests, + SimdSelectSlotTests; type diff --git a/HashLib.Tests/src/SimdSelectSlotTests.pas b/HashLib.Tests/src/SimdSelectSlotTests.pas new file mode 100644 index 0000000..a536bcd --- /dev/null +++ b/HashLib.Tests/src/SimdSelectSlotTests.pas @@ -0,0 +1,253 @@ +unit SimdSelectSlotTests; + +interface + +uses + SysUtils, +{$IFDEF FPC} + fpcunit, + testregistry, +{$ELSE} + TestFramework, +{$ENDIF FPC} + HlpSimdLevels, + HlpX86SimdFeatures, + HlpArmSimdFeatures; + +type + + THashLibTestCase = class abstract(TTestCase) + + end; + +type + + // Exercises the pure overload of TX86SimdFeatures.SelectSlot, which + // takes the active level as a parameter and is therefore fully + // deterministic and host-CPU-independent. + TTestX86SelectSlot = class(THashLibTestCase) + published + procedure TestExactMatch; + procedure TestStepDownOnUnsupportedTier; + procedure TestAllDeclaredTiersAboveActive; + procedure TestEmptyTiers; + procedure TestTierOrderIndependence; + procedure TestScalarHost; + procedure TestScalarTierAlwaysReachable; + end; + +type + + // Symmetric coverage for the ARM surface, since the same SelectSlot + // shape lives on TArmSimdFeatures. + TTestArmSelectSlot = class(THashLibTestCase) + published + procedure TestExactMatch; + procedure TestStepDownOnUnsupportedTier; + procedure TestAllDeclaredTiersAboveActive; + procedure TestEmptyTiers; + procedure TestTierOrderIndependence; + procedure TestScalarHost; + end; + +implementation + +function X86LevelName(ALevel: TX86SimdLevel): string; +begin + case ALevel of + TX86SimdLevel.Scalar: Result := 'Scalar'; + TX86SimdLevel.SSE2: Result := 'SSE2'; + TX86SimdLevel.SSE3: Result := 'SSE3'; + TX86SimdLevel.SSSE3: Result := 'SSSE3'; + TX86SimdLevel.SSE41: Result := 'SSE41'; + TX86SimdLevel.SSE42: Result := 'SSE42'; + TX86SimdLevel.AVX2: Result := 'AVX2'; + else + Result := 'Unknown'; + end; +end; + +function ArmLevelName(ALevel: TArmSimdLevel): string; +begin + case ALevel of + TArmSimdLevel.Scalar: Result := 'Scalar'; + TArmSimdLevel.NEON: Result := 'NEON'; + TArmSimdLevel.SVE: Result := 'SVE'; + TArmSimdLevel.SVE2: Result := 'SVE2'; + else + Result := 'Unknown'; + end; +end; + +{ TTestX86SelectSlot } + +procedure TTestX86SelectSlot.TestExactMatch; +var + LResult: TX86SimdLevel; +begin + // Active host advertises AVX2; AVX2 is declared, so it should win. + LResult := TX86SimdFeatures.SelectSlot(TX86SimdLevel.AVX2, + [TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]); + CheckTrue(LResult = TX86SimdLevel.AVX2, + Format('Expected AVX2 but got %s.', [X86LevelName(LResult)])); +end; + +procedure TTestX86SelectSlot.TestStepDownOnUnsupportedTier; +var + LResult: TX86SimdLevel; +begin + // Host advertises SSE41 (probed but not declared by the algorithm). + // The algorithm declares only AVX2 and SSE2; SelectSlot must step + // down to SSE2 (the central future-proofing claim). + LResult := TX86SimdFeatures.SelectSlot(TX86SimdLevel.SSE41, + [TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]); + CheckTrue(LResult = TX86SimdLevel.SSE2, + Format('Expected SSE2 but got %s.', [X86LevelName(LResult)])); +end; + +procedure TTestX86SelectSlot.TestAllDeclaredTiersAboveActive; +var + LResult: TX86SimdLevel; +begin + // Host caps at SSE2; algorithm declares only AVX2. No tier matches, + // so SelectSlot must fall back to Scalar. + LResult := TX86SimdFeatures.SelectSlot(TX86SimdLevel.SSE2, + [TX86SimdLevel.AVX2]); + CheckTrue(LResult = TX86SimdLevel.Scalar, + Format('Expected Scalar but got %s.', [X86LevelName(LResult)])); +end; + +procedure TTestX86SelectSlot.TestEmptyTiers; +var + LResult: TX86SimdLevel; + LEmpty: array of TX86SimdLevel; +begin + // An algorithm with no SIMD impls passes an empty tier array. + // SelectSlot must fall back to Scalar regardless of host capability. + System.SetLength(LEmpty, 0); + LResult := TX86SimdFeatures.SelectSlot(TX86SimdLevel.AVX2, LEmpty); + CheckTrue(LResult = TX86SimdLevel.Scalar, + Format('Expected Scalar but got %s.', [X86LevelName(LResult)])); +end; + +procedure TTestX86SelectSlot.TestTierOrderIndependence; +var + LDescending, LAscending: TX86SimdLevel; +begin + // SelectSlot reasons over the set of declared tiers, not their order. + // Both orderings must yield the same result for the same host level. + LDescending := TX86SimdFeatures.SelectSlot(TX86SimdLevel.SSE42, + [TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]); + LAscending := TX86SimdFeatures.SelectSlot(TX86SimdLevel.SSE42, + [TX86SimdLevel.SSE2, TX86SimdLevel.AVX2]); + CheckTrue(LDescending = LAscending, + Format('Order-dependent result: descending=%s ascending=%s.', + [X86LevelName(LDescending), X86LevelName(LAscending)])); + CheckTrue(LDescending = TX86SimdLevel.SSE2, + Format('Expected SSE2 but got %s.', [X86LevelName(LDescending)])); +end; + +procedure TTestX86SelectSlot.TestScalarHost; +var + LResult: TX86SimdLevel; +begin + // A host that probed Scalar must never select any SIMD tier. + LResult := TX86SimdFeatures.SelectSlot(TX86SimdLevel.Scalar, + [TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]); + CheckTrue(LResult = TX86SimdLevel.Scalar, + Format('Expected Scalar but got %s.', [X86LevelName(LResult)])); +end; + +procedure TTestX86SelectSlot.TestScalarTierAlwaysReachable; +var + LResult: TX86SimdLevel; +begin + // If the algorithm explicitly declares Scalar as a tier, that is + // always reachable - even on a Scalar host. + LResult := TX86SimdFeatures.SelectSlot(TX86SimdLevel.Scalar, + [TX86SimdLevel.Scalar]); + CheckTrue(LResult = TX86SimdLevel.Scalar, + Format('Expected Scalar but got %s.', [X86LevelName(LResult)])); +end; + +{ TTestArmSelectSlot } + +procedure TTestArmSelectSlot.TestExactMatch; +var + LResult: TArmSimdLevel; +begin + LResult := TArmSimdFeatures.SelectSlot(TArmSimdLevel.SVE2, + [TArmSimdLevel.SVE2, TArmSimdLevel.NEON]); + CheckTrue(LResult = TArmSimdLevel.SVE2, + Format('Expected SVE2 but got %s.', [ArmLevelName(LResult)])); +end; + +procedure TTestArmSelectSlot.TestStepDownOnUnsupportedTier; +var + LResult: TArmSimdLevel; +begin + // Host advertises SVE; algorithm offers SVE2 + NEON only. + // Must step down to NEON (highest reachable declared tier). + LResult := TArmSimdFeatures.SelectSlot(TArmSimdLevel.SVE, + [TArmSimdLevel.SVE2, TArmSimdLevel.NEON]); + CheckTrue(LResult = TArmSimdLevel.NEON, + Format('Expected NEON but got %s.', [ArmLevelName(LResult)])); +end; + +procedure TTestArmSelectSlot.TestAllDeclaredTiersAboveActive; +var + LResult: TArmSimdLevel; +begin + LResult := TArmSimdFeatures.SelectSlot(TArmSimdLevel.NEON, + [TArmSimdLevel.SVE2]); + CheckTrue(LResult = TArmSimdLevel.Scalar, + Format('Expected Scalar but got %s.', [ArmLevelName(LResult)])); +end; + +procedure TTestArmSelectSlot.TestEmptyTiers; +var + LResult: TArmSimdLevel; + LEmpty: array of TArmSimdLevel; +begin + System.SetLength(LEmpty, 0); + LResult := TArmSimdFeatures.SelectSlot(TArmSimdLevel.SVE2, LEmpty); + CheckTrue(LResult = TArmSimdLevel.Scalar, + Format('Expected Scalar but got %s.', [ArmLevelName(LResult)])); +end; + +procedure TTestArmSelectSlot.TestTierOrderIndependence; +var + LDescending, LAscending: TArmSimdLevel; +begin + LDescending := TArmSimdFeatures.SelectSlot(TArmSimdLevel.SVE, + [TArmSimdLevel.SVE2, TArmSimdLevel.NEON]); + LAscending := TArmSimdFeatures.SelectSlot(TArmSimdLevel.SVE, + [TArmSimdLevel.NEON, TArmSimdLevel.SVE2]); + CheckTrue(LDescending = LAscending, + Format('Order-dependent result: descending=%s ascending=%s.', + [ArmLevelName(LDescending), ArmLevelName(LAscending)])); + CheckTrue(LDescending = TArmSimdLevel.NEON, + Format('Expected NEON but got %s.', [ArmLevelName(LDescending)])); +end; + +procedure TTestArmSelectSlot.TestScalarHost; +var + LResult: TArmSimdLevel; +begin + LResult := TArmSimdFeatures.SelectSlot(TArmSimdLevel.Scalar, + [TArmSimdLevel.SVE2, TArmSimdLevel.NEON]); + CheckTrue(LResult = TArmSimdLevel.Scalar, + Format('Expected Scalar but got %s.', [ArmLevelName(LResult)])); +end; + +initialization + +{$IFDEF FPC} + RegisterTest(TTestX86SelectSlot); + RegisterTest(TTestArmSelectSlot); +{$ELSE} + RegisterTest(TTestX86SelectSlot.Suite); + RegisterTest(TTestArmSelectSlot.Suite); +{$ENDIF FPC} + +end. diff --git a/HashLib/src/Checksum/HlpAdler32Dispatch.pas b/HashLib/src/Checksum/HlpAdler32Dispatch.pas index 532082d..ddcc300 100644 --- a/HashLib/src/Checksum/HlpAdler32Dispatch.pas +++ b/HashLib/src/Checksum/HlpAdler32Dispatch.pas @@ -189,7 +189,7 @@ procedure InitDispatch(); begin Adler32_Update := @Adler32_Update_Scalar; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2]) of TX86SimdLevel.SSSE3: begin Adler32_Update := @Adler32_Update_Ssse3; @@ -201,7 +201,7 @@ procedure InitDispatch(); end; {$ENDIF} {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin Adler32_Update := @Adler32_Update_Avx2; diff --git a/HashLib/src/Checksum/HlpCRCDispatch.pas b/HashLib/src/Checksum/HlpCRCDispatch.pas index 93d31ac..898ccd3 100644 --- a/HashLib/src/Checksum/HlpCRCDispatch.pas +++ b/HashLib/src/Checksum/HlpCRCDispatch.pas @@ -514,18 +514,10 @@ procedure InitDispatch(); {$ENDIF HASHLIB_X86_64_ASM} {$IFDEF HASHLIB_X86_SIMD} - {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of - TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2: + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of + TX86SimdLevel.SSE2: BindSse2CrcFold; end; - {$ENDIF HASHLIB_I386_ASM} - {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of - TX86SimdLevel.AVX2, TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2: - BindSse2CrcFold; - end; - {$ENDIF HASHLIB_X86_64_ASM} {$ENDIF HASHLIB_X86_SIMD} end; diff --git a/HashLib/src/Crypto/HlpBlake2BDispatch.pas b/HashLib/src/Crypto/HlpBlake2BDispatch.pas index d453b2c..79627aa 100644 --- a/HashLib/src/Crypto/HlpBlake2BDispatch.pas +++ b/HashLib/src/Crypto/HlpBlake2BDispatch.pas @@ -132,20 +132,20 @@ procedure InitDispatch(); begin Blake2B_Compress := @Blake2B_Compress_Scalar; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of + TX86SimdLevel.SSE2: begin Blake2B_Compress := @Blake2B_Compress_Sse2; end; end; {$ENDIF} {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin Blake2B_Compress := @Blake2B_Compress_Avx2; end; - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + TX86SimdLevel.SSE2: begin Blake2B_Compress := @Blake2B_Compress_Sse2; end; diff --git a/HashLib/src/Crypto/HlpBlake2SDispatch.pas b/HashLib/src/Crypto/HlpBlake2SDispatch.pas index 469b785..3b7d075 100644 --- a/HashLib/src/Crypto/HlpBlake2SDispatch.pas +++ b/HashLib/src/Crypto/HlpBlake2SDispatch.pas @@ -130,20 +130,20 @@ procedure InitDispatch(); begin Blake2S_Compress := @Blake2S_Compress_Scalar; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of + TX86SimdLevel.SSE2: begin Blake2S_Compress := @Blake2S_Compress_Sse2; end; end; {$ENDIF} {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin Blake2S_Compress := @Blake2S_Compress_Avx2; end; - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + TX86SimdLevel.SSE2: begin Blake2S_Compress := @Blake2S_Compress_Sse2; end; diff --git a/HashLib/src/Crypto/HlpBlake3Dispatch.pas b/HashLib/src/Crypto/HlpBlake3Dispatch.pas index 9bd9316..9f9c732 100644 --- a/HashLib/src/Crypto/HlpBlake3Dispatch.pas +++ b/HashLib/src/Crypto/HlpBlake3Dispatch.pas @@ -713,8 +713,8 @@ procedure InitDispatch(); Blake3_HashMany := @Blake3_HashMany_Scalar; Blake3_ParallelDegree := 1; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of + TX86SimdLevel.SSE2: begin Blake3_Compress := @Blake3_Compress_Sse2; Blake3_HashMany := @Blake3_HashMany_Sse2; @@ -723,14 +723,14 @@ procedure InitDispatch(); end; {$ENDIF} {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin Blake3_Compress := @Blake3_Compress_Avx2; Blake3_HashMany := @Blake3_HashMany_Avx2; Blake3_ParallelDegree := 8; end; - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + TX86SimdLevel.SSE2: begin Blake3_Compress := @Blake3_Compress_Sse2; Blake3_HashMany := @Blake3_HashMany_Sse2; diff --git a/HashLib/src/Crypto/HlpSHA1Dispatch.pas b/HashLib/src/Crypto/HlpSHA1Dispatch.pas index 05d9e7f..4555c42 100644 --- a/HashLib/src/Crypto/HlpSHA1Dispatch.pas +++ b/HashLib/src/Crypto/HlpSHA1Dispatch.pas @@ -176,7 +176,7 @@ procedure InitDispatch(); begin SHA1_Compress := @SHA1_Compress_Scalar; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2]) of TX86SimdLevel.SSSE3: begin SHA1_Compress := @SHA1_Compress_Ssse3_Wrap; @@ -193,7 +193,7 @@ procedure InitDispatch(); SHA1_Compress := @SHA1_Compress_ShaNi_Wrap; Exit; end; - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin SHA1_Compress := @SHA1_Compress_Avx2_Wrap; diff --git a/HashLib/src/Crypto/HlpSHA2_256Dispatch.pas b/HashLib/src/Crypto/HlpSHA2_256Dispatch.pas index d49654a..f36fe0a 100644 --- a/HashLib/src/Crypto/HlpSHA2_256Dispatch.pas +++ b/HashLib/src/Crypto/HlpSHA2_256Dispatch.pas @@ -186,7 +186,7 @@ procedure InitDispatch(); begin SHA256_Compress := @SHA256_Compress_Scalar; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2]) of TX86SimdLevel.SSSE3: begin SHA256_Compress := @SHA256_Compress_Ssse3_Wrap; @@ -203,7 +203,7 @@ procedure InitDispatch(); SHA256_Compress := @SHA256_Compress_ShaNi_Wrap; Exit; end; - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin SHA256_Compress := @SHA256_Compress_Avx2_Wrap; diff --git a/HashLib/src/Crypto/HlpSHA2_512Dispatch.pas b/HashLib/src/Crypto/HlpSHA2_512Dispatch.pas index fc5044e..6483d3a 100644 --- a/HashLib/src/Crypto/HlpSHA2_512Dispatch.pas +++ b/HashLib/src/Crypto/HlpSHA2_512Dispatch.pas @@ -195,7 +195,7 @@ procedure InitDispatch(); begin SHA512_Compress := @SHA512_Compress_Scalar; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2]) of TX86SimdLevel.SSSE3: begin SHA512_Compress := @SHA512_Compress_Ssse3_Wrap; @@ -207,7 +207,7 @@ procedure InitDispatch(); end; {$ENDIF} {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSSE3, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin SHA512_Compress := @SHA512_Compress_Avx2_Wrap; diff --git a/HashLib/src/Crypto/HlpSHA3Dispatch.pas b/HashLib/src/Crypto/HlpSHA3Dispatch.pas index 4ba7657..d8a79b2 100644 --- a/HashLib/src/Crypto/HlpSHA3Dispatch.pas +++ b/HashLib/src/Crypto/HlpSHA3Dispatch.pas @@ -497,7 +497,7 @@ procedure InitDispatch(); KeccakF1600_Permute := @KeccakF1600_Scalar; KeccakF1600_Absorb := @KeccakF1600_Absorb_Scalar; {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2]) of TX86SimdLevel.AVX2: begin KeccakF1600_Permute := @KeccakF1600_Avx2_Wrap; diff --git a/HashLib/src/Hash64/HlpXXHash3Dispatch.pas b/HashLib/src/Hash64/HlpXXHash3Dispatch.pas index 79f8ace..30fd7a3 100644 --- a/HashLib/src/Hash64/HlpXXHash3Dispatch.pas +++ b/HashLib/src/Hash64/HlpXXHash3Dispatch.pas @@ -214,8 +214,8 @@ procedure InitDispatch(); XXH3_ScrambleAcc := @XXH3_ScrambleAcc_Scalar; XXH3_InitSecret := @XXH3_InitSecret_Scalar; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of + TX86SimdLevel.SSE2: begin XXH3_Accumulate512 := @XXH3_Accumulate512_Sse2; XXH3_Accumulate := @XXH3_Accumulate_Sse2; @@ -225,7 +225,7 @@ procedure InitDispatch(); end; {$ENDIF} {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin XXH3_Accumulate512 := @XXH3_Accumulate512_Avx2; @@ -233,7 +233,7 @@ procedure InitDispatch(); XXH3_ScrambleAcc := @XXH3_ScrambleAcc_Avx2; XXH3_InitSecret := @XXH3_InitSecret_Avx2; end; - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + TX86SimdLevel.SSE2: begin XXH3_Accumulate512 := @XXH3_Accumulate512_Sse2; XXH3_Accumulate := @XXH3_Accumulate_Sse2; diff --git a/HashLib/src/Include/HashLib.inc b/HashLib/src/Include/HashLib.inc index 4383872..6c47f0b 100644 --- a/HashLib/src/Include/HashLib.inc +++ b/HashLib/src/Include/HashLib.inc @@ -132,20 +132,25 @@ {$IFEND} {$IFDEF HASHLIB_X86_SIMD} -// Uncomment ONE to force a specific X86 SIMD dispatch level: +// Uncomment at most ONE to force a specific X86 SIMD dispatch level: // {$DEFINE HASHLIB_FORCE_SSE2} +// {$DEFINE HASHLIB_FORCE_SSE3} // {$DEFINE HASHLIB_FORCE_SSSE3} - -{$IF (DEFINED(HASHLIB_FORCE_SCALAR) AND DEFINED(HASHLIB_FORCE_SSE2)) - OR (DEFINED(HASHLIB_FORCE_SCALAR) AND DEFINED(HASHLIB_FORCE_SSSE3)) - OR (DEFINED(HASHLIB_FORCE_SSE2) AND DEFINED(HASHLIB_FORCE_SSSE3))} - {$MESSAGE ERROR 'Only one HASHLIB_FORCE_* define may be enabled at a time.'} +// {$DEFINE HASHLIB_FORCE_SSE41} +// {$DEFINE HASHLIB_FORCE_SSE42} + +{$IF (DEFINED(HASHLIB_FORCE_SCALAR) AND (DEFINED(HASHLIB_FORCE_SSE2) OR DEFINED(HASHLIB_FORCE_SSE3) OR DEFINED(HASHLIB_FORCE_SSSE3) OR DEFINED(HASHLIB_FORCE_SSE41) OR DEFINED(HASHLIB_FORCE_SSE42))) + OR (DEFINED(HASHLIB_FORCE_SSE2) AND (DEFINED(HASHLIB_FORCE_SSE3) OR DEFINED(HASHLIB_FORCE_SSSE3) OR DEFINED(HASHLIB_FORCE_SSE41) OR DEFINED(HASHLIB_FORCE_SSE42))) + OR (DEFINED(HASHLIB_FORCE_SSE3) AND (DEFINED(HASHLIB_FORCE_SSSE3) OR DEFINED(HASHLIB_FORCE_SSE41) OR DEFINED(HASHLIB_FORCE_SSE42))) + OR (DEFINED(HASHLIB_FORCE_SSSE3) AND (DEFINED(HASHLIB_FORCE_SSE41) OR DEFINED(HASHLIB_FORCE_SSE42))) + OR (DEFINED(HASHLIB_FORCE_SSE41) AND DEFINED(HASHLIB_FORCE_SSE42))} + {$MESSAGE ERROR 'Only one HASHLIB_FORCE_* (x86 level) define may be enabled at a time.'} {$IFEND} {$ENDIF} {$IFDEF HASHLIB_ARM_SIMD} -// Uncomment ONE to force a specific Arm SIMD dispatch level: +// Uncomment at most ONE to force a specific Arm SIMD dispatch level: // {$DEFINE HASHLIB_FORCE_NEON} // {$DEFINE HASHLIB_FORCE_SVE} diff --git a/HashLib/src/KDF/HlpArgon2Dispatch.pas b/HashLib/src/KDF/HlpArgon2Dispatch.pas index 5722c1b..1e92c70 100644 --- a/HashLib/src/KDF/HlpArgon2Dispatch.pas +++ b/HashLib/src/KDF/HlpArgon2Dispatch.pas @@ -137,20 +137,20 @@ procedure InitDispatch(); begin Argon2_FillBlock := @Argon2_FillBlock_Scalar; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of + TX86SimdLevel.SSE2: begin Argon2_FillBlock := @Argon2_FillBlock_Sse2; end; end; {$ENDIF} {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin Argon2_FillBlock := @Argon2_FillBlock_Avx2; end; - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + TX86SimdLevel.SSE2: begin Argon2_FillBlock := @Argon2_FillBlock_Sse2; end; diff --git a/HashLib/src/KDF/HlpScryptDispatch.pas b/HashLib/src/KDF/HlpScryptDispatch.pas index 8ee1ce1..0a3f2f5 100644 --- a/HashLib/src/KDF/HlpScryptDispatch.pas +++ b/HashLib/src/KDF/HlpScryptDispatch.pas @@ -202,20 +202,20 @@ procedure InitDispatch(); begin Scrypt_SalsaXor := @Scrypt_SalsaXor_Scalar; {$IFDEF HASHLIB_I386_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.SSE2]) of + TX86SimdLevel.SSE2: begin Scrypt_SalsaXor := @Scrypt_SalsaXor_Sse2; end; end; {$ENDIF} {$IFDEF HASHLIB_X86_64_ASM} - case TCpuFeatures.X86.GetActiveSimdLevel() of + case TCpuFeatures.X86.SelectSlot([TX86SimdLevel.AVX2, TX86SimdLevel.SSE2]) of TX86SimdLevel.AVX2: begin Scrypt_SalsaXor := @Scrypt_SalsaXor_Avx2; end; - TX86SimdLevel.SSE2, TX86SimdLevel.SSSE3: + TX86SimdLevel.SSE2: begin Scrypt_SalsaXor := @Scrypt_SalsaXor_Sse2; end; diff --git a/HashLib/src/Utils/HlpArmSimdFeatures.pas b/HashLib/src/Utils/HlpArmSimdFeatures.pas index 6f8f3d2..feaf592 100644 --- a/HashLib/src/Utils/HlpArmSimdFeatures.pas +++ b/HashLib/src/Utils/HlpArmSimdFeatures.pas @@ -55,6 +55,18 @@ TArmSimdFeatures = class sealed class function HasSHA3(): Boolean; static; class function HasCRC32(): Boolean; static; class function HasPMULL(): Boolean; static; + + // Picks the highest declared tier in ATiers that is <= the cached + // FActiveSimdLevel. Falls back to TArmSimdLevel.Scalar when no tier + // matches or ATiers is empty. Dispatch units use this overload. + class function SelectSlot(const ATiers: array of TArmSimdLevel) + : TArmSimdLevel; overload; static; + + // Pure overload: reasons over any caller-supplied active level. + // Used by tests to deterministically exercise fallback semantics + // without depending on the host CPU. + class function SelectSlot(AActiveLevel: TArmSimdLevel; + const ATiers: array of TArmSimdLevel): TArmSimdLevel; overload; static; end; implementation @@ -455,6 +467,39 @@ class function TArmSimdFeatures.HasPMULL(): Boolean; Result := FHasPMULL; end; +class function TArmSimdFeatures.SelectSlot(const ATiers + : array of TArmSimdLevel): TArmSimdLevel; +begin + Result := SelectSlot(FActiveSimdLevel, ATiers); +end; + +class function TArmSimdFeatures.SelectSlot(AActiveLevel: TArmSimdLevel; + const ATiers: array of TArmSimdLevel): TArmSimdLevel; +var + I: Integer; + LTier, LBest: TArmSimdLevel; + LFound: Boolean; +begin + // Walk all declared tiers, keep the highest one that is <= AActiveLevel. + // Order of ATiers is irrelevant. Empty ATiers or no matching tier yields + // TArmSimdLevel.Scalar so dispatch units cleanly fall through to scalar. + LBest := TArmSimdLevel.Scalar; + LFound := False; + for I := 0 to System.Length(ATiers) - 1 do + begin + LTier := ATiers[I]; + if (LTier <= AActiveLevel) and ((not LFound) or (LTier > LBest)) then + begin + LBest := LTier; + LFound := True; + end; + end; + if LFound then + Result := LBest + else + Result := TArmSimdLevel.Scalar; +end; + initialization TArmSimdFeatures.ProbeHardwareAndCache(); TArmSimdFeatures.ApplyBuildOverrides(); diff --git a/HashLib/src/Utils/HlpSimdLevels.pas b/HashLib/src/Utils/HlpSimdLevels.pas index 678e838..bb9b9bf 100644 --- a/HashLib/src/Utils/HlpSimdLevels.pas +++ b/HashLib/src/Utils/HlpSimdLevels.pas @@ -5,7 +5,7 @@ interface type - TX86SimdLevel = (Scalar, SSE2, SSSE3, AVX2); + TX86SimdLevel = (Scalar, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX2); TArmSimdLevel = (Scalar, NEON, SVE, SVE2); implementation diff --git a/HashLib/src/Utils/HlpX86SimdFeatures.pas b/HashLib/src/Utils/HlpX86SimdFeatures.pas index cbfcbce..fe72655 100644 --- a/HashLib/src/Utils/HlpX86SimdFeatures.pas +++ b/HashLib/src/Utils/HlpX86SimdFeatures.pas @@ -25,7 +25,10 @@ TCpuIdResult = record strict private class function CPUHasSSE2(): Boolean; static; + class function CPUHasSSE3(): Boolean; static; class function CPUHasSSSE3(): Boolean; static; + class function CPUHasSSE41(): Boolean; static; + class function CPUHasSSE42(): Boolean; static; class function CPUHasAVX2(): Boolean; static; class function CPUHasSHANI(): Boolean; static; class function CPUHasPCLMULQDQ(): Boolean; static; @@ -39,12 +42,27 @@ TCpuIdResult = record public class function GetActiveSimdLevel(): TX86SimdLevel; static; class function HasSSE2(): Boolean; static; + class function HasSSE3(): Boolean; static; class function HasSSSE3(): Boolean; static; + class function HasSSE41(): Boolean; static; + class function HasSSE42(): Boolean; static; class function HasAVX2(): Boolean; static; class function HasSHANI(): Boolean; static; class function HasPCLMULQDQ(): Boolean; static; class function HasVPCLMULQDQ(): Boolean; static; class function HasAESNI(): Boolean; static; + + // Picks the highest declared tier in ATiers that is <= the cached + // FActiveSimdLevel. Falls back to TX86SimdLevel.Scalar when no tier + // matches or ATiers is empty. Dispatch units use this overload. + class function SelectSlot(const ATiers: array of TX86SimdLevel) + : TX86SimdLevel; overload; static; + + // Pure overload: reasons over any caller-supplied active level. + // Used by tests to deterministically exercise fallback semantics + // without depending on the host CPU. + class function SelectSlot(AActiveLevel: TX86SimdLevel; + const ATiers: array of TX86SimdLevel): TX86SimdLevel; overload; static; end; implementation @@ -77,6 +95,21 @@ class function TX86SimdFeatures.CPUHasSSE2(): Boolean; {$ENDIF} end; +class function TX86SimdFeatures.CPUHasSSE3(): Boolean; +{$IFDEF HASHLIB_X86_SIMD} +var + LCpuId: TCpuIdResult; +{$ENDIF} +begin +{$IFDEF HASHLIB_X86_SIMD} + CpuIdQuery(1, 0, @LCpuId); + // SSE3: ECX bit 0 + Result := (LCpuId.RegECX and (1 shl 0)) <> 0; +{$ELSE} + Result := False; +{$ENDIF} +end; + class function TX86SimdFeatures.CPUHasSSSE3(): Boolean; {$IFDEF HASHLIB_X86_SIMD} var @@ -92,6 +125,36 @@ class function TX86SimdFeatures.CPUHasSSSE3(): Boolean; {$ENDIF} end; +class function TX86SimdFeatures.CPUHasSSE41(): Boolean; +{$IFDEF HASHLIB_X86_SIMD} +var + LCpuId: TCpuIdResult; +{$ENDIF} +begin +{$IFDEF HASHLIB_X86_SIMD} + CpuIdQuery(1, 0, @LCpuId); + // SSE4.1: ECX bit 19 + Result := (LCpuId.RegECX and (1 shl 19)) <> 0; +{$ELSE} + Result := False; +{$ENDIF} +end; + +class function TX86SimdFeatures.CPUHasSSE42(): Boolean; +{$IFDEF HASHLIB_X86_SIMD} +var + LCpuId: TCpuIdResult; +{$ENDIF} +begin +{$IFDEF HASHLIB_X86_SIMD} + CpuIdQuery(1, 0, @LCpuId); + // SSE4.2: ECX bit 20 + Result := (LCpuId.RegECX and (1 shl 20)) <> 0; +{$ELSE} + Result := False; +{$ENDIF} +end; + class function TX86SimdFeatures.CPUHasAVX2(): Boolean; {$IFDEF HASHLIB_X86_SIMD} var @@ -181,30 +244,38 @@ class function TX86SimdFeatures.CPUHasAESNI(): Boolean; end; class procedure TX86SimdFeatures.ProbeHardwareAndCache(); +var + LHasSSE2, LHasSSE3, LHasSSSE3, LHasSSE41, LHasSSE42, LHasAVX2: Boolean; begin - FActiveSimdLevel := TX86SimdLevel.Scalar; - FHasSHANI := False; - FHasPCLMULQDQ := False; - FHasVPCLMULQDQ := False; - FHasAESNI := False; - - if CPUHasSSE2() then - begin - FActiveSimdLevel := TX86SimdLevel.SSE2; - FHasPCLMULQDQ := CPUHasPCLMULQDQ(); - if CPUHasSSSE3() then - begin - FActiveSimdLevel := TX86SimdLevel.SSSE3; - if CPUHasAVX2() then - begin - FActiveSimdLevel := TX86SimdLevel.AVX2; - FHasVPCLMULQDQ := CPUHasVPCLMULQDQ(); - end; - end; - end; - - FHasSHANI := CPUHasSHANI(); - FHasAESNI := CPUHasAESNI(); + // Probe once, reason later + LHasSSE2 := CPUHasSSE2(); + LHasSSE3 := CPUHasSSE3() and LHasSSE2; + LHasSSSE3 := CPUHasSSSE3() and LHasSSE3; // enforce invariant defensively + LHasSSE41 := CPUHasSSE41() and LHasSSSE3; + LHasSSE42 := CPUHasSSE42() and LHasSSE41; + LHasAVX2 := CPUHasAVX2() and LHasSSE42; // AVX2 implies full SSE lineage + + // Pick the highest tier the CPU can sustain + if LHasAVX2 then + FActiveSimdLevel := TX86SimdLevel.AVX2 + else if LHasSSE42 then + FActiveSimdLevel := TX86SimdLevel.SSE42 + else if LHasSSE41 then + FActiveSimdLevel := TX86SimdLevel.SSE41 + else if LHasSSSE3 then + FActiveSimdLevel := TX86SimdLevel.SSSE3 + else if LHasSSE3 then + FActiveSimdLevel := TX86SimdLevel.SSE3 + else if LHasSSE2 then + FActiveSimdLevel := TX86SimdLevel.SSE2 + else + FActiveSimdLevel := TX86SimdLevel.Scalar; + + // Independent feature bits - not tied to the SIMD tier ladder + FHasAESNI := CPUHasAESNI(); + FHasSHANI := CPUHasSHANI(); + FHasPCLMULQDQ := CPUHasPCLMULQDQ(); + FHasVPCLMULQDQ := CPUHasVPCLMULQDQ() and LHasAVX2; // VPCLMULQDQ needs AVX/AVX2 lanes end; class procedure TX86SimdFeatures.ApplyBuildOverrides(); @@ -222,6 +293,13 @@ class procedure TX86SimdFeatures.ApplyBuildOverrides(); FHasPCLMULQDQ := False; FHasVPCLMULQDQ := False; FHasAESNI := False; +{$ELSEIF DEFINED(HASHLIB_FORCE_SSE3)} + if FActiveSimdLevel > TX86SimdLevel.SSE3 then + FActiveSimdLevel := TX86SimdLevel.SSE3; + FHasSHANI := False; + FHasPCLMULQDQ := False; + FHasVPCLMULQDQ := False; + FHasAESNI := False; {$ELSEIF DEFINED(HASHLIB_FORCE_SSSE3)} if FActiveSimdLevel > TX86SimdLevel.SSSE3 then FActiveSimdLevel := TX86SimdLevel.SSSE3; @@ -229,6 +307,20 @@ class procedure TX86SimdFeatures.ApplyBuildOverrides(); FHasPCLMULQDQ := False; FHasVPCLMULQDQ := False; FHasAESNI := False; +{$ELSEIF DEFINED(HASHLIB_FORCE_SSE41)} + if FActiveSimdLevel > TX86SimdLevel.SSE41 then + FActiveSimdLevel := TX86SimdLevel.SSE41; + FHasSHANI := False; + FHasPCLMULQDQ := False; + FHasVPCLMULQDQ := False; + FHasAESNI := False; +{$ELSEIF DEFINED(HASHLIB_FORCE_SSE42)} + if FActiveSimdLevel > TX86SimdLevel.SSE42 then + FActiveSimdLevel := TX86SimdLevel.SSE42; + FHasSHANI := False; + FHasPCLMULQDQ := False; + FHasVPCLMULQDQ := False; + FHasAESNI := False; {$IFEND} end; @@ -242,11 +334,26 @@ class function TX86SimdFeatures.HasSSE2(): Boolean; Result := FActiveSimdLevel >= TX86SimdLevel.SSE2; end; +class function TX86SimdFeatures.HasSSE3(): Boolean; +begin + Result := FActiveSimdLevel >= TX86SimdLevel.SSE3; +end; + class function TX86SimdFeatures.HasSSSE3(): Boolean; begin Result := FActiveSimdLevel >= TX86SimdLevel.SSSE3; end; +class function TX86SimdFeatures.HasSSE41(): Boolean; +begin + Result := FActiveSimdLevel >= TX86SimdLevel.SSE41; +end; + +class function TX86SimdFeatures.HasSSE42(): Boolean; +begin + Result := FActiveSimdLevel >= TX86SimdLevel.SSE42; +end; + class function TX86SimdFeatures.HasAVX2(): Boolean; begin Result := FActiveSimdLevel >= TX86SimdLevel.AVX2; @@ -272,6 +379,39 @@ class function TX86SimdFeatures.HasAESNI(): Boolean; Result := FHasAESNI; end; +class function TX86SimdFeatures.SelectSlot(const ATiers + : array of TX86SimdLevel): TX86SimdLevel; +begin + Result := SelectSlot(FActiveSimdLevel, ATiers); +end; + +class function TX86SimdFeatures.SelectSlot(AActiveLevel: TX86SimdLevel; + const ATiers: array of TX86SimdLevel): TX86SimdLevel; +var + I: Integer; + LTier, LBest: TX86SimdLevel; + LFound: Boolean; +begin + // Walk all declared tiers, keep the highest one that is <= AActiveLevel. + // Order of ATiers is irrelevant. Empty ATiers or no matching tier yields + // TX86SimdLevel.Scalar so dispatch units cleanly fall through to scalar. + LBest := TX86SimdLevel.Scalar; + LFound := False; + for I := 0 to System.Length(ATiers) - 1 do + begin + LTier := ATiers[I]; + if (LTier <= AActiveLevel) and ((not LFound) or (LTier > LBest)) then + begin + LBest := LTier; + LFound := True; + end; + end; + if LFound then + Result := LBest + else + Result := TX86SimdLevel.Scalar; +end; + initialization TX86SimdFeatures.ProbeHardwareAndCache(); TX86SimdFeatures.ApplyBuildOverrides();