+add AVX2 optimizations of function SynetNormalizeLayerForward16bV2.

ermig1979 · ermig1979 · commit ed8a32c60902 · 2026-03-19T12:56:36.000+03:00
diff --git a/docs/2026.html b/docs/2026.html
@@ -46,7 +46,7 @@ <h5>New features</h5>
  <li>Parameter 'activation' to function SimdSynetInnerProduct16bInit.</li>
  <li>Parameter 'params' to function SimdSynetInnerProduct16bSetParams.</li>
  <li>Base implementation of class SynetGatherElements.</li>
- <li>Base implementation, SSE4.1 optimizations of function SynetNormalizeLayerForward16bV2.</li>
+ <li>Base implementation, SSE4.1, AVX2 optimizations of function SynetNormalizeLayerForward16bV2.</li>
 </ul>
 <h5>Improving</h5>
 <ul>
diff --git a/prj/vs2022/Avx2.vcxproj b/prj/vs2022/Avx2.vcxproj
@@ -129,7 +129,8 @@
     <ClCompile Include="..\..\src\Simd\SimdAvx2SynetMergedConvolution8iDepthwise.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAvx2SynetMergedConvolution8iInput.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAvx2SynetMergedConvolution8iOutput.cpp" />
-    <ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize.cpp" />
+    <ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize16b.cpp" />
+    <ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize32f.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAvx2SynetPermute.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAvx2SynetPooling.cpp" />
     <ClCompile Include="..\..\src\Simd\SimdAvx2SynetQuantizedActivation.cpp" />
diff --git a/prj/vs2022/Avx2.vcxproj.filters b/prj/vs2022/Avx2.vcxproj.filters
@@ -394,9 +394,6 @@
     <ClCompile Include="..\..\src\Simd\SimdAvx2SynetGridSample2d32fBlZ.cpp">
       <Filter>Avx2\Synet\Other</Filter>
     </ClCompile>
-    <ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize.cpp">
-      <Filter>Avx2\Synet\Other</Filter>
-    </ClCompile>
     <ClCompile Include="..\..\src\Simd\SimdAvx2SynetPermute.cpp">
       <Filter>Avx2\Synet\Other</Filter>
     </ClCompile>
@@ -469,6 +466,12 @@
     <ClCompile Include="..\..\src\Simd\SimdAvx2SynetQuantizedConvolutionNhwcDepthwiseV2.cpp">
       <Filter>Avx2\Synet\Quantized</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize16b.cpp">
+      <Filter>Avx2\Synet\Normalize</Filter>
+    </ClCompile>
+    <ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize32f.cpp">
+      <Filter>Avx2\Synet\Normalize</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <Filter Include="Avx2">
@@ -531,6 +534,9 @@
     <Filter Include="Avx2\Transform">
       <UniqueIdentifier>{03870f81-3415-432d-8124-db5502f137e8}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Avx2\Synet\Normalize">
+      <UniqueIdentifier>{73ef30bd-9802-44ff-a136-fe77b4e92c11}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\..\src\Simd\SimdAvx2.h">
diff --git a/src/Simd/SimdAvx2.h b/src/Simd/SimdAvx2.h
@@ -528,6 +528,9 @@ namespace Simd
         void SynetNormalizeLayerForwardV4(const float* src, size_t batch, size_t channels, size_t spatial,
             const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst);
 
+        void SynetNormalizeLayerForward16bV2(const uint16_t* src, size_t batch, size_t channels, size_t spatial,
+            const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, uint16_t* dst);
+
         void SynetPoolingAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX,
             size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format);
 
diff --git a/src/Simd/SimdAvx2SynetNormalize16b.cpp b/src/Simd/SimdAvx2SynetNormalize16b.cpp
@@ -0,0 +1,96 @@
+/*
+* Simd Library (http://ermig1979.github.io/Simd).
+*
+* Copyright (c) 2011-2024 Yermalayeu Ihar.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to deal
+* in the Software without restriction, including without limitation the rights
+* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+* copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+#include "Simd/SimdSynet.h"
+#include "Simd/SimdArray.h"
+#include "Simd/SimdMath.h"
+#include "Simd/SimdExtract.h"
+#include "Simd/SimdAvx2.h"
+
+namespace Simd
+{
+#if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE)   
+    namespace Avx2
+    {
+        void NormalizeNhwc16bV2(const uint16_t* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float * buf, uint16_t* dst)
+        {
+            float k = 1.0f / float(channels);
+            size_t channelsF = AlignLo(channels, F), c;
+            Array32f _buf;
+            if (buf == NULL)
+            {
+                _buf.Resize(channels);
+                buf = _buf.data;
+            }            
+            for (size_t b = 0; b < batch; ++b)
+            {
+                for (size_t s = 0; s < spatial; ++s)
+                {
+                    BFloat16ToFloat32(src, channels, buf);
+
+                    __m256 _sum = _mm256_setzero_ps();
+                    for (c = 0; c < channelsF; c += F)
+                        _sum = _mm256_add_ps(_mm256_loadu_ps(buf + c), _sum);
+                    float sum = ExtractSum(_sum);
+                    for (; c < channels; ++c)
+                        sum += buf[c];
+                    __m256 mean = _mm256_set1_ps(sum * k);
+                    for (c = 0; c < channelsF; c += F)
+                        _mm256_storeu_ps(buf + c, _mm256_sub_ps(_mm256_loadu_ps(buf + c), mean));
+                    for (; c < channels; ++c)
+                        _mm_store_ss(buf + c, _mm_sub_ss(_mm_load_ss(buf + c), _mm256_castps256_ps128(mean)));
+
+                    __m256 _sqsum = _mm256_setzero_ps();
+                    for (c = 0; c < channelsF; c += F)
+                    {
+                        __m256 _buf = _mm256_loadu_ps(buf + c);
+                        _sqsum = _mm256_fmadd_ps(_buf, _buf, _sqsum);
+                    }
+                    float sqsum = ExtractSum(_sqsum);
+                    for (; c < channels; ++c)
+                        sqsum += Simd::Square(buf[c]);
+                    __m256 norm = _mm256_set1_ps(1.0f / ::sqrt(sqsum * k + eps));
+                    for (c = 0; c < channelsF; c += F)
+                        _mm256_storeu_ps(buf + c, _mm256_fmadd_ps(_mm256_mul_ps(_mm256_loadu_ps(buf + c), norm), _mm256_loadu_ps(scale + c), _mm256_loadu_ps(shift + c)));
+                    for (; c < channels; ++c)
+                        _mm_store_ss(buf + c, _mm_fmadd_ss(_mm_mul_ss(_mm_load_ss(buf + c), _mm256_castps256_ps128(norm)), _mm_load_ss(scale + c), _mm_load_ss(shift + c)));
+
+                    Float32ToBFloat16(buf, channels, dst);
+
+                    dst += channels;
+                    src += channels;
+                }
+            }
+        }
+
+        void SynetNormalizeLayerForward16bV2(const uint16_t* src, size_t batch, size_t channels, size_t spatial,
+            const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, uint16_t* dst)
+        {
+            if (format == SimdTensorFormatNhwc)
+                NormalizeNhwc16bV2(src, batch, channels, spatial, scale, shift, *eps, buf, dst);
+            else
+                assert(0);
+        }
+    }
+#endif
+}
diff --git a/src/Simd/SimdAvx2SynetNormalize32f.cpp b/src/Simd/SimdAvx2SynetNormalize32f.cpp
diff --git a/src/Simd/SimdLib.cpp b/src/Simd/SimdLib.cpp
@@ -6104,7 +6104,7 @@ SIMD_API void SimdSynetNormalizeLayerForward16bV2(const uint16_t* src, size_t ba
 #if defined(SIMD_SYNET_ENABLE)
     typedef void(*SimdSynetNormalizeLayerForward16bV2Ptr) (const uint16_t* src, size_t batch, size_t channels, size_t spatial,
         const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, uint16_t* dst);
-    const static SimdSynetNormalizeLayerForward16bV2Ptr simdSynetNormalizeLayerForward16bV2 = SIMD_FUNC1(SynetNormalizeLayerForward16bV2, SIMD_SSE41_FUNC);//, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC , SIMD_NEON_FUNC);
+    const static SimdSynetNormalizeLayerForward16bV2Ptr simdSynetNormalizeLayerForward16bV2 = SIMD_FUNC2(SynetNormalizeLayerForward16bV2, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC);//, SIMD_AVX512BW_FUNC , SIMD_NEON_FUNC);
 
     simdSynetNormalizeLayerForward16bV2(src, batch, channels, spatial, scale, shift, eps, format, buf, dst);
 #else
diff --git a/src/Simd/SimdSse41SynetNormalize16b.cpp b/src/Simd/SimdSse41SynetNormalize16b.cpp
@@ -36,13 +36,13 @@ namespace Simd
         void NormalizeNhwc16bV2(const uint16_t* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, uint16_t* dst)
         {
             float k = 1.0f / float(channels);
+            size_t channelsF = AlignLo(channels, F), c;
             Array32f _buf;
             if (buf == NULL)
             {
                 _buf.Resize(channels);
                 buf = _buf.data;
             }
-            size_t channelsF = AlignLo(channels, F), c;
             for (size_t b = 0; b < batch; ++b)
             {
                 for (size_t s = 0; s < spatial; ++s)
diff --git a/src/Test/TestSynetNormalize16b.cpp b/src/Test/TestSynetNormalize16b.cpp
@@ -109,6 +109,7 @@ namespace Test
 
         for (int f = 0; f < 1; f++)
         {
+            result = result && SynetNormalizeLayerForward16bV2AutoTest(1, 512, 196, formats[f], 1, f1, f2);
             result = result && SynetNormalizeLayerForward16bV2AutoTest(1, C, W, formats[f], 1, f1, f2);
             result = result && SynetNormalizeLayerForward16bV2AutoTest(8, C, W, formats[f], 1, f1, f2);
             result = result && SynetNormalizeLayerForward16bV2AutoTest(7, C - O, W + O, formats[f], 0, f1, f2);
@@ -129,6 +130,11 @@ namespace Test
             result = result && SynetNormalizeLayerForward16bV2AutoTest(FUNC_SNLF16B2(Simd::Sse41::SynetNormalizeLayerForward16bV2), FUNC_SNLF16B2(SimdSynetNormalizeLayerForward16bV2));
 #endif 
 
+#ifdef SIMD_AVX2_ENABLE
+        if (Simd::Avx2::Enable && TestAvx2(options))
+            result = result && SynetNormalizeLayerForward16bV2AutoTest(FUNC_SNLF16B2(Simd::Avx2::SynetNormalizeLayerForward16bV2), FUNC_SNLF16B2(SimdSynetNormalizeLayerForward16bV2));
+#endif
+
         return result;
     }
 

Original file line number	Diff line number	Diff line change
`@@ -36,13 +36,13 @@ namespace Simd`
`36`	`36`	`void NormalizeNhwc16bV2(const uint16_t* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, uint16_t* dst)`
`37`	`37`	`{`
`38`	`38`	`float k = 1.0f / float(channels);`
	`39`	`+ size_t channelsF = AlignLo(channels, F), c;`
`39`	`40`	`Array32f _buf;`
`40`	`41`	`if (buf == NULL)`
`41`	`42`	`{`
`42`	`43`	`_buf.Resize(channels);`
`43`	`44`	`buf = _buf.data;`
`44`	`45`	`}`
`45`		`- size_t channelsF = AlignLo(channels, F), c;`
`46`	`46`	`for (size_t b = 0; b < batch; ++b)`
`47`	`47`	`{`
`48`	`48`	`for (size_t s = 0; s < spatial; ++s)`