Skip to content

Commit ed8a32c

Browse files
committed
+add AVX2 optimizations of function SynetNormalizeLayerForward16bV2.
1 parent 50feecc commit ed8a32c

9 files changed

Lines changed: 119 additions & 7 deletions

docs/2026.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ <h5>New features</h5>
4646
<li>Parameter 'activation' to function SimdSynetInnerProduct16bInit.</li>
4747
<li>Parameter 'params' to function SimdSynetInnerProduct16bSetParams.</li>
4848
<li>Base implementation of class SynetGatherElements.</li>
49-
<li>Base implementation, SSE4.1 optimizations of function SynetNormalizeLayerForward16bV2.</li>
49+
<li>Base implementation, SSE4.1, AVX2 optimizations of function SynetNormalizeLayerForward16bV2.</li>
5050
</ul>
5151
<h5>Improving</h5>
5252
<ul>

prj/vs2022/Avx2.vcxproj

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,8 @@
129129
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetMergedConvolution8iDepthwise.cpp" />
130130
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetMergedConvolution8iInput.cpp" />
131131
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetMergedConvolution8iOutput.cpp" />
132-
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize.cpp" />
132+
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize16b.cpp" />
133+
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize32f.cpp" />
133134
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetPermute.cpp" />
134135
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetPooling.cpp" />
135136
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetQuantizedActivation.cpp" />

prj/vs2022/Avx2.vcxproj.filters

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,9 +394,6 @@
394394
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetGridSample2d32fBlZ.cpp">
395395
<Filter>Avx2\Synet\Other</Filter>
396396
</ClCompile>
397-
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize.cpp">
398-
<Filter>Avx2\Synet\Other</Filter>
399-
</ClCompile>
400397
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetPermute.cpp">
401398
<Filter>Avx2\Synet\Other</Filter>
402399
</ClCompile>
@@ -469,6 +466,12 @@
469466
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetQuantizedConvolutionNhwcDepthwiseV2.cpp">
470467
<Filter>Avx2\Synet\Quantized</Filter>
471468
</ClCompile>
469+
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize16b.cpp">
470+
<Filter>Avx2\Synet\Normalize</Filter>
471+
</ClCompile>
472+
<ClCompile Include="..\..\src\Simd\SimdAvx2SynetNormalize32f.cpp">
473+
<Filter>Avx2\Synet\Normalize</Filter>
474+
</ClCompile>
472475
</ItemGroup>
473476
<ItemGroup>
474477
<Filter Include="Avx2">
@@ -531,6 +534,9 @@
531534
<Filter Include="Avx2\Transform">
532535
<UniqueIdentifier>{03870f81-3415-432d-8124-db5502f137e8}</UniqueIdentifier>
533536
</Filter>
537+
<Filter Include="Avx2\Synet\Normalize">
538+
<UniqueIdentifier>{73ef30bd-9802-44ff-a136-fe77b4e92c11}</UniqueIdentifier>
539+
</Filter>
534540
</ItemGroup>
535541
<ItemGroup>
536542
<ClInclude Include="..\..\src\Simd\SimdAvx2.h">

src/Simd/SimdAvx2.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,9 @@ namespace Simd
528528
void SynetNormalizeLayerForwardV4(const float* src, size_t batch, size_t channels, size_t spatial,
529529
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, float* dst);
530530

531+
void SynetNormalizeLayerForward16bV2(const uint16_t* src, size_t batch, size_t channels, size_t spatial,
532+
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, uint16_t* dst);
533+
531534
void SynetPoolingAverage(const float* src, size_t srcC, size_t srcH, size_t srcW, size_t kernelY, size_t kernelX,
532535
size_t strideY, size_t strideX, size_t padY, size_t padX, float* dst, size_t dstH, size_t dstW, SimdBool excludePad, SimdTensorFormatType format);
533536

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Simd Library (http://ermig1979.github.io/Simd).
3+
*
4+
* Copyright (c) 2011-2024 Yermalayeu Ihar.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
* SOFTWARE.
23+
*/
24+
#include "Simd/SimdSynet.h"
25+
#include "Simd/SimdArray.h"
26+
#include "Simd/SimdMath.h"
27+
#include "Simd/SimdExtract.h"
28+
#include "Simd/SimdAvx2.h"
29+
30+
namespace Simd
31+
{
32+
#if defined(SIMD_AVX2_ENABLE) && defined(SIMD_SYNET_ENABLE)
33+
namespace Avx2
34+
{
35+
void NormalizeNhwc16bV2(const uint16_t* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float * buf, uint16_t* dst)
36+
{
37+
float k = 1.0f / float(channels);
38+
size_t channelsF = AlignLo(channels, F), c;
39+
Array32f _buf;
40+
if (buf == NULL)
41+
{
42+
_buf.Resize(channels);
43+
buf = _buf.data;
44+
}
45+
for (size_t b = 0; b < batch; ++b)
46+
{
47+
for (size_t s = 0; s < spatial; ++s)
48+
{
49+
BFloat16ToFloat32(src, channels, buf);
50+
51+
__m256 _sum = _mm256_setzero_ps();
52+
for (c = 0; c < channelsF; c += F)
53+
_sum = _mm256_add_ps(_mm256_loadu_ps(buf + c), _sum);
54+
float sum = ExtractSum(_sum);
55+
for (; c < channels; ++c)
56+
sum += buf[c];
57+
__m256 mean = _mm256_set1_ps(sum * k);
58+
for (c = 0; c < channelsF; c += F)
59+
_mm256_storeu_ps(buf + c, _mm256_sub_ps(_mm256_loadu_ps(buf + c), mean));
60+
for (; c < channels; ++c)
61+
_mm_store_ss(buf + c, _mm_sub_ss(_mm_load_ss(buf + c), _mm256_castps256_ps128(mean)));
62+
63+
__m256 _sqsum = _mm256_setzero_ps();
64+
for (c = 0; c < channelsF; c += F)
65+
{
66+
__m256 _buf = _mm256_loadu_ps(buf + c);
67+
_sqsum = _mm256_fmadd_ps(_buf, _buf, _sqsum);
68+
}
69+
float sqsum = ExtractSum(_sqsum);
70+
for (; c < channels; ++c)
71+
sqsum += Simd::Square(buf[c]);
72+
__m256 norm = _mm256_set1_ps(1.0f / ::sqrt(sqsum * k + eps));
73+
for (c = 0; c < channelsF; c += F)
74+
_mm256_storeu_ps(buf + c, _mm256_fmadd_ps(_mm256_mul_ps(_mm256_loadu_ps(buf + c), norm), _mm256_loadu_ps(scale + c), _mm256_loadu_ps(shift + c)));
75+
for (; c < channels; ++c)
76+
_mm_store_ss(buf + c, _mm_fmadd_ss(_mm_mul_ss(_mm_load_ss(buf + c), _mm256_castps256_ps128(norm)), _mm_load_ss(scale + c), _mm_load_ss(shift + c)));
77+
78+
Float32ToBFloat16(buf, channels, dst);
79+
80+
dst += channels;
81+
src += channels;
82+
}
83+
}
84+
}
85+
86+
void SynetNormalizeLayerForward16bV2(const uint16_t* src, size_t batch, size_t channels, size_t spatial,
87+
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, uint16_t* dst)
88+
{
89+
if (format == SimdTensorFormatNhwc)
90+
NormalizeNhwc16bV2(src, batch, channels, spatial, scale, shift, *eps, buf, dst);
91+
else
92+
assert(0);
93+
}
94+
}
95+
#endif
96+
}

src/Simd/SimdLib.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6104,7 +6104,7 @@ SIMD_API void SimdSynetNormalizeLayerForward16bV2(const uint16_t* src, size_t ba
61046104
#if defined(SIMD_SYNET_ENABLE)
61056105
typedef void(*SimdSynetNormalizeLayerForward16bV2Ptr) (const uint16_t* src, size_t batch, size_t channels, size_t spatial,
61066106
const float* scale, const float* shift, const float* eps, SimdTensorFormatType format, float* buf, uint16_t* dst);
6107-
const static SimdSynetNormalizeLayerForward16bV2Ptr simdSynetNormalizeLayerForward16bV2 = SIMD_FUNC1(SynetNormalizeLayerForward16bV2, SIMD_SSE41_FUNC);//, SIMD_AVX512BW_FUNC, SIMD_AVX2_FUNC , SIMD_NEON_FUNC);
6107+
const static SimdSynetNormalizeLayerForward16bV2Ptr simdSynetNormalizeLayerForward16bV2 = SIMD_FUNC2(SynetNormalizeLayerForward16bV2, SIMD_AVX2_FUNC, SIMD_SSE41_FUNC);//, SIMD_AVX512BW_FUNC , SIMD_NEON_FUNC);
61086108

61096109
simdSynetNormalizeLayerForward16bV2(src, batch, channels, spatial, scale, shift, eps, format, buf, dst);
61106110
#else

src/Simd/SimdSse41SynetNormalize16b.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,13 @@ namespace Simd
3636
void NormalizeNhwc16bV2(const uint16_t* src, size_t batch, size_t channels, size_t spatial, const float* scale, const float* shift, float eps, float* buf, uint16_t* dst)
3737
{
3838
float k = 1.0f / float(channels);
39+
size_t channelsF = AlignLo(channels, F), c;
3940
Array32f _buf;
4041
if (buf == NULL)
4142
{
4243
_buf.Resize(channels);
4344
buf = _buf.data;
4445
}
45-
size_t channelsF = AlignLo(channels, F), c;
4646
for (size_t b = 0; b < batch; ++b)
4747
{
4848
for (size_t s = 0; s < spatial; ++s)

src/Test/TestSynetNormalize16b.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ namespace Test
109109

110110
for (int f = 0; f < 1; f++)
111111
{
112+
result = result && SynetNormalizeLayerForward16bV2AutoTest(1, 512, 196, formats[f], 1, f1, f2);
112113
result = result && SynetNormalizeLayerForward16bV2AutoTest(1, C, W, formats[f], 1, f1, f2);
113114
result = result && SynetNormalizeLayerForward16bV2AutoTest(8, C, W, formats[f], 1, f1, f2);
114115
result = result && SynetNormalizeLayerForward16bV2AutoTest(7, C - O, W + O, formats[f], 0, f1, f2);
@@ -129,6 +130,11 @@ namespace Test
129130
result = result && SynetNormalizeLayerForward16bV2AutoTest(FUNC_SNLF16B2(Simd::Sse41::SynetNormalizeLayerForward16bV2), FUNC_SNLF16B2(SimdSynetNormalizeLayerForward16bV2));
130131
#endif
131132

133+
#ifdef SIMD_AVX2_ENABLE
134+
if (Simd::Avx2::Enable && TestAvx2(options))
135+
result = result && SynetNormalizeLayerForward16bV2AutoTest(FUNC_SNLF16B2(Simd::Avx2::SynetNormalizeLayerForward16bV2), FUNC_SNLF16B2(SimdSynetNormalizeLayerForward16bV2));
136+
#endif
137+
132138
return result;
133139
}
134140

0 commit comments

Comments
 (0)