Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions intrin-nolut.c
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#ifdef __AVX__
//#define _GNU_SOURCE
#include <emmintrin.h>
#include <immintrin.h> // vzeroupper
Expand Down Expand Up @@ -240,3 +241,5 @@ void SYSV_ABI rs_process_nolut_intrin(void* dstvoid, const void* srcvoid, size_t
* VPSLLVW doesn't exist until AVX512BW. AVX2 only has D and Q sizes.
* On Haswell, those take 3 uops anyway (lat=2, recip tput=2). useless without fast vshift
*/

#endif
4 changes: 2 additions & 2 deletions intrin-pinsrw.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ void SYSV_ABI rs_process_pinsrw_intrin(void* dstvoid, const void* srcvoid, size_
const uint64_t *src = srcvoid;
__m128i *dst = dstvoid;

const typeof(LH) L = LH;
const typeof(LH) H = LH + 256;
const uint32_t* L = LH;
const uint32_t* H = LH + 256;

// _mm256_zeroupper();
for (size_t i = 0; i < size/sizeof(*dst) ; i+=1) {
Expand Down
29 changes: 24 additions & 5 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
* compile with:
* x86_64-w64-mingw32-gcc to make a.exe
*
* gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu11 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s asm-avx2-vgatherdd.s intrin-pinsrw.c asm-pinsrw*.s
* gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s asm-avx2-vgatherdd.s intrin-pinsrw.c asm-pinsrw*.s xordep*.[cs]
* (or, for older compilers)
* gcc -DIACA_MARKS_OFF -o rs-asmbench -g -Wall -march=native -funroll-loops -O3 -std=gnu99 main.c process-purec.c intrin-nolut.c reedsolomon-x86_64-mmx.s reedsolomon-x86_64-mmx-orig.s intrin-pinsrw.c asm-pinsrw*.s xordep*.[cs]
*
* some ASM files have IACA marks in them, but the illegal-instruction code is only illegal for 32bit code.
*
* run with:
Expand Down Expand Up @@ -54,6 +57,13 @@
#define HAVE_AVX2 0
#endif

#ifdef __AVX__
#define VZEROUPPER if(HAVE_AVX2) _mm256_zeroupper();
#else
#define VZEROUPPER
#endif


static __inline__ uint64_t rdtsc() {
uint32_t low, high;
/* __asm__ __volatile__ (
Expand All @@ -75,8 +85,10 @@ void SYSV_ABI rs_process_pinsrw64(void* dst, const void* src, size_t size, const
void SYSV_ABI rs_process_pinsrw128(void* dst, const void* src, size_t size, const uint32_t* LH);
void SYSV_ABI rs_process_pinsrw_nodep(void* dst, const void* src, size_t size, const uint32_t* LH);
void SYSV_ABI rs_process_uoptest(void* dst, const void* src, size_t size, const uint32_t* LH);
void SYSV_ABI rs_process_xordep_mul32767(void* dst, const void* src, size_t size, const uint32_t* LH);
// rs_process_pinsrw_intrin
void SYSV_ABI rs_dummy(void* dst, const void* src, size_t size, const uint32_t* LH) { }
void SYSV_ABI rs_memcpy(void* dst, const void* src, size_t size, const uint32_t* LH) { memcpy(dst, src, size) ;}

#ifdef PERF_ONE
#define ONE_ALGO_ONLY
Expand All @@ -102,14 +114,14 @@ static uint64_t time_rs(rs_procfunc_t *fn, void* dst, const void* src, size_t si
{
uint64_t starttime, stoptime;

_mm256_zeroupper();
VZEROUPPER
starttime = rdtsc();
const int maxiter = ITERS;
for (int c=0 ; c<maxiter ; c++) {
fn(dst, src, size, LH);
}
stoptime = rdtsc();
_mm256_zeroupper();
VZEROUPPER
return stoptime - starttime;
}

Expand All @@ -128,7 +140,7 @@ int main (int argc, char *argv[])
for (int i=0; i<512; i++) {
lhTable[i+64/sizeof(*lhTable)] = i;
}
typeof (*lhTable) *LH = lhTable + 64/sizeof(*lhTable);
LH_TABLE_T *LH = lhTable + 64/sizeof(*lhTable);
// LH has zeroes before it, and starts at the beginning of a cache line.
// It is all in one page (including padding)

Expand Down Expand Up @@ -177,14 +189,16 @@ int main (int argc, char *argv[])
time_rs_print ("pinsrw128 ", rs_process_pinsrw128, dstbuf, srcbuf, size, LH);
time_rs_print ("orig MMX-unpck", rs_process_x86_64_mmx_orig, dstbuf, srcbuf, size, LH);
time_rs_print ("dummy ", rs_dummy, dstbuf, srcbuf, size, LH);
time_rs_print ("memcpy ", rs_memcpy, dstbuf, srcbuf, size, LH);
time_rs_print ("MMX w/ 64b rdx", rs_process_x86_64_mmx, dstbuf, srcbuf, size, LH);
time_rs_print ("pinsrw-intrin ", rs_process_pinsrw_intrin, dstbuf, srcbuf, size, LH);
// time_rs_print ("pinsrw-unpipe ", rs_process_pinsrw_unpipelined, dstbuf, srcbuf, size, LH);
time_rs_print ("Pure C ", rs_process_purec, dstbuf, srcbuf, size, LH);
time_rs_print ("xord mul32767 ", rs_process_xordep_mul32767, dstbuf, srcbuf, size, LH);
puts ("----------------");
#endif
for (int i=0 ; i<3 ; i++) {
_mm256_zeroupper();
VZEROUPPER
#ifndef ONE_ALGO_ONLY
time_rs_print ("orig MMX-unpck", rs_process_x86_64_mmx_orig, dstbuf, srcbuf, size, LH);
// time_rs_print ("MMX w/ 64b rdx", rs_process_x86_64_mmx, dstbuf, srcbuf, size, LH);
Expand All @@ -195,15 +209,20 @@ int main (int argc, char *argv[])
time_rs_print ("pinsrw-intrin ", rs_process_pinsrw_intrin, dstbuf, srcbuf, size, LH);
time_rs_print ("Pure C ", rs_process_purec, dstbuf, srcbuf, size, LH);
// time_rs_print ("uoptest ", rs_process_uoptest, dstbuf, srcbuf, size, LH);
#ifdef __AVX__
time_rs_print ("nolut AVX ", rs_process_nolut_intrin, dstbuf, srcbuf, size, LH);
#endif
#else
time_rs_print ("pinsrw128 ", rs_process_pinsrw128, dstbuf, srcbuf, size, LH);
#endif
time_rs_print ("xord mul32767 ", rs_process_xordep_mul32767, dstbuf, srcbuf, size, LH);
// fflush(stdout);
#ifdef __AVX__
if (HAVE_AVX2) {
time_rs_print ("AVX2 vgather ", rs_process_vgather_align32, dstbuf, srcbuf, size, LH);
time_rs_print ("AVX2 vgather ", rs_process_vgather_align32, dstbuf, srcbuf, size, LH);
}
#endif
}

puts ("----------------");
Expand Down
8 changes: 4 additions & 4 deletions process-purec.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ void SYSV_ABI rs_process_purec_64b(void* dstvoid, const void* srcvoid, size_t si

// GCC is silly and keeps L and H in separate regs, instead of using an addressing mode with a displacement
// or even worse, generates add $256, %index_reg and then uses a load with no displacement
typeof(LH) L = LH;
typeof(LH) H = LH+256;
const uint32_t* L = LH;
const uint32_t* H = LH+256;

size &= ~0x07; // multiple of 8
const uint64_t *src64 = srcvoid;
Expand Down Expand Up @@ -82,8 +82,8 @@ void SYSV_ABI rs_process_purec_64b(void* dstvoid, const void* srcvoid, size_t si
/**************** uint32_t version ****************/
void SYSV_ABI rs_process_purec_32b(void* dstvoid, const void* srcvoid, size_t size, const uint32_t* LH)
{
typeof(LH) L = LH;
typeof(LH) H = LH+256;
const uint32_t* L = LH;
const uint32_t* H = LH+256;

size &= ~0x07; // multiple of 8
const uint32_t *src32 = srcvoid;
Expand Down
227 changes: 227 additions & 0 deletions xordep_mul32767.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
# disassembled from XOR algo's JIT for multiplying by 32767
# slightly modified for readability etc
# algorithm doesn't use LH tables, so that arg is ignored
# NOTE: size must be a multiple of 256

.text
.intel_syntax noprefix
.globl rs_process_xordep_mul32767
rs_process_xordep_mul32767:
# rs_process_xordep_mul32767(void* dst (%rdi), const void* src (%rsi), size_t size (%rdx), const u16* LH (%rcx));

# save xmm6-15 for Windows' sake, not necessary otherwise
push rbp
mov rbp,rsp
mov rax,rsp
and rax,0xF
sub rbp,rax
movaps xmmword ptr [rbp-0x10],xmm6
movaps xmmword ptr [rbp-0x20],xmm7
movaps xmmword ptr [rbp-0x30],xmm8
movaps xmmword ptr [rbp-0x40],xmm9
movaps xmmword ptr [rbp-0x50],xmm10
movaps xmmword ptr [rbp-0x60],xmm11
movaps xmmword ptr [rbp-0x70],xmm12
movaps xmmword ptr [rbp-0x80],xmm13
movaps xmmword ptr [rbp-0x90],xmm14
movaps xmmword ptr [rbp-0xA0],xmm15

mov rax,rsi #src
lea rcx,[rdi+rdx] #dest-end
mov rdx,rdi #dest

.align 16
.loop:
# pre-load inputs 3-15 into registers (can't fit all 16 inputs)
movaps xmm3,xmmword ptr [rax+0x30]
movaps xmm4,xmmword ptr [rax+0x40]
movaps xmm5,xmmword ptr [rax+0x50]
movaps xmm6,xmmword ptr [rax+0x60]
movaps xmm7,xmmword ptr [rax+0x70]
movaps xmm8,xmmword ptr [rax+0x80]
movaps xmm9,xmmword ptr [rax+0x90]
movaps xmm10,xmmword ptr [rax+0xA0]
movaps xmm11,xmmword ptr [rax+0xB0]
movaps xmm12,xmmword ptr [rax+0xC0]
movaps xmm13,xmmword ptr [rax+0xD0]
movaps xmm14,xmmword ptr [rax+0xE0]
movaps xmm15,xmmword ptr [rax+0xF0]

# process 256 bytes; algorithm does it in 32 byte 'sub-blocks'
movaps xmm0,xmmword ptr [rdx]
movdqa xmm1,xmmword ptr [rdx+0x10]
movaps xmm2,xmmword ptr [rax]
xorps xmm2,xmmword ptr [rax+0x10]
pxor xmm1,xmmword ptr [rax+0x20]
xorps xmm0,xmm3
xorps xmm2,xmm4
xorps xmm2,xmm5
xorps xmm2,xmm6
pxor xmm1,xmm7
xorps xmm0,xmm11
xorps xmm2,xmm12
xorps xmm2,xmm13
xorps xmm2,xmm14
pxor xmm1,xmm15
xorps xmm0,xmm2
pxor xmm1,xmm2
movaps xmmword ptr [rdx],xmm0
movdqa xmmword ptr [rdx+0x10],xmm1

movaps xmm0,xmmword ptr [rdx+0x20]
movdqa xmm1,xmmword ptr [rdx+0x30]
movaps xmm2,xmmword ptr [rax]
xorps xmm2,xmmword ptr [rax+0x10]
xorps xmm2,xmmword ptr [rax+0x20]
xorps xmm2,xmm3
pxor xmm1,xmm4
xorps xmm0,xmm5
xorps xmm2,xmm6
xorps xmm2,xmm7
xorps xmm2,xmm8
pxor xmm1,xmm9
xorps xmm0,xmm13
xorps xmm2,xmm14
xorps xmm2,xmm15
xorps xmm0,xmm2
pxor xmm1,xmm2
movaps xmmword ptr [rdx+0x20],xmm0
movdqa xmmword ptr [rdx+0x30],xmm1

movaps xmm0,xmmword ptr [rdx+0x40]
movdqa xmm1,xmmword ptr [rdx+0x50]
pxor xmm1,xmmword ptr [rax]
xorps xmm0,xmmword ptr [rax+0x20]
pxor xmm1,xmm3
xorps xmm0,xmm6
movaps xmm2,xmm7
xorps xmm2,xmm8
xorps xmm2,xmm9
xorps xmm2,xmm10
xorps xmm2,xmm11
xorps xmm2,xmm12
xorps xmm2,xmm13
xorps xmm2,xmm14
xorps xmm2,xmm15
xorps xmm0,xmm2
pxor xmm1,xmm2
movaps xmmword ptr [rdx+0x40],xmm0
movdqa xmmword ptr [rdx+0x50],xmm1

movaps xmm0,xmmword ptr [rdx+0x60]
movdqa xmm1,xmmword ptr [rdx+0x70]
pxor xmm1,xmmword ptr [rax]
xorps xmm0,xmmword ptr [rax+0x10]
pxor xmm1,xmmword ptr [rax+0x20]
xorps xmm0,xmm4
pxor xmm1,xmm5
xorps xmm0,xmm8
movaps xmm2,xmm9
xorps xmm2,xmm10
xorps xmm2,xmm11
xorps xmm2,xmm12
xorps xmm2,xmm13
xorps xmm2,xmm14
xorps xmm2,xmm15
xorps xmm0,xmm2
pxor xmm1,xmm2
movaps xmmword ptr [rdx+0x60],xmm0
movdqa xmmword ptr [rdx+0x70],xmm1

movaps xmm0,xmmword ptr [rdx+0x80]
movdqa xmm1,xmmword ptr [rdx+0x90]
xorps xmm0,xmmword ptr [rax]
movaps xmm2,xmmword ptr [rax+0x10]
pxor xmm1,xmmword ptr [rax+0x20]
xorps xmm0,xmm3
pxor xmm1,xmm4
xorps xmm0,xmm6
pxor xmm1,xmm7
xorps xmm0,xmm10
xorps xmm2,xmm11
xorps xmm2,xmm12
xorps xmm2,xmm13
xorps xmm2,xmm14
xorps xmm2,xmm15
xorps xmm0,xmm2
pxor xmm1,xmm2
movaps xmmword ptr [rdx+0x80],xmm0
movdqa xmmword ptr [rdx+0x90],xmm1

movaps xmm0,xmmword ptr [rdx+0xA0]
movdqa xmm1,xmmword ptr [rdx+0xB0]
xorps xmm0,xmmword ptr [rax]
pxor xmm1,xmmword ptr [rax+0x10]
xorps xmm0,xmmword ptr [rax+0x20]
movaps xmm2,xmm3
pxor xmm1,xmm4
xorps xmm0,xmm5
pxor xmm1,xmm6
xorps xmm0,xmm8
pxor xmm1,xmm9
xorps xmm0,xmm12
xorps xmm2,xmm13
xorps xmm2,xmm14
xorps xmm2,xmm15
xorps xmm0,xmm2
pxor xmm1,xmm2
movaps xmmword ptr [rdx+0xA0],xmm0
movdqa xmmword ptr [rdx+0xB0],xmm1

movaps xmm0,xmmword ptr [rdx+0xC0]
movdqa xmm1,xmmword ptr [rdx+0xD0]
movaps xmm2,xmmword ptr [rax]
xorps xmm0,xmmword ptr [rax+0x20]
xorps xmm2,xmm4
xorps xmm0,xmm5
xorps xmm0,xmm7
pxor xmm1,xmm8
xorps xmm0,xmm10
pxor xmm1,xmm12
pxor xmm1,xmm13
xorps xmm2,xmm14
xorps xmm2,xmm15
xorps xmm0,xmm2
pxor xmm1,xmm2
movaps xmmword ptr [rdx+0xC0],xmm0
movdqa xmmword ptr [rdx+0xD0],xmm1

movaps xmm0,xmmword ptr [rdx+0xE0]
movdqa xmm1,xmmword ptr [rdx+0xF0]
movaps xmm2,xmmword ptr [rax]
xorps xmm0,xmmword ptr [rax+0x10]
pxor xmm1,xmmword ptr [rax+0x20]
pxor xmm1,xmm3
pxor xmm1,xmm4
xorps xmm2,xmm5
xorps xmm0,xmm9
pxor xmm1,xmm10
pxor xmm1,xmm11
pxor xmm1,xmm12
xorps xmm2,xmm13
xorps xmm0,xmm14
xorps xmm2,xmm15
xorps xmm0,xmm2
pxor xmm1,xmm2
movaps xmmword ptr [rdx+0xE0],xmm0
movdqa xmmword ptr [rdx+0xF0],xmm1
#end of main processing

add rax,0x100
add rdx,0x100
cmp rdx,rcx
jl .loop

# restore xmm6-15
movaps xmm6,xmmword ptr [rbp-0x10]
movaps xmm7,xmmword ptr [rbp-0x20]
movaps xmm8,xmmword ptr [rbp-0x30]
movaps xmm9,xmmword ptr [rbp-0x40]
movaps xmm10,xmmword ptr [rbp-0x50]
movaps xmm11,xmmword ptr [rbp-0x60]
movaps xmm12,xmmword ptr [rbp-0x70]
movaps xmm13,xmmword ptr [rbp-0x80]
movaps xmm14,xmmword ptr [rbp-0x90]
movaps xmm15,xmmword ptr [rbp-0xA0]
pop rbp
ret