From 0e7eef976512f617180a89070c3cda368f9bea89 Mon Sep 17 00:00:00 2001 From: Sean Bryant Date: Tue, 11 Jun 2013 22:12:04 -0700 Subject: [PATCH 1/5] SSE dot product anyone? --- src/vector.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/vector.c b/src/vector.c index df08cb0..48a4a78 100644 --- a/src/vector.c +++ b/src/vector.c @@ -1,4 +1,5 @@ #include "vector.h" +#include float3 *clone_f3(float3 f) { float3 *clone = malloc(sizeof(float3)); @@ -28,9 +29,17 @@ float3 *f3_scale(float3 *v, float c) { } float f3_dot(float3 v1, float3 v2) { - return (v1[0] * v2[0] + - v1[1] * v2[1] + - v1[2] * v2[2]); + float result; + float arr[4] = { v1[0], v1[1], v1[2], 0.0 }; + float arr2[4] = { v2[0], v2[1], v2[2], 0.0 }; + + __m128 a = _mm_loadu_ps(arr); + __m128 b = _mm_loadu_ps(arr2); + const int mask = 0xff; + __m128 sse_result = _mm_dp_ps(a, b, mask); + + _mm_store_ss(&result, sse_result); + return result; } float3 *f3_cross(float3 *result, float3 v1, float3 v2) { From 1331ecf9882998da521366e10ba5272e9564fafd Mon Sep 17 00:00:00 2001 From: Sean Bryant Date: Tue, 11 Jun 2013 22:24:13 -0700 Subject: [PATCH 2/5] This is can be an aligned load! --- src/vector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vector.c b/src/vector.c index 48a4a78..6c6e78e 100644 --- a/src/vector.c +++ b/src/vector.c @@ -33,8 +33,8 @@ float f3_dot(float3 v1, float3 v2) { float arr[4] = { v1[0], v1[1], v1[2], 0.0 }; float arr2[4] = { v2[0], v2[1], v2[2], 0.0 }; - __m128 a = _mm_loadu_ps(arr); - __m128 b = _mm_loadu_ps(arr2); + __m128 a = _mm_load_ps(arr); + __m128 b = _mm_load_ps(arr2); const int mask = 0xff; __m128 sse_result = _mm_dp_ps(a, b, mask); From ed51e494687030447ac2235aaac1775871f681e6 Mon Sep 17 00:00:00 2001 From: Sean Bryant Date: Tue, 11 Jun 2013 22:29:38 -0700 Subject: [PATCH 3/5] Conditionally compile SSE f3. --- src/vector.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/vector.c b/src/vector.c index 6c6e78e..d524a5e 100644 --- a/src/vector.c +++ b/src/vector.c @@ -1,5 +1,7 @@ #include "vector.h" +#if defined __SSEE_4_1__ #include +#endif float3 *clone_f3(float3 f) { float3 *clone = malloc(sizeof(float3)); @@ -28,6 +30,7 @@ float3 *f3_scale(float3 *v, float c) { return v; } +#if defined __SSE__4_1 float f3_dot(float3 v1, float3 v2) { float result; float arr[4] = { v1[0], v1[1], v1[2], 0.0 }; @@ -41,6 +44,13 @@ float f3_dot(float3 v1, float3 v2) { _mm_store_ss(&result, sse_result); return result; } +#else +float f3_dot(float3 v1, float3 v2) { + return (v1[0] * v2[0] + + v1[1] * v2[1] + + v1[2] * v2[2]); +} +#endif float3 *f3_cross(float3 *result, float3 v1, float3 v2) { (*result)[0] = v1[1]*v2[2] - v1[2]*v2[1]; From c7e94f0d46db743488d94dad0334a8a575ea3271 Mon Sep 17 00:00:00 2001 From: Sean Bryant Date: Tue, 11 Jun 2013 22:39:15 -0700 Subject: [PATCH 4/5] Just worry about the function body. --- src/vector.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vector.c b/src/vector.c index d524a5e..ea0fdec 100644 --- a/src/vector.c +++ b/src/vector.c @@ -30,8 +30,9 @@ float3 *f3_scale(float3 *v, float c) { return v; } -#if defined __SSE__4_1 + float f3_dot(float3 v1, float3 v2) { +#if defined __SSE__4_1 float result; float arr[4] = { v1[0], v1[1], v1[2], 0.0 }; float arr2[4] = { v2[0], v2[1], v2[2], 0.0 }; @@ -43,14 +44,13 @@ float f3_dot(float3 v1, float3 v2) { _mm_store_ss(&result, sse_result); return result; -} #else -float f3_dot(float3 v1, float3 v2) { return (v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2]); -} #endif +} + float3 *f3_cross(float3 *result, float3 v1, float3 v2) { (*result)[0] = v1[1]*v2[2] - v1[2]*v2[1]; From 0d6b32a3f3e27722613847dfa2131ec50ffc9f65 Mon Sep 17 00:00:00 2001 From: Sean Bryant Date: Tue, 11 Jun 2013 22:43:54 -0700 Subject: [PATCH 5/5] it's actually __SSE4_1__ --- src/vector.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/vector.c b/src/vector.c index ea0fdec..05a0fe5 100644 --- a/src/vector.c +++ b/src/vector.c @@ -1,5 +1,5 @@ #include "vector.h" -#if defined __SSEE_4_1__ +#if defined __SSE4_1__ #include #endif @@ -32,7 +32,7 @@ float3 *f3_scale(float3 *v, float c) { float f3_dot(float3 v1, float3 v2) { -#if defined __SSE__4_1 +#if defined __SSE4_1__ float result; float arr[4] = { v1[0], v1[1], v1[2], 0.0 }; float arr2[4] = { v2[0], v2[1], v2[2], 0.0 };