simd.h

1 /* Copyright Jukka Jyl�nki
2
3    Licensed under the Apache License, Version 2.0 (the "License");
4    you may not use this file except in compliance with the License.
5    You may obtain a copy of the License at
6
7        http://www.apache.org/licenses/LICENSE-2.0
8
9    Unless required by applicable law or agreed to in writing, software
10    distributed under the License is distributed on an "AS IS" BASIS,
11    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12    See the License for the specific language governing permissions and
13    limitations under the License. */
14
15 /** @file simd.h
16         @author Jukka Jyl�nki
17         @brief Generic abstraction layer over different SIMD instruction sets. */
18 #pragma once
19
20 #include "../MathBuildConfig.h"
21 #include "MathNamespace.h"
22 #include "../MathGeoLibFwd.h"
23 #include <stdint.h>
24 #include <cstddef>
25 #include "Reinterpret.h"
26 #ifdef MATH_SSE41
27 #include <smmintrin.h>
28 #endif
29
30 #ifdef MATH_SIMD // If SSE is not enabled, this whole file will not be included.
31
32 MATH_BEGIN_NAMESPACE
33
34 #ifdef MATH_SSE
35
36 #define simd4f __m128
37 #define simd4i __m128i
38
39 #define add_ps _mm_add_ps
40 #define sub_ps _mm_sub_ps
41 #define mul_ps _mm_mul_ps
42 #define div_ps _mm_div_ps
43 #define set1_ps _mm_set1_ps
44 /// Sets the vector in order (w, z, y, x).
45 #define set_ps _mm_set_ps
46 static const simd4f simd4fSignBit = set1_ps(-0.f); // -0.f = 1 << 31
47 #define abs_ps(x) _mm_andnot_ps(simd4fSignBit, (x))
48 #define zero_ps() _mm_setzero_ps()
49 #define min_ps _mm_min_ps
50 #define max_ps _mm_max_ps
51 #define s4f_to_s4i(s4f) _mm_castps_si128((s4f))
52 #define s4i_to_s4f(s4i) _mm_castsi128_ps((s4i))
53 #define and_ps _mm_and_ps
54 #define andnot_ps _mm_andnot_ps
55 #define or_ps _mm_or_ps
56 #define xor_ps _mm_xor_ps
57 #define storeu_ps _mm_storeu_ps
58 #define store_ps _mm_store_ps
59 #define loadu_ps _mm_loadu_ps
60 #define load_ps _mm_load_ps
61 #define load1_ps _mm_load1_ps
62 #define stream_ps _mm_stream_ps
63
64 #if defined(MATH_SSE2) && !defined(MATH_AVX) // We can use the pshufd instruction, which was introduced in SSE2 32-bit integer ops.
65 /// Swizzles/permutes a single SSE register into another SSE register. Requires SSE2.
66 #define shuffle1_ps(reg, shuffle) _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128((reg)), (shuffle)))
67 #else // We only have SSE 1, so must use the slightly worse shufps instruction, which always destroys the input operand - or we have AVX where we can use this operation without destroying input
68 #define shuffle1_ps(reg, shuffle) _mm_shuffle_ps((simd4f)(reg), (simd4f)(reg), (shuffle))
69 #endif
70
71 #define xxxx_ps(x) shuffle1_ps((x), _MM_SHUFFLE(0,0,0,0))
72 #define yyyy_ps(x) shuffle1_ps((x), _MM_SHUFFLE(1,1,1,1))
73 #define zzzz_ps(x) shuffle1_ps((x), _MM_SHUFFLE(2,2,2,2))
74 #define wwww_ps(x) shuffle1_ps((x), _MM_SHUFFLE(3,3,3,3))
75
76 #ifdef MATH_SSE41
77 #define allzero_ps(x) _mm_testz_si128(_mm_castps_si128((x)), _mm_castps_si128((x)))
78 #elif defined(MATH_SSE)
79 // Given a input vector of either 0xFFFFFFFF or 0, returns a nonzero integer of all lanes were zero.
80 // Warning: this SSE1 version is more like "all finite without nans" instead of "allzero", because
81 // it does not detect finite non-zero floats. Call only for inputs that are either all 0xFFFFFFFF or 0.
82 int FORCE_INLINE allzero_ps(simd4f x)
83 {
84         simd4f y = yyyy_ps(x);
85         x = or_ps(x, y);
86         y = _mm_movehl_ps(y, x);
87         x = or_ps(x, y);
88         return _mm_ucomige_ss(x, x);
89 }
90 #endif
91
92 static inline __m128 load_vec3(const float *ptr, float w)
93 {
94         __m128 low = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)ptr); // [_ _ y x]
95         __m128 high = _mm_load_ss(ptr + 2); // [_ _ _ z]
96         high = _mm_unpacklo_ps(high, _mm_set_ss(w)); // [_ _ w z]
97         return _mm_movelh_ps(low, high);
98 }
99
100 static inline void store_vec3(float *ptr, simd4f v)
101 {
102         _mm_storel_pi((__m64*)ptr, v);
103         v = _mm_movehl_ps(v, v);
104         _mm_store_ss(ptr+2, v);
105 }
106
107 // Note: Unlike SSE1 _mm_rcp_ps, which has a measured relative error of 3e-4f, the
108 //       rcp_ps function is more precise, and has a measured relative error of 7e-6f.
109 static inline __m128 rcp_ps(__m128 x)
110 {
111         simd4f e = _mm_rcp_ps(x);
112         // Do one iteration of Newton-Rhapson: e_n = 2*e - x*e^2
113         // Note: This is not as precise as computing the exact 1.0f / x, but
114         // not because of too few N-R iterations, but because of single-precision
115         // FP errors being accumulated during the iterations. Even with having,
116         // say, 10 iterations, this function cannot get the exact result 1.0f / x
117         // but carries a small (order of 7e-6f) relative error.
118         return sub_ps(add_ps(e, e), mul_ps(x, mul_ps(e,e)));
119 }
120
121 // Note: Unlike SSE1 _mm_rsqrt_ps, which has a measured relative error of 1e-4f, the
122 //       rsqrt_ps function is more precise, and has a measured relative error of 1e-8f.
123 static inline __m128 rsqrt_ps(__m128 x)
124 {
125         // _mm_rsqrt_ps is not precise, so do one iteration of Newton-Rhapson to correct: e_n = e + 0.5 * (e - x * e^3)
126         simd4f e = _mm_rsqrt_ps(x);
127         simd4f e3 = mul_ps(mul_ps(e,e),e);
128         simd4f half = set1_ps(0.5f);
129         return add_ps(e, mul_ps(half, sub_ps(e, mul_ps(x, e3))));
130 }
131
132 #define sqrt_ps _mm_sqrt_ps
133 #define cmpeq_ps _mm_cmpeq_ps
134 #define cmpge_ps _mm_cmpge_ps
135 #define cmpgt_ps _mm_cmpgt_ps
136 #define cmple_ps _mm_cmple_ps
137 #define cmplt_ps _mm_cmplt_ps
138 #define negate3_ps(x) xor_ps(x, sseSignMask3)
139
140 /// Returns the lowest element of the given sse register as a float.
141 /// @note When compiling with /arch:SSE or newer, it is expected that this function is a no-op "cast", since
142 /// the resulting float is represented in an XMM register as well.
143 #define s4f_x(s4f) _mm_cvtss_f32((s4f))
144 // Given a 4-channel single-precision simd4f variable [w z y x], returns the second channel 'y' as a float.
145 #define s4f_y(s4f) _mm_cvtss_f32(shuffle1_ps((s4f), _MM_SHUFFLE(1,1,1,1)))
146 // Given a 4-channel single-precision simd4f variable [w z y x], returns the third channel 'z' as a float.
147 // @note The intrinsic _mm_movehl_ps() is theoretically better than _mm_unpackhi_ps() or a shuffle to extract the z channel because it has throughput latency of 0.33 compared to 1.0 that the
148 //       other have. However using _mm_movehl_ps() is tricky since in a blind use, VS2013 compiler easily emits a redundant movaps when targeting pre-AVX, so it might not be worth it.
149 #define s4f_z(s4f) _mm_cvtss_f32(_mm_unpackhi_ps((s4f), (s4f)))
150 // Given a 4-channel single-precision simd4f variable [w z y x], returns the fourth channel 'w' as a float.
151 #define s4f_w(s4f) _mm_cvtss_f32(shuffle1_ps((s4f), _MM_SHUFFLE(3,3,3,3)))
152
153 #ifdef MATH_SSE2
154 #define set_ps_hex(w, z, y, x) _mm_castsi128_ps(_mm_set_epi32(w, z, y, x))
155 #define set1_ps_hex(x) _mm_castsi128_ps(_mm_set1_epi32(x))
156 #else
157 #define set_ps_hex(w, z, y, x) _mm_set_ps(ReinterpretAsFloat(w), ReinterpretAsFloat(z), ReinterpretAsFloat(y), ReinterpretAsFloat(x))
158 #define set1_ps_hex(x) _mm_set1_ps(ReinterpretAsFloat(x))
159 #endif
160
161 /// Returns the simd vector [_, _, _, f], that is, a SSE variable with the given float f in the lowest index.
162 /** The three higher indices should all be treated undefined.
163         @note When compiling with /arch:SSE or newer, it is expected that this function is a no-op "cast" if the given
164         float is already in a register, since it will lie in an XMM register already. Check the disassembly to confirm!
165         @note Never use this function if you need to generate a 4-vector [f,f,f,f]. Instead, use set1_ps(f), which
166                 generates a vmovss+vhufps and no redundant vxorps+vmovss! */
167 FORCE_INLINE simd4f setx_ps(float f)
168 {
169         // On VS2010+AVX generates vmovss+vxorps+vmovss
170         // return _mm_load_ss(&f);
171
172 #if _MSC_VER < 1700 // == VS2012
173         // On VS2010+AVX generates vmovss+vshufps (to broadcast the single element to all channels). Best performance so far for VS2010.
174         // On VS2013 generates a vbroadcastss instruction.
175         return set1_ps(f);
176 #else
177         // On VS2010+AVX is the same as _mm_load_ss, i.e. vmovss+vxorps+vmovss
178         // On VS2013, this is the perfect thing - a single vmovss instruction!
179         return _mm_set_ss(f);
180 #endif
181
182         // On VS2010+AVX generates vmovss reg <- mem, vmovss alignedmem <- reg, vmovaps reg <- alignedmem, so is the worst!
183         // simd4f s;
184         // s.m128_f32[0] = f;
185         // return s;
186 }
187
188 /// Returns a direction vector (w == 0) with xyz all set to the same scalar value.
189 FORCE_INLINE simd4f dir_from_scalar_ps(float scalar)
190 {
191         return set_ps(0.f, scalar, scalar, scalar);
192 }
193
194 /// Returns a position vector (w == 1) with xyz all set to the same scalar value.
195 FORCE_INLINE simd4f pos_from_scalar_ps(float scalar)
196 {
197         return set_ps(1.f, scalar, scalar, scalar);
198 }
199
200 // Given four scalar SS FP registers, packs the four values into a single SP FP register.
201 //inline simd4f pack_4ss_to_ps(simd4f x, simd4f y, simd4f z, simd4f w) // VS2010 BUG! Can't use this signature!
202 FORCE_INLINE simd4f pack_4ss_to_ps(simd4f x, simd4f y, simd4f z, const simd4f &w)
203 {
204         simd4f xy = _mm_movelh_ps(x, y); // xy = [ _, y, _, x]
205         simd4f zw = _mm_movelh_ps(z, w); // zw = [ _, w, _, z]
206         return _mm_shuffle_ps(xy, zw, _MM_SHUFFLE(2, 0, 2, 0)); // ret = [w, z, y, x]
207 }
208
209 // Given m = [W z y x], and a float w, returns [w z y x]. That is, sets the highest channel of the given 4-channel vector.
210 static FORCE_INLINE __m128 setw_ps(__m128 m, float w)
211 {
212         __m128 hi = _mm_movehl_ps(m, m); // [W z W z]
213         hi = _mm_unpacklo_ps(hi, _mm_set_ss(w)); // [0 W w z]
214         return _mm_movelh_ps(m, hi); // [w z y x]
215 }
216
217 #ifdef MATH_SSE2
218 FORCE_INLINE simd4f modf_ps(simd4f x, simd4f mod)
219 {
220         // x % mod == x - floor(x/mod)*mod
221         // floor(x/mod) = integerpart(x/mod)
222         simd4f ints = _mm_div_ps(x, mod);
223 #ifdef MATH_SSE41 // _mm_round_ps is SSE4.1
224         simd4f integerpart = _mm_round_ps(ints, _MM_FROUND_TO_ZERO);
225 #else
226         simd4f integerpart = _mm_cvtepi32_ps(_mm_cvttps_epi32(ints));
227 #endif
228         return _mm_sub_ps(x, _mm_mul_ps(integerpart, mod));
229 }
230 #endif
231
232 #elif defined(MATH_NEON)
233
234 #include <arm_neon.h>
235
236 #define simd4f float32x4_t
237 #define simd4i int32x4_t
238
239 #define add_ps vaddq_f32
240 #define sub_ps vsubq_f32
241 #define mul_ps vmulq_f32
242 #define div_ps(a, b) ((a) / (b))
243 #define min_ps vminq_f32
244 #define max_ps vmaxq_f32
245 #define s4f_to_s4i(s4f) vreinterpretq_u32_f32((s4f))
246 #define s4i_to_s4f(s4i) vreinterpretq_f32_u32((s4i))
247 #define and_ps(x, y) s4i_to_s4f(vandq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))
248 #define andnot_ps(x, y) s4i_to_s4f(vbicq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))
249 #define or_ps(x, y) s4i_to_s4f(vorrq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))
250 #define xor_ps(x, y) s4i_to_s4f(veorq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))
251 #define ornot_ps(x, y) s4i_to_s4f(vornq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))
252
253 #define s4f_x(vec) vgetq_lane_f32((vec), 0)
254 #define s4f_y(vec) vgetq_lane_f32((vec), 1)
255 #define s4f_z(vec) vgetq_lane_f32((vec), 0)
256 #define s4f_w(vec) vgetq_lane_f32((vec), 1)
257
258 #define set1_ps vdupq_n_f32
259 #define setx_ps vdupq_n_f32
260 #define abs_ps vabsq_f32
261 #define zero_ps() vdupq_n_f32(0.f)
262
263 #define storeu_ps vst1q_f32
264 #define store_ps vst1q_f32
265 #define loadu_ps vld1q_f32
266 #define load_ps vld1q_f32
267 #define load1_ps(ptr) vdupq_n_f32(*(float*)(ptr))
268 #define stream_ps vst1q_f32
269 static inline simd4f rcp_ps(simd4f x)
270 {
271         simd4f e = vrecpeq_f32(x);
272         e = vmulq_f32(e, vrecpsq_f32(x, e));
273         e = vmulq_f32(e, vrecpsq_f32(x, e));
274         return e;
275 }
276
277 static inline simd4f rsqrt_ps(simd4f x)
278 {
279         simd4f e = vrsqrteq_f32(x);
280         e = vmulq_f32(e, vrsqrtsq_f32(x, vmulq_f32(e, e)));
281         e = vmulq_f32(e, vrsqrtsq_f32(x, vmulq_f32(e, e)));
282         return e;
283 }
284
285 static inline simd4f sqrt_ps(simd4f x) { return mul_ps(x, rsqrt_ps(x)); }
286
287 #define cmpeq_ps(a, b) vreinterpretq_f32_u32(vceqq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))
288 #define cmpge_ps(a, b) vreinterpretq_f32_u32(vcgeq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))
289 #define cmpgt_ps(a, b) vreinterpretq_f32_u32(vcgtq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))
290 #define cmple_ps(a, b) vreinterpretq_f32_u32(vcleq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))
291 #define cmplt_ps(a, b) vreinterpretq_f32_u32(vcltq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))
292
293 // This might not be the most efficient form, and typically it is better to avoid this in NEON, and instead
294 // prefer the scattering/gathering loads and stores instead.
295 #define _MM_TRANSPOSE4_PS(a,b,c,d) do { \
296         float32x4x2_t m1 = vuzpq_f32((a), (c)); \
297         float32x4x2_t m2 = vuzpq_f32((b), (d)); \
298         float32x4x2_t m3 = vtrnq_f32(m1.val[0], m2.val[0]); \
299         float32x4x2_t m4 = vtrnq_f32(m1.val[1], m2.val[1]); \
300         (a) = m3.val[0]; \
301         (b) = m4.val[0]; \
302         (c) = m3.val[1]; \
303         (d) = m4.val[1]; } while(0)
304
305 #ifdef _MSC_VER
306 #define set_ps_const(w,z,y,x) {{ (u64)ReinterpretAsU32(x) | (((u64)ReinterpretAsU32(y)) << 32), (u64)ReinterpretAsU32(z) | (((u64)ReinterpretAsU32(w)) << 32) }}
307 #define set_ps_hex_const(w,z,y,x) {{ (u64)(x) | (((u64)(y)) << 32), (u64)(z) | (((u64)(w)) << 32) }}
308 #else
309 #define set_ps_const(w,z,y,x) { x, y, z, w }
310 #define set_ps_hex_const(w,z,y,x) { ReinterpretAsFloat(x), ReinterpretAsFloat(y), ReinterpretAsFloat(z), ReinterpretAsFloat(w) }
311 #endif
312
313 FORCE_INLINE simd4f set_ps(float w, float z, float y, float x)
314 {
315 //      const ALIGN16 float32_t d[4] = { x, y, z, w };
316 //      return vld1q_f32(d);
317         float32x4_t c = set_ps_const(w,z,y,x);
318         return c;
319 }
320 FORCE_INLINE simd4f set_ps_hex(u32 w, u32 z, u32 y, u32 x)
321 {
322 //      const ALIGN16 u32 d[4] = { x, y, z, w };
323 //      return vld1q_f32((const float*)d);
324         float32x4_t c = set_ps_hex_const(w,z,y,x);
325         return c;
326 }
327
328 FORCE_INLINE simd4f dir_from_scalar_ps(float scalar)
329 {
330         return vsetq_lane_f32(0.f, vdupq_n_f32(scalar), 3);
331 }
332
333 /// Returns a position vector (w == 1) with xyz all set to the same scalar value.
334 FORCE_INLINE simd4f pos_from_scalar_ps(float scalar)
335 {
336         return vsetq_lane_f32(1.f, vdupq_n_f32(scalar), 3);
337 }
338
339 #endif // ~MATH_NEON
340
341 // TODO: Which is better codegen - use simd4fZero constant everywhere, or explicitly refer to zero_ps() everywhere,
342 //       or does it matter?
343 const simd4f simd4fZero     = zero_ps();
344 const simd4f simd4fOne      = set1_ps(1.f);
345 const simd4f simd4fMinusOne = set1_ps(-1.f);
346 const simd4f simd4fEpsilon  = set1_ps(1e-4f);
347
348 // NOTE: This version was benchmarked to be minutely better than the sub_ps(zero_ps) version on a SSE 4.1 capable system in OBB::ClosestPoint(point):
349 // sub_ps&zero_ps: Best: 8.833 nsecs / 24 ticks, Avg: 9.044 nsecs, Worst: 9.217 nsecs
350 // xor_ps&signMask: Best: 8.833 nsecs / 23.768 ticks, Avg: 8.975 nsecs, Worst: 9.601 nsecs
351 // However the memory load is still worrying, so using the zero_ps still for now.
352 //#define negate_ps(x) _mm_xor_ps(x, sseSignMask)
353
354 #define negate_ps(x) sub_ps(zero_ps(), (x))
355
356 // If mask[i] == 0, then output index i from a, otherwise mask[i] must be 0xFFFFFFFF, and output index i from b.
357 FORCE_INLINE simd4f cmov_ps(simd4f a, simd4f b, simd4f mask)
358 {
359 #ifdef MATH_SSE41 // SSE 4.1 offers conditional copying between registers with the blendvps instruction.
360         return _mm_blendv_ps(a, b, mask);
361 #else // If not on SSE 4.1, use conditional masking.
362         b = and_ps(mask, b); // Where mask is 1, output b.
363         a = andnot_ps(mask, a); // Where mask is 0, output a.
364         return or_ps(a, b);
365 #endif
366 }
367
368 static const float andMaskOneF = ReinterpretAsFloat(0xFFFFFFFFU);
369 /// A SSE mask register with x = y = z = 0xFFFFFFFF and w = 0x0.
370 static const simd4f sseMaskXYZ = set_ps(0.f, andMaskOneF, andMaskOneF, andMaskOneF);
371 static const simd4f sseSignMask3 = set_ps(0.f, -0.f, -0.f, -0.f); // -0.f = 1 << 31
372 static const simd4f sseSignMask = set_ps(-0.f, -0.f, -0.f, -0.f); // -0.f = 1 << 31
373 #ifdef MATH_AVX
374 static const __m256 sseSignMask256 = _mm256_set1_ps(-0.f); // -0.f = 1 << 31
375 #endif
376 #ifdef MATH_AVX
377 #define abs_ps256(x) _mm256_andnot_ps(sseSignMask256, x)
378 #endif
379
380 MATH_END_NAMESPACE
381
382 #endif // ~MATH_SIMD