1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 #pragma once19 20 #include "../MathBuildConfig.h"21 #include "[MathNamespace.h]"22 #include "../MathGeoLibFwd.h"23 #include <stdint.h>24 #include <cstddef>25 #include "[Reinterpret.h]"26 #ifdef MATH_SSE4127 #include <smmintrin.h>28 #endif29 30 #ifdef MATH_SIMD // If SSE is not enabled, this whole file will not be included.31 32 [MATH_BEGIN_NAMESPACE]33 34 #ifdef MATH_SSE35 36 #define simd4f __m12837 #define simd4i __m128i38 39 #define add_ps _mm_add_ps40 #define sub_ps _mm_sub_ps41 #define mul_ps _mm_mul_ps42 #define div_ps _mm_div_ps43 #define set1_ps _mm_set1_ps44 45 #define set_ps _mm_set_ps46 static const simd4f simd4fSignBit = set1_ps(-0.f); 47 #define abs_ps(x) _mm_andnot_ps(simd4fSignBit, (x))48 #define zero_ps() _mm_setzero_ps()49 #define min_ps _mm_min_ps50 #define max_ps _mm_max_ps51 #define s4f_to_s4i(s4f) _mm_castps_si128((s4f))52 #define s4i_to_s4f(s4i) _mm_castsi128_ps((s4i))53 #define and_ps _mm_and_ps54 #define andnot_ps _mm_andnot_ps55 #define or_ps _mm_or_ps56 #define xor_ps _mm_xor_ps57 #define storeu_ps _mm_storeu_ps58 #define store_ps _mm_store_ps59 #define loadu_ps _mm_loadu_ps60 #define load_ps _mm_load_ps61 #define load1_ps _mm_load1_ps62 #define stream_ps _mm_stream_ps63 64 #if defined(MATH_SSE2) && !defined(MATH_AVX) // We can use the pshufd instruction, which was introduced in SSE2 32-bit integer ops.65 66 #define shuffle1_ps(reg, shuffle) _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128((reg)), (shuffle)))67 #else // We only have SSE 1, so must use the slightly worse shufps instruction, which always destroys the input operand - or we have AVX where we can use this operation without destroying input68 #define shuffle1_ps(reg, shuffle) _mm_shuffle_ps((simd4f)(reg), (simd4f)(reg), (shuffle))69 #endif70 71 #define xxxx_ps(x) shuffle1_ps((x), _MM_SHUFFLE(0,0,0,0))72 #define yyyy_ps(x) shuffle1_ps((x), _MM_SHUFFLE(1,1,1,1))73 #define zzzz_ps(x) shuffle1_ps((x), _MM_SHUFFLE(2,2,2,2))74 #define wwww_ps(x) shuffle1_ps((x), _MM_SHUFFLE(3,3,3,3))75 76 #ifdef MATH_SSE4177 #define allzero_ps(x) _mm_testz_si128(_mm_castps_si128((x)), _mm_castps_si128((x)))78 #elif defined(MATH_SSE)79 80 81 82 int [FORCE_INLINE] allzero_ps(simd4f x)83 {84 simd4f y = yyyy_ps(x);85 x = or_ps(x, y);86 y = _mm_movehl_ps(y, x);87 x = or_ps(x, y);88 return _mm_ucomige_ss(x, x);89 }90 #endif91 92 static inline __m128 load_vec3(const float *ptr, float w)93 {94 __m128 low = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*)ptr); 95 __m128 high = _mm_load_ss(ptr + 2); 96 high = _mm_unpacklo_ps(high, _mm_set_ss(w)); 97 return _mm_movelh_ps(low, high);98 }99 100 static inline void store_vec3(float *ptr, simd4f v)101 {102 _mm_storel_pi((__m64*)ptr, v);103 v = _mm_movehl_ps(v, v);104 _mm_store_ss(ptr+2, v);105 }106 107 108 109 static inline __m128 rcp_ps(__m128 x)110 {111 simd4f [e] = _mm_rcp_ps(x);112 113 114 115 116 117 118 return sub_ps(add_ps(e, e), mul_ps(x, mul_ps(e,e)));119 }120 121 122 123 static inline __m128 rsqrt_ps(__m128 x)124 {125 126 simd4f e = _mm_rsqrt_ps(x);127 simd4f e3 = mul_ps(mul_ps(e,e),e);128 simd4f half = set1_ps(0.5f);129 return add_ps(e, mul_ps(half, sub_ps(e, mul_ps(x, e3))));130 }131 132 #define sqrt_ps _mm_sqrt_ps133 #define cmpeq_ps _mm_cmpeq_ps134 #define cmpge_ps _mm_cmpge_ps135 #define cmpgt_ps _mm_cmpgt_ps136 #define cmple_ps _mm_cmple_ps137 #define cmplt_ps _mm_cmplt_ps138 #define negate3_ps(x) xor_ps(x, sseSignMask3)139 140 141 142 143 #define s4f_x(s4f) _mm_cvtss_f32((s4f))144 145 #define s4f_y(s4f) _mm_cvtss_f32(shuffle1_ps((s4f), _MM_SHUFFLE(1,1,1,1)))146 147 148 149 #define s4f_z(s4f) _mm_cvtss_f32(_mm_unpackhi_ps((s4f), (s4f)))150 151 #define s4f_w(s4f) _mm_cvtss_f32(shuffle1_ps((s4f), _MM_SHUFFLE(3,3,3,3)))152 153 #ifdef MATH_SSE2154 #define set_ps_hex(w, z, y, x) _mm_castsi128_ps(_mm_set_epi32(w, z, y, x))155 #define set1_ps_hex(x) _mm_castsi128_ps(_mm_set1_epi32(x))156 #else157 #define set_ps_hex(w, z, y, x) _mm_set_ps(ReinterpretAsFloat(w), ReinterpretAsFloat(z), ReinterpretAsFloat(y), ReinterpretAsFloat(x))158 #define set1_ps_hex(x) _mm_set1_ps(ReinterpretAsFloat(x))159 #endif160 161 162 163 164 165 166 167 [FORCE_INLINE] simd4f setx_ps(float f)168 {169 170 171 172 #if _MSC_VER < 1700 // == VS2012173 174 175 return set1_ps(f);176 #else177 178 179 return _mm_set_ss(f);180 #endif181 182 183 184 185 186 }187 188 189 [FORCE_INLINE] simd4f dir_from_scalar_ps(float scalar)190 {191 return set_ps(0.f, scalar, scalar, scalar);192 }193 194 195 [FORCE_INLINE] simd4f pos_from_scalar_ps(float scalar)196 {197 return set_ps(1.f, scalar, scalar, scalar);198 }199 200 201 202 [FORCE_INLINE] simd4f pack_4ss_to_ps(simd4f x, simd4f y, simd4f z, const simd4f &w)203 {204 simd4f xy = _mm_movelh_ps(x, y); 205 simd4f zw = _mm_movelh_ps(z, w); 206 return _mm_shuffle_ps(xy, zw, _MM_SHUFFLE(2, 0, 2, 0)); 207 }208 209 210 static [FORCE_INLINE] __m128 setw_ps(__m128 m, float w)211 {212 __m128 hi = _mm_movehl_ps(m, m); 213 hi = _mm_unpacklo_ps(hi, _mm_set_ss(w)); 214 return _mm_movelh_ps(m, hi); 215 }216 217 #ifdef MATH_SSE2218 [FORCE_INLINE] simd4f modf_ps(simd4f x, simd4f mod)219 {220 221 222 simd4f ints = _mm_div_ps(x, mod);223 #ifdef MATH_SSE41 // _mm_round_ps is SSE4.1224 simd4f integerpart = _mm_round_ps(ints, _MM_FROUND_TO_ZERO);225 #else226 simd4f integerpart = _mm_cvtepi32_ps(_mm_cvttps_epi32(ints));227 #endif228 return _mm_sub_ps(x, _mm_mul_ps(integerpart, mod));229 }230 #endif231 232 #elif defined(MATH_NEON)233 234 #include <arm_neon.h>235 236 #define simd4f float32x4_t237 #define simd4i int32x4_t238 239 #define add_ps vaddq_f32240 #define sub_ps vsubq_f32241 #define mul_ps vmulq_f32242 #define div_ps(a, b) ((a) / (b))243 #define min_ps vminq_f32244 #define max_ps vmaxq_f32245 #define s4f_to_s4i(s4f) vreinterpretq_u32_f32((s4f))246 #define s4i_to_s4f(s4i) vreinterpretq_f32_u32((s4i))247 #define and_ps(x, y) s4i_to_s4f(vandq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))248 #define andnot_ps(x, y) s4i_to_s4f(vbicq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))249 #define or_ps(x, y) s4i_to_s4f(vorrq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))250 #define xor_ps(x, y) s4i_to_s4f(veorq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))251 #define ornot_ps(x, y) s4i_to_s4f(vornq_u32(s4f_to_s4i(x), s4f_to_s4i(y)))252 253 #define s4f_x(vec) vgetq_lane_f32((vec), 0)254 #define s4f_y(vec) vgetq_lane_f32((vec), 1)255 #define s4f_z(vec) vgetq_lane_f32((vec), 0)256 #define s4f_w(vec) vgetq_lane_f32((vec), 1)257 258 #define set1_ps vdupq_n_f32259 #define setx_ps vdupq_n_f32260 #define abs_ps vabsq_f32261 #define zero_ps() vdupq_n_f32(0.f)262 263 #define storeu_ps vst1q_f32264 #define store_ps vst1q_f32265 #define loadu_ps vld1q_f32266 #define load_ps vld1q_f32267 #define load1_ps(ptr) vdupq_n_f32(*(float*)(ptr))268 #define stream_ps vst1q_f32269 static inline simd4f rcp_ps(simd4f x)270 {271 simd4f e = vrecpeq_f32(x);272 e = vmulq_f32(e, vrecpsq_f32(x, e));273 e = vmulq_f32(e, vrecpsq_f32(x, e));274 return [e];275 }276 277 static inline simd4f rsqrt_ps(simd4f x)278 {279 simd4f e = vrsqrteq_f32(x);280 e = vmulq_f32(e, vrsqrtsq_f32(x, vmulq_f32(e, e)));281 e = vmulq_f32(e, vrsqrtsq_f32(x, vmulq_f32(e, e)));282 return [e];283 }284 285 static inline simd4f sqrt_ps(simd4f x) { return mul_ps(x, rsqrt_ps(x)); }286 287 #define cmpeq_ps(a, b) vreinterpretq_f32_u32(vceqq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))288 #define cmpge_ps(a, b) vreinterpretq_f32_u32(vcgeq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))289 #define cmpgt_ps(a, b) vreinterpretq_f32_u32(vcgtq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))290 #define cmple_ps(a, b) vreinterpretq_f32_u32(vcleq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))291 #define cmplt_ps(a, b) vreinterpretq_f32_u32(vcltq_u32(vreinterpretq_u32_f32((a)), vreinterpretq_u32_f32((b))))292 293 294 295 #define _MM_TRANSPOSE4_PS(a,b,c,d) do { \296 float32x4x2_t m1 = vuzpq_f32((a), (c)); \297 float32x4x2_t m2 = vuzpq_f32((b), (d)); \298 float32x4x2_t m3 = vtrnq_f32(m1.val[0], m2.val[0]); \299 float32x4x2_t m4 = vtrnq_f32(m1.val[1], m2.val[1]); \300 (a) = m3.val[0]; \301 (b) = m4.val[0]; \302 (c) = m3.val[1]; \303 (d) = m4.val[1]; } while(0)304 305 #ifdef _MSC_VER306 #define set_ps_const(w,z,y,x) {{ (u64)ReinterpretAsU32(x) | (((u64)ReinterpretAsU32(y)) << 32), (u64)ReinterpretAsU32(z) | (((u64)ReinterpretAsU32(w)) << 32) }}307 #define set_ps_hex_const(w,z,y,x) {{ (u64)(x) | (((u64)(y)) << 32), (u64)(z) | (((u64)(w)) << 32) }}308 #else309 #define set_ps_const(w,z,y,x) { x, y, z, w }310 #define set_ps_hex_const(w,z,y,x) { ReinterpretAsFloat(x), ReinterpretAsFloat(y), ReinterpretAsFloat(z), ReinterpretAsFloat(w) }311 #endif312 313 [FORCE_INLINE] simd4f set_ps(float w, float z, float y, float x)314 {315 316 317 float32x4_t c = set_ps_const(w,z,y,x);318 return c;319 }320 [FORCE_INLINE] simd4f set_ps_hex(u32 w, u32 z, u32 y, u32 x)321 {322 323 324 float32x4_t c = set_ps_hex_const(w,z,y,x);325 return c;326 }327 328 [FORCE_INLINE] simd4f dir_from_scalar_ps(float scalar)329 {330 return vsetq_lane_f32(0.f, vdupq_n_f32(scalar), 3);331 }332 333 334 [FORCE_INLINE] simd4f pos_from_scalar_ps(float scalar)335 {336 return vsetq_lane_f32(1.f, vdupq_n_f32(scalar), 3);337 }338 339 #endif // ~MATH_NEON340 341 342 343 const simd4f simd4fZero = zero_ps();344 const simd4f simd4fOne = set1_ps(1.f);345 const simd4f simd4fMinusOne = set1_ps(-1.f);346 const simd4f simd4fEpsilon = set1_ps(1e-4f);347 348 349 350 351 352 353 354 #define negate_ps(x) sub_ps(zero_ps(), (x))355 356 357 [FORCE_INLINE] simd4f cmov_ps(simd4f a, simd4f b, simd4f mask)358 {359 #ifdef MATH_SSE41 // SSE 4.1 offers conditional copying between registers with the blendvps instruction.360 return _mm_blendv_ps(a, b, mask);361 #else // If not on SSE 4.1, use conditional masking.362 b = and_ps(mask, b); 363 a = andnot_ps(mask, a); 364 return or_ps(a, b);365 #endif366 }367 368 static const float andMaskOneF = [ReinterpretAsFloat](0xFFFFFFFFU);369 370 static const simd4f sseMaskXYZ = set_ps(0.f, andMaskOneF, andMaskOneF, andMaskOneF);371 static const simd4f sseSignMask3 = set_ps(0.f, -0.f, -0.f, -0.f); 372 static const simd4f sseSignMask = set_ps(-0.f, -0.f, -0.f, -0.f); 373 #ifdef MATH_AVX374 static const __m256 sseSignMask256 = _mm256_set1_ps(-0.f); 375 #endif376 #ifdef MATH_AVX377 #define abs_ps256(x) _mm256_andnot_ps(sseSignMask256, x)378 #endif379 380 [MATH_END_NAMESPACE]381 382 #endif // ~MATH_SIMD Go back to previous page