1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 #pragma once19 20 #include "../MathBuildConfig.h"21 22 #ifdef MATH_SSE23 24 #include "[MathTypes.h]"25 #include "[SSEMath.h]"26 27 28 [FORCE_INLINE] simd4f sum_xyz_ps(simd4f m)29 {30 #ifdef MATH_SSE3 // If we have SSE 3, we can use the haddps (horizontal add) instruction, _mm_hadd_ps intrinsic.31 m = and_ps(m, sseMaskXYZ); 32 m = _mm_hadd_ps(m, m); 33 m = _mm_hadd_ps(m, m); 34 return m; 35 #else // We only have SSE 1, and must individually shuffle.36 simd4f X = xxxx_ps(m);37 simd4f Y = yyyy_ps(m);38 simd4f Z = zzzz_ps(m);39 simd4f XYZ = add_ps(X, add_ps(Y, Z)); 40 return XYZ; 41 #endif42 }43 44 45 [FORCE_INLINE] simd4f sum_xyz_ps3(simd4f m)46 {47 simd4f yzx = shuffle1_ps(m, _MM_SHUFFLE(3,0,2,1)); 48 simd4f zxy = shuffle1_ps(m, _MM_SHUFFLE(3,1,0,2)); 49 simd4f XYZ = add_ps(m, add_ps(yzx, zxy)); 50 return XYZ; 51 }52 53 [FORCE_INLINE] float sum_xyz_float(simd4f m)54 {55 return s4f_x(sum_xyz_ps3(m));56 }57 58 59 [FORCE_INLINE] simd4f sum_xyzw_ps(simd4f m)60 {61 #ifdef MATH_SSE3 // If we have SSE 3, we can use the haddps (horizontal add) instruction, _mm_hadd_ps intrinsic.62 m = _mm_hadd_ps(m, m); 63 m = _mm_hadd_ps(m, m); 64 return m; 65 #else // We only have SSE 1, and must individually shuffle.66 simd4f v2 = shuffle1_ps(m, _MM_SHUFFLE(1,0,3,2)); 67 v2 = add_ps(v2, m); 68 simd4f v3 = shuffle1_ps(v2, _MM_SHUFFLE(0,3,2,1)); 69 return add_ps(v2, v3); 70 #endif71 }72 73 [FORCE_INLINE] float sum_xyzw_float(simd4f m)74 {75 return s4f_x(sum_xyzw_ps(m));76 }77 78 [FORCE_INLINE] simd4f mul_xyzw_ps(simd4f v)79 {80 simd4f v2 = shuffle1_ps(v, _MM_SHUFFLE(1, 0, 3, 2)); 81 v2 = mul_ps(v, v2); 82 simd4f v3 = shuffle1_ps(v2, _MM_SHUFFLE(2, 1, 0, 3)); 83 return mul_ps(v2, v3); 84 }85 86 [FORCE_INLINE] float mul_xyzw_float(simd4f m)87 {88 return s4f_x(mul_xyzw_ps(m));89 }90 91 92 [FORCE_INLINE] simd4f dot3_ps(simd4f a, simd4f b)93 {94 #ifdef MATH_SSE41 // If we have SSE 4.1, we can use the dpps (dot product) instruction, _mm_dp_ps intrinsic.95 return _mm_dp_ps(a, b, 0x7F); 96 #else // Otherwise, use SSE3 haddps or SSE1 with individual shuffling.97 return sum_xyz_ps(mul_ps(a, b));98 #endif99 }100 101 102 [FORCE_INLINE] simd4f dot3_ps3(simd4f a, simd4f b)103 {104 #ifdef MATH_SSE41 // If we have SSE 4.1, we can use the dpps (dot product) instruction, _mm_dp_ps intrinsic.105 return _mm_dp_ps(a, b, 0x7F); 106 #else // Otherwise, use SSE3 haddps or SSE1 with individual shuffling.107 return sum_xyz_ps3(mul_ps(a, b));108 #endif109 }110 111 [FORCE_INLINE] float dot3_float(simd4f a, simd4f b)112 {113 return s4f_x(dot3_ps3(a, b));114 }115 116 117 [FORCE_INLINE] simd4f dot4_ps(simd4f a, simd4f b)118 {119 #ifdef MATH_SSE41 // If we have SSE 4.1, we can use the dpps (dot product) instruction, _mm_dp_ps intrinsic.120 return _mm_dp_ps(a, b, 0xFF); 121 #else // Otherwise, use SSE3 haddps or SSE1 with individual shuffling.122 return sum_xyzw_ps(mul_ps(a, b));123 #endif124 }125 126 [FORCE_INLINE] float dot4_float(simd4f a, simd4f b)127 {128 return s4f_x(dot4_ps(a, b));129 }130 131 [FORCE_INLINE] simd4f cross_ps(simd4f a, simd4f b)132 {133 simd4f a_xzy = shuffle1_ps(a, _MM_SHUFFLE(3, 0, 2, 1)); 134 simd4f b_xzy = shuffle1_ps(b, _MM_SHUFFLE(3, 0, 2, 1)); 135 136 simd4f x_yxz = mul_ps(b_xzy, a); 137 simd4f y_yxz = mul_ps(a_xzy, b); 138 139 return shuffle1_ps(sub_ps(x_yxz, y_yxz), _MM_SHUFFLE(3, 0, 2, 1)); 140 }141 142 [FORCE_INLINE] void basis_ps(simd4f v, simd4f *outB, simd4f *outC)143 {144 simd4f a = abs_ps(v);145 simd4f a_min = min_ps(a, min_ps(yyyy_ps(a), _mm_movehl_ps(a, a))); 146 a_min = xxxx_ps(a_min); 147 a = cmple_ps(a, a_min); 148 149 simd4f q = and_ps(a, set_ps(0.f, 1.f, 1.f, 1.f));150 151 simd4f v_xzy = shuffle1_ps(v, _MM_SHUFFLE(3, 0, 2, 1));152 simd4f v_yxz = shuffle1_ps(v, _MM_SHUFFLE(3, 1, 0, 2));153 simd4f q_xzy = shuffle1_ps(q, _MM_SHUFFLE(3, 0, 2, 1));154 simd4f b_yxz = sub_ps(mul_ps(q_xzy, v), mul_ps(v_xzy, q));155 simd4f b = shuffle1_ps(b_yxz, _MM_SHUFFLE(3, 0, 2, 1));156 simd4f b_xzy = shuffle1_ps(b_yxz, _MM_SHUFFLE(3, 1, 0, 2));157 simd4f c = sub_ps(mul_ps(b_yxz, v_xzy), mul_ps(v_yxz, b_xzy));158 159 *outB = mul_ps(b, rsqrt_ps(dot4_ps(b, b)));160 *outC = mul_ps(c, rsqrt_ps(dot4_ps(c, c)));161 }162 163 simd4f vec3_length_ps(simd4f vec);164 simd4f vec3_length_ps3(simd4f vec);165 166 167 [FORCE_INLINE] simd4f vec4_safe_normalize3(simd4f vec, simd4f &outLength)168 {169 outLength = vec3_length_ps3(vec);170 simd4f isZero = _mm_cmplt_ps(outLength, simd4fEpsilon); 171 simd4f normalized = _mm_div_ps(vec, outLength); 172 normalized = cmov_ps(normalized, [float4::unitX].v, isZero); 173 return cmov_ps(vec, normalized, sseMaskXYZ); 174 }175 176 #endif Go back to previous page