1 #pragma once2 3 #include "../MathBuildConfig.h"4 5 #ifdef MATH_SIMD6 7 #include "[SSEMath.h]"8 #include "[float4_neon.h]"9 10 [MATH_BEGIN_NAMESPACE]11 12 #ifdef MATH_SSE13 14 15 16 inline void quat_to_mat3x4(__m128 q, __m128 t, __m128 *m)17 {18 19 const u32 sign = 0x80000000UL;20 const __m128 sseX0 = set_ps_hex(sign, sign, sign, 0);21 const __m128 sseX1 = set_ps_hex(sign, sign, 0, sign);22 __m128 [one] = _mm_set_ps(0, 0, 0, 1);23 24 #if 0 // The original code converted a quaternion into an hybrid of rotation/translation (bug?)25 __m128 q2 = _mm_add_ps(q, q); 26 __m128 yxxy = shuffle1_ps(q, _MM_SHUFFLE(1, 0, 0, 1)); 27 __m128 yyzz2 = shuffle1_ps(q2, _MM_SHUFFLE(2, 2, 1, 1)); 28 __m128 yy_xy_xz_yz_2 = _mm_mul_ps(yxxy, yyzz2); 29 30 __m128 zwww = shuffle1_ps(q, _MM_SHUFFLE(3, 3, 3, 2)); 31 __m128 zzyx2 = shuffle1_ps(q2, _MM_SHUFFLE(0, 1, 2, 2)); 32 __m128 zz_wz_wy_wx_2 = _mm_mul_ps(zwww, zzyx2); 33 34 __m128 xx2 = _mm_mul_ss(q, q2); 35 36 37 __m128 one_m_xx2 = _mm_sub_ss(one, xx2); 38 __m128 one_m_xx_yy_2 = _mm_sub_ss(one_m_xx2, yy_xy_xz_yz_2); 39 __m128 one_m_xx_yy_2_0_tz_tw = _mm_shuffle_ps(one_m_xx_yy_2, t, _MM_SHUFFLE(3, 2, 1, 0)); 40 41 42 __m128 m_yy_xy_xz_yz_2 = _mm_xor_ps(yy_xy_xz_yz_2, sseX0); 43 __m128 m_zz_wz_wy_wx_2 = _mm_xor_ps(zz_wz_wy_wx_2, sseX1); 44 __m128 m_zz_one_wz_wy_wx_2 = _mm_add_ss(m_zz_wz_wy_wx_2, one); 45 __m128 first_row = _mm_sub_ps(m_zz_one_wz_wy_wx_2, m_yy_xy_xz_yz_2); 46 m[0] = first_row;47 _mm_store_ss((float*)m+3, t);48 49 50 __m128 s1 = _mm_move_ss(m_yy_xy_xz_yz_2, xx2); 51 __m128 s2 = _mm_xor_ps(m_zz_one_wz_wy_wx_2, sseX0); 52 __m128 s3 = _mm_sub_ps(s2, s1); 53 __m128 t_yzwx = shuffle1_ps(t, _MM_SHUFFLE(0, 3, 2, 1)); 54 __m128 second_row = shuffle1_ps(s3, _MM_SHUFFLE(2, 3, 0, 1)); 55 m[1] = second_row;56 _mm_store_ss((float*)m+7, t_yzwx);57 58 59 __m128 t1 = _mm_movehl_ps(first_row, second_row); 60 __m128 t2 = _mm_shuffle_ps(t1, one_m_xx_yy_2_0_tz_tw, _MM_SHUFFLE(2, 0, 3, 1)); 61 m[2] = t2;62 #else63 __m128 q2 = _mm_add_ps(q, q); 64 __m128 yxxy = shuffle1_ps(q, _MM_SHUFFLE(1, 0, 0, 1)); 65 __m128 yyzz2 = shuffle1_ps(q2, _MM_SHUFFLE(2, 2, 1, 1)); 66 __m128 yy_xy_xz_yz_2 = _mm_mul_ps(yxxy, yyzz2); 67 68 __m128 zwww = shuffle1_ps(q, _MM_SHUFFLE(3, 3, 3, 2)); 69 __m128 zzyx2 = shuffle1_ps(q2, _MM_SHUFFLE(0, 1, 2, 2)); 70 __m128 zz_wz_wy_wx_2 = _mm_mul_ps(zwww, zzyx2); 71 72 __m128 xx2 = _mm_mul_ss(q, q2); 73 74 75 __m128 one_m_xx2 = _mm_sub_ss(one, xx2); 76 __m128 one_m_xx_yy_2 = _mm_sub_ss(one_m_xx2, yy_xy_xz_yz_2); 77 __m128 one_m_xx_yy_2_0_tz_tw = one_m_xx_yy_2;78 79 80 __m128 m_yy_xy_xz_yz_2 = _mm_xor_ps(yy_xy_xz_yz_2, sseX0); 81 __m128 m_zz_wz_wy_wx_2 = _mm_xor_ps(zz_wz_wy_wx_2, sseX1); 82 __m128 m_zz_one_wz_wy_wx_2 = _mm_add_ss(m_zz_wz_wy_wx_2, one); 83 __m128 first_row = _mm_sub_ps(m_zz_one_wz_wy_wx_2, m_yy_xy_xz_yz_2); 84 85 86 __m128 s1 = _mm_move_ss(m_yy_xy_xz_yz_2, xx2); 87 __m128 s2 = _mm_xor_ps(m_zz_one_wz_wy_wx_2, sseX0); 88 __m128 s3 = _mm_sub_ps(s2, s1); 89 __m128 second_row = shuffle1_ps(s3, _MM_SHUFFLE(2, 3, 0, 1)); 90 91 92 __m128 t1 = _mm_movehl_ps(first_row, second_row); 93 __m128 third_row = _mm_shuffle_ps(t1, one_m_xx_yy_2_0_tz_tw, _MM_SHUFFLE(2, 0, 3, 1)); 94 95 __m128 tmp0 = _mm_unpacklo_ps(first_row, second_row);96 __m128 tmp2 = _mm_unpacklo_ps(third_row, t);97 __m128 tmp1 = _mm_unpackhi_ps(first_row, second_row);98 __m128 tmp3 = _mm_unpackhi_ps(third_row, t);99 m[0] = _mm_movelh_ps(tmp0, tmp2);100 m[1] = _mm_movehl_ps(tmp2, tmp0);101 m[2] = _mm_movelh_ps(tmp1, tmp3);102 #endif103 }104 105 [FORCE_INLINE] void quat_to_mat4x4(__m128 q, __m128 t, __m128 *m)106 {107 quat_to_mat3x4(q, t, m);108 m[3] = set_ps(1.f, 0.f, 0.f, 0.f);109 }110 111 [FORCE_INLINE] simd4f quat_transform_vec4(simd4f quat, simd4f vec)112 {113 __m128 W = wwww_ps(quat);114 115 116 __m128 a_xzy = shuffle1_ps(quat, _MM_SHUFFLE(3, 0, 2, 1)); 117 __m128 b_yxz = shuffle1_ps(vec, _MM_SHUFFLE(3, 1, 0, 2)); 118 __m128 a_yxz = shuffle1_ps(quat, _MM_SHUFFLE(3, 1, 0, 2)); 119 __m128 b_xzy = shuffle1_ps(vec, _MM_SHUFFLE(3, 0, 2, 1)); 120 __m128 x = _mm_mul_ps(a_xzy, b_yxz); 121 __m128 y = _mm_mul_ps(a_yxz, b_xzy); 122 __m128 qxv = _mm_sub_ps(x, y); 123 124 __m128 Wv = _mm_mul_ps(W, vec);125 __m128 s = _mm_add_ps(qxv, Wv);126 127 128 __m128 s_yxz = shuffle1_ps(s, _MM_SHUFFLE(3, 1, 0, 2)); 129 __m128 s_xzy = shuffle1_ps(s, _MM_SHUFFLE(3, 0, 2, 1)); 130 x = _mm_mul_ps(a_xzy, s_yxz); 131 y = _mm_mul_ps(a_yxz, s_xzy); 132 s = _mm_sub_ps(x, y); 133 134 s = _mm_add_ps(s, s);135 s = _mm_add_ps(s, vec);136 return s;137 }138 139 #endif // ~MATH_SSE140 141 #ifdef ANDROID142 inline void quat_mul_quat_asm(const void *q1, const void *q2, void *out)143 {144 145 146 147 148 #ifdef _DEBUG149 [assert]([IS16ALIGNED](out));150 [assert]([IS16ALIGNED](q1));151 [assert]([IS16ALIGNED](q2));152 [assert]([IS16ALIGNED](sx));153 [assert]([IS16ALIGNED](sy));154 [assert]([IS16ALIGNED](sz));155 #endif156 157 asm(158 "\t vld1.32 {d0, d1}, [%1]\n" 159 "\t vmov.i32 d12, #0\n" 160 "\t vmov.i32 d13, #0x80000000\n" 161 "\t vld1.32 {d8, d9}, [%2]\n" 162 "\t vdup.32 q1, d0[1]\n" 163 "\t vdup.32 q2, d1[0]\n" 164 "\t vshl.i64 d10, d13, #32\n" 165 "\t vdup.32 q3, d1[1]\n" 166 "\t vdup.32 q0, d0[0]\n" 167 "\t vmov d11, d10\n" 168 "\t vmov d15, d10\n" 169 "\t vshr.u64 d14, d10, #32\n" 170 171 "\t vmov d18, d9\n" 172 "\t vmov d19, d8\n" 173 174 "\t veor q0, q0, q5\n" 175 "\t veor q1, q1, q6\n" 176 "\t veor q2, q2, q7\n" 177 178 "\t vrev64.32 q10, q9\n" 179 180 "\t vmul.f32 q0, q0, q10\n" 181 "\t vmul.f32 q11, q1, q9\n" 182 "\t vrev64.32 q8, q4\n" 183 "\t vmla.f32 q0, q2, q8\n" 184 "\t vmla.f32 q11, q3, q4\n" 185 186 "\t vadd.f32 q0, q0, q11\n"187 "\t vst1.32 {d0, d1}, [%0]\n" 188 : 189 : [out]"r"(out), [quat1]"r"(q1), [quat2]"r"(q2)190 : "memory", "q11", "q10", "q9", "q8", "q7", "q6", "q5", "q4", "q3", "q2", "q1", "q0");191 }192 #endif193 194 [FORCE_INLINE] simd4f quat_mul_quat(simd4f q1, simd4f q2)195 {196 197 198 199 200 #ifdef MATH_SSE201 const __m128 signx = set_ps_hex(0x80000000u, 0, 0x80000000u, 0); 202 const __m128 signy = shuffle1_ps(signx, _MM_SHUFFLE(3,3,0,0)); 203 const __m128 signz = shuffle1_ps(signx, _MM_SHUFFLE(3,0,0,3)); 204 205 __m128 X = _mm_xor_ps(signx, xxxx_ps(q1));206 __m128 Y = _mm_xor_ps(signy, yyyy_ps(q1));207 __m128 Z = _mm_xor_ps(signz, zzzz_ps(q1));208 __m128 W = wwww_ps(q1);209 210 __m128 r1 = shuffle1_ps(q2, _MM_SHUFFLE(0, 1, 2, 3)); 211 __m128 r2 = shuffle1_ps(q2, _MM_SHUFFLE(1, 0, 3, 2)); 212 __m128 r3 = shuffle1_ps(q2, _MM_SHUFFLE(2, 3, 0, 1)); 213 214 215 return _mm_add_ps(_mm_add_ps(_mm_mul_ps(X, r1), _mm_mul_ps(Y, r2)), 216 _mm_add_ps(_mm_mul_ps(Z, r3), _mm_mul_ps(W, q2)));217 #elif defined(ANDROID)218 simd4f ret;219 quat_mul_quat_asm(&q1, &q2, &ret);220 return ret;221 #else // NEON222 static const float32x4_t signx = set_ps_hex_const(0x80000000u, 0, 0x80000000u, 0);223 static const float32x4_t signy = set_ps_hex_const(0x80000000u, 0x80000000u, 0, 0);224 static const float32x4_t signz = set_ps_hex_const(0x80000000u, 0, 0, 0x80000000u);225 226 const float32_t *q1f = (const float32_t *)&q1;227 float32x4_t X = xor_ps(signx, vdupq_n_f32(q1f[0]));228 float32x4_t Y = xor_ps(signy, vdupq_n_f32(q1f[1]));229 float32x4_t Z = xor_ps(signz, vdupq_n_f32(q1f[2]));230 float32x4_t W = vdupq_n_f32(q1f[3]);231 232 float32x4_t r3 = vrev64q_f32(q2); 233 float32x4_t r1 = vcombine_f32(vget_high_f32(r3), vget_low_f32(r3)); 234 float32x4_t r2 = vrev64q_f32(r1); 235 236 float32x4_t ret = mul_ps(X, r1);237 ret = vmlaq_f32(ret, Y, r2);238 ret = vmlaq_f32(ret, Z, r3);239 ret = vmlaq_f32(ret, W, q2);240 return ret;241 #endif242 }243 244 #ifdef MATH_SSE245 [FORCE_INLINE] simd4f quat_div_quat(simd4f q1, simd4f q2)246 {247 248 249 250 251 252 const __m128 signx = set_ps_hex(0x80000000u, 0, 0x80000000u, 0); 253 const __m128 signy = shuffle1_ps(signx, _MM_SHUFFLE(3,3,0,0)); 254 const __m128 signz = shuffle1_ps(signx, _MM_SHUFFLE(3,0,0,3)); 255 256 __m128 X = _mm_xor_ps(signx, xxxx_ps(q1));257 __m128 Y = _mm_xor_ps(signy, yyyy_ps(q1));258 __m128 Z = _mm_xor_ps(signz, zzzz_ps(q1));259 __m128 W = wwww_ps(q1);260 261 q2 = negate3_ps(q2);262 __m128 r1 = shuffle1_ps(q2, _MM_SHUFFLE(0, 1, 2, 3)); 263 __m128 r2 = shuffle1_ps(q2, _MM_SHUFFLE(1, 0, 3, 2)); 264 __m128 r3 = shuffle1_ps(q2, _MM_SHUFFLE(2, 3, 0, 1)); 265 266 267 return _mm_add_ps(_mm_add_ps(_mm_mul_ps(X, r1), _mm_mul_ps(Y, r2)), 268 _mm_add_ps(_mm_mul_ps(Z, r3), _mm_mul_ps(W, q2)));269 }270 #endif271 272 [MATH_END_NAMESPACE]273 274 #endif // ~MATH_SIMD Go back to previous page