1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 #pragma once19 20 #include "../MathBuildConfig.h"21 22 #ifdef MATH_SSE23 24 #include "[SSEMath.h]"25 #include "[float4_sse.h]"26 27 [MATH_BEGIN_NAMESPACE]28 29 void quat_to_mat4x4(__m128 q, __m128 t, __m128 *m);30 31 32 #ifdef MATH_SSE41 // If we have SSE 4.1, we can use the dpps (dot product) instruction, _mm_dp_ps intrinsic.33 34 inline __m128 mat4x4_mul_sse41(const __m128 *matrix, __m128 vector)35 {36 __m128 x = _mm_dp_ps(matrix[0], vector, 0xF0 | 0x0F); 37 __m128 y = _mm_dp_ps(matrix[1], vector, 0xF0 | 0x0F);38 __m128 z = _mm_dp_ps(matrix[2], vector, 0xF0 | 0x0F);39 __m128 w = _mm_dp_ps(matrix[3], vector, 0xF0 | 0x0F);40 41 __m128 xy = _mm_movelh_ps(x, y); 42 __m128 zw = _mm_movelh_ps(z, w); 43 44 return _mm_shuffle_ps(xy, zw, _MM_SHUFFLE(2, 0, 2, 0)); 45 }46 #endif47 48 49 #ifdef MATH_SSE3 // If we have SSE3, we can repeatedly use haddps to accumulate the result.50 inline __m128 mat4x4_mul_sse3(const __m128 *matrix, __m128 vector)51 {52 __m128 x = _mm_mul_ps(matrix[0], vector);53 __m128 y = _mm_mul_ps(matrix[1], vector);54 __m128 z = _mm_mul_ps(matrix[2], vector);55 __m128 w = _mm_mul_ps(matrix[3], vector);56 __m128 tmp1 = _mm_hadd_ps(x, y); 57 __m128 tmp2 = _mm_hadd_ps(z, w); 58 59 return _mm_hadd_ps(tmp1, tmp2); 60 }61 #endif62 63 inline __m128 mat4x4_mul_sse1(const __m128 *matrix, __m128 vector)64 {65 __m128 x = _mm_mul_ps(matrix[0], vector);66 __m128 y = _mm_mul_ps(matrix[1], vector);67 __m128 z = _mm_mul_ps(matrix[2], vector);68 __m128 w = _mm_mul_ps(matrix[3], vector);69 _MM_TRANSPOSE4_PS(x, y, z, w); 70 71 return _mm_add_ps(_mm_add_ps(x, y), _mm_add_ps(z, w));72 }73 74 inline __m128 colmajor_mat4x4_mul_sse1(const __m128 *matrix, __m128 vector)75 {76 __m128 x = xxxx_ps(vector);77 __m128 y = yyyy_ps(vector);78 __m128 z = zzzz_ps(vector);79 __m128 w = wwww_ps(vector);80 x = _mm_mul_ps(x, matrix[0]);81 y = _mm_mul_ps(y, matrix[1]);82 z = _mm_mul_ps(z, matrix[2]);83 w = _mm_mul_ps(w, matrix[3]);84 85 return _mm_add_ps(_mm_add_ps(x, y), _mm_add_ps(z, w));86 }87 88 89 inline __m128 colmajor_mat4x4_muldir_sse1(const __m128 *matrix, __m128 vector)90 {91 __m128 x = xxxx_ps(vector);92 __m128 y = yyyy_ps(vector);93 __m128 z = zzzz_ps(vector);94 x = _mm_mul_ps(x, matrix[0]);95 y = _mm_mul_ps(y, matrix[1]);96 z = _mm_mul_ps(z, matrix[2]);97 98 return _mm_add_ps(_mm_add_ps(x, y), z);99 }100 101 inline __m128 colmajor_mat4x4_mul_sse1_2(const __m128 *matrix, __m128 vector)102 {103 __m128 x = xxxx_ps(vector);104 __m128 y = yyyy_ps(vector);105 __m128 z = zzzz_ps(vector);106 __m128 w = wwww_ps(vector);107 x = _mm_mul_ps(x, matrix[0]);108 y = _mm_mul_ps(y, matrix[1]);109 z = _mm_mul_ps(z, matrix[2]);110 w = _mm_mul_ps(w, matrix[3]);111 112 return _mm_add_ps(_mm_add_ps(x, y), _mm_add_ps(z, w));113 }114 115 116 inline __m128 mat4x4_mul_sse(const __m128 *matrix, __m128 vector)117 {118 119 120 121 122 #if defined(MATH_SSE3)123 return mat4x4_mul_sse3(matrix, vector);124 #else125 return mat4x4_mul_sse1(matrix, vector);126 #endif127 }128 129 130 inline __m128 mat3x4_mul_sse(const __m128 *matrix, __m128 vector)131 {132 __m128 x = dot4_ps(matrix[0], vector);133 __m128 y = dot4_ps(matrix[1], vector);134 __m128 z = dot4_ps(matrix[2], vector);135 136 137 __m128 xy = _mm_movelh_ps(x, y); 138 __m128 zw = _mm_movehl_ps(vector, z); 139 return _mm_shuffle_ps(xy, zw, _MM_SHUFFLE(3, 1, 2, 0)); 140 }141 142 inline [float3] mat3x4_mul_vec(const __m128 *matrix, __m128 vector)143 {144 __m128 x = dot4_ps(matrix[0], vector);145 __m128 y = dot4_ps(matrix[1], vector);146 __m128 z = dot4_ps(matrix[2], vector);147 148 return [float3](s4f_x(x), s4f_x(y), s4f_x(z));149 }150 151 #define _mm_transpose_matrix_intel(row0, row1, row2, row3) \152 __m128 tmp0, tmp1, tmp2, tmp3; \153 tmp0 = _mm_unpacklo_ps(row0, row1); \154 tmp2 = _mm_unpacklo_ps(row2, row3); \155 tmp1 = _mm_unpackhi_ps(row0, row1); \156 tmp3 = _mm_unpackhi_ps(row2, row3); \157 row0 = _mm_movelh_ps(tmp0, tmp2); \158 row1 = _mm_movehl_ps(tmp2, tmp0); \159 row2 = _mm_movelh_ps(tmp1, tmp3); \160 row3 = _mm_movehl_ps(tmp3, tmp1);161 162 [FORCE_INLINE] void mat4x4_mul_dpps(__m128 *out, const __m128 *m1, const __m128 *m2)163 {164 165 166 167 168 169 170 __m128 low1 = _mm_movelh_ps(m2[0], m2[1]); 171 __m128 low2 = _mm_movelh_ps(m2[2], m2[3]); 172 __m128 hi1 = _mm_movehl_ps(m2[1], m2[0]); 173 __m128 hi2 = _mm_movehl_ps(m2[3], m2[2]); 174 175 __m128 row1 = _mm_shuffle_ps(low1, low2, _MM_SHUFFLE(2, 0, 2, 0)); 176 __m128 row2 = _mm_shuffle_ps(low1, low2, _MM_SHUFFLE(3, 1, 3, 1)); 177 __m128 row3 = _mm_shuffle_ps(hi1, hi2, _MM_SHUFFLE(2, 0, 2, 0)); 178 __m128 row4 = _mm_shuffle_ps(hi1, hi2, _MM_SHUFFLE(3, 1, 3, 1)); 179 180 __m128 _00 = dot4_ps(m1[0], row1);181 __m128 _01 = dot4_ps(m1[0], row2);182 __m128 _02 = dot4_ps(m1[0], row3);183 __m128 _03 = dot4_ps(m1[0], row4);184 out[0] = pack_4ss_to_ps(_00, _01, _02, _03);185 186 __m128 _10 = dot4_ps(m1[1], row1);187 __m128 _11 = dot4_ps(m1[1], row2);188 __m128 _12 = dot4_ps(m1[1], row3);189 __m128 _13 = dot4_ps(m1[1], row4);190 out[1] = pack_4ss_to_ps(_10, _11, _12, _13);191 192 __m128 _20 = dot4_ps(m1[2], row1);193 __m128 _21 = dot4_ps(m1[2], row2);194 __m128 _22 = dot4_ps(m1[2], row3);195 __m128 _23 = dot4_ps(m1[2], row4);196 out[2] = pack_4ss_to_ps(_20, _21, _22, _23);197 198 __m128 _30 = dot4_ps(m1[3], row1);199 __m128 _31 = dot4_ps(m1[3], row2);200 __m128 _32 = dot4_ps(m1[3], row3);201 __m128 _33 = dot4_ps(m1[3], row4);202 out[3] = pack_4ss_to_ps(_30, _31, _32, _33);203 }204 205 [FORCE_INLINE] void mat4x4_mul_dpps_2(__m128 *out, const __m128 *m1, const __m128 *m2)206 {207 208 209 210 211 212 __m128 row1 = m2[0];213 __m128 row2 = m2[1];214 __m128 row3 = m2[2];215 __m128 row4 = m2[3];216 _mm_transpose_matrix_intel(row1, row2, row3, row4);217 218 __m128 _00 = dot4_ps(m1[0], row1);219 __m128 _01 = dot4_ps(m1[0], row2);220 __m128 _02 = dot4_ps(m1[0], row3);221 __m128 _03 = dot4_ps(m1[0], row4);222 out[0] = pack_4ss_to_ps(_00, _01, _02, _03);223 224 __m128 _10 = dot4_ps(m1[1], row1);225 __m128 _11 = dot4_ps(m1[1], row2);226 __m128 _12 = dot4_ps(m1[1], row3);227 __m128 _13 = dot4_ps(m1[1], row4);228 out[1] = pack_4ss_to_ps(_10, _11, _12, _13);229 230 __m128 _20 = dot4_ps(m1[2], row1);231 __m128 _21 = dot4_ps(m1[2], row2);232 __m128 _22 = dot4_ps(m1[2], row3);233 __m128 _23 = dot4_ps(m1[2], row4);234 out[2] = pack_4ss_to_ps(_20, _21, _22, _23);235 236 __m128 _30 = dot4_ps(m1[3], row1);237 __m128 _31 = dot4_ps(m1[3], row2);238 __m128 _32 = dot4_ps(m1[3], row3);239 __m128 _33 = dot4_ps(m1[3], row4);240 out[3] = pack_4ss_to_ps(_30, _31, _32, _33);241 }242 243 [FORCE_INLINE] void mat4x4_mul_dpps_3(__m128 *out, const __m128 *m1, const __m128 *m2)244 {245 246 247 248 249 250 251 __m128 low1 = _mm_movelh_ps(m2[0], m2[1]); 252 __m128 low2 = _mm_movelh_ps(m2[2], m2[3]); 253 __m128 hi1 = _mm_movehl_ps(m2[1], m2[0]); 254 __m128 hi2 = _mm_movehl_ps(m2[3], m2[2]); 255 256 __m128 row1 = _mm_shuffle_ps(low1, low2, _MM_SHUFFLE(2, 0, 2, 0)); 257 __m128 row2 = _mm_shuffle_ps(low1, low2, _MM_SHUFFLE(3, 1, 3, 1)); 258 __m128 row3 = _mm_shuffle_ps(hi1, hi2, _MM_SHUFFLE(2, 0, 2, 0)); 259 __m128 row4 = _mm_shuffle_ps(hi1, hi2, _MM_SHUFFLE(3, 1, 3, 1)); 260 261 __m128 _00 = dot4_ps(m1[0], row1);262 __m128 _01 = dot4_ps(m1[0], row2);263 __m128 _02 = dot4_ps(m1[0], row3);264 __m128 _03 = dot4_ps(m1[0], row4);265 266 __m128 xy = _mm_movelh_ps(_00, _01); 267 __m128 zw = _mm_movelh_ps(_02, _03); 268 out[0] = _mm_shuffle_ps(xy, zw, _MM_SHUFFLE(2, 0, 2, 0)); 269 270 271 272 __m128 _10 = dot4_ps(m1[1], row1);273 __m128 _11 = dot4_ps(m1[1], row2);274 __m128 _12 = dot4_ps(m1[1], row3);275 __m128 _13 = dot4_ps(m1[1], row4);276 277 __m128 xy2 = _mm_movelh_ps(_10, _11); 278 __m128 zw2 = _mm_movelh_ps(_12, _13); 279 out[1] = _mm_shuffle_ps(xy2, zw2, _MM_SHUFFLE(2, 0, 2, 0)); 280 281 282 283 __m128 _20 = dot4_ps(m1[2], row1);284 __m128 _21 = dot4_ps(m1[2], row2);285 __m128 _22 = dot4_ps(m1[2], row3);286 __m128 _23 = dot4_ps(m1[2], row4);287 288 __m128 xy3 = _mm_movelh_ps(_20, _21); 289 __m128 zw3 = _mm_movelh_ps(_22, _23); 290 out[2] = _mm_shuffle_ps(xy3, zw3, _MM_SHUFFLE(2, 0, 2, 0)); 291 292 293 294 __m128 _30 = dot4_ps(m1[3], row1);295 __m128 _31 = dot4_ps(m1[3], row2);296 __m128 _32 = dot4_ps(m1[3], row3);297 __m128 _33 = dot4_ps(m1[3], row4);298 299 __m128 xy4 = _mm_movelh_ps(_30, _31); 300 __m128 zw4 = _mm_movelh_ps(_32, _33); 301 out[3] = _mm_shuffle_ps(xy4, zw4, _MM_SHUFFLE(2, 0, 2, 0)); 302 303 304 }305 306 [FORCE_INLINE] void mat4x4_mul_sse(__m128 *out, const __m128 *m1, const __m128 *m2)307 {308 __m128 s0 = xxxx_ps(m1[0]);309 __m128 s1 = yyyy_ps(m1[0]);310 __m128 s2 = zzzz_ps(m1[0]);311 __m128 s3 = wwww_ps(m1[0]);312 __m128 r0 = _mm_mul_ps(s0, m2[0]);313 __m128 r1 = _mm_mul_ps(s1, m2[1]);314 __m128 r2 = _mm_mul_ps(s2, m2[2]);315 __m128 r3 = _mm_mul_ps(s3, m2[3]);316 out[0] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));317 318 s0 = xxxx_ps(m1[1]);319 s1 = yyyy_ps(m1[1]);320 s2 = zzzz_ps(m1[1]);321 s3 = wwww_ps(m1[1]);322 r0 = _mm_mul_ps(s0, m2[0]);323 r1 = _mm_mul_ps(s1, m2[1]);324 r2 = _mm_mul_ps(s2, m2[2]);325 r3 = _mm_mul_ps(s3, m2[3]);326 out[1] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));327 328 s0 = xxxx_ps(m1[2]);329 s1 = yyyy_ps(m1[2]);330 s2 = zzzz_ps(m1[2]);331 s3 = wwww_ps(m1[2]);332 r0 = _mm_mul_ps(s0, m2[0]);333 r1 = _mm_mul_ps(s1, m2[1]);334 r2 = _mm_mul_ps(s2, m2[2]);335 r3 = _mm_mul_ps(s3, m2[3]);336 out[2] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));337 338 s0 = xxxx_ps(m1[3]);339 s1 = yyyy_ps(m1[3]);340 s2 = zzzz_ps(m1[3]);341 s3 = wwww_ps(m1[3]);342 r0 = _mm_mul_ps(s0, m2[0]);343 r1 = _mm_mul_ps(s1, m2[1]);344 r2 = _mm_mul_ps(s2, m2[2]);345 r3 = _mm_mul_ps(s3, m2[3]);346 out[3] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));347 }348 349 [FORCE_INLINE] void mat4x4_mul_sse_2(__m128 *out, const __m128 *m1, const __m128 *m2)350 {351 __m128 s0 = xxxx_ps(m1[0]);352 __m128 s1 = yyyy_ps(m1[0]);353 __m128 s2 = zzzz_ps(m1[0]);354 __m128 s3 = wwww_ps(m1[0]);355 __m128 r0 = _mm_mul_ps(s0, m2[0]);356 __m128 r1 = _mm_mul_ps(s1, m2[1]);357 __m128 r2 = _mm_mul_ps(s2, m2[2]);358 __m128 r3 = _mm_mul_ps(s3, m2[3]);359 out[0] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));360 361 s0 = xxxx_ps(m1[1]);362 s1 = yyyy_ps(m1[1]);363 s2 = zzzz_ps(m1[1]);364 s3 = wwww_ps(m1[1]);365 r0 = _mm_mul_ps(s0, m2[0]);366 r1 = _mm_mul_ps(s1, m2[1]);367 r2 = _mm_mul_ps(s2, m2[2]);368 r3 = _mm_mul_ps(s3, m2[3]);369 out[1] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));370 371 s0 = xxxx_ps(m1[2]);372 s1 = yyyy_ps(m1[2]);373 s2 = zzzz_ps(m1[2]);374 s3 = wwww_ps(m1[2]);375 r0 = _mm_mul_ps(s0, m2[0]);376 r1 = _mm_mul_ps(s1, m2[1]);377 r2 = _mm_mul_ps(s2, m2[2]);378 r3 = _mm_mul_ps(s3, m2[3]);379 out[2] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));380 381 s0 = xxxx_ps(m1[3]);382 s1 = yyyy_ps(m1[3]);383 s2 = zzzz_ps(m1[3]);384 s3 = wwww_ps(m1[3]);385 r0 = _mm_mul_ps(s0, m2[0]);386 r1 = _mm_mul_ps(s1, m2[1]);387 r2 = _mm_mul_ps(s2, m2[2]);388 r3 = _mm_mul_ps(s3, m2[3]);389 out[3] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));390 }391 392 inline void mat3x4_mul_sse(__m128 *out, const __m128 *m1, const __m128 *m2)393 {394 const __m128 m2_3 = _mm_set_ps(1.f, 0.f, 0.f, 0.f);395 396 __m128 s0 = xxxx_ps(m1[0]);397 __m128 s1 = yyyy_ps(m1[0]);398 __m128 s2 = zzzz_ps(m1[0]);399 __m128 s3 = wwww_ps(m1[0]);400 __m128 r0 = _mm_mul_ps(s0, m2[0]);401 __m128 r1 = _mm_mul_ps(s1, m2[1]);402 __m128 r2 = _mm_mul_ps(s2, m2[2]);403 __m128 r3 = _mm_mul_ps(s3, m2_3);404 out[0] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));405 406 s0 = xxxx_ps(m1[1]);407 s1 = yyyy_ps(m1[1]);408 s2 = zzzz_ps(m1[1]);409 s3 = wwww_ps(m1[1]);410 r0 = _mm_mul_ps(s0, m2[0]);411 r1 = _mm_mul_ps(s1, m2[1]);412 r2 = _mm_mul_ps(s2, m2[2]);413 r3 = _mm_mul_ps(s3, m2_3);414 out[1] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));415 416 s0 = xxxx_ps(m1[2]);417 s1 = yyyy_ps(m1[2]);418 s2 = zzzz_ps(m1[2]);419 s3 = wwww_ps(m1[2]);420 r0 = _mm_mul_ps(s0, m2[0]);421 r1 = _mm_mul_ps(s1, m2[1]);422 r2 = _mm_mul_ps(s2, m2[2]);423 r3 = _mm_mul_ps(s3, m2_3);424 out[2] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));425 }426 427 428 429 430 431 inline void mat4x4_mul_mat3x4_sse(__m128 *out, const __m128 *m1, const __m128 *m2)432 {433 const __m128 m2_3 = _mm_set_ps(1.f, 0.f, 0.f, 0.f);434 435 __m128 s0 = xxxx_ps(m1[0]);436 __m128 s1 = yyyy_ps(m1[0]);437 __m128 s2 = zzzz_ps(m1[0]);438 __m128 s3 = wwww_ps(m1[0]);439 __m128 r0 = _mm_mul_ps(s0, m2[0]);440 __m128 r1 = _mm_mul_ps(s1, m2[1]);441 __m128 r2 = _mm_mul_ps(s2, m2[2]);442 __m128 r3 = _mm_mul_ps(s3, m2_3);443 out[0] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));444 445 s0 = xxxx_ps(m1[1]);446 s1 = yyyy_ps(m1[1]);447 s2 = zzzz_ps(m1[1]);448 s3 = wwww_ps(m1[1]);449 r0 = _mm_mul_ps(s0, m2[0]);450 r1 = _mm_mul_ps(s1, m2[1]);451 r2 = _mm_mul_ps(s2, m2[2]);452 r3 = _mm_mul_ps(s3, m2_3);453 out[1] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));454 455 s0 = xxxx_ps(m1[2]);456 s1 = yyyy_ps(m1[2]);457 s2 = zzzz_ps(m1[2]);458 s3 = wwww_ps(m1[2]);459 r0 = _mm_mul_ps(s0, m2[0]);460 r1 = _mm_mul_ps(s1, m2[1]);461 r2 = _mm_mul_ps(s2, m2[2]);462 r3 = _mm_mul_ps(s3, m2_3);463 out[2] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));464 465 s0 = xxxx_ps(m1[3]);466 s1 = yyyy_ps(m1[3]);467 s2 = zzzz_ps(m1[3]);468 s3 = wwww_ps(m1[3]);469 r0 = _mm_mul_ps(s0, m2[0]);470 r1 = _mm_mul_ps(s1, m2[1]);471 r2 = _mm_mul_ps(s2, m2[2]);472 r3 = _mm_mul_ps(s3, m2_3);473 out[3] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));474 }475 476 477 478 479 480 inline void mat3x4_mul_mat4x4_sse(__m128 *out, const __m128 *m1, const __m128 *m2)481 {482 __m128 s0 = xxxx_ps(m1[0]);483 __m128 s1 = yyyy_ps(m1[0]);484 __m128 s2 = zzzz_ps(m1[0]);485 __m128 s3 = wwww_ps(m1[0]);486 __m128 r0 = _mm_mul_ps(s0, m2[0]);487 __m128 r1 = _mm_mul_ps(s1, m2[1]);488 __m128 r2 = _mm_mul_ps(s2, m2[2]);489 __m128 r3 = _mm_mul_ps(s3, m2[3]);490 out[0] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));491 492 s0 = xxxx_ps(m1[1]);493 s1 = yyyy_ps(m1[1]);494 s2 = zzzz_ps(m1[1]);495 s3 = wwww_ps(m1[1]);496 r0 = _mm_mul_ps(s0, m2[0]);497 r1 = _mm_mul_ps(s1, m2[1]);498 r2 = _mm_mul_ps(s2, m2[2]);499 r3 = _mm_mul_ps(s3, m2[3]);500 out[1] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));501 502 s0 = xxxx_ps(m1[2]);503 s1 = yyyy_ps(m1[2]);504 s2 = zzzz_ps(m1[2]);505 s3 = wwww_ps(m1[2]);506 r0 = _mm_mul_ps(s0, m2[0]);507 r1 = _mm_mul_ps(s1, m2[1]);508 r2 = _mm_mul_ps(s2, m2[2]);509 r3 = _mm_mul_ps(s3, m2[3]);510 out[2] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));511 512 out[3] = m2[3];513 }514 515 516 517 518 519 inline void mat3x3_mul_mat4x4_sse(__m128 *out, const float *m1, const __m128 *m2)520 {521 __m128 m1_0 = loadu_ps(m1);522 523 __m128 s0 = xxxx_ps(m1_0);524 __m128 s1 = yyyy_ps(m1_0);525 __m128 s2 = zzzz_ps(m1_0);526 __m128 r0 = _mm_mul_ps(s0, m2[0]);527 __m128 r1 = _mm_mul_ps(s1, m2[1]);528 __m128 r2 = _mm_mul_ps(s2, m2[2]);529 out[0] = _mm_add_ps(_mm_add_ps(r0, r1), r2);530 531 s0 = wwww_ps(m1_0);532 __m128 m1_1 = loadu_ps(m1+4);533 s1 = xxxx_ps(m1_1);534 s2 = yyyy_ps(m1_1);535 r0 = _mm_mul_ps(s0, m2[0]);536 r1 = _mm_mul_ps(s1, m2[1]);537 r2 = _mm_mul_ps(s2, m2[2]);538 out[1] = _mm_add_ps(_mm_add_ps(r0, r1), r2);539 540 s0 = zzzz_ps(m1_1);541 s1 = wwww_ps(m1_1);542 s2 = load1_ps(m1+8);543 r0 = _mm_mul_ps(s0, m2[0]);544 r1 = _mm_mul_ps(s1, m2[1]);545 r2 = _mm_mul_ps(s2, m2[2]);546 out[2] = _mm_add_ps(_mm_add_ps(r0, r1), r2);547 548 out[3] = m2[3];549 }550 551 552 553 554 555 inline void mat4x4_mul_mat3x3_sse(__m128 *out, const __m128 *m1, const float *m2)556 {557 __m128 m2_0 = load_vec3(m2, 0.f);558 __m128 m2_1 = load_vec3(m2+3, 0.f);559 __m128 m2_2 = load_vec3(m2+6, 0.f);560 const __m128 m2_3 = _mm_set_ps(1.f, 0.f, 0.f, 0.f);561 562 __m128 s0 = xxxx_ps(m1[0]);563 __m128 s1 = yyyy_ps(m1[0]);564 __m128 s2 = zzzz_ps(m1[0]);565 __m128 s3 = wwww_ps(m1[0]);566 __m128 r0 = _mm_mul_ps(s0, m2_0);567 __m128 r1 = _mm_mul_ps(s1, m2_1);568 __m128 r2 = _mm_mul_ps(s2, m2_2);569 __m128 r3 = _mm_mul_ps(s3, m2_3);570 out[0] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));571 572 s0 = xxxx_ps(m1[1]);573 s1 = yyyy_ps(m1[1]);574 s2 = zzzz_ps(m1[1]);575 s3 = wwww_ps(m1[1]);576 r0 = _mm_mul_ps(s0, m2_0);577 r1 = _mm_mul_ps(s1, m2_1);578 r2 = _mm_mul_ps(s2, m2_2);579 r3 = _mm_mul_ps(s3, m2_3);580 out[1] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));581 582 s0 = xxxx_ps(m1[2]);583 s1 = yyyy_ps(m1[2]);584 s2 = zzzz_ps(m1[2]);585 s3 = wwww_ps(m1[2]);586 r0 = _mm_mul_ps(s0, m2_0);587 r1 = _mm_mul_ps(s1, m2_1);588 r2 = _mm_mul_ps(s2, m2_2);589 r3 = _mm_mul_ps(s3, m2_3);590 out[2] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));591 592 s0 = xxxx_ps(m1[3]);593 s1 = yyyy_ps(m1[3]);594 s2 = zzzz_ps(m1[3]);595 s3 = wwww_ps(m1[3]);596 r0 = _mm_mul_ps(s0, m2_0);597 r1 = _mm_mul_ps(s1, m2_1);598 r2 = _mm_mul_ps(s2, m2_2);599 r3 = _mm_mul_ps(s3, m2_3);600 out[3] = _mm_add_ps(_mm_add_ps(r0, r1), _mm_add_ps(r2, r3));601 }602 603 604 #define MAT_COFACTOR(mat, i, j) \605 _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(mat[2], mat[1], _MM_SHUFFLE(j,j,j,j)), \606 shuffle1_ps(_mm_shuffle_ps(mat[3], mat[2], _MM_SHUFFLE(i,i,i,i)), _MM_SHUFFLE(2,0,0,0))), \607 _mm_mul_ps(shuffle1_ps(_mm_shuffle_ps(mat[3], mat[2], _MM_SHUFFLE(j,j,j,j)), _MM_SHUFFLE(2,0,0,0)), \608 _mm_shuffle_ps(mat[2], mat[1], _MM_SHUFFLE(i,i,i,i))))609 [FORCE_INLINE] float mat4x4_inverse(const __m128 *mat, __m128 *out)610 {611 __m128 f1 = MAT_COFACTOR(mat, 3, 2);612 __m128 f2 = MAT_COFACTOR(mat, 3, 1);613 __m128 f3 = MAT_COFACTOR(mat, 2, 1);614 __m128 f4 = MAT_COFACTOR(mat, 3, 0);615 __m128 f5 = MAT_COFACTOR(mat, 2, 0);616 __m128 f6 = MAT_COFACTOR(mat, 1, 0);617 __m128 v1 = shuffle1_ps(_mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(0,0,0,0)), _MM_SHUFFLE(2,2,2,0));618 __m128 v2 = shuffle1_ps(_mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(1,1,1,1)), _MM_SHUFFLE(2,2,2,0));619 __m128 v3 = shuffle1_ps(_mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(2,2,2,2)), _MM_SHUFFLE(2,2,2,0));620 __m128 v4 = shuffle1_ps(_mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(2,2,2,0));621 const __m128 s1 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f);622 const __m128 s2 = _mm_set_ps( 0.0f, -0.0f, 0.0f, -0.0f);623 __m128 r1 = _mm_xor_ps(s1, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v2, f1), _mm_mul_ps(v3, f2)), _mm_mul_ps(v4, f3)));624 __m128 r2 = _mm_xor_ps(s2, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f1), _mm_mul_ps(v3, f4)), _mm_mul_ps(v4, f5)));625 __m128 r3 = _mm_xor_ps(s1, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f2), _mm_mul_ps(v2, f4)), _mm_mul_ps(v4, f6)));626 __m128 r4 = _mm_xor_ps(s2, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f3), _mm_mul_ps(v2, f5)), _mm_mul_ps(v3, f6)));627 __m128 det = dot4_ps(mat[0], _mm_movelh_ps(_mm_unpacklo_ps(r1, r2), _mm_unpacklo_ps(r3, r4)));628 __m128 rcp = _mm_rcp_ps(det);629 out[0] = _mm_mul_ps(r1, rcp);630 out[1] = _mm_mul_ps(r2, rcp);631 out[2] = _mm_mul_ps(r3, rcp);632 out[3] = _mm_mul_ps(r4, rcp);633 return s4f_x(det);634 }635 636 #define MAT3x4_COFACTOR(mat, i, j) \637 _mm_sub_ps(_mm_mul_ps(_mm_shuffle_ps(mat[2], mat[1], _MM_SHUFFLE(j,j,j,j)), \638 shuffle1_ps(_mm_shuffle_ps(mat_3, mat[2], _MM_SHUFFLE(i,i,i,i)), _MM_SHUFFLE(2,0,0,0))), \639 _mm_mul_ps(shuffle1_ps(_mm_shuffle_ps(mat_3, mat[2], _MM_SHUFFLE(j,j,j,j)), _MM_SHUFFLE(2,0,0,0)), \640 _mm_shuffle_ps(mat[2], mat[1], _MM_SHUFFLE(i,i,i,i))))641 [FORCE_INLINE] float mat3x4_inverse(const __m128 *mat, __m128 *out)642 {643 644 __m128 mat_3 = set_ps(1.f, 0.f, 0.f, 0.f);645 __m128 f1 = MAT3x4_COFACTOR(mat, 3, 2);646 __m128 f2 = MAT3x4_COFACTOR(mat, 3, 1);647 __m128 f3 = MAT3x4_COFACTOR(mat, 2, 1);648 __m128 f4 = MAT3x4_COFACTOR(mat, 3, 0);649 __m128 f5 = MAT3x4_COFACTOR(mat, 2, 0);650 __m128 f6 = MAT3x4_COFACTOR(mat, 1, 0);651 __m128 v1 = shuffle1_ps(_mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(0,0,0,0)), _MM_SHUFFLE(2,2,2,0));652 __m128 v2 = shuffle1_ps(_mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(1,1,1,1)), _MM_SHUFFLE(2,2,2,0));653 __m128 v3 = shuffle1_ps(_mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(2,2,2,2)), _MM_SHUFFLE(2,2,2,0));654 __m128 v4 = shuffle1_ps(_mm_shuffle_ps(mat[1], mat[0], _MM_SHUFFLE(3,3,3,3)), _MM_SHUFFLE(2,2,2,0));655 const __m128 s1 = _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f);656 const __m128 s2 = _mm_set_ps( 0.0f, -0.0f, 0.0f, -0.0f);657 __m128 r1 = _mm_xor_ps(s1, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v2, f1), _mm_mul_ps(v3, f2)), _mm_mul_ps(v4, f3)));658 __m128 r2 = _mm_xor_ps(s2, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f1), _mm_mul_ps(v3, f4)), _mm_mul_ps(v4, f5)));659 __m128 r3 = _mm_xor_ps(s1, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f2), _mm_mul_ps(v2, f4)), _mm_mul_ps(v4, f6)));660 __m128 r4 = _mm_xor_ps(s2, _mm_add_ps(_mm_sub_ps(_mm_mul_ps(v1, f3), _mm_mul_ps(v2, f5)), _mm_mul_ps(v3, f6)));661 __m128 det = dot4_ps(mat[0], _mm_movelh_ps(_mm_unpacklo_ps(r1, r2), _mm_unpacklo_ps(r3, r4)));662 __m128 rcp = _mm_rcp_ps(det);663 out[0] = _mm_mul_ps(r1, rcp);664 out[1] = _mm_mul_ps(r2, rcp);665 out[2] = _mm_mul_ps(r3, rcp);666 return s4f_x(det);667 }668 669 670 [FORCE_INLINE] void mat3x4_inverse_orthonormal(const __m128 *mat, __m128 *out)671 {672 673 674 675 676 677 678 679 __m128 tx = wwww_ps(mat[0]);680 __m128 ty = wwww_ps(mat[1]);681 __m128 tz = wwww_ps(mat[2]);682 tx = _mm_mul_ps(tx, mat[0]);683 ty = _mm_mul_ps(ty, mat[1]);684 tz = _mm_mul_ps(tz, mat[2]);685 __m128 vec = negate_ps(_mm_add_ps(_mm_add_ps(tx, ty), tz));686 687 __m128 tmp0 = _mm_unpacklo_ps(mat[0], mat[1]); 688 __m128 tmp1 = _mm_unpackhi_ps(mat[0], mat[1]); 689 __m128 tmp2 = _mm_unpacklo_ps(mat[2], vec); 690 __m128 tmp3 = _mm_unpackhi_ps(mat[2], vec); 691 692 out[0] = _mm_movelh_ps(tmp0, tmp2); 693 out[1] = _mm_movehl_ps(tmp2, tmp0); 694 out[2] = _mm_movelh_ps(tmp1, tmp3); 695 696 }697 698 699 [FORCE_INLINE] void mat3x4_inverse_orthogonal_uniformscale(const __m128 *mat, __m128 *out)700 {701 702 703 704 705 706 simd4f scale = rcp_ps(sum_xyz_ps(mul_ps(mat[0], mat[0])));707 708 709 710 simd4f tx = wwww_ps(mat[0]);711 simd4f ty = wwww_ps(mat[1]);712 simd4f tz = wwww_ps(mat[2]);713 simd4f m0 = mul_ps(scale, mat[0]);714 simd4f m1 = mul_ps(scale, mat[1]);715 simd4f m2 = mul_ps(scale, mat[2]);716 tx = mul_ps(tx, m0);717 ty = mul_ps(ty, m1);718 tz = mul_ps(tz, m2);719 simd4f vec = negate_ps(add_ps(add_ps(tx, ty), tz));720 721 simd4f tmp0 = _mm_unpacklo_ps(m0, m1); 722 simd4f tmp1 = _mm_unpackhi_ps(m0, m1); 723 simd4f tmp2 = _mm_unpacklo_ps(m2, vec); 724 simd4f tmp3 = _mm_unpackhi_ps(m2, vec); 725 726 out[0] = _mm_movelh_ps(tmp0, tmp2); 727 out[1] = _mm_movehl_ps(tmp2, tmp0); 728 out[2] = _mm_movelh_ps(tmp1, tmp3); 729 730 }731 732 [FORCE_INLINE] void mat3x4_inverse_colorthogonal(const __m128 *mat, __m128 *out)733 {734 735 736 737 738 739 simd4f scale = rcp_ps(add_ps(mul_ps(mat[0], mat[0]), add_ps(mul_ps(mat[1], mat[1]), mul_ps(mat[2], mat[2]))));740 741 742 743 simd4f tx = wwww_ps(mat[0]);744 simd4f ty = wwww_ps(mat[1]);745 simd4f tz = wwww_ps(mat[2]);746 simd4f m0 = mul_ps(scale, mat[0]);747 simd4f m1 = mul_ps(scale, mat[1]);748 simd4f m2 = mul_ps(scale, mat[2]);749 tx = mul_ps(tx, m0);750 ty = mul_ps(ty, m1);751 tz = mul_ps(tz, m2);752 simd4f vec = negate_ps(add_ps(add_ps(tx, ty), tz));753 754 simd4f tmp0 = _mm_unpacklo_ps(m0, m1); 755 simd4f tmp1 = _mm_unpackhi_ps(m0, m1); 756 simd4f tmp2 = _mm_unpacklo_ps(m2, vec); 757 simd4f tmp3 = _mm_unpackhi_ps(m2, vec); 758 759 out[0] = _mm_movelh_ps(tmp0, tmp2); 760 out[1] = _mm_movehl_ps(tmp2, tmp0); 761 out[2] = _mm_movelh_ps(tmp1, tmp3); 762 763 }764 765 [FORCE_INLINE] __m128 NewtonRhapsonRecipStep(__m128 recip, __m128 estimate)766 {767 768 769 __m128 [e2] = _mm_mul_ps(estimate, estimate);770 return _mm_sub_ps(_mm_add_ps(estimate, estimate), _mm_mul_ps(recip, e2));771 }772 773 [FORCE_INLINE] __m128 NewtonRhapsonRecip(__m128 recip)774 {775 __m128 estimate = _mm_rcp_ps(recip);776 return NewtonRhapsonRecipStep(recip, estimate);777 }778 779 780 inline float mat4x4_determinant(const __m128 *row)781 {782 __m128 s = xxxx_ps(NewtonRhapsonRecip(row[0]));783 784 __m128 row0 = _mm_mul_ps(s, row[0]);785 s = xxxx_ps(row[1]);786 __m128 row1 = _mm_sub_ps(row[1], _mm_mul_ps(s, row0));787 s = xxxx_ps(row[2]);788 __m128 row2 = _mm_sub_ps(row[2], _mm_mul_ps(s, row0));789 s = xxxx_ps(row[3]);790 __m128 row3 = _mm_sub_ps(row[3], _mm_mul_ps(s, row0));791 792 793 s = yyyy_ps(NewtonRhapsonRecip(row1));794 __m128 row1_1 = _mm_mul_ps(s, row1);795 s = yyyy_ps(row2);796 __m128 row2_1 = _mm_sub_ps(row2, _mm_mul_ps(s, row1_1));797 s = yyyy_ps(row3);798 __m128 row3_1 = _mm_sub_ps(row3, _mm_mul_ps(s, row1_1));799 800 801 802 __m128 r1 = shuffle1_ps(row2_1, _MM_SHUFFLE(2,3,1,0));803 __m128 r = _mm_mul_ps(r1, row3_1);804 __m128 a = wwww_ps(r);805 __m128 b = zzzz_ps(r);806 __m128 d1 = _mm_sub_ss(a, b);807 __m128 d2 = row[0];808 __m128 d3 = yyyy_ps(row1);809 __m128 [d] = _mm_mul_ss(d1, _mm_mul_ss(d2, d3));810 return s4f_x(d);811 }812 813 814 inline float mat3x4_determinant(const __m128 *row)815 {816 __m128 s = xxxx_ps(NewtonRhapsonRecip(row[0]));817 818 __m128 row0 = _mm_mul_ps(s, row[0]);819 s = xxxx_ps(row[1]);820 __m128 row1 = _mm_sub_ps(row[1], _mm_mul_ps(s, row0));821 s = xxxx_ps(row[2]);822 __m128 row2 = _mm_sub_ps(row[2], _mm_mul_ps(s, row0));823 824 825 826 __m128 r1 = shuffle1_ps(row1, _MM_SHUFFLE(3,1,2,0));827 __m128 r = _mm_mul_ps(r1, row2);828 __m128 a = zzzz_ps(r);829 __m128 b = yyyy_ps(r);830 __m128 d1 = _mm_sub_ss(a, b);831 __m128 d2 = row[0];832 __m128 d = _mm_mul_ss(d1, d2);833 return s4f_x(d);834 }835 836 inline void mat3x4_transpose(const __m128 *src, __m128 *dst)837 {838 __m128 src3 = _mm_setzero_ps(); 839 __m128 tmp0 = _mm_unpacklo_ps(src[0], src[1]);840 __m128 tmp2 = _mm_unpacklo_ps(src[2], src3);841 __m128 tmp1 = _mm_unpackhi_ps(src[0], src[1]);842 __m128 tmp3 = _mm_unpackhi_ps(src[2], src3);843 dst[0] = _mm_movelh_ps(tmp0, tmp2);844 dst[1] = _mm_movehl_ps(tmp2, tmp0);845 dst[2] = _mm_movelh_ps(tmp1, tmp3);846 }847 848 [MATH_END_NAMESPACE]849 850 #endif // ~MATH_SSE Go back to previous page