1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 #pragma once20 21 #ifdef MATH_SIMD22 23 #include "[SSEMath.h]"24 #include "[float4_neon.h]"25 #include "[float4x4_sse.h]"26 27 #if !defined(ANDROID) 28 // Multiplies mat * vec, where mat is a matrix in row-major format.29 [FORCE_INLINE] simd4f mat4x4_mul_vec4(const simd4f *mat, simd4f vec)30 {31 #ifdef MATH_NEON32 33 float32x4x4_t m = vld4q_f32((const float32_t*)mat);34 simd4f ret = vmulq_lane_f32(m.val[0], vget_low_f32(vec), 0);35 ret = vmlaq_lane_f32(ret, m.val[1], vget_low_f32(vec), 1);36 ret = vmlaq_lane_f32(ret, m.val[2], vget_high_f32(vec), 0);37 return vmlaq_lane_f32(ret, m.val[3], vget_high_f32(vec), 1);38 #elif defined(MATH_SSE3)39 return mat4x4_mul_sse3(mat, vec);40 #else41 return mat4x4_mul_sse(mat, vec);42 #endif43 }44 #endif45 46 47 [FORCE_INLINE] simd4f vec4_mul_mat4x4(simd4f vec, const simd4f *mat)48 {49 #ifdef MATH_NEON50 simd4f ret = vmulq_lane_f32(mat[0], vget_low_f32(vec), 0);51 ret = vmlaq_lane_f32(ret, mat[1], vget_low_f32(vec), 1);52 ret = vmlaq_lane_f32(ret, mat[2], vget_high_f32(vec), 0);53 return vmlaq_lane_f32(ret, mat[3], vget_high_f32(vec), 1);54 #else55 return colmajor_mat4x4_mul_sse1(mat, vec);56 #endif57 }58 59 60 [FORCE_INLINE] void mat4x4_mul_mat4x4(simd4f *out, const simd4f *m1, const simd4f *m2)61 {62 #if defined(MATH_NEON)63 simd4f r1 = vmulq_lane_f32(m2[0], vget_low_f32(m1[0]), 0);64 simd4f r2 = vmulq_lane_f32(m2[0], vget_low_f32(m1[1]), 0);65 simd4f r3 = vmulq_lane_f32(m2[0], vget_low_f32(m1[2]), 0);66 simd4f r4 = vmulq_lane_f32(m2[0], vget_low_f32(m1[3]), 0);67 68 r1 = vmlaq_lane_f32(r1, m2[1], vget_low_f32(m1[0]), 1);69 r2 = vmlaq_lane_f32(r2, m2[1], vget_low_f32(m1[1]), 1);70 r3 = vmlaq_lane_f32(r3, m2[1], vget_low_f32(m1[2]), 1);71 r4 = vmlaq_lane_f32(r4, m2[1], vget_low_f32(m1[3]), 1);72 73 r1 = vmlaq_lane_f32(r1, m2[2], vget_high_f32(m1[0]), 0);74 r2 = vmlaq_lane_f32(r2, m2[2], vget_high_f32(m1[1]), 0);75 r3 = vmlaq_lane_f32(r3, m2[2], vget_high_f32(m1[2]), 0);76 r4 = vmlaq_lane_f32(r4, m2[2], vget_high_f32(m1[3]), 0);77 78 r1 = vmlaq_lane_f32(r1, m2[3], vget_high_f32(m1[0]), 1);79 r2 = vmlaq_lane_f32(r2, m2[3], vget_high_f32(m1[1]), 1);80 r3 = vmlaq_lane_f32(r3, m2[3], vget_high_f32(m1[2]), 1);81 r4 = vmlaq_lane_f32(r4, m2[3], vget_high_f32(m1[3]), 1);82 83 out[0] = r1;84 out[1] = r2;85 out[2] = r3;86 out[3] = r4;87 #else88 mat4x4_mul_sse(out, m1, m2);89 #endif90 }91 92 #ifdef ANDROID93 [FORCE_INLINE] void mat4x4_mul_mat4x4_asm(simd4f *out, const simd4f *m1, const simd4f *m2)94 {95 asm(96 "\t vldmia %1, {q4-q7} \n"97 "\t vldmia %2, {q8-q11} \n"98 "\t vmul.f32 q0, q8, d8[0] \n"99 "\t vmul.f32 q1, q8, d10[0] \n"100 "\t vmul.f32 q2, q8, d12[0] \n"101 "\t vmul.f32 q3, q8, d14[0] \n"102 "\t vmla.f32 q0, q9, d8[1] \n"103 "\t vmla.f32 q1, q9, d10[1] \n"104 "\t vmla.f32 q2, q9, d12[1] \n"105 "\t vmla.f32 q3, q9, d14[1] \n"106 "\t vmla.f32 q0, q10, d9[0] \n"107 "\t vmla.f32 q1, q10, d11[0] \n"108 "\t vmla.f32 q2, q10, d13[0] \n"109 "\t vmla.f32 q3, q10, d15[0] \n"110 "\t vmla.f32 q0, q11, d9[1] \n"111 "\t vmla.f32 q1, q11, d11[1] \n"112 "\t vmla.f32 q2, q11, d13[1] \n"113 "\t vmla.f32 q3, q11, d15[1] \n"114 "\t vstmia %0, {q0-q3} \n"115 : 116 : "r"(out), "r"(m1), "r"(m2)117 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q11");118 }119 #endif120 121 #if !defined(ANDROID) 122 FORCE_INLINE void mat4x4_transpose(simd4f *out, const simd4f *mat)123 {124 #ifdef MATH_NEON125 float32x4x4_t m = vld4q_f32((const float32_t*)mat);126 vst1q_f32((float32_t*)out, m.val[0]);127 vst1q_f32((float32_t*)out+4, m.val[1]);128 vst1q_f32((float32_t*)out+8, m.val[2]);129 vst1q_f32((float32_t*)out+12, m.val[3]);130 #else131 132 133 134 #ifdef MATH_AVX135 __m128 tmp0 = _mm_shuffle_ps(mat[0], mat[1], 0x44);136 __m128 tmp2 = _mm_shuffle_ps(mat[0], mat[1], 0xEE);137 __m128 tmp1 = _mm_shuffle_ps(mat[2], mat[3], 0x44);138 __m128 tmp3 = _mm_shuffle_ps(mat[2], mat[3], 0xEE);139 out[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);140 out[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);141 out[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);142 out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);143 #else144 __m128 tmp0 = _mm_unpacklo_ps(mat[0], mat[1]);145 __m128 tmp2 = _mm_unpacklo_ps(mat[2], mat[3]);146 __m128 tmp1 = _mm_unpackhi_ps(mat[0], mat[1]);147 __m128 tmp3 = _mm_unpackhi_ps(mat[2], mat[3]);148 out[0] = _mm_movelh_ps(tmp0, tmp2);149 out[1] = _mm_movehl_ps(tmp2, tmp0);150 out[2] = _mm_movelh_ps(tmp1, tmp3);151 out[3] = _mm_movehl_ps(tmp3, tmp1);152 #endif153 154 #endif155 }156 #endif157 158 [FORCE_INLINE] void mat4x4_set(simd4f *mat, float _00, float _01, float _02, float _03,159 float _10, float _11, float _12, float _13,160 float _20, float _21, float _22, float _23,161 float _30, float _31, float _32, float _33)162 {163 #ifdef MATH_AVX164 __m256 *mat2 = (__m256*)mat;165 mat2[0] = _mm256_set_ps(_13, _12, _11, _10, _03, _02, _01, _00);166 mat2[1] = _mm256_set_ps(_33, _32, _31, _30, _23, _22, _21, _20);167 #else168 mat[0] = set_ps(_03, _02, _01, _00);169 mat[1] = set_ps(_13, _12, _11, _10);170 mat[2] = set_ps(_23, _22, _21, _20);171 mat[3] = set_ps(_33, _32, _31, _30);172 #endif173 }174 175 [FORCE_INLINE] void mat4x4_mul_float(simd4f *out, const simd4f *mat, float scalar)176 {177 #ifdef MATH_AVX178 __m256 s = _mm256_set1_ps(scalar);179 __m256 *o = (__m256*)out;180 __m256 *i = (__m256*)mat;181 o[0] = _mm256_mul_ps(i[0], s);182 o[1] = _mm256_mul_ps(i[1], s);183 #else184 simd4f v = set1_ps(scalar);185 out[0] = mul_ps(mat[0], v);186 out[1] = mul_ps(mat[1], v);187 out[2] = mul_ps(mat[2], v);188 out[3] = mul_ps(mat[3], v);189 #endif190 }191 192 [FORCE_INLINE] void mat4x4_div_float(simd4f *out, const simd4f *mat, float scalar)193 {194 #ifdef MATH_AVX195 __m256 *o = (__m256*)out;196 __m256 *i = (__m256*)mat;197 __m256 s = _mm256_set1_ps(scalar);198 __m256 [one] = _mm256_set1_ps(1.f);199 s = _mm256_div_ps(one, s);200 o[0] = _mm256_mul_ps(i[0], s);201 o[1] = _mm256_mul_ps(i[1], s);202 #else203 simd4f s = set1_ps(scalar);204 simd4f one = set1_ps(1.f);205 s = div_ps(one, s);206 out[0] = mul_ps(mat[0], s);207 out[1] = mul_ps(mat[1], s);208 out[2] = mul_ps(mat[2], s);209 out[3] = mul_ps(mat[3], s);210 #endif211 }212 213 [FORCE_INLINE] void mat4x4_add_mat4x4(simd4f *out, const simd4f *m1, const simd4f *m2)214 {215 #ifdef MATH_AVX216 __m256 *o = (__m256*)out;217 __m256 *i1 = (__m256*)m1;218 __m256 *i2 = (__m256*)m2;219 o[0] = _mm256_add_ps(i1[0], i2[0]);220 o[1] = _mm256_add_ps(i1[1], i2[1]);221 #else222 out[0] = add_ps(m1[0], m2[0]);223 out[1] = add_ps(m1[1], m2[1]);224 out[2] = add_ps(m1[2], m2[2]);225 out[3] = add_ps(m1[3], m2[3]);226 #endif227 }228 229 [FORCE_INLINE] void mat4x4_sub_mat4x4(simd4f *out, const simd4f *m1, const simd4f *m2)230 {231 #ifdef MATH_AVX232 __m256 *o = (__m256*)out;233 __m256 *i1 = (__m256*)m1;234 __m256 *i2 = (__m256*)m2;235 o[0] = _mm256_sub_ps(i1[0], i2[0]);236 o[1] = _mm256_sub_ps(i1[1], i2[1]);237 #else238 out[0] = sub_ps(m1[0], m2[0]);239 out[1] = sub_ps(m1[1], m2[1]);240 out[2] = sub_ps(m1[2], m2[2]);241 out[3] = sub_ps(m1[3], m2[3]);242 #endif243 }244 245 [FORCE_INLINE] void mat4x4_negate(simd4f *out, const simd4f *mat)246 {247 #ifdef MATH_AVX248 __m256 [zero] = _mm256_setzero_ps();249 __m256 *o = (__m256*)out;250 __m256 *m = (__m256*)mat;251 o[0] = _mm256_sub_ps(zero, m[0]);252 o[1] = _mm256_sub_ps(zero, m[1]);253 #else254 out[0] = negate_ps(mat[0]);255 out[1] = negate_ps(mat[1]);256 out[2] = negate_ps(mat[2]);257 out[3] = negate_ps(mat[3]);258 #endif259 }260 261 #endif Go back to previous page