1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 #pragma once20 21 #ifdef MATH_SIMD22 23 #ifdef MATH_NEON24 #include <arm_neon.h>25 #endif26 27 #include "[float4_sse.h]"28 29 #ifdef ANDROID30 [FORCE_INLINE] void vec4_add_float_asm(const void *vec, float f, void *out)31 {32 asm(33 "\t vld1.32 {d0, d1} [%1] \n"34 "\t vdup.32 q1, [%2] \n"35 "\t vadd.f32 q0, q0, q1 \n"36 "\t vst1.32 {d0, d1}, [%0] \n"37 : 38 :"r"(out), "r"(vec), "r"(f)39 :"q0", "q1");40 }41 #endif42 43 [FORCE_INLINE] simd4f vec4_add_float(simd4f vec, float f)44 {45 return add_ps(vec, set1_ps(f));46 }47 48 [FORCE_INLINE] simd4f vec4_add_vec4(simd4f vec, simd4f vec2)49 {50 return add_ps(vec, vec2);51 }52 53 [FORCE_INLINE] simd4f vec4_sub_float(simd4f vec, float f)54 {55 return sub_ps(vec, set1_ps(f));56 }57 58 [FORCE_INLINE] simd4f float_sub_vec4(float f, simd4f vec)59 {60 return sub_ps(set1_ps(f), vec);61 }62 63 [FORCE_INLINE] simd4f vec4_sub_vec4(simd4f vec, simd4f vec2)64 {65 return sub_ps(vec, vec2);66 }67 68 [FORCE_INLINE] simd4f vec4_mul_float(simd4f vec, float f)69 {70 return mul_ps(vec, set1_ps(f));71 }72 73 [FORCE_INLINE] simd4f vec4_mul_vec4(simd4f vec, simd4f vec2)74 {75 return mul_ps(vec, vec2);76 }77 78 [FORCE_INLINE] simd4f vec4_div_float(simd4f vec, float f)79 {80 #ifdef MATH_SSE81 return div_ps(vec, set1_ps(f));82 #elif defined(MATH_NEON)83 simd4f v = set1_ps(f);84 simd4f rcp = vrecpeq_f32(v);85 rcp = mul_ps(vrecpsq_f32(v, rcp), rcp);86 return mul_ps(vec, rcp);87 #endif88 }89 90 [FORCE_INLINE] simd4f float_div_vec4(float f, simd4f vec)91 {92 #ifdef MATH_SSE93 return div_ps(set1_ps(f), vec);94 #elif defined(MATH_NEON)95 simd4f rcp = vrecpeq_f32(vec);96 rcp = mul_ps(vrecpsq_f32(vec, rcp), rcp);97 return mul_ps(set1_ps(f), rcp);98 #endif99 }100 101 [FORCE_INLINE] simd4f vec4_recip(simd4f vec)102 {103 #ifdef MATH_SSE104 simd4f [e] = _mm_rcp_ps(vec); 105 return sub_ps(_mm_add_ps(e, e), mul_ps(vec, mul_ps(e,e)));106 #elif defined(MATH_NEON)107 simd4f rcp = vrecpeq_f32(vec);108 rcp = mul_ps(vrecpsq_f32(vec, rcp), rcp);109 return rcp;110 #endif111 }112 113 #ifdef MATH_NEON114 inline std::string ToString(uint8x8x2_t vec)115 {116 uint8_t *v = (uint8_t*)&vec;117 char str[256];118 sprintf(str, "[%02X, %02X, %02X, %02X, %02X, %02X, %02X, %02X | %02X, %02X, %02X, %02X, %02X, %02X, %02X, %02X]", 119 (int)v[15], (int)v[14], (int)v[13], (int)v[12], (int)v[11], (int)v[10], (int)v[9], (int)v[8], (int)v[7], (int)v[6], (int)v[5], (int)v[4], (int)v[3], (int)v[2], (int)v[1], (int)v[0]);120 return str;121 }122 #endif123 124 #if defined(MATH_AVX) || defined(MATH_NEON)125 [FORCE_INLINE] simd4f vec4_permute(simd4f vec, int i, int j, int k, int l)126 {127 #ifdef MATH_AVX128 return _mm_permutevar_ps(vec, _mm_set_epi32(l, k, j, i));129 #elif defined(MATH_NEON)130 131 const uint8_t I = (uint8_t)i << 2;132 const uint8_t J = (uint8_t)j << 2;133 const uint8_t K = (uint8_t)k << 2;134 const uint8_t L = (uint8_t)l << 2;135 const [ALIGN16] uint8_t indexData[16] = { I, I+1, I+2, I+3, J, J+1, J+2, J+3, K, K+1, K+2, K+3, L, L+1, L+2, L+3 };136 uint8x8x2_t indices;137 indices.val[0] = vld1_u8(indexData);138 indices.val[1] = vld1_u8(indexData+8);139 uint8x8x2_t src = *(uint8x8x2_t*)&vec;140 uint8x8x2_t dst;141 dst.val[0] = vtbl2_u8(src, indices.val[0]);142 dst.val[1] = vtbl2_u8(src, indices.val[1]);143 return *(simd4f*)&dst;144 #endif145 }146 #endif147 148 #ifdef MATH_NEON149 150 [FORCE_INLINE] float sum_xyzw_float(simd4f vec)151 {152 float32x2_t r = vadd_f32(vget_high_f32(vec), vget_low_f32(vec));153 return vget_lane_f32(vpadd_f32(r, r), 0);154 }155 156 [FORCE_INLINE] float mul_xyzw_float(simd4f vec)157 {158 float32x2_t lo = vget_low_f32(vec);159 float32x2_t hi = vget_high_f32(vec);160 float32x2_t mul = vmul_f32(lo, hi);161 return vget_lane_f32(mul, 0) * vget_lane_f32(mul, 1); 162 }163 164 [FORCE_INLINE] simd4f negate3_ps(simd4f vec)165 {166 static const [ALIGN16] uint32_t indexData[4] = { 0x80000000UL, 0x80000000UL, 0x80000000UL, 0 };167 static const uint64x2_t mask = vld1q_u64((const uint64_t*)indexData);168 return vreinterpretq_f32_u64(veorq_u64(vreinterpretq_u64_f32(vec), mask));169 }170 171 [FORCE_INLINE] float sum_xyz_float(simd4f vec)172 {173 return sum_xyzw_float(vsetq_lane_f32(0.f, vec, 3));174 }175 176 [FORCE_INLINE] float dot4_float(simd4f a, simd4f b)177 {178 simd4f mul = mul_ps(a, b);179 return sum_xyzw_float(mul);180 }181 182 [FORCE_INLINE] float dot3_float(simd4f a, simd4f b)183 {184 simd4f mul = mul_ps(a, b);185 return sum_xyz_float(mul);186 }187 188 [FORCE_INLINE] simd4f dot4_ps(simd4f a, simd4f b)189 {190 return set1_ps(dot4_float(a, b));191 }192 193 [FORCE_INLINE] simd4f dot3_ps(simd4f a, simd4f b)194 {195 return set1_ps(dot3_float(a, b));196 }197 #define dot3_ps3 dot3_ps198 199 #endif // ~MATH_NEON200 201 #ifdef ANDROID202 [FORCE_INLINE] float vec4_length_sq_float_asm(const simd4f *vec)203 {204 float len;205 asm(206 "\t vld1.32 {d0, d1}, [%1] \n"207 "\t vmul.f32 q0, q0, q0 \n" 208 "\t vpadd.f32 d0, d0, d1 \n" 209 "\t vadd.f32 %0, s0, s1 \n" 210 :"=w"(len)211 :"r"(vec)212 :"q0");213 return len;214 }215 216 [FORCE_INLINE] void vec4_length_sq_ps_asm(const simd4f *vec, simd4f *out)217 {218 asm(219 "\t vld1.32 {d0, d1}, [%1] \n"220 "\t vmul.f32 q0, q0, q0 \n" 221 "\t vpadd.f32 d0, d0, d1 \n" 222 "\t vrev64.32 d1, d0 \n" 223 "\t vadd.f32 d0, d0, d1 \n" 224 "\t vmov.f32 d1, d0 \n"225 "\t vst1.32 {d0, d1}, [%0] \n"226 :227 :"r"(out), "r"(vec)228 :"memory", "q0");229 }230 #endif231 232 [FORCE_INLINE] float vec4_length_sq_float(simd4f vec)233 {234 return dot4_float(vec, vec);235 }236 237 [FORCE_INLINE] simd4f vec4_length_sq_ps(simd4f vec)238 {239 return dot4_ps(vec, vec);240 }241 242 [FORCE_INLINE] float vec3_length_sq_float(simd4f vec)243 {244 return dot3_float(vec, vec);245 }246 247 [FORCE_INLINE] simd4f vec3_length_sq_ps(simd4f vec)248 {249 return dot3_ps(vec, vec);250 }251 252 [FORCE_INLINE] simd4f vec4_rsqrt(simd4f vec)253 {254 #ifdef MATH_SSE255 simd4f e = _mm_rsqrt_ps(vec); 256 simd4f e3 = mul_ps(mul_ps(e,e), e); 257 return _mm_add_ps(e, mul_ps(set1_ps(0.5f), sub_ps(e, mul_ps(vec, e3))));258 #elif defined(MATH_NEON)259 float32x4_t r = vrsqrteq_f32(vec);260 return mul_ps(vrsqrtsq_f32(mul_ps(r, r), vec), r);261 #endif262 }263 264 [FORCE_INLINE] simd4f vec4_sqrt(simd4f vec)265 {266 #ifdef MATH_SSE267 return _mm_sqrt_ps(vec);268 #else269 270 271 return mul_ps(vec, vec4_rsqrt(vec));272 #endif273 }274 275 [FORCE_INLINE] float vec4_length_float(simd4f vec)276 {277 return s4f_x(vec4_sqrt(dot4_ps(vec, vec)));278 }279 280 [FORCE_INLINE] simd4f vec4_length_ps(simd4f vec)281 {282 return vec4_sqrt(dot4_ps(vec, vec));283 }284 285 [FORCE_INLINE] simd4f vec4_normalize(simd4f vec)286 {287 return mul_ps(vec, vec4_rsqrt(vec4_length_sq_ps(vec)));288 }289 290 [FORCE_INLINE] float vec3_length_float(simd4f vec)291 {292 return s4f_x(vec4_sqrt(dot3_ps3(vec, vec)));293 }294 295 [FORCE_INLINE] simd4f vec3_length_ps(simd4f vec)296 {297 return vec4_sqrt(dot3_ps(vec, vec));298 }299 300 [FORCE_INLINE] simd4f vec3_length_ps3(simd4f vec)301 {302 return vec4_sqrt(dot3_ps3(vec, vec));303 }304 305 [FORCE_INLINE] simd4f vec3_normalize(simd4f vec)306 {307 return mul_ps(vec, vec4_rsqrt(vec3_length_sq_ps(vec)));308 }309 310 311 [FORCE_INLINE] simd4f vec4_lerp(simd4f a, simd4f b, simd4f t)312 {313 314 return add_ps(a, mul_ps(t, sub_ps(b, a)));315 }316 317 [FORCE_INLINE] simd4f vec4_lerp(simd4f a, simd4f b, float t)318 {319 return vec4_lerp(a, b, set1_ps(t));320 }321 322 #endif // ~MATH_SIMD Go back to previous page