1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 #ifdef MATH_SSE239 40 #include "[SSEMath.h]"41 42 static const __m128 _ps_1 = _mm_set1_ps(1.f);43 static const __m128 _ps_0p5 = _mm_set1_ps(0.5f);44 static const __m128 _ps_min_norm_pos = set1_ps_hex(0x00800000);45 static const __m128 _ps_mant_mask = set1_ps_hex(0x7f800000);46 static const __m128 _ps_inv_mant_mask = set1_ps_hex(~0x7f800000);47 static const __m128 _ps_sign_mask = set1_ps_hex(0x80000000);48 static const __m128 _ps_inv_sign_mask = set1_ps_hex(~0x80000000);49 50 static const __m128i _pi32_1 = _mm_set1_epi32(1);51 static const __m128i _pi32_inv1 = _mm_set1_epi32(~1);52 static const __m128i _pi32_2 = _mm_set1_epi32(2);53 static const __m128i _pi32_4 = _mm_set1_epi32(4);54 static const __m128i _pi32_0x7f = _mm_set1_epi32(0x7f);55 56 static const __m128 _ps_cephes_SQRTHF = _mm_set1_ps(0.707106781186547524f);57 static const __m128 _ps_cephes_log_p0 = _mm_set1_ps(7.0376836292E-2f);58 static const __m128 _ps_cephes_log_p1 = _mm_set1_ps(-1.1514610310E-1f);59 static const __m128 _ps_cephes_log_p2 = _mm_set1_ps(1.1676998740E-1f);60 static const __m128 _ps_cephes_log_p3 = _mm_set1_ps(-1.2420140846E-1f);61 static const __m128 _ps_cephes_log_p4 = _mm_set1_ps(+1.4249322787E-1f);62 static const __m128 _ps_cephes_log_p5 = _mm_set1_ps(-1.6668057665E-1f);63 static const __m128 _ps_cephes_log_p6 = _mm_set1_ps(+2.0000714765E-1f);64 static const __m128 _ps_cephes_log_p7 = _mm_set1_ps(-2.4999993993E-1f);65 static const __m128 _ps_cephes_log_p8 = _mm_set1_ps(+3.3333331174E-1f);66 static const __m128 _ps_cephes_log_q1 = _mm_set1_ps(-2.12194440[e]-4f);67 static const __m128 _ps_cephes_log_q2 = _mm_set1_ps(0.693359375f);68 69 70 71 72 [FORCE_INLINE] __m128 log_ps(__m128 x) {73 __m128 [one] = _ps_1;74 __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());75 x = _mm_max_ps(x, _ps_min_norm_pos); 76 __m128i emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);77 78 x = _mm_and_ps(x, _ps_inv_mant_mask);79 x = _mm_or_ps(x, _ps_0p5);80 emm0 = _mm_sub_epi32(emm0, _pi32_0x7f);81 __m128 [e] = _mm_cvtepi32_ps(emm0);82 e = _mm_add_ps(e, one);83 84 85 86 87 88 89 __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);90 __m128 tmp = _mm_and_ps(x, mask);91 x = _mm_sub_ps(x, one);92 e = _mm_sub_ps(e, _mm_and_ps(one, mask));93 x = _mm_add_ps(x, tmp);94 __m128 z = _mm_mul_ps(x,x);95 __m128 y = _ps_cephes_log_p0;96 y = _mm_mul_ps(y, x);97 y = _mm_add_ps(y, _ps_cephes_log_p1);98 y = _mm_mul_ps(y, x);99 y = _mm_add_ps(y, _ps_cephes_log_p2);100 y = _mm_mul_ps(y, x);101 y = _mm_add_ps(y, _ps_cephes_log_p3);102 y = _mm_mul_ps(y, x);103 y = _mm_add_ps(y, _ps_cephes_log_p4);104 y = _mm_mul_ps(y, x);105 y = _mm_add_ps(y, _ps_cephes_log_p5);106 y = _mm_mul_ps(y, x);107 y = _mm_add_ps(y, _ps_cephes_log_p6);108 y = _mm_mul_ps(y, x);109 y = _mm_add_ps(y, _ps_cephes_log_p7);110 y = _mm_mul_ps(y, x);111 y = _mm_add_ps(y, _ps_cephes_log_p8);112 y = _mm_mul_ps(y, _mm_mul_ps(x, z));113 tmp = _mm_mul_ps(e, _ps_cephes_log_q1);114 y = _mm_add_ps(y, tmp);115 tmp = _mm_mul_ps(z, _ps_0p5);116 y = _mm_sub_ps(y, tmp);117 tmp = _mm_mul_ps(e, _ps_cephes_log_q2);118 x = _mm_add_ps(x, _mm_add_ps(y, tmp));119 x = _mm_or_ps(x, invalid_mask); 120 return x;121 }122 123 static const __m128 _ps_exp_hi = _mm_set1_ps(88.3762626647949f);124 static const __m128 _ps_exp_lo = _mm_set1_ps(-88.3762626647949f);125 126 static const __m128 _ps_cephes_LOG2EF = _mm_set1_ps(1.44269504088896341f);127 static const __m128 _ps_cephes_exp_C1 = _mm_set1_ps(0.693359375f);128 static const __m128 _ps_cephes_exp_C2 = _mm_set1_ps(-2.12194440e-4f);129 130 static const __m128 _ps_cephes_exp_p0 = _mm_set1_ps(1.9875691500E-4f);131 static const __m128 _ps_cephes_exp_p1 = _mm_set1_ps(1.3981999507E-3f);132 static const __m128 _ps_cephes_exp_p2 = _mm_set1_ps(8.3334519073E-3f);133 static const __m128 _ps_cephes_exp_p3 = _mm_set1_ps(4.1665795894E-2f);134 static const __m128 _ps_cephes_exp_p4 = _mm_set1_ps(1.6666665459E-1f);135 static const __m128 _ps_cephes_exp_p5 = _mm_set1_ps(5.0000001201E-1f);136 137 [FORCE_INLINE] __m128 exp_ps(__m128 x) {138 __m128 one = _ps_1;139 140 x = _mm_min_ps(x, _ps_exp_hi);141 x = _mm_max_ps(x, _ps_exp_lo);142 143 144 __m128 fx = _mm_mul_ps(x, _ps_cephes_LOG2EF);145 fx = _mm_add_ps(fx, _ps_0p5);146 147 148 __m128i emm0 = _mm_cvttps_epi32(fx);149 __m128 tmp = _mm_cvtepi32_ps(emm0);150 151 __m128 mask = _mm_cmpgt_ps(tmp, fx);152 mask = _mm_and_ps(mask, one);153 fx = _mm_sub_ps(tmp, mask);154 tmp = _mm_mul_ps(fx, _ps_cephes_exp_C1);155 __m128 z = _mm_mul_ps(fx, _ps_cephes_exp_C2);156 x = _mm_sub_ps(x, _mm_add_ps(tmp, z));157 z = _mm_mul_ps(x,x);158 __m128 y = _ps_cephes_exp_p0;159 y = _mm_mul_ps(y, x);160 y = _mm_add_ps(y, _ps_cephes_exp_p1);161 y = _mm_mul_ps(y, x);162 y = _mm_add_ps(y, _ps_cephes_exp_p2);163 y = _mm_mul_ps(y, x);164 y = _mm_add_ps(y, _ps_cephes_exp_p3);165 y = _mm_mul_ps(y, x);166 y = _mm_add_ps(y, _ps_cephes_exp_p4);167 y = _mm_mul_ps(y, x);168 y = _mm_add_ps(y, _ps_cephes_exp_p5);169 y = _mm_mul_ps(y, z);170 y = _mm_add_ps(y, _mm_add_ps(x, one));171 172 173 emm0 = _mm_cvttps_epi32(fx);174 emm0 = _mm_add_epi32(emm0, _pi32_0x7f);175 emm0 = _mm_slli_epi32(emm0, 23);176 __m128 pow2n = _mm_castsi128_ps(emm0);177 y = _mm_mul_ps(y, pow2n);178 return y;179 }180 181 static const __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);182 static const __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);183 static const __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);184 static const __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);185 static const __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);186 static const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);187 static const __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);188 static const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);189 static const __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);190 static const __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 [FORCE_INLINE] __m128 sin_ps(__m128 x) { 221 #if 0 // Currently expect user to round manually if he knows input can be unbounded.222 #ifdef MATH_SSE41 // _mm_round_ps is SSE4.1223 224 x = modf_ps(x, _mm_set1_ps(2.f*[pi]));225 #endif226 #endif227 228 229 __m128 sign_bit = _mm_and_ps(x, _ps_sign_mask);230 231 x = _mm_xor_ps(x, sign_bit);232 233 234 __m128 y = _mm_mul_ps(x, _ps_cephes_FOPI);235 236 237 __m128i emm2 = _mm_cvttps_epi32(y);238 239 emm2 = _mm_add_epi32(emm2, _pi32_1);240 emm2 = _mm_and_si128(emm2, _pi32_inv1);241 y = _mm_cvtepi32_ps(emm2);242 243 244 __m128i emm0 = _mm_and_si128(emm2, _pi32_4);245 emm0 = _mm_slli_epi32(emm0, 29);246 247 248 249 250 251 252 emm2 = _mm_and_si128(emm2, _pi32_2);253 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());254 255 __m128 swap_sign_bit = _mm_castsi128_ps(emm0);256 __m128 poly_mask = _mm_castsi128_ps(emm2);257 sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);258 259 260 261 __m128 xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);262 __m128 xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);263 __m128 xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);264 x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));265 266 267 y = _ps_coscof_p0;268 __m128 z = _mm_mul_ps(x,x);269 270 y = _mm_mul_ps(y, z);271 y = _mm_add_ps(y, _ps_coscof_p1);272 y = _mm_mul_ps(y, z);273 y = _mm_add_ps(y, _ps_coscof_p2);274 y = _mm_mul_ps(y, _mm_mul_ps(z, z));275 __m128 tmp = _mm_mul_ps(z, _ps_0p5);276 y = _mm_sub_ps(y, tmp);277 y = _mm_add_ps(y, _ps_1);278 279 280 281 __m128 y2 = _ps_sincof_p0;282 y2 = _mm_mul_ps(y2, z);283 y2 = _mm_add_ps(y2, _ps_sincof_p1);284 y2 = _mm_mul_ps(y2, z);285 y2 = _mm_add_ps(y2, _ps_sincof_p2);286 y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));287 y2 = _mm_add_ps(y2, x);288 289 290 xmm3 = poly_mask;291 y2 = _mm_and_ps(xmm3, y2); 292 y = _mm_andnot_ps(xmm3, y);293 y = _mm_add_ps(y,y2);294 295 y = _mm_xor_ps(y, sign_bit);296 return y;297 }298 299 300 [FORCE_INLINE] __m128 cos_ps(__m128 x) { 301 302 #if 0 // Currently expect user to round manually if he knows input can be unbounded.303 #ifdef MATH_SSE41 // _mm_round_ps is SSE4.1304 305 x = modf_ps(x, _mm_set1_ps(2.f*[pi]));306 #endif307 #endif308 309 310 x = _mm_and_ps(x, _ps_inv_sign_mask);311 312 __m128 y = _mm_mul_ps(x, _ps_cephes_FOPI);313 314 315 __m128i emm2 = _mm_cvttps_epi32(y);316 317 emm2 = _mm_add_epi32(emm2, _pi32_1);318 emm2 = _mm_and_si128(emm2, _pi32_inv1);319 y = _mm_cvtepi32_ps(emm2);320 321 emm2 = _mm_sub_epi32(emm2, _pi32_2);322 323 324 __m128i emm0 = _mm_andnot_si128(emm2, _pi32_4);325 emm0 = _mm_slli_epi32(emm0, 29);326 327 emm2 = _mm_and_si128(emm2, _pi32_2);328 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());329 330 __m128 sign_bit = _mm_castsi128_ps(emm0);331 __m128 poly_mask = _mm_castsi128_ps(emm2);332 333 334 __m128 xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);335 __m128 xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);336 __m128 xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);337 x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));338 339 340 y = _ps_coscof_p0;341 __m128 z = _mm_mul_ps(x,x);342 343 y = _mm_mul_ps(y, z);344 y = _mm_add_ps(y, _ps_coscof_p1);345 y = _mm_mul_ps(y, z);346 y = _mm_add_ps(y, _ps_coscof_p2);347 y = _mm_mul_ps(y, _mm_mul_ps(z, z));348 __m128 tmp = _mm_mul_ps(z, _ps_0p5);349 y = _mm_sub_ps(y, tmp);350 y = _mm_add_ps(y, _ps_1);351 352 353 354 __m128 y2 = _ps_sincof_p0;355 y2 = _mm_mul_ps(y2, z);356 y2 = _mm_add_ps(y2, _ps_sincof_p1);357 y2 = _mm_mul_ps(y2, z);358 y2 = _mm_add_ps(y2, _ps_sincof_p2);359 y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));360 y2 = _mm_add_ps(y2, x);361 362 363 xmm3 = poly_mask;364 y2 = _mm_and_ps(xmm3, y2); 365 y = _mm_andnot_ps(xmm3, y);366 y = _mm_add_ps(y,y2);367 368 y = _mm_xor_ps(y, sign_bit);369 370 return y;371 }372 373 374 375 [FORCE_INLINE] void sincos_ps(__m128 x, __m128 *s, __m128 *c) {376 #if 0377 #ifdef MATH_SSE41 // _mm_round_ps is SSE4.1378 379 x = modf_ps(x, _mm_set1_ps(2.f*3.141592654f));380 #endif381 #endif382 383 384 __m128 sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);385 386 x = _mm_xor_ps(x, sign_bit_sin);387 388 389 __m128 y = _mm_mul_ps(x, _ps_cephes_FOPI);390 391 392 __m128i emm2 = _mm_cvttps_epi32(y);393 394 395 emm2 = _mm_add_epi32(emm2, _pi32_1);396 emm2 = _mm_and_si128(emm2, _pi32_inv1);397 y = _mm_cvtepi32_ps(emm2);398 399 __m128i emm4 = emm2;400 401 402 __m128i emm0 = _mm_and_si128(emm2, _pi32_4);403 emm0 = _mm_slli_epi32(emm0, 29);404 __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);405 406 407 emm2 = _mm_and_si128(emm2, _pi32_2);408 emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());409 __m128 poly_mask = _mm_castsi128_ps(emm2);410 411 412 __m128 xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);413 __m128 xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);414 __m128 xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);415 x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));416 417 emm4 = _mm_sub_epi32(emm4, _pi32_2);418 emm4 = _mm_andnot_si128(emm4, _pi32_4);419 emm4 = _mm_slli_epi32(emm4, 29);420 __m128 sign_bit_cos = _mm_castsi128_ps(emm4);421 422 sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);423 424 425 __m128 z = _mm_mul_ps(x,x);426 y = _ps_coscof_p0;427 428 y = _mm_mul_ps(y, z);429 y = _mm_add_ps(y, _ps_coscof_p1);430 y = _mm_mul_ps(y, z);431 y = _mm_add_ps(y, _ps_coscof_p2);432 y = _mm_mul_ps(y, _mm_mul_ps(z, z));433 __m128 tmp = _mm_mul_ps(z, _ps_0p5);434 y = _mm_sub_ps(y, tmp);435 y = _mm_add_ps(y, _ps_1);436 437 438 439 __m128 y2 = _ps_sincof_p0;440 y2 = _mm_mul_ps(y2, z);441 y2 = _mm_add_ps(y2, _ps_sincof_p1);442 y2 = _mm_mul_ps(y2, z);443 y2 = _mm_add_ps(y2, _ps_sincof_p2);444 y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));445 y2 = _mm_add_ps(y2, x);446 447 448 xmm3 = poly_mask;449 __m128 ysin2 = _mm_and_ps(xmm3, y2);450 __m128 ysin1 = _mm_andnot_ps(xmm3, y);451 y2 = _mm_sub_ps(y2,ysin2);452 y = _mm_sub_ps(y, ysin1);453 454 xmm1 = _mm_add_ps(ysin1,ysin2);455 xmm2 = _mm_add_ps(y,y2);456 457 458 *s = _mm_xor_ps(xmm1, sign_bit_sin);459 *c = _mm_xor_ps(xmm2, sign_bit_cos);460 }461 462 #endif Go back to previous page