sse_mathfun.h

1 // This file taken from http://gruntthepeon.free.fr/ssemath/
2 // Modifications by Jukka Jyl�nki:
3 // - Removed MMX support
4 // - Added modf to for better precision outside [0, 2pi] range.
5 // - Simplified constants usage syntax.
6
7 /* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
8
9    Inspired by Intel Approximate Math library, and based on the
10    corresponding algorithms of the cephes math library
11
12    The default is to use the SSE1 version. If you define USE_SSE2 the
13    the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
14    not expect any significant performance improvement with SSE2.
15 */
16
17 /* Copyright (C) 2007  Julien Pommier
18
19   This software is provided 'as-is', without any express or implied
20   warranty.  In no event will the authors be held liable for any damages
21   arising from the use of this software.
22
23   Permission is granted to anyone to use this software for any purpose,
24   including commercial applications, and to alter it and redistribute it
25   freely, subject to the following restrictions:
26
27   1. The origin of this software must not be misrepresented; you must not
28      claim that you wrote the original software. If you use this software
29      in a product, an acknowledgment in the product documentation would be
30      appreciated but is not required.
31   2. Altered source versions must be plainly marked as such, and must not be
32      misrepresented as being the original software.
33   3. This notice may not be removed or altered from any source distribution.
34
35   (this is the zlib license)
36 */
37
38 #ifdef MATH_SSE2
39
40 #include "SSEMath.h"
41
42 static const __m128 _ps_1 = _mm_set1_ps(1.f);
43 static const __m128 _ps_0p5 = _mm_set1_ps(0.5f);
44 static const __m128 _ps_min_norm_pos = set1_ps_hex(0x00800000);
45 static const __m128 _ps_mant_mask = set1_ps_hex(0x7f800000);
46 static const __m128 _ps_inv_mant_mask = set1_ps_hex(~0x7f800000);
47 static const __m128 _ps_sign_mask = set1_ps_hex(0x80000000);
48 static const __m128 _ps_inv_sign_mask = set1_ps_hex(~0x80000000);
49
50 static const __m128i _pi32_1 = _mm_set1_epi32(1);
51 static const __m128i _pi32_inv1 = _mm_set1_epi32(~1);
52 static const __m128i _pi32_2 = _mm_set1_epi32(2);
53 static const __m128i _pi32_4 = _mm_set1_epi32(4);
54 static const __m128i _pi32_0x7f = _mm_set1_epi32(0x7f);
55
56 static const __m128 _ps_cephes_SQRTHF = _mm_set1_ps(0.707106781186547524f);
57 static const __m128 _ps_cephes_log_p0 = _mm_set1_ps(7.0376836292E-2f);
58 static const __m128 _ps_cephes_log_p1 = _mm_set1_ps(-1.1514610310E-1f);
59 static const __m128 _ps_cephes_log_p2 = _mm_set1_ps(1.1676998740E-1f);
60 static const __m128 _ps_cephes_log_p3 = _mm_set1_ps(-1.2420140846E-1f);
61 static const __m128 _ps_cephes_log_p4 = _mm_set1_ps(+1.4249322787E-1f);
62 static const __m128 _ps_cephes_log_p5 = _mm_set1_ps(-1.6668057665E-1f);
63 static const __m128 _ps_cephes_log_p6 = _mm_set1_ps(+2.0000714765E-1f);
64 static const __m128 _ps_cephes_log_p7 = _mm_set1_ps(-2.4999993993E-1f);
65 static const __m128 _ps_cephes_log_p8 = _mm_set1_ps(+3.3333331174E-1f);
66 static const __m128 _ps_cephes_log_q1 = _mm_set1_ps(-2.12194440e-4f);
67 static const __m128 _ps_cephes_log_q2 = _mm_set1_ps(0.693359375f);
68
69 /* natural logarithm computed for 4 simultaneous float
70    return NaN for x <= 0
71 */
72 FORCE_INLINE __m128 log_ps(__m128 x) {
73   __m128 one = _ps_1;
74   __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());
75   x = _mm_max_ps(x, _ps_min_norm_pos);  /* cut off denormalized stuff */
76   __m128i emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
77   /* keep only the fractional part */
78   x = _mm_and_ps(x, _ps_inv_mant_mask);
79   x = _mm_or_ps(x, _ps_0p5);
80   emm0 = _mm_sub_epi32(emm0, _pi32_0x7f);
81   __m128 e = _mm_cvtepi32_ps(emm0);
82   e = _mm_add_ps(e, one);
83   /* part2:
84      if( x < SQRTHF ) {
85        e -= 1;
86        x = x + x - 1.0;
87      } else { x = x - 1.0; }
88   */
89   __m128 mask = _mm_cmplt_ps(x, _ps_cephes_SQRTHF);
90   __m128 tmp = _mm_and_ps(x, mask);
91   x = _mm_sub_ps(x, one);
92   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
93   x = _mm_add_ps(x, tmp);
94   __m128 z = _mm_mul_ps(x,x);
95   __m128 y = _ps_cephes_log_p0;
96   y = _mm_mul_ps(y, x);
97   y = _mm_add_ps(y, _ps_cephes_log_p1);
98   y = _mm_mul_ps(y, x);
99   y = _mm_add_ps(y, _ps_cephes_log_p2);
100   y = _mm_mul_ps(y, x);
101   y = _mm_add_ps(y, _ps_cephes_log_p3);
102   y = _mm_mul_ps(y, x);
103   y = _mm_add_ps(y, _ps_cephes_log_p4);
104   y = _mm_mul_ps(y, x);
105   y = _mm_add_ps(y, _ps_cephes_log_p5);
106   y = _mm_mul_ps(y, x);
107   y = _mm_add_ps(y, _ps_cephes_log_p6);
108   y = _mm_mul_ps(y, x);
109   y = _mm_add_ps(y, _ps_cephes_log_p7);
110   y = _mm_mul_ps(y, x);
111   y = _mm_add_ps(y, _ps_cephes_log_p8);
112   y = _mm_mul_ps(y, _mm_mul_ps(x, z));
113   tmp = _mm_mul_ps(e, _ps_cephes_log_q1);
114   y = _mm_add_ps(y, tmp);
115   tmp = _mm_mul_ps(z, _ps_0p5);
116   y = _mm_sub_ps(y, tmp);
117   tmp = _mm_mul_ps(e, _ps_cephes_log_q2);
118   x = _mm_add_ps(x, _mm_add_ps(y, tmp));
119   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
120   return x;
121 }
122
123 static const __m128 _ps_exp_hi = _mm_set1_ps(88.3762626647949f);
124 static const __m128 _ps_exp_lo = _mm_set1_ps(-88.3762626647949f);
125
126 static const __m128 _ps_cephes_LOG2EF = _mm_set1_ps(1.44269504088896341f);
127 static const __m128 _ps_cephes_exp_C1 = _mm_set1_ps(0.693359375f);
128 static const __m128 _ps_cephes_exp_C2 = _mm_set1_ps(-2.12194440e-4f);
129
130 static const __m128 _ps_cephes_exp_p0 = _mm_set1_ps(1.9875691500E-4f);
131 static const __m128 _ps_cephes_exp_p1 = _mm_set1_ps(1.3981999507E-3f);
132 static const __m128 _ps_cephes_exp_p2 = _mm_set1_ps(8.3334519073E-3f);
133 static const __m128 _ps_cephes_exp_p3 = _mm_set1_ps(4.1665795894E-2f);
134 static const __m128 _ps_cephes_exp_p4 = _mm_set1_ps(1.6666665459E-1f);
135 static const __m128 _ps_cephes_exp_p5 = _mm_set1_ps(5.0000001201E-1f);
136
137 FORCE_INLINE __m128 exp_ps(__m128 x) {
138   __m128 one = _ps_1;
139
140   x = _mm_min_ps(x, _ps_exp_hi);
141   x = _mm_max_ps(x, _ps_exp_lo);
142
143   /* express exp(x) as exp(g + n*log(2)) */
144   __m128 fx = _mm_mul_ps(x, _ps_cephes_LOG2EF);
145   fx = _mm_add_ps(fx, _ps_0p5);
146
147   /* how to perform a floorf with SSE: just below */
148   __m128i emm0 = _mm_cvttps_epi32(fx);
149   __m128 tmp  = _mm_cvtepi32_ps(emm0);
150   /* if greater, substract 1 */
151   __m128 mask = _mm_cmpgt_ps(tmp, fx);
152   mask = _mm_and_ps(mask, one);
153   fx = _mm_sub_ps(tmp, mask);
154   tmp = _mm_mul_ps(fx, _ps_cephes_exp_C1);
155   __m128 z = _mm_mul_ps(fx, _ps_cephes_exp_C2);
156   x = _mm_sub_ps(x, _mm_add_ps(tmp, z));
157   z = _mm_mul_ps(x,x);
158   __m128 y = _ps_cephes_exp_p0;
159   y = _mm_mul_ps(y, x);
160   y = _mm_add_ps(y, _ps_cephes_exp_p1);
161   y = _mm_mul_ps(y, x);
162   y = _mm_add_ps(y, _ps_cephes_exp_p2);
163   y = _mm_mul_ps(y, x);
164   y = _mm_add_ps(y, _ps_cephes_exp_p3);
165   y = _mm_mul_ps(y, x);
166   y = _mm_add_ps(y, _ps_cephes_exp_p4);
167   y = _mm_mul_ps(y, x);
168   y = _mm_add_ps(y, _ps_cephes_exp_p5);
169   y = _mm_mul_ps(y, z);
170   y = _mm_add_ps(y, _mm_add_ps(x, one));
171
172   /* build 2^n */
173   emm0 = _mm_cvttps_epi32(fx);
174   emm0 = _mm_add_epi32(emm0, _pi32_0x7f);
175   emm0 = _mm_slli_epi32(emm0, 23);
176   __m128 pow2n = _mm_castsi128_ps(emm0);
177   y = _mm_mul_ps(y, pow2n);
178   return y;
179 }
180
181 static const __m128 _ps_minus_cephes_DP1 = _mm_set1_ps(-0.78515625f);
182 static const __m128 _ps_minus_cephes_DP2 = _mm_set1_ps(-2.4187564849853515625e-4f);
183 static const __m128 _ps_minus_cephes_DP3 = _mm_set1_ps(-3.77489497744594108e-8f);
184 static const __m128 _ps_sincof_p0 = _mm_set1_ps(-1.9515295891E-4f);
185 static const __m128 _ps_sincof_p1 = _mm_set1_ps( 8.3321608736E-3f);
186 static const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611E-1f);
187 static const __m128 _ps_coscof_p0 = _mm_set1_ps( 2.443315711809948E-005f);
188 static const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765E-003f);
189 static const __m128 _ps_coscof_p2 = _mm_set1_ps( 4.166664568298827E-002f);
190 static const __m128 _ps_cephes_FOPI = _mm_set1_ps(1.27323954473516f); // 4 / M_PI
191
192 /* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
193    it runs also on old athlons XPs and the pentium III of your grand
194    mother.
195
196    The code is the exact rewriting of the cephes sinf function.
197    Precision is excellent as long as x < 8192 (I did not bother to
198    take into account the special handling they have for greater values
199    -- it does not return garbage for arguments over 8192, though, but
200    the extra precision is missing).
201
202    Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
203    surprising but correct result.
204
205    Performance is also surprisingly good, 1.33 times faster than the
206    macos vsinf SSE2 function, and 1.5 times faster than the
207    __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
208    too bad for an SSE1 function (with no special tuning) !
209    However the latter libraries probably have a much better handling of NaN,
210    Inf, denormalized and other special arguments..
211
212    On my core 1 duo, the execution of this function takes approximately 95 cycles.
213
214    From what I have observed on the experiments with Intel AMath lib, switching to an
215    SSE2 version would improve the perf by only 10%.
216
217    Since it is based on SSE intrinsics, it has to be compiled at -O2 to
218    deliver full speed.
219 */
220 FORCE_INLINE __m128 sin_ps(__m128 x) { // any x
221 #if 0 // Currently expect user to round manually if he knows input can be unbounded.
222 #ifdef MATH_SSE41 // _mm_round_ps is SSE4.1
223   // XXX Added in MathGeoLib: Take a modulo of the input in 2pi to try to enhance the precision with large input values.
224   x = modf_ps(x, _mm_set1_ps(2.f*pi));
225 #endif
226 #endif
227
228   /* extract the sign bit (upper one) */
229   __m128 sign_bit = _mm_and_ps(x, _ps_sign_mask);
230   /* take the absolute value */
231   x = _mm_xor_ps(x, sign_bit);
232
233   /* scale by 4/Pi */
234   __m128 y = _mm_mul_ps(x, _ps_cephes_FOPI);
235
236   /* store the integer part of y in mm0 */
237   __m128i emm2 = _mm_cvttps_epi32(y);
238   /* j=(j+1) & (~1) (see the cephes sources) */
239   emm2 = _mm_add_epi32(emm2, _pi32_1);
240   emm2 = _mm_and_si128(emm2, _pi32_inv1);
241   y = _mm_cvtepi32_ps(emm2);
242
243   /* get the swap sign flag */
244   __m128i emm0 = _mm_and_si128(emm2, _pi32_4);
245   emm0 = _mm_slli_epi32(emm0, 29);
246   /* get the polynom selection mask
247      there is one polynom for 0 <= x <= Pi/4
248      and another one for Pi/4<x<=Pi/2
249
250      Both branches will be computed.
251   */
252   emm2 = _mm_and_si128(emm2, _pi32_2);
253   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
254
255   __m128 swap_sign_bit = _mm_castsi128_ps(emm0);
256   __m128 poly_mask = _mm_castsi128_ps(emm2);
257   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
258
259   /* The magic pass: "Extended precision modular arithmetic"
260      x = ((x - y * DP1) - y * DP2) - y * DP3; */
261   __m128 xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
262   __m128 xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
263   __m128 xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
264   x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
265
266   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
267   y = _ps_coscof_p0;
268   __m128 z = _mm_mul_ps(x,x);
269
270   y = _mm_mul_ps(y, z);
271   y = _mm_add_ps(y, _ps_coscof_p1);
272   y = _mm_mul_ps(y, z);
273   y = _mm_add_ps(y, _ps_coscof_p2);
274   y = _mm_mul_ps(y, _mm_mul_ps(z, z));
275   __m128 tmp = _mm_mul_ps(z, _ps_0p5);
276   y = _mm_sub_ps(y, tmp);
277   y = _mm_add_ps(y, _ps_1);
278
279   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
280
281   __m128 y2 = _ps_sincof_p0;
282   y2 = _mm_mul_ps(y2, z);
283   y2 = _mm_add_ps(y2, _ps_sincof_p1);
284   y2 = _mm_mul_ps(y2, z);
285   y2 = _mm_add_ps(y2, _ps_sincof_p2);
286   y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
287   y2 = _mm_add_ps(y2, x);
288
289   /* select the correct result from the two polynoms */
290   xmm3 = poly_mask;
291   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
292   y = _mm_andnot_ps(xmm3, y);
293   y = _mm_add_ps(y,y2);
294   /* update the sign */
295   y = _mm_xor_ps(y, sign_bit);
296   return y;
297 }
298
299 /* almost the same as sin_ps */
300 FORCE_INLINE __m128 cos_ps(__m128 x) { // any x
301
302 #if 0 // Currently expect user to round manually if he knows input can be unbounded.
303 #ifdef MATH_SSE41 // _mm_round_ps is SSE4.1
304   // XXX Added in MathGeoLib: Take a modulo of the input in 2pi to try to enhance the precision with large input values.
305   x = modf_ps(x, _mm_set1_ps(2.f*pi));
306 #endif
307 #endif
308
309   /* take the absolute value */
310   x = _mm_and_ps(x, _ps_inv_sign_mask);
311   /* scale by 4/Pi */
312   __m128 y = _mm_mul_ps(x, _ps_cephes_FOPI);
313
314   /* store the integer part of y in mm0 */
315   __m128i emm2 = _mm_cvttps_epi32(y);
316   /* j=(j+1) & (~1) (see the cephes sources) */
317   emm2 = _mm_add_epi32(emm2, _pi32_1);
318   emm2 = _mm_and_si128(emm2, _pi32_inv1);
319   y = _mm_cvtepi32_ps(emm2);
320
321   emm2 = _mm_sub_epi32(emm2, _pi32_2);
322
323   /* get the swap sign flag */
324   __m128i emm0 = _mm_andnot_si128(emm2, _pi32_4);
325   emm0 = _mm_slli_epi32(emm0, 29);
326   /* get the polynom selection mask */
327   emm2 = _mm_and_si128(emm2, _pi32_2);
328   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
329
330   __m128 sign_bit = _mm_castsi128_ps(emm0);
331   __m128 poly_mask = _mm_castsi128_ps(emm2);
332   /* The magic pass: "Extended precision modular arithmetic"
333      x = ((x - y * DP1) - y * DP2) - y * DP3; */
334   __m128 xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
335   __m128 xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
336   __m128 xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
337   x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
338
339   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
340   y = _ps_coscof_p0;
341   __m128 z = _mm_mul_ps(x,x);
342
343   y = _mm_mul_ps(y, z);
344   y = _mm_add_ps(y, _ps_coscof_p1);
345   y = _mm_mul_ps(y, z);
346   y = _mm_add_ps(y, _ps_coscof_p2);
347   y = _mm_mul_ps(y, _mm_mul_ps(z, z));
348   __m128 tmp = _mm_mul_ps(z, _ps_0p5);
349   y = _mm_sub_ps(y, tmp);
350   y = _mm_add_ps(y, _ps_1);
351
352   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
353
354   __m128 y2 = _ps_sincof_p0;
355   y2 = _mm_mul_ps(y2, z);
356   y2 = _mm_add_ps(y2, _ps_sincof_p1);
357   y2 = _mm_mul_ps(y2, z);
358   y2 = _mm_add_ps(y2, _ps_sincof_p2);
359   y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
360   y2 = _mm_add_ps(y2, x);
361
362   /* select the correct result from the two polynoms */
363   xmm3 = poly_mask;
364   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
365   y = _mm_andnot_ps(xmm3, y);
366   y = _mm_add_ps(y,y2);
367   /* update the sign */
368   y = _mm_xor_ps(y, sign_bit);
369
370   return y;
371 }
372
373 /* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
374    it is almost as fast, and gives you a free cosine with your sine */
375 FORCE_INLINE void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
376 #if 0
377 #ifdef MATH_SSE41 // _mm_round_ps is SSE4.1
378   // XXX Added in MathGeoLib: Take a modulo of the input in 2pi to try to enhance the precision with large input values.
379   x = modf_ps(x, _mm_set1_ps(2.f*3.141592654f));
380 #endif
381 #endif
382
383   /* extract the sign bit (upper one) */
384   __m128 sign_bit_sin = _mm_and_ps(x, _ps_sign_mask);
385   /* take the absolute value */
386   x = _mm_xor_ps(x, sign_bit_sin);
387
388   /* scale by 4/Pi */
389   __m128 y = _mm_mul_ps(x, _ps_cephes_FOPI);
390
391   /* store the integer part of y in emm2 */
392   __m128i emm2 = _mm_cvttps_epi32(y);
393
394   /* j=(j+1) & (~1) (see the cephes sources) */
395   emm2 = _mm_add_epi32(emm2, _pi32_1);
396   emm2 = _mm_and_si128(emm2, _pi32_inv1);
397   y = _mm_cvtepi32_ps(emm2);
398
399   __m128i emm4 = emm2;
400
401   /* get the swap sign flag for the sine */
402   __m128i emm0 = _mm_and_si128(emm2, _pi32_4);
403   emm0 = _mm_slli_epi32(emm0, 29);
404   __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
405
406   /* get the polynom selection mask for the sine*/
407   emm2 = _mm_and_si128(emm2, _pi32_2);
408   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
409   __m128 poly_mask = _mm_castsi128_ps(emm2);
410   /* The magic pass: "Extended precision modular arithmetic"
411      x = ((x - y * DP1) - y * DP2) - y * DP3; */
412   __m128 xmm1 = _mm_mul_ps(y, _ps_minus_cephes_DP1);
413   __m128 xmm2 = _mm_mul_ps(y, _ps_minus_cephes_DP2);
414   __m128 xmm3 = _mm_mul_ps(y, _ps_minus_cephes_DP3);
415   x = _mm_add_ps(_mm_add_ps(x, xmm1), _mm_add_ps(xmm2, xmm3));
416
417   emm4 = _mm_sub_epi32(emm4, _pi32_2);
418   emm4 = _mm_andnot_si128(emm4, _pi32_4);
419   emm4 = _mm_slli_epi32(emm4, 29);
420   __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
421
422   sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
423
424   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
425   __m128 z = _mm_mul_ps(x,x);
426   y = _ps_coscof_p0;
427
428   y = _mm_mul_ps(y, z);
429   y = _mm_add_ps(y, _ps_coscof_p1);
430   y = _mm_mul_ps(y, z);
431   y = _mm_add_ps(y, _ps_coscof_p2);
432   y = _mm_mul_ps(y, _mm_mul_ps(z, z));
433   __m128 tmp = _mm_mul_ps(z, _ps_0p5);
434   y = _mm_sub_ps(y, tmp);
435   y = _mm_add_ps(y, _ps_1);
436
437   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
438
439   __m128 y2 = _ps_sincof_p0;
440   y2 = _mm_mul_ps(y2, z);
441   y2 = _mm_add_ps(y2, _ps_sincof_p1);
442   y2 = _mm_mul_ps(y2, z);
443   y2 = _mm_add_ps(y2, _ps_sincof_p2);
444   y2 = _mm_mul_ps(y2, _mm_mul_ps(z, x));
445   y2 = _mm_add_ps(y2, x);
446
447   /* select the correct result from the two polynoms */
448   xmm3 = poly_mask;
449   __m128 ysin2 = _mm_and_ps(xmm3, y2);
450   __m128 ysin1 = _mm_andnot_ps(xmm3, y);
451   y2 = _mm_sub_ps(y2,ysin2);
452   y = _mm_sub_ps(y, ysin1);
453
454   xmm1 = _mm_add_ps(ysin1,ysin2);
455   xmm2 = _mm_add_ps(y,y2);
456
457   /* update the sign */
458   *s = _mm_xor_ps(xmm1, sign_bit_sin);
459   *c = _mm_xor_ps(xmm2, sign_bit_cos);
460 }
461
462 #endif