float4_sse.h

1 /* Copyright Jukka Jyl�nki
2
3    Licensed under the Apache License, Version 2.0 (the "License");
4    you may not use this file except in compliance with the License.
5    You may obtain a copy of the License at
6
7        http://www.apache.org/licenses/LICENSE-2.0
8
9    Unless required by applicable law or agreed to in writing, software
10    distributed under the License is distributed on an "AS IS" BASIS,
11    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12    See the License for the specific language governing permissions and
13    limitations under the License. */
14
15 /** @file float4_sse.h
16         @author Jukka Jyl�nki
17         @brief SSE code for float4-related computations. */
18 #pragma once
19
20 #include "../MathBuildConfig.h"
21
22 #ifdef MATH_SSE
23
24 #include "MathTypes.h"
25 #include "SSEMath.h"
26
27 // Input: [w,z,y,x], Output: x+y+z in all four channels.
28 FORCE_INLINE simd4f sum_xyz_ps(simd4f m)
29 {
30 #ifdef MATH_SSE3 // If we have SSE 3, we can use the haddps (horizontal add) instruction, _mm_hadd_ps intrinsic.
31         m = and_ps(m, sseMaskXYZ); // Clear w to zero.
32         m = _mm_hadd_ps(m, m); // m = (x+y, z, x+y, z).
33         m = _mm_hadd_ps(m, m); // m = (x+y+z, x+y+z, x+y+z, x+y+z).
34         return m; // Each index of the output will contain the sum x+y+z.
35 #else // We only have SSE 1, and must individually shuffle.
36         simd4f X = xxxx_ps(m);
37         simd4f Y = yyyy_ps(m);
38         simd4f Z = zzzz_ps(m);
39         simd4f XYZ = add_ps(X, add_ps(Y, Z));
40         return XYZ; // Each index of the output will contain the sum x+y+z.
41 #endif
42 }
43
44 // Input: [w,z,y,x], Output: x+y+z in three lowest channels, w is undefined.
45 FORCE_INLINE simd4f sum_xyz_ps3(simd4f m)
46 {
47         simd4f yzx = shuffle1_ps(m, _MM_SHUFFLE(3,0,2,1)); // [_, x, z, y]
48         simd4f zxy = shuffle1_ps(m, _MM_SHUFFLE(3,1,0,2)); // [_, y, x, z]
49         simd4f XYZ = add_ps(m, add_ps(yzx, zxy)); // [_, x+y+z, x+y+z, x+y+z]
50         return XYZ; // The three lowest elements will contain the sum x+y+z. Highest element is undefined.
51 }
52
53 FORCE_INLINE float sum_xyz_float(simd4f m)
54 {
55         return s4f_x(sum_xyz_ps3(m));
56 }
57
58 /// The returned SP FP contains x+y+z+w in all channels of the vector.
59 FORCE_INLINE simd4f sum_xyzw_ps(simd4f m)
60 {
61 #ifdef MATH_SSE3 // If we have SSE 3, we can use the haddps (horizontal add) instruction, _mm_hadd_ps intrinsic.
62         m = _mm_hadd_ps(m, m); // m = (x+y, z+w, x+y, z+w).
63         m = _mm_hadd_ps(m, m); // m = (x+y+z+w, x+y+z+w, x+y+z+w, x+y+z+w).
64         return m; // Each index of the output will contain the sum x+y+z+w.
65 #else // We only have SSE 1, and must individually shuffle.
66         simd4f v2 = shuffle1_ps(m, _MM_SHUFFLE(1,0,3,2)); // = [y, x, w, z]
67         v2 = add_ps(v2, m); // = [w+y, z+x, y+w, x+z]
68         simd4f v3 = shuffle1_ps(v2, _MM_SHUFFLE(0,3,2,1)); // = [x+z, w+y, z+x, y+w]
69         return add_ps(v2, v3); // = [w+y+x+z, z+x+w+y, y+w+z+x, x+z+y+w]
70 #endif
71 }
72
73 FORCE_INLINE float sum_xyzw_float(simd4f m)
74 {
75         return s4f_x(sum_xyzw_ps(m));
76 }
77
78 FORCE_INLINE simd4f mul_xyzw_ps(simd4f v)
79 {
80         simd4f v2 = shuffle1_ps(v, _MM_SHUFFLE(1, 0, 3, 2)); // v2 = [y, x, w, z]
81         v2 = mul_ps(v, v2); // v2 = [w*y, z*x, y*w, x*z]
82         simd4f v3 = shuffle1_ps(v2, _MM_SHUFFLE(2, 1, 0, 3)); // v3 = [z*x, y*w, x*z, w*y]
83         return mul_ps(v2, v3); // v3 = [w*y*z*x, z*x*y*w, y*w*x*z, x*z*w*y]
84 }
85
86 FORCE_INLINE float mul_xyzw_float(simd4f m)
87 {
88         return s4f_x(mul_xyzw_ps(m));
89 }
90
91 // Returns the dot-product of the x,y,z components in all channels of the output vector.
92 FORCE_INLINE simd4f dot3_ps(simd4f a, simd4f b)
93 {
94 #ifdef MATH_SSE41 // If we have SSE 4.1, we can use the dpps (dot product) instruction, _mm_dp_ps intrinsic.
95         return _mm_dp_ps(a, b, 0x7F); // Choose to multiply x, y and z (0x70 = 0111 0000), and store the output to all indices (0x0F == 0000 1111).
96 #else // Otherwise, use SSE3 haddps or SSE1 with individual shuffling.
97         return sum_xyz_ps(mul_ps(a, b));
98 #endif
99 }
100
101 // Returns the dot-product of the x,y,z components in all channels of the output vector.
102 FORCE_INLINE simd4f dot3_ps3(simd4f a, simd4f b)
103 {
104 #ifdef MATH_SSE41 // If we have SSE 4.1, we can use the dpps (dot product) instruction, _mm_dp_ps intrinsic.
105         return _mm_dp_ps(a, b, 0x7F); // Choose to multiply x, y and z (0x70 = 0111 0000), and store the output to all indices (0x0F == 0000 1111).
106 #else // Otherwise, use SSE3 haddps or SSE1 with individual shuffling.
107         return sum_xyz_ps3(mul_ps(a, b));
108 #endif
109 }
110
111 FORCE_INLINE float dot3_float(simd4f a, simd4f b)
112 {
113         return s4f_x(dot3_ps3(a, b));
114 }
115
116 /// The dot product is stored in each channel of the returned vector.
117 FORCE_INLINE simd4f dot4_ps(simd4f a, simd4f b)
118 {
119 #ifdef MATH_SSE41 // If we have SSE 4.1, we can use the dpps (dot product) instruction, _mm_dp_ps intrinsic.
120         return _mm_dp_ps(a, b, 0xFF); // Choose to multiply x, y, z and w (0xF0 = 1111 0000), and store the output to all indices (0x0F == 0000 1111).
121 #else // Otherwise, use SSE3 haddps or SSE1 with individual shuffling.
122         return sum_xyzw_ps(mul_ps(a, b));
123 #endif
124 }
125
126 FORCE_INLINE float dot4_float(simd4f a, simd4f b)
127 {
128         return s4f_x(dot4_ps(a, b));
129 }
130
131 FORCE_INLINE simd4f cross_ps(simd4f a, simd4f b)
132 {
133         simd4f a_xzy = shuffle1_ps(a, _MM_SHUFFLE(3, 0, 2, 1)); // = [a.w, a.x, a.z, a.y]
134         simd4f b_xzy = shuffle1_ps(b, _MM_SHUFFLE(3, 0, 2, 1)); // = [b.w, b.x, b.z, b.y]
135
136         simd4f x_yxz = mul_ps(b_xzy, a); // [a.w*b.w, a.z*b.x, a.y*b.z, a.x*b.y]
137         simd4f y_yxz = mul_ps(a_xzy, b); // [a.w*b.w, a.z*b.x, a.y*b.z, a.x*b.y]
138
139         return shuffle1_ps(sub_ps(x_yxz, y_yxz), _MM_SHUFFLE(3, 0, 2, 1)); // [0, a.x*b.y - a.y*b.x, a.z*b.x - a.x*b.z, a.y*b.z - a.z*b.y]
140 }
141
142 FORCE_INLINE void basis_ps(simd4f v, simd4f *outB, simd4f *outC)
143 {
144         simd4f a = abs_ps(v);
145         simd4f a_min = min_ps(a, min_ps(yyyy_ps(a), _mm_movehl_ps(a, a))); // Horizontal min of x,y,z
146         a_min = xxxx_ps(a_min); // Broadcast to all elements.
147         a = cmple_ps(a, a_min); // Mask 0xFFFFFFFF to channels that contain the min element.
148         // Choose from (1,0,0), (0,1,0), and (0,0,1) the one that's most perpendicular to this vector.
149         simd4f q = and_ps(a, set_ps(0.f, 1.f, 1.f, 1.f));
150
151         simd4f v_xzy = shuffle1_ps(v, _MM_SHUFFLE(3, 0, 2, 1));
152         simd4f v_yxz = shuffle1_ps(v, _MM_SHUFFLE(3, 1, 0, 2));
153         simd4f q_xzy = shuffle1_ps(q, _MM_SHUFFLE(3, 0, 2, 1));
154         simd4f b_yxz = sub_ps(mul_ps(q_xzy, v), mul_ps(v_xzy, q));
155         simd4f b = shuffle1_ps(b_yxz, _MM_SHUFFLE(3, 0, 2, 1));
156         simd4f b_xzy = shuffle1_ps(b_yxz, _MM_SHUFFLE(3, 1, 0, 2));
157         simd4f c = sub_ps(mul_ps(b_yxz, v_xzy), mul_ps(v_yxz, b_xzy));
158
159         *outB = mul_ps(b, rsqrt_ps(dot4_ps(b, b)));
160         *outC = mul_ps(c, rsqrt_ps(dot4_ps(c, c)));
161 }
162
163 simd4f vec3_length_ps(simd4f vec);
164 simd4f vec3_length_ps3(simd4f vec);
165
166 /// Returns a normalized copy of the given vector. Returns the length of the original vector in outLength.
167 FORCE_INLINE simd4f vec4_safe_normalize3(simd4f vec, simd4f &outLength)
168 {
169         outLength = vec3_length_ps3(vec);
170         simd4f isZero = _mm_cmplt_ps(outLength, simd4fEpsilon); // Was the length zero?
171         simd4f normalized = _mm_div_ps(vec, outLength); // Normalize.
172         normalized = cmov_ps(normalized, float4::unitX.v, isZero); // If length == 0, output the vector (1,0,0).
173         return cmov_ps(vec, normalized, sseMaskXYZ); // Return the original .w component to the vector (this function is supposed to preserve original .w).
174 }
175
176 #endif