1 /* Copyright Jukka Jyl�nki
2
3    Licensed under the Apache License, Version 2.0 (the "License");
4    you may not use this file except in compliance with the License.
5    You may obtain a copy of the License at
6
7        http://www.apache.org/licenses/LICENSE-2.0
8
9    Unless required by applicable law or agreed to in writing, software
10    distributed under the License is distributed on an "AS IS" BASIS,
11    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12    See the License for the specific language governing permissions and
13    limitations under the License. */
14
15 /** @file TriangleMesh_IntersectRay_SSE.inl
16         @author Jukka Jyl�nki
17         @brief SSE implementation of ray-mesh intersection routines. */
18 MATH_BEGIN_NAMESPACE
19
20 #if defined(MATH_GEN_SSE2) && !defined(MATH_GEN_TRIANGLEINDEX)
21 float TriangleMesh::IntersectRay_SSE2(const Ray &ray) const
22 #elif defined(MATH_GEN_SSE2) && defined(MATH_GEN_TRIANGLEINDEX) && !defined(MATH_GEN_UV)
23 float TriangleMesh::IntersectRay_TriangleIndex_SSE2(const Ray &ray, int &outTriangleIndex) const
24 #elif defined(MATH_GEN_SSE2) && defined(MATH_GEN_TRIANGLEINDEX) && defined(MATH_GEN_UV)
25 float TriangleMesh::IntersectRay_TriangleIndex_UV_SSE2(const Ray &ray, int &outTriangleIndex, float &outU, float &outV) const
26 #elif defined(MATH_GEN_SSE41) && !defined(MATH_GEN_TRIANGLEINDEX)
27 float TriangleMesh::IntersectRay_SSE41(const Ray &ray) const
28 #elif defined(MATH_GEN_SSE41) && defined(MATH_GEN_TRIANGLEINDEX) && !defined(MATH_GEN_UV)
29 float TriangleMesh::IntersectRay_TriangleIndex_SSE41(const Ray &ray, int &outTriangleIndex) const
30 #elif defined(MATH_GEN_SSE41) && defined(MATH_GEN_TRIANGLEINDEX) && defined(MATH_GEN_UV)
31 float TriangleMesh::IntersectRay_TriangleIndex_UV_SSE41(const Ray &ray, int &outTriangleIndex, float &outU, float &outV) const
32 #endif
33 {
34 //      std::cout << numTris << " tris: ";
35 //      TRACESTART(RayTriMeshIntersectSSE);
36
37         assert(sizeof(float3) == 3*sizeof(float));
38         assert(sizeof(Triangle) == 3*sizeof(float3));
39 #ifdef _DEBUG
40         assert(vertexDataLayout == 1); // Must be SoA4 structured!
41 #endif
42         
43         const float inf = FLOAT_INF;
44         __m128 nearestD = _mm_set1_ps(inf);
45 #ifdef MATH_GEN_UV
46         __m128 nearestU = _mm_set1_ps(inf);
47         __m128 nearestV = _mm_set1_ps(inf);
48 #endif
49 #ifdef MATH_GEN_TRIANGLEINDEX
50         __m128i nearestIndex = _mm_set1_epi32(-1);
51 #endif
52
53         const __m128 lX = _mm_load1_ps(&ray.pos.x);
54         const __m128 lY = _mm_load1_ps(&ray.pos.y);
55         const __m128 lZ = _mm_load1_ps(&ray.pos.z);
56
57         const __m128 dX = _mm_load1_ps(&ray.dir.x);
58         const __m128 dY = _mm_load1_ps(&ray.dir.y);
59         const __m128 dZ = _mm_load1_ps(&ray.dir.z);
60
61         const __m128 epsilon = _mm_set1_ps(1e-4f);
62         const __m128 zero = _mm_setzero_ps();
63         const __m128 one = _mm_set1_ps(1.f);
64
65     const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
66
67         assert(((uintptr_t)data & 0xF) == 0);
68
69         const float *tris = reinterpret_cast<const float*>(data);
70
71         for(int i = 0; i+4 <= numTriangles; i += 4)
72         {
73                 __m128 v0x = _mm_load_ps(tris);
74                 __m128 v0y = _mm_load_ps(tris+4);
75                 __m128 v0z = _mm_load_ps(tris+8);
76
77 #ifdef SOA_HAS_EDGES
78                 __m128 e1x = _mm_load_ps(tris+12);
79                 __m128 e1y = _mm_load_ps(tris+16);
80                 __m128 e1z = _mm_load_ps(tris+20);
81
82                 __m128 e2x = _mm_load_ps(tris+24);
83                 __m128 e2y = _mm_load_ps(tris+28);
84                 __m128 e2z = _mm_load_ps(tris+32);
85 #else
86                 __m128 v1x = _mm_load_ps(tris+12);
87                 __m128 v1y = _mm_load_ps(tris+16);
88                 __m128 v1z = _mm_load_ps(tris+20);
89
90                 __m128 v2x = _mm_load_ps(tris+24);
91                 __m128 v2y = _mm_load_ps(tris+28);
92                 __m128 v2z = _mm_load_ps(tris+32);
93
94                 // Edge vectors
95                 __m128 e1x = _mm_sub_ps(v1x, v0x);
96                 __m128 e1y = _mm_sub_ps(v1y, v0y);
97                 __m128 e1z = _mm_sub_ps(v1z, v0z);
98
99                 __m128 e2x = _mm_sub_ps(v2x, v0x);
100                 __m128 e2y = _mm_sub_ps(v2y, v0y);
101                 __m128 e2z = _mm_sub_ps(v2z, v0z);
102 #endif
103 //              _mm_prefetch((const char *)(tris+36), _MM_HINT_T0);
104
105                 // begin calculating determinant - also used to calculate U parameter
106                 __m128 px = _mm_sub_ps(_mm_mul_ps(dY, e2z), _mm_mul_ps(dZ, e2y));
107                 __m128 py = _mm_sub_ps(_mm_mul_ps(dZ, e2x), _mm_mul_ps(dX, e2z));
108                 __m128 pz = _mm_sub_ps(_mm_mul_ps(dX, e2y), _mm_mul_ps(dY, e2x));
109
110                 // If det < 0, intersecting backfacing tri, > 0, intersecting frontfacing tri, 0, parallel to plane.
111                 __m128 det = _mm_add_ps(_mm_add_ps(_mm_mul_ps(e1x, px), _mm_mul_ps(e1y, py)), _mm_mul_ps(e1z, pz));
112
113                 // If determinant is near zero, ray lies in plane of triangle.
114
115 //              if (fabs(det) <= epsilon)
116 //                      return FLOAT_INF;
117                 __m128 recipDet = _mm_rcp_ps(det);
118
119                 __m128 absdet = _mm_andnot_ps(sign_mask, det);
120                 __m128 out = _mm_cmple_ps(absdet, epsilon);
121
122                 // Calculate distance from v0 to ray origin
123                 __m128 tx = _mm_sub_ps(lX, v0x);
124                 __m128 ty = _mm_sub_ps(lY, v0y);
125                 __m128 tz = _mm_sub_ps(lZ, v0z);
126
127                 // Output barycentric u
128                 __m128 u = _mm_mul_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(tx, px), _mm_mul_ps(ty, py)), _mm_mul_ps(tz, pz)), recipDet);
129
130 //              if (u < 0.f || u > 1.f)
131 //                      return FLOAT_INF; // Barycentric U is outside the triangle - early out.
132                 __m128 out2 = _mm_cmplt_ps(u, zero);
133                 out = _mm_or_ps(out, out2);
134                 out2 = _mm_cmpgt_ps(u, one);
135                 out = _mm_or_ps(out, out2);
136
137                 // Prepare to test V parameter
138                 __m128 qx = _mm_sub_ps(_mm_mul_ps(ty, e1z), _mm_mul_ps(tz, e1y));
139                 __m128 qy = _mm_sub_ps(_mm_mul_ps(tz, e1x), _mm_mul_ps(tx, e1z));
140                 __m128 qz = _mm_sub_ps(_mm_mul_ps(tx, e1y), _mm_mul_ps(ty, e1x));
141
142                 // Output barycentric v
143                 __m128 v = _mm_mul_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(dX, qx), _mm_mul_ps(dY, qy)), _mm_mul_ps(dZ, qz)), recipDet);
144
145 //              if (v < 0.f || u + v > 1.f) // Barycentric V or the combination of U and V are outside the triangle - no intersection.
146 //                      return FLOAT_INF;
147                 out2 = _mm_cmplt_ps(v, zero);
148                 out = _mm_or_ps(out, out2);
149                 __m128 uv = _mm_add_ps(u, v);
150                 out2 = _mm_cmpgt_ps(uv, one);
151                 out = _mm_or_ps(out, out2);
152
153                 // Output signed distance from ray to triangle.
154                 __m128 t = _mm_mul_ps(_mm_add_ps(_mm_add_ps(_mm_mul_ps(e2x, qx), _mm_mul_ps(e2y, qy)), _mm_mul_ps(e2z, qz)), recipDet);
155
156                 // t < 0?
157                 out2 = _mm_cmplt_ps(t, zero);
158                 out = _mm_or_ps(out, out2);
159
160                 // Worse than previous result?
161                 out2 = _mm_cmpge_ps(t, nearestD);
162                 out = _mm_or_ps(out, out2);
163
164                 // The mask 'out' now contains 0xFF in all indices which are worse than previous, and
165                 // 0x00 in indices which are better.
166
167 #ifdef MATH_GEN_SSE41
168                 nearestD = _mm_blendv_ps(t, nearestD, out);
169 #else
170                 // If SSE 4.1 is not available:
171                 nearestD = _mm_and_ps(out, nearestD);
172                 t = _mm_andnot_ps(out, t);
173                 nearestD = _mm_or_ps(t, nearestD);
174 #endif
175
176 #ifdef MATH_GEN_UV
177
178 #ifdef MATH_GEN_SSE41
179                 nearestU = _mm_blendv_ps(u, nearestU, out); // 'blend' requires SSE4.1!
180                 nearestV = _mm_blendv_ps(v, nearestV, out); // 'blend' requires SSE4.1!
181 #else
182                 // If SSE 4.1 is not available:
183                 nearestU = _mm_and_ps(out, nearestU);
184                 nearestV = _mm_and_ps(out, nearestV);
185                 u = _mm_andnot_ps(out, u);
186                 v = _mm_andnot_ps(out, v);
187                 nearestU = _mm_or_ps(u, nearestU);
188                 nearestV = _mm_or_ps(v, nearestV);
189 #endif
190
191 #endif
192
193 #ifdef MATH_GEN_TRIANGLEINDEX
194                 __m128i hitIndex = _mm_set1_epi32(i);
195 #ifdef MATH_GEN_SSE41
196                 nearestIndex = _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(hitIndex), _mm_castsi128_ps(nearestIndex), out)); // 'blend' requires SSE4.1!
197 #else
198                 // If SSE 4.1 is not available:
199                 // Store the index of the triangle that was hit.
200                 nearestIndex = _mm_and_si128(_mm_castps_si128(out), nearestIndex);
201                 hitIndex = _mm_andnot_si128(_mm_castps_si128(out), hitIndex);
202                 nearestIndex = _mm_or_si128(hitIndex, nearestIndex);
203 #endif
204
205 #endif
206
207                 tris += 36;
208         }
209
210         float4 d = nearestD;
211 #ifdef MATH_GEN_UV
212         float4 u = nearestU;
213         float4 v = nearestV;
214 #endif
215 #ifdef MATH_GEN_TRIANGLEINDEX
216         u32 idx[4];
217         _mm_store_si128((__m128i*)idx, nearestIndex);
218 #endif
219         float smallestT = FLOAT_INF;
220         for(int i = 0; i < 4; ++i)
221                 if (d[i] < smallestT)
222                 {
223                         smallestT = d[i];
224 #ifdef MATH_GEN_TRIANGLEINDEX
225                         outTriangleIndex = idx[i]+i;
226 #endif
227 #ifdef MATH_GEN_UV
228                         outU = u[i];
229                         outV = v[i];
230 #endif
231                 }
232
233 //      TRACEEND(RayTriMeshIntersectSSE);
234
235 //      static double avgtimes = 0.f;
236 //      static double nAvgTimes = 0;
237 //      static double processedBytes;
238         
239 //      processedBytes += numTris * 3 * 4;
240
241 //      avgtimes += Clock::TicksToMillisecondsD(time_RayTriMeshIntersectSSE);
242 //      ++nAvgTimes;
243 //      std::cout << "Total avg (SSE): " << (avgtimes / nAvgTimes) << std::endl;
244 //      std::cout << "Hit distance (SSE): " << smallestT << ", index: " << hitTriangleIndex << ", UV: (" << u << ", " << v << ")" << std::endl;
245 //      std::cout << "(SSE) " << processedBytes / avgtimes * 1000.0 / 1024.0 / 1024.0 / 1024.0 << "GB/sec." << std::endl;
246
247         return smallestT;
248 }
249
250
251 //float TriangleMesh::IntersectRay_AVX(const Ray &ray) const
252 //float TriangleMesh::IntersectRay_TriangleIndex_AVX(const Ray &ray, int &outIndex) const
253 //float TriangleMesh::IntersectRay_TriangleIndex_UV_AVX(const Ray &ray, int &outIndex, float &outU, float &outV) const
254
255
256
257
258 #ifdef MATH_GEN_SSE2
259 #undef MATH_GEN_SSE2
260 #endif
261 #ifdef MATH_GEN_SSE41
262 #undef MATH_GEN_SSE41
263 #endif
264 #ifdef MATH_GEN_TRIANGLEINDEX
265 #undef MATH_GEN_TRIANGLEINDEX
266 #endif
267 #ifdef MATH_GEN_UV
268 #undef MATH_GEN_UV
269 #endif
270
271 MATH_END_NAMESPACE

Go back to previous page