TriangleMesh_IntersectRay

1 /* Copyright Jukka Jyl�nki
2
3    Licensed under the Apache License, Version 2.0 (the "License");
4    you may not use this file except in compliance with the License.
5    You may obtain a copy of the License at
6
7        http://www.apache.org/licenses/LICENSE-2.0
8
9    Unless required by applicable law or agreed to in writing, software
10    distributed under the License is distributed on an "AS IS" BASIS,
11    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12    See the License for the specific language governing permissions and
13    limitations under the License. */
14
15 /** @file TriangleMesh_IntersectRay_AVX.inl
16         @author Jukka Jyl�nki
17         @brief AVX implementation of ray-mesh intersection routines. */
18
19 #include "../Math/SSEMath.h"
20
21 MATH_BEGIN_NAMESPACE
22
23 #if !defined(MATH_GEN_TRIANGLEINDEX)
24 float TriangleMesh::IntersectRay_AVX(const Ray &ray) const
25 #elif defined(MATH_GEN_TRIANGLEINDEX) && !defined(MATH_GEN_UV)
26 float TriangleMesh::IntersectRay_TriangleIndex_AVX(const Ray &ray, int &outTriangleIndex) const
27 #elif defined(MATH_GEN_TRIANGLEINDEX) && defined(MATH_GEN_UV)
28 float TriangleMesh::IntersectRay_TriangleIndex_UV_AVX(const Ray &ray, int &outTriangleIndex, float &outU, float &outV) const
29 #endif
30 {
31 //      std::cout << numTris << " tris: ";
32 //      TRACESTART(RayTriMeshIntersectAVX);
33
34         assert(sizeof(float3) == 3*sizeof(float));
35         assert(sizeof(Triangle) == 3*sizeof(vec));
36 #ifdef _DEBUG
37         assert(vertexDataLayout == 2); // Must be SoA8 structured!
38 #endif
39
40 //      hitTriangleIndex = -1;
41 //      float3 pt;
42         const float inf = FLOAT_INF;
43         __m256 nearestD = _mm256_set1_ps(inf);
44 #ifdef MATH_GEN_UV
45         __m256 nearestU = _mm256_set1_ps(inf);
46         __m256 nearestV = _mm256_set1_ps(inf);
47 #endif
48 #ifdef MATH_GEN_TRIANGLEINDEX
49         __m256i nearestIndex = _mm256_set1_epi32(-1);
50 #endif
51
52         const __m256 lX = _mm256_broadcast_ss(&ray.pos.x);
53         const __m256 lY = _mm256_broadcast_ss(&ray.pos.y);
54         const __m256 lZ = _mm256_broadcast_ss(&ray.pos.z);
55
56         const __m256 dX = _mm256_broadcast_ss(&ray.dir.x);
57         const __m256 dY = _mm256_broadcast_ss(&ray.dir.y);
58         const __m256 dZ = _mm256_broadcast_ss(&ray.dir.z);
59
60         const __m256 epsilon = _mm256_set1_ps(1e-4f);
61         const __m256 zero = _mm256_setzero_ps();
62         const __m256 one = _mm256_set1_ps(1.f);
63
64         assert(((uintptr_t)data & 0x1F) == 0);
65
66         const float *tris = reinterpret_cast<const float*>(data);
67
68         for(int i = 0; i+8 <= numTriangles; i += 8)
69         {
70                 __m256 v0x = _mm256_load_ps(tris);
71                 __m256 v0y = _mm256_load_ps(tris+8);
72                 __m256 v0z = _mm256_load_ps(tris+16);
73
74 #ifdef SOA_HAS_EDGES
75                 // Edge vectors
76                 __m256 e1x = _mm256_load_ps(tris+24);
77                 __m256 e1y = _mm256_load_ps(tris+32);
78                 __m256 e1z = _mm256_load_ps(tris+40);
79
80                 __m256 e2x = _mm256_load_ps(tris+48);
81                 __m256 e2y = _mm256_load_ps(tris+56);
82                 __m256 e2z = _mm256_load_ps(tris+64);
83 #else
84                 __m256 v1x = _mm256_load_ps(tris+24);
85                 __m256 v1y = _mm256_load_ps(tris+32);
86                 __m256 v1z = _mm256_load_ps(tris+40);
87
88                 __m256 v2x = _mm256_load_ps(tris+48);
89                 __m256 v2y = _mm256_load_ps(tris+56);
90                 __m256 v2z = _mm256_load_ps(tris+64);
91
92                 // Edge vectors
93                 __m256 e1x = _mm256_sub_ps(v1x, v0x);
94                 __m256 e1y = _mm256_sub_ps(v1y, v0y);
95                 __m256 e1z = _mm256_sub_ps(v1z, v0z);
96
97                 __m256 e2x = _mm256_sub_ps(v2x, v0x);
98                 __m256 e2y = _mm256_sub_ps(v2y, v0y);
99                 __m256 e2z = _mm256_sub_ps(v2z, v0z);
100 #endif
101                 // begin calculating determinant - also used to calculate U parameter
102                 __m256 px = _mm256_sub_ps(_mm256_mul_ps(dY, e2z), _mm256_mul_ps(dZ, e2y));
103                 __m256 py = _mm256_sub_ps(_mm256_mul_ps(dZ, e2x), _mm256_mul_ps(dX, e2z));
104                 __m256 pz = _mm256_sub_ps(_mm256_mul_ps(dX, e2y), _mm256_mul_ps(dY, e2x));
105
106                 // If det < 0, intersecting backfacing tri, > 0, intersecting frontfacing tri, 0, parallel to plane.
107                 __m256 det = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(e1x, px), _mm256_mul_ps(e1y, py)), _mm256_mul_ps(e1z, pz));
108
109                 // If determinant is near zero, ray lies in plane of triangle.
110
111 //              if (fabs(det) <= epsilon)
112 //                      return FLOAT_INF;
113                 __m256 recipDet = _mm256_rcp_ps(det);
114
115                 __m256 absdet = abs_ps256(det);
116                 __m256 out = _mm256_cmp_ps(absdet, epsilon, _CMP_LT_OQ);
117
118                 // Calculate distance from v0 to ray origin
119                 __m256 tx = _mm256_sub_ps(lX, v0x);
120                 __m256 ty = _mm256_sub_ps(lY, v0y);
121                 __m256 tz = _mm256_sub_ps(lZ, v0z);
122
123                 // Output barycentric u
124                 __m256 u = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(tx, px), _mm256_mul_ps(ty, py)), _mm256_mul_ps(tz, pz)), recipDet);
125
126 //              if (u < 0.f || u > 1.f)
127 //                      return FLOAT_INF; // Barycentric U is outside the triangle - early out.
128                 __m256 out2 = _mm256_cmp_ps(u, zero, 1);
129                 out = _mm256_or_ps(out, out2);
130                 out2 = _mm256_cmp_ps(u, one, _CMP_GT_OQ);
131                 out = _mm256_or_ps(out, out2);
132
133                 // Prepare to test V parameter
134                 __m256 qx = _mm256_sub_ps(_mm256_mul_ps(ty, e1z), _mm256_mul_ps(tz, e1y));
135                 __m256 qy = _mm256_sub_ps(_mm256_mul_ps(tz, e1x), _mm256_mul_ps(tx, e1z));
136                 __m256 qz = _mm256_sub_ps(_mm256_mul_ps(tx, e1y), _mm256_mul_ps(ty, e1x));
137
138                 // Output barycentric v
139                 __m256 v = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(dX, qx), _mm256_mul_ps(dY, qy)), _mm256_mul_ps(dZ, qz)), recipDet);
140
141 //              if (v < 0.f || u + v > 1.f) // Barycentric V or the combination of U and V are outside the triangle - no intersection.
142 //                      return FLOAT_INF;
143                 out2 = _mm256_cmp_ps(v, zero, _CMP_LT_OQ);
144                 out = _mm256_or_ps(out, out2);
145                 __m256 uv = _mm256_add_ps(u, v);
146                 out2 = _mm256_cmp_ps(uv, one, _CMP_GT_OQ);
147                 out = _mm256_or_ps(out, out2);
148
149                 // Output signed distance from ray to triangle.
150                 __m256 t = _mm256_mul_ps(_mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(e2x, qx), _mm256_mul_ps(e2y, qy)), _mm256_mul_ps(e2z, qz)), recipDet);
151
152                 // t < 0?
153                 out2 = _mm256_cmp_ps(t, zero, _CMP_LT_OQ);
154                 out = _mm256_or_ps(out, out2);
155
156                 // Worse than previous result?
157                 out2 = _mm256_cmp_ps(t, nearestD, _CMP_GE_OQ);
158                 out = _mm256_or_ps(out, out2);
159
160                 // Store the index of the triangle that was hit.
161 #ifdef MATH_GEN_TRIANGLEINDEX
162                 __m256i hitIndex = _mm256_set1_epi32(i);
163                 nearestIndex = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(hitIndex), _mm256_castsi256_ps(nearestIndex), out)); // 'blend' requires SSE4.1!
164 #endif
165
166 #ifdef MATH_GEN_UV
167                 nearestU = _mm256_blendv_ps(u, nearestU, out);
168                 nearestV = _mm256_blendv_ps(v, nearestV, out);
169 #endif
170
171                 // The mask out now contains 0xFF in all indices which are worse than previous, and
172                 // 0x00 in indices which are better.
173                 nearestD = _mm256_blendv_ps(t, nearestD, out);
174
175                 tris += 72;
176         }
177
178         float ds[8];
179         _mm256_store_ps(ds, nearestD);
180 #ifdef MATH_GEN_UV
181         float su[8];
182         float sv[8];
183         _mm256_storeu_ps(su, nearestU);
184         _mm256_storeu_ps(sv, nearestV);
185 #endif
186
187 #ifdef MATH_GEN_TRIANGLEINDEX
188         u32 ds2[8];
189         _mm256_storeu_si256((__m256i*)ds2, nearestIndex);
190 #endif
191
192         float smallestT = FLOAT_INF;
193 //      float u = FLOAT_NAN, v = FLOAT_NAN;
194         for(int i = 0; i < 8; ++i)
195                 if (ds[i] < smallestT)
196                 {
197                         smallestT = ds[i];
198 #ifdef MATH_GEN_TRIANGLEINDEX
199                         outTriangleIndex = ds2[i]+i;
200 #endif
201 #ifdef MATH_GEN_UV
202                         outU = su[i];
203                         outV = sv[i];
204 #endif
205                 }
206
207 //      TRACEEND(RayTriMeshIntersectAVX);
208
209 //      static double avgtimes = 0.f;
210 //      static double nAvgTimes = 0;
211 //      static double processedBytes;
212
213 //      processedBytes += numTris * 3 * 4;
214
215 //      avgtimes += Clock::TicksToMillisecondsD(time_RayTriMeshIntersectAVX);
216 //      ++nAvgTimes;
217 //      std::cout << "Total avg (AVX): " << (avgtimes / nAvgTimes) << std::endl;
218 //      std::cout << "Hit distance (AVX): " << smallestT << ", index: " << hitTriangleIndex << ", UV: (" << u << ", " << v << ")" << std::endl;
219 //      std::cout << "(AVX) " << processedBytes / avgtimes * 1000.0 / 1024.0 / 1024.0 / 1024.0 << "GB/sec." << std::endl;
220
221         return smallestT;
222 }
223
224
225 #ifdef MATH_GEN_TRIANGLEINDEX
226 #undef MATH_GEN_TRIANGLEINDEX
227 #endif
228 #ifdef MATH_GEN_UV
229 #undef MATH_GEN_UV
230 #endif
231
232 MATH_END_NAMESPACE