float4x4_neon.h

1 /* Copyright Jukka Jyl�nki
2
3    Licensed under the Apache License, Version 2.0 (the "License");
4    you may not use this file except in compliance with the License.
5    You may obtain a copy of the License at
6
7        http://www.apache.org/licenses/LICENSE-2.0
8
9    Unless required by applicable law or agreed to in writing, software
10    distributed under the License is distributed on an "AS IS" BASIS,
11    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12    See the License for the specific language governing permissions and
13    limitations under the License. */
14
15 /** @file float4_neon.h
16         @author Jukka Jyl�nki
17         @brief ARM NEON code for float4-related computations. */
18
19 #pragma once
20
21 #ifdef MATH_SIMD
22
23 #include "SSEMath.h"
24 #include "float4_neon.h"
25 #include "float4x4_sse.h"
26
27 #if !defined(ANDROID) ///\bug Android GCC 4.6.6 gives internal compiler error!
28 // Multiplies mat * vec, where mat is a matrix in row-major format.
29 FORCE_INLINE simd4f mat4x4_mul_vec4(const simd4f *mat, simd4f vec)
30 {
31 #ifdef MATH_NEON
32         // Transpose matrix at load time to get in registers in column-major format.
33         float32x4x4_t m = vld4q_f32((const float32_t*)mat);
34         simd4f ret = vmulq_lane_f32(m.val[0], vget_low_f32(vec), 0);
35         ret = vmlaq_lane_f32(ret, m.val[1], vget_low_f32(vec), 1);
36         ret = vmlaq_lane_f32(ret, m.val[2], vget_high_f32(vec), 0);
37         return vmlaq_lane_f32(ret, m.val[3], vget_high_f32(vec), 1);
38 #elif defined(MATH_SSE3)
39         return mat4x4_mul_sse3(mat, vec);
40 #else
41         return mat4x4_mul_sse(mat, vec);
42 #endif
43 }
44 #endif
45
46 // Multiplies vec * mat, where mat is a matrix in row-major format.
47 FORCE_INLINE simd4f vec4_mul_mat4x4(simd4f vec, const simd4f *mat)
48 {
49 #ifdef MATH_NEON
50         simd4f ret = vmulq_lane_f32(mat[0], vget_low_f32(vec), 0);
51         ret = vmlaq_lane_f32(ret, mat[1], vget_low_f32(vec), 1);
52         ret = vmlaq_lane_f32(ret, mat[2], vget_high_f32(vec), 0);
53         return vmlaq_lane_f32(ret, mat[3], vget_high_f32(vec), 1);
54 #else
55         return colmajor_mat4x4_mul_sse1(mat, vec);
56 #endif
57 }
58
59 // Multiplies m1 * m2, where m1 and m2 are stored in row-major format.
60 FORCE_INLINE void mat4x4_mul_mat4x4(simd4f *out, const simd4f *m1, const simd4f *m2)
61 {
62 #if defined(MATH_NEON)
63         simd4f r1 = vmulq_lane_f32(m2[0], vget_low_f32(m1[0]), 0);
64         simd4f r2 = vmulq_lane_f32(m2[0], vget_low_f32(m1[1]), 0);
65         simd4f r3 = vmulq_lane_f32(m2[0], vget_low_f32(m1[2]), 0);
66         simd4f r4 = vmulq_lane_f32(m2[0], vget_low_f32(m1[3]), 0);
67
68         r1 = vmlaq_lane_f32(r1, m2[1], vget_low_f32(m1[0]), 1);
69         r2 = vmlaq_lane_f32(r2, m2[1], vget_low_f32(m1[1]), 1);
70         r3 = vmlaq_lane_f32(r3, m2[1], vget_low_f32(m1[2]), 1);
71         r4 = vmlaq_lane_f32(r4, m2[1], vget_low_f32(m1[3]), 1);
72
73         r1 = vmlaq_lane_f32(r1, m2[2], vget_high_f32(m1[0]), 0);
74         r2 = vmlaq_lane_f32(r2, m2[2], vget_high_f32(m1[1]), 0);
75         r3 = vmlaq_lane_f32(r3, m2[2], vget_high_f32(m1[2]), 0);
76         r4 = vmlaq_lane_f32(r4, m2[2], vget_high_f32(m1[3]), 0);
77
78         r1 = vmlaq_lane_f32(r1, m2[3], vget_high_f32(m1[0]), 1);
79         r2 = vmlaq_lane_f32(r2, m2[3], vget_high_f32(m1[1]), 1);
80         r3 = vmlaq_lane_f32(r3, m2[3], vget_high_f32(m1[2]), 1);
81         r4 = vmlaq_lane_f32(r4, m2[3], vget_high_f32(m1[3]), 1);
82
83         out[0] = r1;
84         out[1] = r2;
85         out[2] = r3;
86         out[3] = r4;
87 #else
88         mat4x4_mul_sse(out, m1, m2);
89 #endif
90 }
91
92 #ifdef ANDROID
93 FORCE_INLINE void mat4x4_mul_mat4x4_asm(simd4f *out, const simd4f *m1, const simd4f *m2)
94 {
95         asm(
96                 "\t vldmia %1, {q4-q7} \n"
97                 "\t vldmia %2, {q8-q11} \n"
98                 "\t vmul.f32 q0, q8, d8[0] \n"
99                 "\t vmul.f32 q1, q8, d10[0] \n"
100                 "\t vmul.f32 q2, q8, d12[0] \n"
101                 "\t vmul.f32 q3, q8, d14[0] \n"
102                 "\t vmla.f32 q0, q9, d8[1] \n"
103                 "\t vmla.f32 q1, q9, d10[1] \n"
104                 "\t vmla.f32 q2, q9, d12[1] \n"
105                 "\t vmla.f32 q3, q9, d14[1] \n"
106                 "\t vmla.f32 q0, q10, d9[0] \n"
107                 "\t vmla.f32 q1, q10, d11[0] \n"
108                 "\t vmla.f32 q2, q10, d13[0] \n"
109                 "\t vmla.f32 q3, q10, d15[0] \n"
110                 "\t vmla.f32 q0, q11, d9[1] \n"
111                 "\t vmla.f32 q1, q11, d11[1] \n"
112                 "\t vmla.f32 q2, q11, d13[1] \n"
113                 "\t vmla.f32 q3, q11, d15[1] \n"
114                 "\t vstmia %0, {q0-q3} \n"
115         : /* no outputs by value */
116         : "r"(out), "r"(m1), "r"(m2)
117         : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q11");
118 }
119 #endif
120
121 #if !defined(ANDROID) ///\bug Android GCC 4.6.6 gives internal compiler error!
122 FORCE_INLINE void mat4x4_transpose(simd4f *out, const simd4f *mat)
123 {
124 #ifdef MATH_NEON
125         float32x4x4_t m = vld4q_f32((const float32_t*)mat);
126         vst1q_f32((float32_t*)out, m.val[0]);
127         vst1q_f32((float32_t*)out+4, m.val[1]);
128         vst1q_f32((float32_t*)out+8, m.val[2]);
129         vst1q_f32((float32_t*)out+12, m.val[3]);
130 #else
131
132         // Work around Visual Studio AVX codegen issue and avoid movelh and movehl altogether,
133         // they seem to produce fishy results even when /GL is not enabled. Related: https://connect.microsoft.com/VisualStudio/feedback/details/814682/visual-studio-2013-x64-compiler-generates-faulty-code-with-gl-o2-arch-avx-flags-enabled
134 #ifdef MATH_AVX
135         __m128 tmp0 = _mm_shuffle_ps(mat[0], mat[1], 0x44);
136         __m128 tmp2 = _mm_shuffle_ps(mat[0], mat[1], 0xEE);
137         __m128 tmp1 = _mm_shuffle_ps(mat[2], mat[3], 0x44);
138         __m128 tmp3 = _mm_shuffle_ps(mat[2], mat[3], 0xEE);
139         out[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
140         out[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
141         out[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
142         out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
143 #else
144         __m128 tmp0 = _mm_unpacklo_ps(mat[0], mat[1]);
145         __m128 tmp2 = _mm_unpacklo_ps(mat[2], mat[3]);
146         __m128 tmp1 = _mm_unpackhi_ps(mat[0], mat[1]);
147         __m128 tmp3 = _mm_unpackhi_ps(mat[2], mat[3]);
148         out[0] = _mm_movelh_ps(tmp0, tmp2);
149         out[1] = _mm_movehl_ps(tmp2, tmp0);
150         out[2] = _mm_movelh_ps(tmp1, tmp3);
151         out[3] = _mm_movehl_ps(tmp3, tmp1);
152 #endif
153
154 #endif
155 }
156 #endif
157
158 FORCE_INLINE void mat4x4_set(simd4f *mat, float _00, float _01, float _02, float _03,
159                                           float _10, float _11, float _12, float _13,
160                                           float _20, float _21, float _22, float _23,
161                                           float _30, float _31, float _32, float _33)
162 {
163 #ifdef MATH_AVX
164         __m256 *mat2 = (__m256*)mat;
165         mat2[0] = _mm256_set_ps(_13, _12, _11, _10, _03, _02, _01, _00);
166         mat2[1] = _mm256_set_ps(_33, _32, _31, _30, _23, _22, _21, _20);
167 #else
168         mat[0] = set_ps(_03, _02, _01, _00);
169         mat[1] = set_ps(_13, _12, _11, _10);
170         mat[2] = set_ps(_23, _22, _21, _20);
171         mat[3] = set_ps(_33, _32, _31, _30);
172 #endif
173 }
174
175 FORCE_INLINE void mat4x4_mul_float(simd4f *out, const simd4f *mat, float scalar)
176 {
177 #ifdef MATH_AVX
178         __m256 s = _mm256_set1_ps(scalar);
179         __m256 *o = (__m256*)out;
180         __m256 *i = (__m256*)mat;
181         o[0] = _mm256_mul_ps(i[0], s);
182         o[1] = _mm256_mul_ps(i[1], s);
183 #else
184         simd4f v = set1_ps(scalar);
185         out[0] = mul_ps(mat[0], v);
186         out[1] = mul_ps(mat[1], v);
187         out[2] = mul_ps(mat[2], v);
188         out[3] = mul_ps(mat[3], v);
189 #endif
190 }
191
192 FORCE_INLINE void mat4x4_div_float(simd4f *out, const simd4f *mat, float scalar)
193 {
194 #ifdef MATH_AVX
195         __m256 *o = (__m256*)out;
196         __m256 *i = (__m256*)mat;
197         __m256 s = _mm256_set1_ps(scalar);
198         __m256 one = _mm256_set1_ps(1.f);
199         s = _mm256_div_ps(one, s);
200         o[0] = _mm256_mul_ps(i[0], s);
201         o[1] = _mm256_mul_ps(i[1], s);
202 #else
203         simd4f s = set1_ps(scalar);
204         simd4f one = set1_ps(1.f);
205         s = div_ps(one, s);
206         out[0] = mul_ps(mat[0], s);
207         out[1] = mul_ps(mat[1], s);
208         out[2] = mul_ps(mat[2], s);
209         out[3] = mul_ps(mat[3], s);
210 #endif
211 }
212
213 FORCE_INLINE void mat4x4_add_mat4x4(simd4f *out, const simd4f *m1, const simd4f *m2)
214 {
215 #ifdef MATH_AVX
216         __m256 *o = (__m256*)out;
217         __m256 *i1 = (__m256*)m1;
218         __m256 *i2 = (__m256*)m2;
219         o[0] = _mm256_add_ps(i1[0], i2[0]);
220         o[1] = _mm256_add_ps(i1[1], i2[1]);
221 #else
222         out[0] = add_ps(m1[0], m2[0]);
223         out[1] = add_ps(m1[1], m2[1]);
224         out[2] = add_ps(m1[2], m2[2]);
225         out[3] = add_ps(m1[3], m2[3]);
226 #endif
227 }
228
229 FORCE_INLINE void mat4x4_sub_mat4x4(simd4f *out, const simd4f *m1, const simd4f *m2)
230 {
231 #ifdef MATH_AVX
232         __m256 *o = (__m256*)out;
233         __m256 *i1 = (__m256*)m1;
234         __m256 *i2 = (__m256*)m2;
235         o[0] = _mm256_sub_ps(i1[0], i2[0]);
236         o[1] = _mm256_sub_ps(i1[1], i2[1]);
237 #else
238         out[0] = sub_ps(m1[0], m2[0]);
239         out[1] = sub_ps(m1[1], m2[1]);
240         out[2] = sub_ps(m1[2], m2[2]);
241         out[3] = sub_ps(m1[3], m2[3]);
242 #endif
243 }
244
245 FORCE_INLINE void mat4x4_negate(simd4f *out, const simd4f *mat)
246 {
247 #ifdef MATH_AVX
248         __m256 zero = _mm256_setzero_ps();
249         __m256 *o = (__m256*)out;
250         __m256 *m = (__m256*)mat;
251         o[0] = _mm256_sub_ps(zero, m[0]);
252         o[1] = _mm256_sub_ps(zero, m[1]);
253 #else
254         out[0] = negate_ps(mat[0]);
255         out[1] = negate_ps(mat[1]);
256         out[2] = negate_ps(mat[2]);
257         out[3] = negate_ps(mat[3]);
258 #endif
259 }
260
261 #endif