matrix4x4_sse_p.h source code [qt3d/src/core/transforms/matrix4x4_sse_p.h]

1	// Copyright (C) 2016 Paul Lemire <[email protected]>
2	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4	#ifndef QT3DCORE_MATRIX4X4_SSE_P_H
5	#define QT3DCORE_MATRIX4X4_SSE_P_H
6
7	//
8	// W A R N I N G
9	// -------------
10	//
11	// This file is not part of the Qt3D API. It exists purely as an
12	// implementation detail. This header file may change from version to
13	// version without notice, or even be removed.
14	//
15	// We mean it.
16	//
17
18	#include <Qt3DCore/private/vector4d_p.h>
19	#include <Qt3DCore/private/vector3d_p.h>
20	#include <private/qsimd_p.h>
21	#include <QMatrix4x4>
22
23	#if defined(__AVX2__)
24	#include "matrix4x4_avx2_p.h"
25	#elif defined(__SSE2__)
26
27	QT_BEGIN_NAMESPACE
28
29	namespace Qt3DCore {
30
31	class Matrix4x4_SSE
32	{
33	public:
34
35	Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); }
36	explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {}
37
38	// QMatrix4x4::constData returns in column major order
39	explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat)
40	{
41	// data may not be properly aligned, using unaligned loads
42	const float *data = mat.constData();
43	m_col1 = _mm_loadu_ps(p: data);
44	m_col2 = _mm_loadu_ps(p: data + `4`);
45	m_col3 = _mm_loadu_ps(p: data + `8`);
46	m_col4 = _mm_loadu_ps(p: data + `12`);
47	}
48
49	// Assumes data is 16 bytes aligned (and in column major order)
50	explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data)
51	{
52	m_col1 = _mm_load_ps(p: data);
53	m_col2 = _mm_load_ps(p: data + `4`);
54	m_col3 = _mm_load_ps(p: data + `8`);
55	m_col4 = _mm_load_ps(p: data + `12`);
56	}
57
58	// In (row major) but we store in column major order
59	explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14,
60	float m21, float m22, float m23, float m24,
61	float m31, float m32, float m33, float m34,
62	float m41, float m42, float m43, float m44)
63	{
64	m_col1 = _mm_set_ps(z: m41, y: m31, x: m21, w: m11);
65	m_col2 = _mm_set_ps(z: m42, y: m32, x: m22, w: m12);
66	m_col3 = _mm_set_ps(z: m43, y: m33, x: m23, w: m13);
67	m_col4 = _mm_set_ps(z: m44, y: m34, x: m24, w: m14);
68	}
69
70	Q_ALWAYS_INLINE void setToIdentity()
71	{
72	m_col1 = _mm_set_ss(w: `1.0f`);
73	m_col2 = _mm_set_ps(z: `0.0f`, y: `0.0f`, x: `1.0f`, w: `0.0f`);
74	m_col3 = _mm_set_ps(z: `0.0f`, y: `1.0f`, x: `0.0f`, w: `0.0f`);
75	m_col4 = _mm_set_ps(z: `1.0f`, y: `0.0f`, x: `0.0f`, w: `0.0f`);
76	}
77
78	Q_ALWAYS_INLINE Matrix4x4_SSE operator(const* Matrix4x4_SSE &other) const
79	{
80	Matrix4x4_SSE c(Qt::Uninitialized);
81
82	const __m128 c1 = m_col1;
83	const __m128 c2 = m_col2;
84	const __m128 c3 = m_col3;
85	const __m128 c4 = m_col4;
86
87	// c11, c21, c31, c41
88	// 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41)
89	// 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42)
90	// 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43)
91	// 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44)
92	__m128 tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m11()), b: c1);
93	tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m21()), b: c2), b: tmp);
94	tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m31()), b: c3), b: tmp);
95	c.m_col1 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m41()), b: c4), b: tmp);
96
97	// c21, c22, c23, c24
98	tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m12()), b: c1);
99	tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m22()), b: c2), b: tmp);
100	tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m32()), b: c3), b: tmp);
101	c.m_col2 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m42()), b: c4), b: tmp);
102
103	// c31, c32, c33, c34
104	tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m13()), b: c1);
105	tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m23()), b: c2), b: tmp);
106	tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m33()), b: c3), b: tmp);
107	c.m_col3 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m43()), b: c4), b: tmp);
108
109	// c41, c42, c43, c44
110	tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m14()), b: c1);
111	tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m24()), b: c2), b: tmp);
112	tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m34()), b: c3), b: tmp);
113	c.m_col4 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m44()), b: c4), b: tmp);
114
115	return c;
116	}
117
118	Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const
119	{
120	Matrix4x4_SSE c(Qt::Uninitialized);
121
122	c.m_col1 = _mm_sub_ps(a: m_col1, b: other.m_col1);
123	c.m_col2 = _mm_sub_ps(a: m_col2, b: other.m_col2);
124	c.m_col3 = _mm_sub_ps(a: m_col3, b: other.m_col3);
125	c.m_col4 = _mm_sub_ps(a: m_col4, b: other.m_col4);
126
127	return c;
128	}
129
130	Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const
131	{
132	Matrix4x4_SSE c(Qt::Uninitialized);
133
134	c.m_col1 = _mm_add_ps(a: m_col1, b: other.m_col1);
135	c.m_col2 = _mm_add_ps(a: m_col2, b: other.m_col2);
136	c.m_col3 = _mm_add_ps(a: m_col3, b: other.m_col3);
137	c.m_col4 = _mm_add_ps(a: m_col4, b: other.m_col4);
138
139	return c;
140	}
141
142	Q_ALWAYS_INLINE Matrix4x4_SSE &operator=(const* Matrix4x4_SSE &other)
143	{
144	*this = *this * other;
145	return *this;
146	}
147
148	Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other)
149	{
150	*this = *this - other;
151	return *this;
152	}
153
154	Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other)
155	{
156	*this = *this + other;
157	return *this;
158	}
159
160	Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const
161	{
162	Matrix4x4_SSE c(Qt::Uninitialized);
163
164	// ~113 instructions
165	// 0b11011101 == 0xdd
166	// 0b10001000 == 0x88
167	const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, `0xdd`);
168	const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, `0x88`);
169	const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, `0xdd`);
170	const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, `0x88`);
171	c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, `0x88`);
172	c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, `0x88`);
173	c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, `0xdd`);
174	c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, `0xdd`);
175
176	return c;
177	}
178
179	Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const
180	{
181	// TO DO: Optimize
182	const QMatrix4x4 mat = toQMatrix4x4();
183	return Matrix4x4_SSE (mat.inverted());
184	}
185
186	Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const
187	{
188	// 0b1111 == 0xf
189	return (_mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col1, b: other.m_col1)) == `0xf` &&
190	_mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col2, b: other.m_col2)) == `0xf` &&
191	_mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col3, b: other.m_col3)) == `0xf` &&
192	_mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col4, b: other.m_col4)) == `0xf`);
193	}
194
195	Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const
196	{
197	return !(*this == other);
198	}
199
200	Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(a: m_col1); }
201	Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(a: m_col2); }
202	Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(a: m_col3); }
203	Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(a: m_col4); }
204
205	Q_ALWAYS_INLINE float m21() const
206	{
207	// 0b01010101 = 0x55
208	return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, `0x55`));
209	}
210	Q_ALWAYS_INLINE float m22() const
211	{
212	// 0b01010101 = 0x55
213	return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, `0x55`));
214	}
215	Q_ALWAYS_INLINE float m23() const
216	{
217	// 0b01010101 = 0x55
218	return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, `0x55`));
219	}
220	Q_ALWAYS_INLINE float m24() const
221	{
222	// 0b01010101 = 0x55
223	return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, `0x55`));
224	}
225
226	Q_ALWAYS_INLINE float m31() const
227	{
228	// 0b10101010 = 0xaa
229	return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, `0xaa`));
230	}
231	Q_ALWAYS_INLINE float m32() const
232	{
233	// 0b10101010 = 0xaa
234	return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, `0xaa`));
235	}
236	Q_ALWAYS_INLINE float m33() const
237	{
238	// 0b10101010 = 0xaa
239	return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, `0xaa`));
240	}
241	Q_ALWAYS_INLINE float m34() const
242	{
243	// 0b10101010 = 0xaa
244	return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, `0xaa`));
245	}
246
247	Q_ALWAYS_INLINE float m41() const
248	{
249	// 0b11111111 = 0xff
250	return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, `0xff`));
251	}
252	Q_ALWAYS_INLINE float m42() const
253	{
254	// 0b11111111 = 0xff
255	return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, `0xff`));
256	}
257	Q_ALWAYS_INLINE float m43() const
258	{
259	// 0b11111111 = 0xff
260	return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, `0xff`));
261	}
262	Q_ALWAYS_INLINE float m44() const
263	{
264	// 0b11111111 = 0xff
265	return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, `0xff`));
266	}
267
268	Q_ALWAYS_INLINE Vector4D row(int index) const
269	{
270	switch (index) {
271	case `0`:
272	return Vector4D (m11(), m12(), m13(), m14());
273	case `1`:
274	return Vector4D (m21(), m22(), m23(), m24());
275	case `2`:
276	return Vector4D (m31(), m32(), m33(), m34());
277	case `3`:
278	return Vector4D (m41(), m42(), m43(), m44());
279	default:
280	Q_UNREACHABLE_RETURN(Vector4D ());
281	}
282	}
283
284	Q_ALWAYS_INLINE Vector4D column(int index) const
285	{
286	Vector4D c(Qt::Uninitialized);
287	switch (index) {
288	case `0`:
289	c.m_xyzw = m_col1;
290	break;
291	case `1`:
292	c.m_xyzw = m_col2;
293	break;
294	case `2`:
295	c.m_xyzw = m_col3;
296	break;
297	case `3`:
298	c.m_xyzw = m_col4;
299	break;
300	default:
301	Q_UNREACHABLE_RETURN(Vector4D ());
302	}
303	return c;
304	}
305
306	Q_ALWAYS_INLINE float operator()(int row, int column) const {
307	return this->row(index: row)[column];
308	}
309
310	Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4 (m11(), m12(), m13(), m14(),
311	m21(), m22(), m23(), m24(),
312	m31(), m32(), m33(), m34(),
313	m41(), m42(), m43(), m44()); }
314
315	Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const
316	{
317	return *this * point;
318	}
319
320	Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const
321	{
322	return *this * point;
323	}
324
325	Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const
326	{
327	const Vector3D_SSE row1(m11(), m12(), m13());
328	const Vector3D_SSE row2(m21(), m22(), m23());
329	const Vector3D_SSE row3(m31(), m32(), m33());
330
331	return Vector3D (Vector3D_SSE::dotProduct(a: row1, b: vector),
332	Vector3D_SSE::dotProduct(a: row2, b: vector),
333	Vector3D_SSE::dotProduct(a: row3, b: vector));
334	}
335
336	friend Q_ALWAYS_INLINE Vector4D operator(const* Vector4D &vector, const Matrix4x4_SSE &matrix);
337	friend Q_ALWAYS_INLINE Vector4D operator(const* Matrix4x4_SSE &matrix, const Vector4D &vector);
338
339	friend Q_ALWAYS_INLINE Vector3D operator(const* Vector3D &vector, const Matrix4x4_SSE &matrix);
340	friend Q_ALWAYS_INLINE Vector3D operator(const* Matrix4x4_SSE &matrix, const Vector3D &vector);
341
342	friend Q_3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m);
343
344	private:
345	// Internally we will store the matrix as indicated below
346	// Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major)
347	// struct
348	// {
349	// float m_m11, m_m21, m_m31, m_m41;
350	// float m_m12, m_m22, m_m32, m_m42;
351	// float m_m13, m_m23, m_m33, m_m43;
352	// float m_m14, m_m24, m_m34, m_m44;
353	// };
354	// struct
355	// {
356	// float m[16];
357	// };
358	__m128 m_col1;
359	__m128 m_col2;
360	__m128 m_col3;
361	__m128 m_col4;
362	};
363
364	Q_ALWAYS_INLINE Vector4D operator(const* Vector4D &vector, const Matrix4x4_SSE &matrix)
365	{
366	const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vector.m_xyzw);
367	const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vector.m_xyzw);
368	const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vector.m_xyzw);
369	const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vector.m_xyzw);
370
371
372	// 0b01000100 == 0x44
373	// 0b11101110 == 0xee
374
375	// vCol1.x, vCol1.y, vCol2.x, vCol2.y
376	__m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, `0x44`);
377	// vCol1.z, vCol1.w, vCol2.z, vCol2.w
378	__m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, `0xee`);
379
380	// vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w,
381	const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2);
382
383	// vCol3.x, vCol3.y, vCol4.x, vCol4.y
384	tmp1 = _mm_shuffle_ps(vCol3, vCol4, `0x44`);
385	// vCol3.z, vCol3.w, vCol4.z, vCol4.w
386	tmp2 = _mm_shuffle_ps(vCol3, vCol4, `0xee`);
387
388	// vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w,
389	const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2);
390
391	// 0b10001000 == 0x88
392	// 0b11011101 == 0xdd
393
394	// vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z,
395	tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, `0x88`);
396	// vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w,
397	tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, `0xdd`);
398
399	Vector4D v(Qt::Uninitialized);
400	v.m_xyzw = _mm_add_ps(a: tmp1, b: tmp2);
401	return v;
402	}
403
404	Q_ALWAYS_INLINE Vector4D operator(const* Matrix4x4_SSE &matrix, const Vector4D &vector)
405	{
406	const Matrix4x4_SSE transposed = matrix.transposed();
407	return vector * transposed;
408	}
409
410	Q_ALWAYS_INLINE Vector3D operator(const* Vector3D &vector, const Matrix4x4_SSE &matrix)
411	{
412	const __m128 vec4 = _mm_set_ps(z: `1.0f`, y: vector.z(), x: vector.y(), w: vector.x());
413
414	const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vec4);
415	const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vec4);
416	const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vec4);
417	const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vec4);
418
419	// 0b01000100 == 0x44
420	// 0b11101110 == 0xee
421
422	// vCol1.x, vCol1.y, vCol2.x, vCol2.y
423	__m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, `0x44`);
424	// vCol1.z, vCol1.w, vCol2.z, vCol2.w
425	__m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, `0xee`);
426
427	// vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w,
428	const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2);
429
430	// vCol3.x, vCol3.y, vCol4.x, vCol4.y
431	tmp1 = _mm_shuffle_ps(vCol3, vCol4, `0x44`);
432	// vCol3.z, vCol3.w, vCol4.z, vCol4.w
433	tmp2 = _mm_shuffle_ps(vCol3, vCol4, `0xee`);
434
435	// vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w,
436	const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2);
437
438	// 0b10001000 == 0x88
439	// 0b11011101 == 0xdd
440
441	// vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z,
442	tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, `0x88`);
443	// vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w,
444	tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, `0xdd`);
445
446	const __m128 result = _mm_add_ps(a: tmp1, b: tmp2);
447	// 0b11111111 = 0xff
448	const __m128 divisor = _mm_shuffle_ps(result, result, `0xff`);
449	Vector3D v(Qt::Uninitialized);
450	v.m_xyzw = _mm_div_ps(a: result, b: divisor);
451	return v;
452	}
453
454	Q_ALWAYS_INLINE Vector3D operator(const* Matrix4x4_SSE &matrix, const Vector3D &vector)
455	{
456	const Matrix4x4_SSE transposed = matrix.transposed();
457	return vector * transposed;
458	}
459
460	} // Qt3DCore
461
462
463	Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE);
464
465	QT_END_NAMESPACE
466
467	Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE)
468
469	#endif // __SSE2__
470
471	#endif // QT3DCORE_MATRIX4X4_SSE_P_H
472

source code of qt3d/src/core/transforms/matrix4x4_sse_p.h