| 1 | // Copyright (C) 2016 Paul Lemire <[email protected]> |
| 2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
| 3 | |
| 4 | #ifndef QT3DCORE_MATRIX4X4_SSE_P_H |
| 5 | #define QT3DCORE_MATRIX4X4_SSE_P_H |
| 6 | |
| 7 | // |
| 8 | // W A R N I N G |
| 9 | // ------------- |
| 10 | // |
| 11 | // This file is not part of the Qt3D API. It exists purely as an |
| 12 | // implementation detail. This header file may change from version to |
| 13 | // version without notice, or even be removed. |
| 14 | // |
| 15 | // We mean it. |
| 16 | // |
| 17 | |
| 18 | #include <Qt3DCore/private/vector4d_p.h> |
| 19 | #include <Qt3DCore/private/vector3d_p.h> |
| 20 | #include <private/qsimd_p.h> |
| 21 | #include <QMatrix4x4> |
| 22 | |
| 23 | #if defined(__AVX2__) |
| 24 | #include "matrix4x4_avx2_p.h" |
| 25 | #elif defined(__SSE2__) |
| 26 | |
| 27 | QT_BEGIN_NAMESPACE |
| 28 | |
| 29 | namespace Qt3DCore { |
| 30 | |
| 31 | class Matrix4x4_SSE |
| 32 | { |
| 33 | public: |
| 34 | |
| 35 | Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); } |
| 36 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {} |
| 37 | |
| 38 | // QMatrix4x4::constData returns in column major order |
| 39 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat) |
| 40 | { |
| 41 | // data may not be properly aligned, using unaligned loads |
| 42 | const float *data = mat.constData(); |
| 43 | m_col1 = _mm_loadu_ps(p: data); |
| 44 | m_col2 = _mm_loadu_ps(p: data + 4); |
| 45 | m_col3 = _mm_loadu_ps(p: data + 8); |
| 46 | m_col4 = _mm_loadu_ps(p: data + 12); |
| 47 | } |
| 48 | |
| 49 | // Assumes data is 16 bytes aligned (and in column major order) |
| 50 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data) |
| 51 | { |
| 52 | m_col1 = _mm_load_ps(p: data); |
| 53 | m_col2 = _mm_load_ps(p: data + 4); |
| 54 | m_col3 = _mm_load_ps(p: data + 8); |
| 55 | m_col4 = _mm_load_ps(p: data + 12); |
| 56 | } |
| 57 | |
| 58 | // In (row major) but we store in column major order |
| 59 | explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14, |
| 60 | float m21, float m22, float m23, float m24, |
| 61 | float m31, float m32, float m33, float m34, |
| 62 | float m41, float m42, float m43, float m44) |
| 63 | { |
| 64 | m_col1 = _mm_set_ps(z: m41, y: m31, x: m21, w: m11); |
| 65 | m_col2 = _mm_set_ps(z: m42, y: m32, x: m22, w: m12); |
| 66 | m_col3 = _mm_set_ps(z: m43, y: m33, x: m23, w: m13); |
| 67 | m_col4 = _mm_set_ps(z: m44, y: m34, x: m24, w: m14); |
| 68 | } |
| 69 | |
| 70 | Q_ALWAYS_INLINE void setToIdentity() |
| 71 | { |
| 72 | m_col1 = _mm_set_ss(w: 1.0f); |
| 73 | m_col2 = _mm_set_ps(z: 0.0f, y: 0.0f, x: 1.0f, w: 0.0f); |
| 74 | m_col3 = _mm_set_ps(z: 0.0f, y: 1.0f, x: 0.0f, w: 0.0f); |
| 75 | m_col4 = _mm_set_ps(z: 1.0f, y: 0.0f, x: 0.0f, w: 0.0f); |
| 76 | } |
| 77 | |
| 78 | Q_ALWAYS_INLINE Matrix4x4_SSE operator*(const Matrix4x4_SSE &other) const |
| 79 | { |
| 80 | Matrix4x4_SSE c(Qt::Uninitialized); |
| 81 | |
| 82 | const __m128 c1 = m_col1; |
| 83 | const __m128 c2 = m_col2; |
| 84 | const __m128 c3 = m_col3; |
| 85 | const __m128 c4 = m_col4; |
| 86 | |
| 87 | // c11, c21, c31, c41 |
| 88 | // 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41) |
| 89 | // 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42) |
| 90 | // 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43) |
| 91 | // 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44) |
| 92 | __m128 tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m11()), b: c1); |
| 93 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m21()), b: c2), b: tmp); |
| 94 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m31()), b: c3), b: tmp); |
| 95 | c.m_col1 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m41()), b: c4), b: tmp); |
| 96 | |
| 97 | // c21, c22, c23, c24 |
| 98 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m12()), b: c1); |
| 99 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m22()), b: c2), b: tmp); |
| 100 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m32()), b: c3), b: tmp); |
| 101 | c.m_col2 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m42()), b: c4), b: tmp); |
| 102 | |
| 103 | // c31, c32, c33, c34 |
| 104 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m13()), b: c1); |
| 105 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m23()), b: c2), b: tmp); |
| 106 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m33()), b: c3), b: tmp); |
| 107 | c.m_col3 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m43()), b: c4), b: tmp); |
| 108 | |
| 109 | // c41, c42, c43, c44 |
| 110 | tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m14()), b: c1); |
| 111 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m24()), b: c2), b: tmp); |
| 112 | tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m34()), b: c3), b: tmp); |
| 113 | c.m_col4 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m44()), b: c4), b: tmp); |
| 114 | |
| 115 | return c; |
| 116 | } |
| 117 | |
| 118 | Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const |
| 119 | { |
| 120 | Matrix4x4_SSE c(Qt::Uninitialized); |
| 121 | |
| 122 | c.m_col1 = _mm_sub_ps(a: m_col1, b: other.m_col1); |
| 123 | c.m_col2 = _mm_sub_ps(a: m_col2, b: other.m_col2); |
| 124 | c.m_col3 = _mm_sub_ps(a: m_col3, b: other.m_col3); |
| 125 | c.m_col4 = _mm_sub_ps(a: m_col4, b: other.m_col4); |
| 126 | |
| 127 | return c; |
| 128 | } |
| 129 | |
| 130 | Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const |
| 131 | { |
| 132 | Matrix4x4_SSE c(Qt::Uninitialized); |
| 133 | |
| 134 | c.m_col1 = _mm_add_ps(a: m_col1, b: other.m_col1); |
| 135 | c.m_col2 = _mm_add_ps(a: m_col2, b: other.m_col2); |
| 136 | c.m_col3 = _mm_add_ps(a: m_col3, b: other.m_col3); |
| 137 | c.m_col4 = _mm_add_ps(a: m_col4, b: other.m_col4); |
| 138 | |
| 139 | return c; |
| 140 | } |
| 141 | |
| 142 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator*=(const Matrix4x4_SSE &other) |
| 143 | { |
| 144 | *this = *this * other; |
| 145 | return *this; |
| 146 | } |
| 147 | |
| 148 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other) |
| 149 | { |
| 150 | *this = *this - other; |
| 151 | return *this; |
| 152 | } |
| 153 | |
| 154 | Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other) |
| 155 | { |
| 156 | *this = *this + other; |
| 157 | return *this; |
| 158 | } |
| 159 | |
| 160 | Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const |
| 161 | { |
| 162 | Matrix4x4_SSE c(Qt::Uninitialized); |
| 163 | |
| 164 | // ~113 instructions |
| 165 | // 0b11011101 == 0xdd |
| 166 | // 0b10001000 == 0x88 |
| 167 | const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, 0xdd); |
| 168 | const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, 0x88); |
| 169 | const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, 0xdd); |
| 170 | const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, 0x88); |
| 171 | c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, 0x88); |
| 172 | c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, 0x88); |
| 173 | c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, 0xdd); |
| 174 | c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); |
| 175 | |
| 176 | return c; |
| 177 | } |
| 178 | |
| 179 | Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const |
| 180 | { |
| 181 | // TO DO: Optimize |
| 182 | const QMatrix4x4 mat = toQMatrix4x4(); |
| 183 | return Matrix4x4_SSE(mat.inverted()); |
| 184 | } |
| 185 | |
| 186 | Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const |
| 187 | { |
| 188 | // 0b1111 == 0xf |
| 189 | return (_mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col1, b: other.m_col1)) == 0xf && |
| 190 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col2, b: other.m_col2)) == 0xf && |
| 191 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col3, b: other.m_col3)) == 0xf && |
| 192 | _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col4, b: other.m_col4)) == 0xf); |
| 193 | } |
| 194 | |
| 195 | Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const |
| 196 | { |
| 197 | return !(*this == other); |
| 198 | } |
| 199 | |
| 200 | Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(a: m_col1); } |
| 201 | Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(a: m_col2); } |
| 202 | Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(a: m_col3); } |
| 203 | Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(a: m_col4); } |
| 204 | |
| 205 | Q_ALWAYS_INLINE float m21() const |
| 206 | { |
| 207 | // 0b01010101 = 0x55 |
| 208 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0x55)); |
| 209 | } |
| 210 | Q_ALWAYS_INLINE float m22() const |
| 211 | { |
| 212 | // 0b01010101 = 0x55 |
| 213 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0x55)); |
| 214 | } |
| 215 | Q_ALWAYS_INLINE float m23() const |
| 216 | { |
| 217 | // 0b01010101 = 0x55 |
| 218 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0x55)); |
| 219 | } |
| 220 | Q_ALWAYS_INLINE float m24() const |
| 221 | { |
| 222 | // 0b01010101 = 0x55 |
| 223 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0x55)); |
| 224 | } |
| 225 | |
| 226 | Q_ALWAYS_INLINE float m31() const |
| 227 | { |
| 228 | // 0b10101010 = 0xaa |
| 229 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xaa)); |
| 230 | } |
| 231 | Q_ALWAYS_INLINE float m32() const |
| 232 | { |
| 233 | // 0b10101010 = 0xaa |
| 234 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xaa)); |
| 235 | } |
| 236 | Q_ALWAYS_INLINE float m33() const |
| 237 | { |
| 238 | // 0b10101010 = 0xaa |
| 239 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xaa)); |
| 240 | } |
| 241 | Q_ALWAYS_INLINE float m34() const |
| 242 | { |
| 243 | // 0b10101010 = 0xaa |
| 244 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xaa)); |
| 245 | } |
| 246 | |
| 247 | Q_ALWAYS_INLINE float m41() const |
| 248 | { |
| 249 | // 0b11111111 = 0xff |
| 250 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xff)); |
| 251 | } |
| 252 | Q_ALWAYS_INLINE float m42() const |
| 253 | { |
| 254 | // 0b11111111 = 0xff |
| 255 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xff)); |
| 256 | } |
| 257 | Q_ALWAYS_INLINE float m43() const |
| 258 | { |
| 259 | // 0b11111111 = 0xff |
| 260 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xff)); |
| 261 | } |
| 262 | Q_ALWAYS_INLINE float m44() const |
| 263 | { |
| 264 | // 0b11111111 = 0xff |
| 265 | return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xff)); |
| 266 | } |
| 267 | |
| 268 | Q_ALWAYS_INLINE Vector4D row(int index) const |
| 269 | { |
| 270 | switch (index) { |
| 271 | case 0: |
| 272 | return Vector4D(m11(), m12(), m13(), m14()); |
| 273 | case 1: |
| 274 | return Vector4D(m21(), m22(), m23(), m24()); |
| 275 | case 2: |
| 276 | return Vector4D(m31(), m32(), m33(), m34()); |
| 277 | case 3: |
| 278 | return Vector4D(m41(), m42(), m43(), m44()); |
| 279 | default: |
| 280 | Q_UNREACHABLE_RETURN(Vector4D()); |
| 281 | } |
| 282 | } |
| 283 | |
| 284 | Q_ALWAYS_INLINE Vector4D column(int index) const |
| 285 | { |
| 286 | Vector4D c(Qt::Uninitialized); |
| 287 | switch (index) { |
| 288 | case 0: |
| 289 | c.m_xyzw = m_col1; |
| 290 | break; |
| 291 | case 1: |
| 292 | c.m_xyzw = m_col2; |
| 293 | break; |
| 294 | case 2: |
| 295 | c.m_xyzw = m_col3; |
| 296 | break; |
| 297 | case 3: |
| 298 | c.m_xyzw = m_col4; |
| 299 | break; |
| 300 | default: |
| 301 | Q_UNREACHABLE_RETURN(Vector4D()); |
| 302 | } |
| 303 | return c; |
| 304 | } |
| 305 | |
| 306 | Q_ALWAYS_INLINE float operator()(int row, int column) const { |
| 307 | return this->row(index: row)[column]; |
| 308 | } |
| 309 | |
| 310 | Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(), |
| 311 | m21(), m22(), m23(), m24(), |
| 312 | m31(), m32(), m33(), m34(), |
| 313 | m41(), m42(), m43(), m44()); } |
| 314 | |
| 315 | Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const |
| 316 | { |
| 317 | return *this * point; |
| 318 | } |
| 319 | |
| 320 | Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const |
| 321 | { |
| 322 | return *this * point; |
| 323 | } |
| 324 | |
| 325 | Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const |
| 326 | { |
| 327 | const Vector3D_SSE row1(m11(), m12(), m13()); |
| 328 | const Vector3D_SSE row2(m21(), m22(), m23()); |
| 329 | const Vector3D_SSE row3(m31(), m32(), m33()); |
| 330 | |
| 331 | return Vector3D(Vector3D_SSE::dotProduct(a: row1, b: vector), |
| 332 | Vector3D_SSE::dotProduct(a: row2, b: vector), |
| 333 | Vector3D_SSE::dotProduct(a: row3, b: vector)); |
| 334 | } |
| 335 | |
| 336 | friend Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix); |
| 337 | friend Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector); |
| 338 | |
| 339 | friend Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix); |
| 340 | friend Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector); |
| 341 | |
| 342 | friend Q_3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m); |
| 343 | |
| 344 | private: |
| 345 | // Internally we will store the matrix as indicated below |
| 346 | // Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major) |
| 347 | // struct |
| 348 | // { |
| 349 | // float m_m11, m_m21, m_m31, m_m41; |
| 350 | // float m_m12, m_m22, m_m32, m_m42; |
| 351 | // float m_m13, m_m23, m_m33, m_m43; |
| 352 | // float m_m14, m_m24, m_m34, m_m44; |
| 353 | // }; |
| 354 | // struct |
| 355 | // { |
| 356 | // float m[16]; |
| 357 | // }; |
| 358 | __m128 m_col1; |
| 359 | __m128 m_col2; |
| 360 | __m128 m_col3; |
| 361 | __m128 m_col4; |
| 362 | }; |
| 363 | |
| 364 | Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix) |
| 365 | { |
| 366 | const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vector.m_xyzw); |
| 367 | const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vector.m_xyzw); |
| 368 | const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vector.m_xyzw); |
| 369 | const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vector.m_xyzw); |
| 370 | |
| 371 | |
| 372 | // 0b01000100 == 0x44 |
| 373 | // 0b11101110 == 0xee |
| 374 | |
| 375 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |
| 376 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |
| 377 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |
| 378 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |
| 379 | |
| 380 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |
| 381 | const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2); |
| 382 | |
| 383 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |
| 384 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |
| 385 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |
| 386 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |
| 387 | |
| 388 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |
| 389 | const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2); |
| 390 | |
| 391 | // 0b10001000 == 0x88 |
| 392 | // 0b11011101 == 0xdd |
| 393 | |
| 394 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |
| 395 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |
| 396 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |
| 397 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |
| 398 | |
| 399 | Vector4D v(Qt::Uninitialized); |
| 400 | v.m_xyzw = _mm_add_ps(a: tmp1, b: tmp2); |
| 401 | return v; |
| 402 | } |
| 403 | |
| 404 | Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector) |
| 405 | { |
| 406 | const Matrix4x4_SSE transposed = matrix.transposed(); |
| 407 | return vector * transposed; |
| 408 | } |
| 409 | |
| 410 | Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix) |
| 411 | { |
| 412 | const __m128 vec4 = _mm_set_ps(z: 1.0f, y: vector.z(), x: vector.y(), w: vector.x()); |
| 413 | |
| 414 | const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vec4); |
| 415 | const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vec4); |
| 416 | const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vec4); |
| 417 | const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vec4); |
| 418 | |
| 419 | // 0b01000100 == 0x44 |
| 420 | // 0b11101110 == 0xee |
| 421 | |
| 422 | // vCol1.x, vCol1.y, vCol2.x, vCol2.y |
| 423 | __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); |
| 424 | // vCol1.z, vCol1.w, vCol2.z, vCol2.w |
| 425 | __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); |
| 426 | |
| 427 | // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, |
| 428 | const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2); |
| 429 | |
| 430 | // vCol3.x, vCol3.y, vCol4.x, vCol4.y |
| 431 | tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); |
| 432 | // vCol3.z, vCol3.w, vCol4.z, vCol4.w |
| 433 | tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); |
| 434 | |
| 435 | // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, |
| 436 | const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2); |
| 437 | |
| 438 | // 0b10001000 == 0x88 |
| 439 | // 0b11011101 == 0xdd |
| 440 | |
| 441 | // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, |
| 442 | tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); |
| 443 | // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, |
| 444 | tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); |
| 445 | |
| 446 | const __m128 result = _mm_add_ps(a: tmp1, b: tmp2); |
| 447 | // 0b11111111 = 0xff |
| 448 | const __m128 divisor = _mm_shuffle_ps(result, result, 0xff); |
| 449 | Vector3D v(Qt::Uninitialized); |
| 450 | v.m_xyzw = _mm_div_ps(a: result, b: divisor); |
| 451 | return v; |
| 452 | } |
| 453 | |
| 454 | Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector) |
| 455 | { |
| 456 | const Matrix4x4_SSE transposed = matrix.transposed(); |
| 457 | return vector * transposed; |
| 458 | } |
| 459 | |
| 460 | } // Qt3DCore |
| 461 | |
| 462 | |
| 463 | Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE); |
| 464 | |
| 465 | QT_END_NAMESPACE |
| 466 | |
| 467 | Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE) |
| 468 | |
| 469 | #endif // __SSE2__ |
| 470 | |
| 471 | #endif // QT3DCORE_MATRIX4X4_SSE_P_H |
| 472 | |