| 1 | // Copyright (C) 2021 The Qt Company Ltd. | 
|---|
| 2 | // Copyright (C) 2022 Intel Corporation. | 
|---|
| 3 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only | 
|---|
| 4 |  | 
|---|
| 5 | #ifndef QSIMD_P_H | 
|---|
| 6 | #define QSIMD_P_H | 
|---|
| 7 |  | 
|---|
| 8 | // | 
|---|
| 9 | //  W A R N I N G | 
|---|
| 10 | //  ------------- | 
|---|
| 11 | // | 
|---|
| 12 | // This file is not part of the Qt API.  It exists purely as an | 
|---|
| 13 | // implementation detail.  This header file may change from version to | 
|---|
| 14 | // version without notice, or even be removed. | 
|---|
| 15 | // | 
|---|
| 16 | // We mean it. | 
|---|
| 17 | // | 
|---|
| 18 |  | 
|---|
| 19 | #include <QtCore/private/qglobal_p.h> | 
|---|
| 20 | #include <QtCore/qsimd.h> | 
|---|
| 21 |  | 
|---|
| 22 | QT_WARNING_PUSH | 
|---|
| 23 | QT_WARNING_DISABLE_CLANG( "-Wundef") | 
|---|
| 24 | QT_WARNING_DISABLE_GCC( "-Wundef") | 
|---|
| 25 | QT_WARNING_DISABLE_INTEL(103) | 
|---|
| 26 |  | 
|---|
| 27 | #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ | 
|---|
| 28 | for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i) | 
|---|
| 29 |  | 
|---|
| 30 | #define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \ | 
|---|
| 31 | for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i) | 
|---|
| 32 |  | 
|---|
| 33 | #define SIMD_EPILOGUE(i, length, max) \ | 
|---|
| 34 | for (int _i = 0; _i < max && i < length; ++i, ++_i) | 
|---|
| 35 |  | 
|---|
| 36 | /* | 
|---|
| 37 | * Code can use the following constructs to determine compiler support & status: | 
|---|
| 38 | * - #ifdef __XXX__      (e.g: #ifdef __AVX__  or #ifdef __ARM_NEON__) | 
|---|
| 39 | *   If this test passes, then the compiler is already generating code for that | 
|---|
| 40 | *   given sub-architecture. The intrinsics for that sub-architecture are | 
|---|
| 41 | *   #included and can be used without restriction or runtime check. | 
|---|
| 42 | * | 
|---|
| 43 | * - #if QT_COMPILER_SUPPORTS(XXX) | 
|---|
| 44 | *   If this test passes, then the compiler is able to generate code for that | 
|---|
| 45 | *   given sub-architecture in another translation unit, given the right set of | 
|---|
| 46 | *   flags. Use of the intrinsics is not guaranteed. This is useful with | 
|---|
| 47 | *   runtime detection (see below). | 
|---|
| 48 | * | 
|---|
| 49 | * - #if QT_COMPILER_SUPPORTS_HERE(XXX) | 
|---|
| 50 | *   If this test passes, then the compiler is able to generate code for that | 
|---|
| 51 | *   given sub-architecture in this translation unit, even if it is not doing | 
|---|
| 52 | *   that now (it might be). Individual functions may be tagged with | 
|---|
| 53 | *   QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that | 
|---|
| 54 | *   sub-arch. Only inside such functions is the use of the intrisics | 
|---|
| 55 | *   guaranteed to work. This is useful with runtime detection (see below). | 
|---|
| 56 | * | 
|---|
| 57 | * The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is | 
|---|
| 58 | * historical: GCC 4.8 needed the distinction. | 
|---|
| 59 | * | 
|---|
| 60 | * Runtime detection of a CPU sub-architecture can be done with the | 
|---|
| 61 | * qCpuHasFeature(XXX) function. There are two strategies for generating | 
|---|
| 62 | * optimized code like that: | 
|---|
| 63 | * | 
|---|
| 64 | * 1) place the optimized code in a different translation unit (C or assembly | 
|---|
| 65 | * sources) and pass the correct flags to the compiler to enable support. Those | 
|---|
| 66 | * sources must not include qglobal.h, which means they cannot include this | 
|---|
| 67 | * file either. The dispatcher function would look like this: | 
|---|
| 68 | * | 
|---|
| 69 | *      void foo() | 
|---|
| 70 | *      { | 
|---|
| 71 | *      #if QT_COMPILER_SUPPORTS(XXX) | 
|---|
| 72 | *          if (qCpuHasFeature(XXX)) { | 
|---|
| 73 | *              foo_optimized_xxx(); | 
|---|
| 74 | *              return; | 
|---|
| 75 | *          } | 
|---|
| 76 | *      #endif | 
|---|
| 77 | *          foo_plain(); | 
|---|
| 78 | *      } | 
|---|
| 79 | * | 
|---|
| 80 | * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and | 
|---|
| 81 | * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use | 
|---|
| 82 | * other Qt code. The dispatcher function would look like this: | 
|---|
| 83 | * | 
|---|
| 84 | *      void foo() | 
|---|
| 85 | *      { | 
|---|
| 86 | *      #if QT_COMPILER_SUPPORTS_HERE(XXX) | 
|---|
| 87 | *          if (qCpuHasFeature(XXX)) { | 
|---|
| 88 | *              foo_optimized_xxx(); | 
|---|
| 89 | *              return; | 
|---|
| 90 | *          } | 
|---|
| 91 | *      #endif | 
|---|
| 92 | *          foo_plain(); | 
|---|
| 93 | *      } | 
|---|
| 94 | */ | 
|---|
| 95 |  | 
|---|
| 96 | #if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC) | 
|---|
| 97 | #include <intrin.h> | 
|---|
| 98 | #endif | 
|---|
| 99 |  | 
|---|
| 100 | #define QT_COMPILER_SUPPORTS(x)     (QT_COMPILER_SUPPORTS_ ## x - 0) | 
|---|
| 101 |  | 
|---|
| 102 | #if defined(Q_PROCESSOR_ARM) | 
|---|
| 103 | #  define QT_COMPILER_SUPPORTS_HERE(x)    ((__ARM_FEATURE_ ## x) || (__ ## x ## __) || QT_COMPILER_SUPPORTS(x)) | 
|---|
| 104 | #  if defined(Q_CC_GNU) | 
|---|
| 105 | /* GCC requires attributes for a function */ | 
|---|
| 106 | #    define QT_FUNCTION_TARGET(x)  __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) | 
|---|
| 107 | #  else | 
|---|
| 108 | #    define QT_FUNCTION_TARGET(x) | 
|---|
| 109 | #  endif | 
|---|
| 110 | #elif defined(Q_PROCESSOR_MIPS) | 
|---|
| 111 | #  define QT_COMPILER_SUPPORTS_HERE(x)    (__ ## x ## __) | 
|---|
| 112 | #  define QT_FUNCTION_TARGET(x) | 
|---|
| 113 | #  if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32) | 
|---|
| 114 | #    define __MIPS_DSP__ | 
|---|
| 115 | #  endif | 
|---|
| 116 | #  if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32) | 
|---|
| 117 | #    define __MIPS_DSPR2__ | 
|---|
| 118 | #  endif | 
|---|
| 119 | #elif defined(Q_PROCESSOR_X86) | 
|---|
| 120 | #  if defined(Q_CC_CLANG) && defined(Q_CC_MSVC) | 
|---|
| 121 | #    define QT_COMPILER_SUPPORTS_HERE(x)    (__ ## x ## __) | 
|---|
| 122 | #  else | 
|---|
| 123 | #    define QT_COMPILER_SUPPORTS_HERE(x)    ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x)) | 
|---|
| 124 | #  endif | 
|---|
| 125 | #  if defined(Q_CC_GNU) | 
|---|
| 126 | /* GCC requires attributes for a function */ | 
|---|
| 127 | #    define QT_FUNCTION_TARGET(x)  __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) | 
|---|
| 128 | #  else | 
|---|
| 129 | #    define QT_FUNCTION_TARGET(x) | 
|---|
| 130 | #  endif | 
|---|
| 131 | #else | 
|---|
| 132 | #  define QT_COMPILER_SUPPORTS_HERE(x)    (__ ## x ## __) | 
|---|
| 133 | #  define QT_FUNCTION_TARGET(x) | 
|---|
| 134 | #endif | 
|---|
| 135 |  | 
|---|
| 136 | #if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED) | 
|---|
| 137 | // Intrinsic support appears to be missing, so pretend these features don't exist | 
|---|
| 138 | #  undef __SSE__ | 
|---|
| 139 | #  undef __SSE2__ | 
|---|
| 140 | #  undef __SSE3__ | 
|---|
| 141 | #  undef __SSSE3__ | 
|---|
| 142 | #  undef __SSE4_1__ | 
|---|
| 143 | #  undef __SSE4_2__ | 
|---|
| 144 | #  undef __AES__ | 
|---|
| 145 | #  undef __POPCNT__ | 
|---|
| 146 | #  undef __AVX__ | 
|---|
| 147 | #  undef __F16C__ | 
|---|
| 148 | #  undef __RDRND__ | 
|---|
| 149 | #  undef __AVX2__ | 
|---|
| 150 | #  undef __BMI__ | 
|---|
| 151 | #  undef __BMI2__ | 
|---|
| 152 | #  undef __FMA__ | 
|---|
| 153 | #  undef __MOVBE__ | 
|---|
| 154 | #  undef __RDSEED__ | 
|---|
| 155 | #  undef __AVX512F__ | 
|---|
| 156 | #  undef __AVX512ER__ | 
|---|
| 157 | #  undef __AVX512CD__ | 
|---|
| 158 | #  undef __AVX512PF__ | 
|---|
| 159 | #  undef __AVX512DQ__ | 
|---|
| 160 | #  undef __AVX512BW__ | 
|---|
| 161 | #  undef __AVX512VL__ | 
|---|
| 162 | #  undef __AVX512IFMA__ | 
|---|
| 163 | #  undef __AVX512VBMI__ | 
|---|
| 164 | #  undef __SHA__ | 
|---|
| 165 | #  undef __AVX512VBMI2__ | 
|---|
| 166 | #  undef __AVX512BITALG__ | 
|---|
| 167 | #  undef __AVX512VNNI__ | 
|---|
| 168 | #  undef __AVX512VPOPCNTDQ__ | 
|---|
| 169 | #  undef __GFNI__ | 
|---|
| 170 | #  undef __VAES__ | 
|---|
| 171 | #endif | 
|---|
| 172 |  | 
|---|
| 173 | #ifdef Q_PROCESSOR_X86 | 
|---|
| 174 | /* -- x86 intrinsic support -- */ | 
|---|
| 175 |  | 
|---|
| 176 | #  if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX) | 
|---|
| 177 | // The compiler for QNX is missing the intrinsic | 
|---|
| 178 | #    undef QT_COMPILER_SUPPORTS_RDSEED | 
|---|
| 179 | #  endif | 
|---|
| 180 | #  if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2) | 
|---|
| 181 | // MSVC doesn't define __SSE2__, so do it ourselves | 
|---|
| 182 | #    define __SSE__                         1 | 
|---|
| 183 | #  endif | 
|---|
| 184 |  | 
|---|
| 185 | #  if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG) | 
|---|
| 186 | // 64-bit GCC on Windows does not support AVX, so we hack around it by forcing | 
|---|
| 187 | // it to emit unaligned loads & stores | 
|---|
| 188 | // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001 | 
|---|
| 189 | asm( | 
|---|
| 190 | ".macro vmovapd args:vararg\n" | 
|---|
| 191 | "    vmovupd \\args\n" | 
|---|
| 192 | ".endm\n" | 
|---|
| 193 | ".macro vmovaps args:vararg\n" | 
|---|
| 194 | "    vmovups \\args\n" | 
|---|
| 195 | ".endm\n" | 
|---|
| 196 | ".macro vmovdqa args:vararg\n" | 
|---|
| 197 | "    vmovdqu \\args\n" | 
|---|
| 198 | ".endm\n" | 
|---|
| 199 | ".macro vmovdqa32 args:vararg\n" | 
|---|
| 200 | "    vmovdqu32 \\args\n" | 
|---|
| 201 | ".endm\n" | 
|---|
| 202 | ".macro vmovdqa64 args:vararg\n" | 
|---|
| 203 | "    vmovdqu64 \\args\n" | 
|---|
| 204 | ".endm\n" | 
|---|
| 205 | ); | 
|---|
| 206 | #  endif | 
|---|
| 207 |  | 
|---|
| 208 | #  if defined(Q_CC_GNU) && !defined(Q_OS_WASM) | 
|---|
| 209 | // GCC 4.4 and Clang 2.8 added a few more intrinsics there | 
|---|
| 210 | #    include <x86intrin.h> | 
|---|
| 211 | #  endif | 
|---|
| 212 | #ifdef Q_OS_WASM | 
|---|
| 213 | #   include <immintrin.h> | 
|---|
| 214 | # endif | 
|---|
| 215 |  | 
|---|
| 216 | #  include <QtCore/private/qsimd_x86_p.h> | 
|---|
| 217 |  | 
|---|
| 218 | // x86-64 sub-architecture version 3 | 
|---|
| 219 | // | 
|---|
| 220 | // The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2, | 
|---|
| 221 | // BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3 | 
|---|
| 222 | // of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems | 
|---|
| 223 | // with the GNU libc, libraries with this feature can be installed on a | 
|---|
| 224 | // "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h" | 
|---|
| 225 | // sub-architecture too. | 
|---|
| 226 |  | 
|---|
| 227 | #  if defined(__AVX2__) | 
|---|
| 228 | // List of features present with -march=x86-64-v3 and not architecturally | 
|---|
| 229 | // implied by __AVX2__ | 
|---|
| 230 | #    define ARCH_HASWELL_MACROS     \ | 
|---|
| 231 | (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__) | 
|---|
| 232 | #    if ARCH_HASWELL_MACROS != 7 | 
|---|
| 233 | #      error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2" | 
|---|
| 234 | #    endif | 
|---|
| 235 | static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing."); | 
|---|
| 236 | #    define __haswell__       1 | 
|---|
| 237 | #    undef ARCH_HASWELL_MACROS | 
|---|
| 238 | #  endif | 
|---|
| 239 |  | 
|---|
| 240 | // x86-64 sub-architecture version 4 | 
|---|
| 241 | // | 
|---|
| 242 | // Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core | 
|---|
| 243 | // 6th generation (codename "Skylake"). AMD Zen4 is the their first processor | 
|---|
| 244 | // with AVX512 support and it includes all of these too. The GNU libc subdir for | 
|---|
| 245 | // this is "glibc-hwcaps/x86-64-v4". | 
|---|
| 246 | // | 
|---|
| 247 | #  define ARCH_SKX_MACROS           (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__) | 
|---|
| 248 | #  if ARCH_SKX_MACROS != 0 | 
|---|
| 249 | #    if ARCH_SKX_MACROS != 5 | 
|---|
| 250 | #      error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f" | 
|---|
| 251 | #    endif | 
|---|
| 252 | static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing."); | 
|---|
| 253 | #    define __skylake_avx512__  1 | 
|---|
| 254 | #  endif | 
|---|
| 255 | #  undef ARCH_SKX_MACROS | 
|---|
| 256 | #endif  /* Q_PROCESSOR_X86 */ | 
|---|
| 257 |  | 
|---|
| 258 | // NEON intrinsics | 
|---|
| 259 | // note: as of GCC 4.9, does not support function targets for ARM | 
|---|
| 260 | #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64) | 
|---|
| 261 | #if defined(Q_CC_CLANG) | 
|---|
| 262 | #define QT_FUNCTION_TARGET_STRING_NEON      "neon" | 
|---|
| 263 | #else | 
|---|
| 264 | #define QT_FUNCTION_TARGET_STRING_NEON      "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available. | 
|---|
| 265 | #endif | 
|---|
| 266 | #ifndef __ARM_NEON__ | 
|---|
| 267 | // __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection. | 
|---|
| 268 | #define __ARM_NEON__ | 
|---|
| 269 | #endif | 
|---|
| 270 |  | 
|---|
| 271 | #ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64 | 
|---|
| 272 | inline uint16_t vaddvq_u16(uint16x8_t v8) | 
|---|
| 273 | { | 
|---|
| 274 | const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8)); | 
|---|
| 275 | const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2)); | 
|---|
| 276 | return vget_lane_u16(vreinterpret_u16_u64(v1), 0); | 
|---|
| 277 | } | 
|---|
| 278 |  | 
|---|
| 279 | inline uint8_t vaddv_u8(uint8x8_t v8) | 
|---|
| 280 | { | 
|---|
| 281 | const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8))); | 
|---|
| 282 | return vget_lane_u8(vreinterpret_u8_u64(v1), 0); | 
|---|
| 283 | } | 
|---|
| 284 | #endif | 
|---|
| 285 |  | 
|---|
| 286 | // Missing NEON intrinsics, needed due different type definitions: | 
|---|
| 287 | inline uint16x8_t qvsetq_n_u16(uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4, | 
|---|
| 288 | uint16_t v5, uint16_t v6, uint16_t v7, uint16_t v8) { | 
|---|
| 289 | #if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG) | 
|---|
| 290 | using u64 = uint64_t; | 
|---|
| 291 | const uint16x8_t vmask = { | 
|---|
| 292 | v1 | (v2 << 16) | (u64(v3) << 32) | (u64(v4) << 48), | 
|---|
| 293 | v5 | (v6 << 16) | (u64(v7) << 32) | (u64(v8) << 48) | 
|---|
| 294 | }; | 
|---|
| 295 | #else | 
|---|
| 296 | const uint16x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 }; | 
|---|
| 297 | #endif | 
|---|
| 298 | return vmask; | 
|---|
| 299 | } | 
|---|
| 300 | inline uint8x8_t qvset_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, | 
|---|
| 301 | uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8) { | 
|---|
| 302 | #if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG) | 
|---|
| 303 | using u64 = uint64_t; | 
|---|
| 304 | const uint8x8_t vmask = { | 
|---|
| 305 | v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) | | 
|---|
| 306 | (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56) | 
|---|
| 307 | }; | 
|---|
| 308 | #else | 
|---|
| 309 | const uint8x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 }; | 
|---|
| 310 | #endif | 
|---|
| 311 | return vmask; | 
|---|
| 312 | } | 
|---|
| 313 | inline uint8x16_t qvsetq_n_u8(uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4, | 
|---|
| 314 | uint8_t v5,  uint8_t v6,  uint8_t v7,  uint8_t v8, | 
|---|
| 315 | uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, | 
|---|
| 316 | uint8_t v13, uint8_t v14, uint8_t v15, uint8_t v16) { | 
|---|
| 317 | #if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG) | 
|---|
| 318 | using u64 = uint64_t; | 
|---|
| 319 | const uint8x16_t vmask = { | 
|---|
| 320 | v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) | | 
|---|
| 321 | (u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56), | 
|---|
| 322 | v9 | (v10 << 8) | (v11 << 16) | (v12 << 24) | | 
|---|
| 323 | (u64(v13) << 32) | (u64(v14) << 40) | (u64(v15) << 48) | (u64(v16) << 56) | 
|---|
| 324 | }; | 
|---|
| 325 | #else | 
|---|
| 326 | const uint8x16_t vmask = { v1, v2,  v3,  v4,  v5,  v6,  v7,  v8, | 
|---|
| 327 | v9, v10, v11, v12, v13, v14, v15, v16}; | 
|---|
| 328 | #endif | 
|---|
| 329 | return vmask; | 
|---|
| 330 | } | 
|---|
| 331 | inline uint32x4_t qvsetq_n_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) | 
|---|
| 332 | { | 
|---|
| 333 | #if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG) | 
|---|
| 334 | return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c }; | 
|---|
| 335 | #else | 
|---|
| 336 | return uint32x4_t{ a, b, c, d }; | 
|---|
| 337 | #endif | 
|---|
| 338 | } | 
|---|
| 339 | #endif | 
|---|
| 340 |  | 
|---|
| 341 | #if defined(_M_ARM64) && __ARM_ARCH >= 800 | 
|---|
| 342 | #define __ARM_FEATURE_CRYPTO 1 | 
|---|
| 343 | #define __ARM_FEATURE_CRC32 1 | 
|---|
| 344 | #endif | 
|---|
| 345 |  | 
|---|
| 346 | #if defined(Q_PROCESSOR_ARM_64) | 
|---|
| 347 | #if defined(Q_CC_CLANG) | 
|---|
| 348 | #define QT_FUNCTION_TARGET_STRING_AES        "crypto" | 
|---|
| 349 | #define QT_FUNCTION_TARGET_STRING_CRC32      "crc" | 
|---|
| 350 | #elif defined(Q_CC_GNU) | 
|---|
| 351 | #define QT_FUNCTION_TARGET_STRING_AES        "+crypto" | 
|---|
| 352 | #define QT_FUNCTION_TARGET_STRING_CRC32      "+crc" | 
|---|
| 353 | #elif defined(Q_CC_MSVC) | 
|---|
| 354 | #define QT_FUNCTION_TARGET_STRING_AES | 
|---|
| 355 | #define QT_FUNCTION_TARGET_STRING_CRC32 | 
|---|
| 356 | #endif | 
|---|
| 357 | #elif defined(Q_PROCESSOR_ARM_32) | 
|---|
| 358 | #if defined(Q_CC_CLANG) | 
|---|
| 359 | #define QT_FUNCTION_TARGET_STRING_AES        "armv8-a,crypto" | 
|---|
| 360 | #define QT_FUNCTION_TARGET_STRING_CRC32      "armv8-a,crc" | 
|---|
| 361 | #elif defined(Q_CC_GNU) | 
|---|
| 362 | #define QT_FUNCTION_TARGET_STRING_AES        "arch=armv8-a+crypto" | 
|---|
| 363 | #define QT_FUNCTION_TARGET_STRING_CRC32      "arch=armv8-a+crc" | 
|---|
| 364 | #endif | 
|---|
| 365 | #endif | 
|---|
| 366 |  | 
|---|
| 367 | #ifndef Q_PROCESSOR_X86 | 
|---|
| 368 | enum CPUFeatures { | 
|---|
| 369 | #if defined(Q_PROCESSOR_ARM) | 
|---|
| 370 | CpuFeatureNEON          = 2, | 
|---|
| 371 | CpuFeatureARM_NEON      = CpuFeatureNEON, | 
|---|
| 372 | CpuFeatureCRC32         = 4, | 
|---|
| 373 | CpuFeatureAES           = 8, | 
|---|
| 374 | CpuFeatureARM_CRYPTO    = CpuFeatureAES, | 
|---|
| 375 | #elif defined(Q_PROCESSOR_MIPS) | 
|---|
| 376 | CpuFeatureDSP           = 2, | 
|---|
| 377 | CpuFeatureDSPR2         = 4, | 
|---|
| 378 | #endif | 
|---|
| 379 | }; | 
|---|
| 380 |  | 
|---|
| 381 | static const uint64_t qCompilerCpuFeatures = 0 | 
|---|
| 382 | #if defined __ARM_NEON__ | 
|---|
| 383 | | CpuFeatureNEON | 
|---|
| 384 | #endif | 
|---|
| 385 | #if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64)) | 
|---|
| 386 | // Yocto Project recipes enable Crypto extension for all ARMv8 configs, | 
|---|
| 387 | // even for targets without the Crypto extension. That's wrong, but as | 
|---|
| 388 | // the compiler never generates the code for them on their own, most | 
|---|
| 389 | // code never notices the problem. But we would. By not setting the | 
|---|
| 390 | // bits here, we force a runtime detection. | 
|---|
| 391 | #if defined __ARM_FEATURE_CRC32 | 
|---|
| 392 | | CpuFeatureCRC32 | 
|---|
| 393 | #endif | 
|---|
| 394 | #if defined __ARM_FEATURE_CRYPTO | 
|---|
| 395 | | CpuFeatureAES | 
|---|
| 396 | #endif | 
|---|
| 397 | #endif // Q_OS_LINUX && Q_PROCESSOR_ARM64 | 
|---|
| 398 | #if defined __mips_dsp | 
|---|
| 399 | | CpuFeatureDSP | 
|---|
| 400 | #endif | 
|---|
| 401 | #if defined __mips_dspr2 | 
|---|
| 402 | | CpuFeatureDSPR2 | 
|---|
| 403 | #endif | 
|---|
| 404 | ; | 
|---|
| 405 | #endif | 
|---|
| 406 |  | 
|---|
| 407 | #ifdef __cplusplus | 
|---|
| 408 | #  include <atomic> | 
|---|
| 409 | #  define Q_ATOMIC(T)   std::atomic<T> | 
|---|
| 410 | QT_BEGIN_NAMESPACE | 
|---|
| 411 | using std::atomic_load_explicit; | 
|---|
| 412 | static constexpr auto memory_order_relaxed = std::memory_order_relaxed; | 
|---|
| 413 | extern "C"{ | 
|---|
| 414 | #else | 
|---|
| 415 | #  include <stdatomic.h> | 
|---|
| 416 | #  define Q_ATOMIC(T)   _Atomic(T) | 
|---|
| 417 | #endif | 
|---|
| 418 |  | 
|---|
| 419 | #ifdef Q_PROCESSOR_X86 | 
|---|
| 420 | typedef uint64_t QCpuFeatureType; | 
|---|
| 421 | static const QCpuFeatureType qCompilerCpuFeatures = _compilerCpuFeatures; | 
|---|
| 422 | static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell; | 
|---|
| 423 | static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512; | 
|---|
| 424 | #else | 
|---|
| 425 | typedef unsigned QCpuFeatureType; | 
|---|
| 426 | #endif | 
|---|
| 427 | extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[1]; | 
|---|
| 428 | Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)(); | 
|---|
| 429 |  | 
|---|
| 430 | static inline uint64_t qCpuFeatures() | 
|---|
| 431 | { | 
|---|
| 432 | #ifdef QT_BOOTSTRAPPED | 
|---|
| 433 | return qCompilerCpuFeatures;    // no detection | 
|---|
| 434 | #else | 
|---|
| 435 | quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), m: memory_order_relaxed); | 
|---|
| 436 | if (!QT_SUPPORTS_INIT_PRIORITY) { | 
|---|
| 437 | if (Q_UNLIKELY(features == 0)) | 
|---|
| 438 | features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)(); | 
|---|
| 439 | } | 
|---|
| 440 | return features; | 
|---|
| 441 | #endif | 
|---|
| 442 | } | 
|---|
| 443 |  | 
|---|
| 444 | #define qCpuHasFeature(feature)     (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \ | 
|---|
| 445 | || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature)) | 
|---|
| 446 |  | 
|---|
| 447 | #ifdef __cplusplus | 
|---|
| 448 | } // extern "C" | 
|---|
| 449 |  | 
|---|
| 450 | #  if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED) | 
|---|
| 451 | Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept; | 
|---|
| 452 |  | 
|---|
| 453 | static inline bool qHasHwrng() | 
|---|
| 454 | { | 
|---|
| 455 | return qCpuHasFeature(RDRND); | 
|---|
| 456 | } | 
|---|
| 457 | #  else | 
|---|
| 458 | static inline qsizetype qRandomCpu(void *, qsizetype) noexcept | 
|---|
| 459 | { | 
|---|
| 460 | return 0; | 
|---|
| 461 | } | 
|---|
| 462 | static inline bool qHasHwrng() | 
|---|
| 463 | { | 
|---|
| 464 | return false; | 
|---|
| 465 | } | 
|---|
| 466 | #  endif | 
|---|
| 467 |  | 
|---|
| 468 | QT_END_NAMESPACE | 
|---|
| 469 |  | 
|---|
| 470 | #endif // __cplusplus | 
|---|
| 471 |  | 
|---|
| 472 | QT_WARNING_POP | 
|---|
| 473 |  | 
|---|
| 474 | #endif // QSIMD_P_H | 
|---|
| 475 |  | 
|---|