Compare commits

...

4 Commits

Author SHA1 Message Date
Jarred Sumner
03d0b55d29 Merge branch 'main' into jarred/cpu-id 2025-04-14 19:23:39 -07:00
Jarred Sumner
8bc664ea87 Update CPUFeatures.cpp
Try this
2025-04-14 11:56:41 -07:00
Jarred Sumner
cddb016a2f Update CPUFeatures.cpp 2025-04-13 19:15:19 -07:00
Jarred Sumner
1b23c275b7 Make the simd detection work better 2025-04-13 19:12:13 -07:00

View File

@@ -18,48 +18,140 @@ enum class AArch64CPUFeature : uint8_t {
};
#if CPU(X86_64)
#include <stdint.h>
#if OS(WINDOWS)
namespace cpuid_bit {
// Can be found on Intel ISA Reference for CPUID
// Thisis copypasta from SIMDUTF, mostly.
// EAX = 0x01
constexpr uint32_t pclmulqdq = uint32_t(1)
<< 1; ///< @private bit 1 of ECX for EAX=0x1
constexpr uint32_t sse42 = uint32_t(1)
<< 20; ///< @private bit 20 of ECX for EAX=0x1
constexpr uint32_t popcnt = uint32_t(1) << 23; // POPCNT is bit 23 in ECX
constexpr uint32_t avx = uint32_t(1) << 28; // AVX is bit 28 in ECX
constexpr uint32_t osxsave = (uint32_t(1) << 26) | (uint32_t(1) << 27); ///< @private bits 26+27 of ECX for EAX=0x1
#include <windows.h>
// EAX = 0x7f (Structured Extended Feature Flags), ECX = 0x00 (Sub-leaf)
// See: "Table 3-8. Information Returned by CPUID Instruction"
namespace ebx {
constexpr uint32_t bmi1 = uint32_t(1) << 3;
constexpr uint32_t avx2 = uint32_t(1) << 5;
constexpr uint32_t bmi2 = uint32_t(1) << 8;
constexpr uint32_t avx512f = uint32_t(1) << 16;
constexpr uint32_t avx512dq = uint32_t(1) << 17;
constexpr uint32_t avx512ifma = uint32_t(1) << 21;
constexpr uint32_t avx512cd = uint32_t(1) << 28;
constexpr uint32_t avx512bw = uint32_t(1) << 30;
constexpr uint32_t avx512vl = uint32_t(1) << 31;
} // namespace ebx
namespace ecx {
constexpr uint32_t avx512vbmi = uint32_t(1) << 1;
constexpr uint32_t avx512vbmi2 = uint32_t(1) << 6;
constexpr uint32_t avx512vnni = uint32_t(1) << 11;
constexpr uint32_t avx512bitalg = uint32_t(1) << 12;
constexpr uint32_t avx512vpopcnt = uint32_t(1) << 14;
} // namespace ecx
namespace edx {
constexpr uint32_t avx512vp2intersect = uint32_t(1) << 8;
}
namespace xcr0_bit {
constexpr uint64_t avx256_saved = uint64_t(1) << 2; ///< @private bit 2 = AVX
constexpr uint64_t avx512_saved = uint64_t(7) << 5; ///< @private bits 5,6,7 = opmask, ZMM_hi256, hi16_ZMM
} // namespace xcr0_bit
} // namespace cpuid_bit
static inline void cpuid(uint32_t* eax, uint32_t* ebx, uint32_t* ecx,
uint32_t* edx)
{
#if defined(_MSC_VER)
int cpu_info[4];
__cpuidex(cpu_info, *eax, *ecx);
*eax = cpu_info[0];
*ebx = cpu_info[1];
*ecx = cpu_info[2];
*edx = cpu_info[3];
#elif defined(HAVE_GCC_GET_CPUID) && defined(USE_GCC_GET_CPUID)
uint32_t level = *eax;
__get_cpuid(level, eax, ebx, ecx, edx);
#else
uint32_t a = *eax, b, c = *ecx, d;
asm volatile("cpuid\n\t" : "+a"(a), "=b"(b), "+c"(c), "=d"(d));
*eax = a;
*ebx = b;
*ecx = c;
*edx = d;
#endif
}
// Read the extended control register XCR0
static inline uint64_t xgetbv(uint32_t xcr)
{
#if defined(_MSC_VER)
return _xgetbv(xcr);
#else
uint32_t eax, edx;
asm volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
return ((uint64_t)edx << 32) | eax;
#endif
}
static uint8_t x86_cpu_features()
{
uint8_t features = 0;
// Use CPUID for robust CPU feature detection
uint32_t eax, ebx, ecx, edx;
#if OS(WINDOWS)
if (IsProcessorFeaturePresent(PF_SSE4_2_INSTRUCTIONS_AVAILABLE))
// Check for SSE4.2 and POPCNT (CPUID leaf 1)
eax = 1;
ecx = 0;
/**
* Use cpuid because the Windows API for this is a big liar. Our CI machines on
* AWS report no AVX2 when they absolutely do support it.
*/
cpuid(&eax, &ebx, &ecx, &edx);
if (ecx & cpuid_bit::sse42)
features |= 1 << static_cast<uint8_t>(X86CPUFeature::sse42);
if (IsProcessorFeaturePresent(PF_AVX_INSTRUCTIONS_AVAILABLE))
features |= 1 << static_cast<uint8_t>(X86CPUFeature::avx);
if (IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE))
features |= 1 << static_cast<uint8_t>(X86CPUFeature::avx2);
if (IsProcessorFeaturePresent(PF_AVX512F_INSTRUCTIONS_AVAILABLE))
features |= 1 << static_cast<uint8_t>(X86CPUFeature::avx512);
#else
#if __has_builtin(__builtin_cpu_supports)
__builtin_cpu_init();
if (__builtin_cpu_supports("sse4.2"))
features |= 1 << static_cast<uint8_t>(X86CPUFeature::sse42);
if (__builtin_cpu_supports("popcnt"))
if (ecx & cpuid_bit::popcnt)
features |= 1 << static_cast<uint8_t>(X86CPUFeature::popcnt);
if (__builtin_cpu_supports("avx"))
features |= 1 << static_cast<uint8_t>(X86CPUFeature::avx);
if (__builtin_cpu_supports("avx2"))
features |= 1 << static_cast<uint8_t>(X86CPUFeature::avx2);
if (__builtin_cpu_supports("avx512f"))
features |= 1 << static_cast<uint8_t>(X86CPUFeature::avx512);
#endif
#endif
// Check for AVX following Intel's recommended detection steps:
// 1. Check if OSXSAVE is supported (CPUID.1:ECX.OSXSAVE[bit 27] = 1)
// 2. Check if OS has enabled XMM and YMM state support (XCR0[2:1] = '11b')
// 3. Check if CPU supports AVX instructions (CPUID.1:ECX.AVX[bit 28] = 1)
bool osxsave_supported = (ecx & cpuid_bit::osxsave) == cpuid_bit::osxsave;
bool avx_supported = (ecx & cpuid_bit::avx) == cpuid_bit::avx;
if (osxsave_supported && avx_supported) {
// Check if OS has enabled XMM and YMM state support
uint64_t xcr0 = xgetbv(0);
bool avx_enabled_by_os = (xcr0 & cpuid_bit::xcr0_bit::avx256_saved) == cpuid_bit::xcr0_bit::avx256_saved;
if (avx_enabled_by_os) {
features |= 1 << static_cast<uint8_t>(X86CPUFeature::avx);
// Check for AVX2 and AVX512 (CPUID leaf 7)
eax = 7;
ecx = 0;
cpuid(&eax, &ebx, &ecx, &edx);
if (ebx & cpuid_bit::ebx::avx2)
features |= 1 << static_cast<uint8_t>(X86CPUFeature::avx2);
// For AVX-512, we need to check both CPU support and OS support for the state
bool avx512f_supported = (ebx & cpuid_bit::ebx::avx512f) == cpuid_bit::ebx::avx512f;
bool avx512_enabled_by_os = (xcr0 & cpuid_bit::xcr0_bit::avx512_saved) == cpuid_bit::xcr0_bit::avx512_saved;
if (avx512f_supported && avx512_enabled_by_os)
features |= 1 << static_cast<uint8_t>(X86CPUFeature::avx512);
}
} else {
// If AVX is not supported or enabled, don't even check for AVX2 or AVX512
// as they depend on AVX support
}
return features;
}
@@ -68,48 +160,66 @@ static uint8_t x86_cpu_features()
#if CPU(ARM64)
#if OS(WINDOWS)
#elif OS(MACOS)
#if OS(DARWIN)
#include <sys/sysctl.h>
#elif OS(LINUX)
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif
static uint8_t aarch64_cpu_features()
{
uint8_t features = 0;
#if OS(WINDOWS)
#pragma error "TODO: Implement AArch64 CPU features for Windows"
#elif OS(MACOS)
// On ARM64, we'll use a safer approach to avoid illegal instructions
// NEON and FP are always present in ARMv8-A
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::neon);
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::fp);
#if OS(DARWIN)
// On macOS/iOS, use sysctlbyname to detect CPU features
int value = 0;
size_t size = sizeof(value);
if (sysctlbyname("hw.optional.AdvSIMD", &value, &size, NULL, 0) == 0 && value == 1)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::neon);
if (sysctlbyname("hw.optional.floatingpoint", &value, &size, NULL, 0) == 0 && value == 1)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::fp);
if (sysctlbyname("hw.optional.arm.FEAT_AES", &value, &size, NULL, 0) == 0 && value == 1)
// Check for AES
if (sysctlbyname("hw.optional.arm.FEAT_AES", &value, &size, nullptr, 0) == 0 && value == 1)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::aes);
if (sysctlbyname("hw.optional.armv8_crc32", &value, &size, NULL, 0) == 0 && value == 1)
// Check for CRC32
if (sysctlbyname("hw.optional.arm.FEAT_CRC32", &value, &size, nullptr, 0) == 0 && value == 1)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::crc32);
if (sysctlbyname("hw.optional.arm.FEAT_LSE", &value, &size, NULL, 0) == 0 && value == 1)
// Check for LSE/Atomics
if (sysctlbyname("hw.optional.arm.FEAT_LSE", &value, &size, nullptr, 0) == 0 && value == 1)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::atomics);
if (sysctlbyname("hw.optional.arm.FEAT_SVE", &value, &size, NULL, 0) == 0 && value == 1)
// Check for SVE
if (sysctlbyname("hw.optional.arm.FEAT_SVE", &value, &size, nullptr, 0) == 0 && value == 1)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::sve);
#elif OS(LINUX)
unsigned long hwcaps = getauxval(AT_HWCAP);
if (hwcaps & HWCAP_ASIMD)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::neon);
if (hwcaps & HWCAP_FP)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::fp);
if (hwcaps & HWCAP_AES)
#else
// For non-Apple ARM64 platforms, we can use the system register approach
// but we need to be careful about illegal instructions
uint64_t id_aa64isar0_el1 = 0;
uint64_t id_aa64isar1_el1 = 0;
uint64_t id_aa64pfr0_el1 = 0;
// Use inline assembly with constraints to safely read system registers
asm volatile("mrs %0, id_aa64isar0_el1" : "=r"(id_aa64isar0_el1));
asm volatile("mrs %0, id_aa64isar1_el1" : "=r"(id_aa64isar1_el1));
asm volatile("mrs %0, id_aa64pfr0_el1" : "=r"(id_aa64pfr0_el1));
// Check for AES (bits 7:4 of ID_AA64ISAR0_EL1)
if (((id_aa64isar0_el1 >> 4) & 0xf) > 0)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::aes);
if (hwcaps & HWCAP_CRC32)
// Check for CRC32 (bits 19:16 of ID_AA64ISAR0_EL1)
if (((id_aa64isar0_el1 >> 16) & 0xf) > 0)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::crc32);
if (hwcaps & HWCAP_ATOMICS)
// Check for LSE/Atomics (bits 23:20 of ID_AA64ISAR0_EL1)
if (((id_aa64isar0_el1 >> 20) & 0xf) > 0)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::atomics);
if (hwcaps & HWCAP_SVE)
// Check for SVE (bits 3:0 of ID_AA64PFR0_EL1)
if (((id_aa64pfr0_el1 >> 0) & 0xf) > 0)
features |= 1 << static_cast<uint8_t>(AArch64CPUFeature::sve);
#endif