From ad52afd27433d50b932708276fe4a3afbe7481cc Mon Sep 17 00:00:00 2001 From: anthonycanino Date: Wed, 8 Feb 2023 15:10:39 -0800 Subject: [PATCH] Enable AVX512 Additional 16 SIMD Registers (#79544) * Change regMask_enum and regMaskTP to unsigned __int64_t on AMD64. This allows for more registers to be encoded in the register allocator. * Add upper 16 SIMD registers to allocator. Commit includes refactoring code to use `const instrDesc *` instead of `instruction` so information about when EVEX is needed (due to high SIMD registers) is available to the emitter. * Limit high SIMD reg to compatible intrinsics lsra build. * Limit high SIMD reg to compatible intrinsics lsra build. * Limit high SIMD reg to compatible intrinsics and gentree nodes. Commit constrains certain hw intrinsics and gentree nodes to use lower SIMD registers even if upper SIMD registers are available due to limitations of EVEX encoding for certain instructions. For example, SSE `Reciprocal` lowers to `rcpps` which does not have an EVEX encoding form, hence, we cannot allow that hw intrincis node to use a high SIMD register. These intrinsics are marked with `HW_Flag_NoEvexSemantics`. Other such intructions related to masking (typically marked with `HW_Flag_ReturnsPerElementMask`) also have similar issues (though they can be replaced with the EVEX k registers and associated masking when implemented). In addition, the callee/calleer save registers have also been adjusted to properly handle the presence and absence of AVX512 upper simd registers at runtime. * Fix for X86 throughput. * Add upper simd stress test to the AVX512 testing pipeline. * Formatting. * Fix wrong-sized attr for simd mov instruction. * Fix non-AMD64 LSRA stress mask. * Update src/coreclr/jit/compiler.h Co-authored-by: Bruce Forstall * Update src/coreclr/jit/compiler.cpp Co-authored-by: Bruce Forstall * Update src/coreclr/jit/gentree.cpp Co-authored-by: Bruce Forstall * Update src/coreclr/jit/hwintrinsic.h Co-authored-by: Bruce Forstall * Update src/coreclr/jit/target.h Co-authored-by: Bruce Forstall * Update src/coreclr/jit/emitxarch.cpp Co-authored-by: Bruce Forstall * Remove unneeded vars * Address PR comments. * Allow `emitinl.h` access to the `rbm` variables. * Replace RBM_LOWSIMD with `BuildEvexIncompatibleMask`. * Move AVX512 dependent `targetamd.h` vars into compiler object. * Fixing some edge cases for `targetamd.h` variables. * Fix a merge/rebase bug. * Update src/coreclr/jit/compiler.h Co-authored-by: Bruce Forstall * Update src/coreclr/jit/lsra.cpp Co-authored-by: Bruce Forstall * Update src/coreclr/jit/compiler.h Co-authored-by: Bruce Forstall * Fix nits. * Trying VM changes. * VM hack. * VM hack. * Revert "VM hack." This reverts commit 91cf3db9115e94ca1d759045fe4da6a496228cfc. * Adjust ACTUAL_REG_COUNT based on availability of AVX512. * Use inline accessor functions instead of macros Convert from macros to accessor functions for RBM_ALLFLOAT, RBM_FLT_CALLEE_TRASH, CNT_CALLEE_TRASH_FLOAT. Convert LSRA use of ACTUAL_REG_COUNT to AVAILABLE_REG_COUNT, and create an accessor for that value for AMD64 as well. * Clearifying comments. --------- Co-authored-by: Bruce Forstall Co-authored-by: Kunal Pathak --- .../templates/runtimes/run-test-job.yml | 1 + src/coreclr/jit/codegen.h | 11 + src/coreclr/jit/codegenxarch.cpp | 4 +- src/coreclr/jit/compiler.cpp | 49 ++ src/coreclr/jit/compiler.h | 44 ++ src/coreclr/jit/emit.cpp | 23 +- src/coreclr/jit/emit.h | 27 + src/coreclr/jit/emitinl.h | 3 - src/coreclr/jit/emitxarch.cpp | 598 +++++++++++------- src/coreclr/jit/emitxarch.h | 63 +- src/coreclr/jit/gentree.cpp | 21 + src/coreclr/jit/gentree.h | 6 + src/coreclr/jit/hwintrinsic.h | 22 +- src/coreclr/jit/hwintrinsiclistxarch.h | 302 ++++----- src/coreclr/jit/instrsxarch.h | 20 +- src/coreclr/jit/lsra.cpp | 94 +-- src/coreclr/jit/lsra.h | 78 ++- src/coreclr/jit/lsrabuild.cpp | 13 +- src/coreclr/jit/lsraxarch.cpp | 127 +++- src/coreclr/jit/optimizer.cpp | 4 +- src/coreclr/jit/register.h | 22 +- src/coreclr/jit/target.h | 20 +- src/coreclr/jit/targetamd64.h | 52 +- src/coreclr/jit/utils.h | 10 + src/coreclr/vm/threadsuspend.cpp | 8 +- src/tests/Common/testenvironment.proj | 1 + 26 files changed, 1098 insertions(+), 525 deletions(-) diff --git a/eng/pipelines/common/templates/runtimes/run-test-job.yml b/eng/pipelines/common/templates/runtimes/run-test-job.yml index 6712f6c9d3c960..12454418098e1c 100644 --- a/eng/pipelines/common/templates/runtimes/run-test-job.yml +++ b/eng/pipelines/common/templates/runtimes/run-test-job.yml @@ -532,6 +532,7 @@ jobs: ${{ if in(parameters.testGroup, 'jitstress-isas-avx512') }}: scenarios: - jitstress_isas_avx512_forceevex + - jitstress_isas_avx512_forceevex_stresshighregs ${{ if in(parameters.testGroup, 'jitstressregs-x86') }}: scenarios: - jitstressregs1_x86_noavx diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index e24b7b54a5b565..90154ef3b653ff 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -35,6 +35,17 @@ class CodeGen final : public CodeGenInterface GenTree* addr, bool fold, bool* revPtr, GenTree** rv1Ptr, GenTree** rv2Ptr, unsigned* mulPtr, ssize_t* cnsPtr); private: +#if defined(TARGET_AMD64) + regMaskTP get_RBM_ALLFLOAT() const + { + return compiler->rbmAllFloat; + } + regMaskTP get_RBM_FLT_CALLEE_TRASH() const + { + return compiler->rbmFltCalleeTrash; + } +#endif // TARGET_AMD64 + #if defined(TARGET_XARCH) // Bit masks used in negating a float or double number. // This is to avoid creating more than one data constant for these bitmasks when a diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index c4a839fbb4044d..a40fda3b2bc4e0 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3535,7 +3535,7 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) // this probably needs to be changed. // Load - genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src, offset); + genCodeForLoadOffset(INS_movdqu, EA_16BYTE, xmmTmpReg, src, offset); // Store genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset); @@ -8358,7 +8358,7 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset { ins = INS_movdqu; // This should be changed! - attr = EA_8BYTE; + attr = EA_16BYTE; size = 16; } else diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index facac4746bfb94..6a042eb8811114 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3329,6 +3329,24 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compJitSaveFpLrWithCalleeSavedRegisters = JitConfig.JitSaveFpLrWithCalleeSavedRegisters(); } #endif // defined(DEBUG) && defined(TARGET_ARM64) + +#if defined(TARGET_AMD64) + rbmAllFloat = RBM_ALLFLOAT_INIT; + rbmFltCalleeTrash = RBM_FLT_CALLEE_TRASH_INIT; + cntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT; + availableRegCount = ACTUAL_REG_COUNT; + + if (DoJitStressEvexEncoding()) + { + rbmAllFloat |= RBM_HIGHFLOAT; + rbmFltCalleeTrash |= RBM_HIGHFLOAT; + cntCalleeTrashFloat += CNT_CALLEE_TRASH_HIGHFLOAT; + } + else + { + availableRegCount -= CNT_HIGHFLOAT; + } +#endif // TARGET_AMD64 } #ifdef DEBUG @@ -3532,6 +3550,37 @@ bool Compiler::compPromoteFewerStructs(unsigned lclNum) return rejectThisPromo; } +//------------------------------------------------------------------------ +// dumpRegMask: display a register mask. For well-known sets of registers, display a well-known token instead of +// a potentially large number of registers. +// +// Arguments: +// regs - The set of registers to display +// +void Compiler::dumpRegMask(regMaskTP regs) const +{ + if (regs == RBM_ALLINT) + { + printf("[allInt]"); + } + else if (regs == (RBM_ALLINT & ~RBM_FPBASE)) + { + printf("[allIntButFP]"); + } + else if (regs == RBM_ALLFLOAT) + { + printf("[allFloat]"); + } + else if (regs == RBM_ALLDOUBLE) + { + printf("[allDouble]"); + } + else + { + dspRegMask(regs); + } +} + #endif // DEBUG void Compiler::compInitDebuggingInfo() diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index e565c419f06ab7..55d4ae161e92e9 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10453,6 +10453,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX bool compJitHaltMethod(); + void dumpRegMask(regMaskTP regs) const; + #endif /* @@ -10727,6 +10729,48 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX GenTree* fgMorphMultiregStructArg(CallArg* arg); bool killGCRefs(GenTree* tree); + +#if defined(TARGET_AMD64) +private: + // The following are for initializing register allocator "constants" defined in targetamd64.h + // that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases + // the number of SIMD (xmm, ymm, and zmm) registers from 16 to 32. + // As only 64-bit xarch has the capability to have the additional registers, we limit the changes + // to TARGET_AMD64 only. + // + // Users of these values need to define four accessor functions: + // + // regMaskTP get_RBM_ALLFLOAT(); + // regMaskTP get_RBM_FLT_CALLEE_TRASH(); + // unsigned get_CNT_CALLEE_TRASH_FLOAT(); + // unsigned get_AVAILABLE_REG_COUNT(); + // + // which return the values of these variables. + // + // This was done to avoid polluting all `targetXXX.h` macro definitions with a compiler parameter, where only + // TARGET_AMD64 requires one. + // + regMaskTP rbmAllFloat; + regMaskTP rbmFltCalleeTrash; + unsigned cntCalleeTrashFloat; + unsigned availableRegCount; + +public: + regMaskTP get_RBM_ALLFLOAT() const + { + return rbmAllFloat; + } + regMaskTP get_RBM_FLT_CALLEE_TRASH() const + { + return rbmFltCalleeTrash; + } + unsigned get_CNT_CALLEE_TRASH_FLOAT() const + { + return cntCalleeTrashFloat; + } + +#endif // TARGET_AMD64 + }; // end of class Compiler //--------------------------------------------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index c0de6ca69a86ca..9acf7268e08a5f 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -120,6 +120,17 @@ void emitLocation::Print(LONG compMethodID) const } #endif // DEBUG +#if defined(TARGET_AMD64) +inline regMaskTP emitter::get_RBM_FLT_CALLEE_TRASH() const +{ + return emitComp->rbmFltCalleeTrash; +} +inline unsigned emitter::get_AVAILABLE_REG_COUNT() const +{ + return emitComp->availableRegCount; +} +#endif // TARGET_AMD64 + /***************************************************************************** * * Return the name of an instruction format. @@ -3226,11 +3237,19 @@ void emitter::emitDispRegSet(regMaskTP regs) for (reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) { - if ((regs & genRegMask(reg)) == 0) + if (regs == RBM_NONE) + { + break; + } + + regMaskTP curReg = genRegMask(reg); + if ((regs & curReg) == 0) { continue; } + regs -= curReg; + if (sp) { printf(" "); @@ -3400,6 +3419,7 @@ emitter::instrDesc* emitter::emitNewInstrCallInd(int argCnt, #endif // TARGET_XARCH /* Save the live GC registers in the unused register fields */ + assert((gcrefRegs & RBM_CALLEE_TRASH) == 0); emitEncodeCallGCregs(gcrefRegs, id); return id; @@ -3472,6 +3492,7 @@ emitter::instrDesc* emitter::emitNewInstrCallDir(int argCnt, assert(!id->idIsLargeCns()); /* Save the live GC registers in the unused register fields */ + assert((gcrefRegs & RBM_CALLEE_TRASH) == 0); emitEncodeCallGCregs(gcrefRegs, id); return id; diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 783d927f6661c7..a8ff3e2cf693f0 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1138,6 +1138,28 @@ class emitter idAddr()->_idReg4 = reg; assert(reg == idAddr()->_idReg4); } + bool idHasReg3() const + { + switch (idInsFmt()) + { + case IF_RWR_RRD_RRD: + case IF_RWR_RRD_RRD_CNS: + case IF_RWR_RRD_RRD_RRD: + return true; + default: + return false; + } + } + bool idHasReg4() const + { + switch (idInsFmt()) + { + case IF_RWR_RRD_RRD_RRD: + return true; + default: + return false; + } + } #endif // defined(TARGET_XARCH) #ifdef TARGET_ARMARCH insOpts idInsOpt() const @@ -1968,6 +1990,11 @@ class emitter CORINFO_FIELD_HANDLE emitBlkConst(const void* cnsAddr, unsigned cnsSize, unsigned cnsAlign, var_types elemType); private: +#if defined(TARGET_AMD64) + regMaskTP get_RBM_FLT_CALLEE_TRASH() const; + unsigned get_AVAILABLE_REG_COUNT() const; +#endif // TARGET_AMD64 + CORINFO_FIELD_HANDLE emitFltOrDblConst(double constValue, emitAttr attr); CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue); CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue); diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h index 82c78299efebd3..125c1ddd0fbd3f 100644 --- a/src/coreclr/jit/emitinl.h +++ b/src/coreclr/jit/emitinl.h @@ -211,11 +211,8 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id) * * Convert between a register mask and a smaller version for storage. */ - /*static*/ inline void emitter::emitEncodeCallGCregs(regMaskTP regmask, instrDesc* id) { - assert((regmask & RBM_CALLEE_TRASH) == 0); - unsigned encodeMask; #ifdef TARGET_X86 diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 2158504426aafa..90c681dbc620e7 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -217,10 +217,10 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_phminposuw: case INS_mpsadbw: case INS_pclmulqdq: - case INS_aesdec: - case INS_aesdeclast: case INS_aesenc: case INS_aesenclast: + case INS_aesdec: + case INS_aesdeclast: case INS_aesimc: case INS_aeskeygenassist: case INS_vzeroupper: @@ -260,21 +260,30 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_prefetcht2: case INS_sfence: // Might need new INS_*suffix* instructions for these. - case INS_por: // INS_pord, INS_porq. - case INS_pxor: // INS_pxord, INS_pxorq - case INS_movdqa: // INS_movdqa32, INS_movdqa64. - case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. - case INS_pand: // INS_pandd, INS_pandq. - case INS_pandn: // INS_pandnd, INS_pandnq. - case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. - case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. - case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. - case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. - { - return false; - } + + // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, + // movdqu16 etc) + // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand + // type + // i.e., movdqu => movdqu32 etc + // Since we are not using k registers yet, this will have no impact on correctness but will affect things + // once + // k registers are used (as that is the point of the "break out operand type" of these instructions) + // case INS_movdqa: // INS_movdqa32, INS_movdqa64. + // case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. + // case INS_pand: // INS_pandd, INS_pandq. + // case INS_pandn: // INS_pandnd, INS_pandnq. + // case INS_por: // INS_pord, INS_porq. + // case INS_pxor: // INS_pxord, INS_pxorq + // case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. + // case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. + // case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. + // case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. + { + return false; + } default: { break; @@ -826,9 +835,11 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) const // Return Value: // true if this instruction requires a VEX or EVEX prefix. // -bool emitter::TakesSimdPrefix(instruction ins) const +bool emitter::TakesSimdPrefix(const instrDesc* id) const { - return TakesEvexPrefix(ins) || TakesVexPrefix(ins); + instruction ins = id->idIns(); + + return TakesEvexPrefix(id) || TakesVexPrefix(ins); } //------------------------------------------------------------------------ @@ -850,13 +861,23 @@ bool emitter::TakesSimdPrefix(instruction ins) const // Return Value: // true if this instruction requires a EVEX prefix. // -bool emitter::TakesEvexPrefix(instruction ins) const +bool emitter::TakesEvexPrefix(const instrDesc* id) const { if (!emitComp->DoJitStressEvexEncoding()) { return false; } + instruction ins = id->idIns(); + + if (HasHighSIMDReg(id)) + { + assert(IsEvexEncodedInstruction(ins)); + // TODO-XARCH-AVX512 remove this check once k registers have been implemented + assert(!HasKMaskRegisterDest(ins)); + return true; + } + // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added. return IsEvexEncodedInstruction(ins) && !HasKMaskRegisterDest(ins); } @@ -1124,6 +1145,50 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) #endif //! TARGET_AMD64 } +//------------------------------------------------------------------------ +// HasHighSIMReg: Checks if an instruction uses a high SIMD registers (mm16-mm31) +// and will require one of the EVEX high SIMD bits (EVEX.R', EVEX.V', EVEX.X) +// +// Arguments: +// id -- instruction descriptor for encoding +// +// Return Value: +// true if instruction will require EVEX encoding for its register operands. +bool emitter::HasHighSIMDReg(const instrDesc* id) const +{ +#if defined(TARGET_AMD64) + if (IsHighSIMDReg(id->idReg1()) || IsHighSIMDReg(id->idReg2())) + return true; + + if (id->idIsSmallDsc()) + return false; + + if ((id->idHasReg3() && IsHighSIMDReg(id->idReg3())) || (id->idHasReg4() && IsHighSIMDReg(id->idReg4()))) + return true; +#endif + // X86 JIT operates in 32-bit mode and hence extended reg are not available. + return false; +} + +//------------------------------------------------------------------------ +// IsHighSIMDReg: Checks if a register is strictly an EVEX encoded high SIMD +// registers (mm16-mm31). +// +// Arguments: +// reg -- register to check +// +// Return Value: +// true if the register is strictly an EVEX encoded high SIMD register +bool emitter::IsHighSIMDReg(regNumber reg) const +{ +#ifdef TARGET_AMD64 + return ((reg >= REG_XMM16) && (reg <= REG_XMM31)); +#else + // X86 JIT operates in 32-bit mode and hence extended reg are not available. + return false; +#endif +} + // Returns true if using this register will require a REX.* prefix. // Since XMM registers overlap with YMM registers, this routine // can also be used to know whether a YMM register if the @@ -1131,7 +1196,7 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) bool IsExtendedReg(regNumber reg) { #ifdef TARGET_AMD64 - return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM15)); + return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM31)); #else // X86 JIT operates in 32-bit mode and hence extended reg are not available. return false; @@ -1143,7 +1208,7 @@ bool IsExtendedReg(regNumber reg, emitAttr attr) { #ifdef TARGET_AMD64 // Not a register, so doesn't need a prefix - if (reg > REG_XMM15) + if (reg > REG_XMM31) { return false; } @@ -1184,12 +1249,29 @@ bool IsExtendedReg(regNumber reg, emitAttr attr) bool IsXMMReg(regNumber reg) { #ifdef TARGET_AMD64 - return (reg >= REG_XMM0) && (reg <= REG_XMM15); + return (reg >= REG_XMM0) && (reg <= REG_XMM31); #else // !TARGET_AMD64 return (reg >= REG_XMM0) && (reg <= REG_XMM7); #endif // !TARGET_AMD64 } +//------------------------------------------------------------------------ +// HighAwareRegEncoding: For EVEX encoded high SIMD registers (mm16-mm31), +// get a register encoding for bits 0-4, where the 5th bit is encoded via +// EVEX.R', EVEX.R, or EVEX.X. +// +// Arguments: +// reg -- register to encode +// +// Return Value: +// bits 0-4 of register encoding +// +unsigned HighAwareRegEncoding(regNumber reg) +{ + static_assert((REG_XMM0 & 0x7) == 0, "bad XMMBASE"); + return (unsigned)(reg & 0xF); +} + // Returns bits to be encoded in instruction for the given register. unsigned RegEncoding(regNumber reg) { @@ -1200,11 +1282,13 @@ unsigned RegEncoding(regNumber reg) // Utility routines that abstract the logic of adding REX.W, REX.R, REX.X, REX.B and REX prefixes // SSE2: separate 1-byte prefix gets added before opcode. // AVX: specific bits within VEX prefix need to be set in bit-inverted form. -emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexWPrefix(const instrDesc* id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // W-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1234,11 +1318,13 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) #ifdef TARGET_AMD64 -emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexRPrefix(const instrDesc* id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // R-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1262,11 +1348,13 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) return code | 0x4400000000ULL; } -emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexXPrefix(const instrDesc* id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { // X-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1289,11 +1377,13 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) return code | 0x4200000000ULL; } -emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexBPrefix(const instrDesc* id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // B-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1325,6 +1415,46 @@ emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) return code | 0x4000000000ULL; } +//------------------------------------------------------------------------ +// AddEvexVPrimePrefix: Add the EVEX.V' bit to the EVEX prefix. EVEX.V' +// is encoded in inverted form. +// +// Arguments: +// code -- register to encode +// +// Return Value: +// code with EVEX.V' set in verted form. +// +emitter::code_t emitter::AddEvexVPrimePrefix(code_t code) +{ +#if defined(TARGET_AMD64) + assert(UseEvexEncoding() && hasEvexPrefix(code)); + return emitter::code_t(code & 0xFFFFFFF7FFFFFFFFULL); +#else + unreached(); +#endif +} + +//------------------------------------------------------------------------ +// AddEvexRPrimePrefix: Add the EVEX.R' bit to the EVEX prefix. EVEX.R' +// is encoded in inverted form. +// +// Arguments: +// code -- register to encode +// +// Return Value: +// code with EVEX.R' set in verted form. +// +emitter::code_t emitter::AddEvexRPrimePrefix(code_t code) +{ +#if defined(TARGET_AMD64) + assert(UseEvexEncoding() && hasEvexPrefix(code)); + return emitter::code_t(code & 0xFFEFFFFFFFFFFFFFULL); +#else + unreached(); +#endif +} + #endif // TARGET_AMD64 bool isPrefix(BYTE b) @@ -1865,7 +1995,7 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const // IsEvexEncodedInstruction(ins) is `true` for AVX/SSE instructions also which needs to be VEX encoded unless // explicitly // asked for EVEX. - if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(ins)) + if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(id)) { // EVEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces. // Therefore, to estimate the size adding EVEX prefix size and size of instruction opcode bytes will always @@ -2639,10 +2769,12 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) const * part of an opcode. */ -inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); + instruction ins = id->idIns(); + #ifdef TARGET_AMD64 // Either code is not NULL or reg is not an extended reg. // If reg is an extended reg, instruction needs to be prefixed with 'REX' @@ -2651,7 +2783,14 @@ inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAtt if (IsExtendedReg(reg)) { - *code = AddRexBPrefix(ins, *code); // REX.B + if (IsHighSIMDReg(reg)) + { + *code = AddRexXPrefix(id, *code); // EVEX.X + } + if (reg & 0x8) + { + *code = AddRexBPrefix(id, *code); // REX.B + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { @@ -2673,10 +2812,12 @@ inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAtt * part of an opcode. */ -inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); + instruction ins = id->idIns(); + #ifdef TARGET_AMD64 // Either code is not NULL or reg is not an extended reg. // If reg is an extended reg, instruction needs to be prefixed with 'REX' @@ -2685,7 +2826,14 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt if (IsExtendedReg(reg)) { - *code = AddRexRPrefix(ins, *code); // REX.R + if (IsHighSIMDReg(reg)) + { + *code = AddEvexRPrimePrefix(*code); // EVEX.R' + } + if (reg & 0x8) + { + *code = AddRexRPrefix(id, *code); // REX.R + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { @@ -2706,8 +2854,10 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt * Returns modified SIMD opcode with the specified register encoded in bits 3-6 of * byte 2 of VEX and EVEX prefix. */ -inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber reg, emitAttr size, code_t code) { + instruction ins = id->idIns(); + assert(reg < REG_STK); assert(IsVexOrEvexEncodedInstruction(ins)); assert(hasVexOrEvexPrefix(code)); @@ -2725,10 +2875,21 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, assert(regBits <= 0xF); if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) { - assert(hasEvexPrefix(code) && TakesEvexPrefix(ins)); + assert(hasEvexPrefix(code) && TakesEvexPrefix(id)); + +// TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case. +// Rather see these paths cleaned up. +#if defined(TARGET_AMD64) + regBits = HighAwareRegEncoding(reg); + if (IsHighSIMDReg(reg)) + { + // Have to set the EVEX V' bit + code = AddEvexVPrimePrefix(code); + } +#endif // Shift count = 5-bytes of opcode + 0-2 bits for EVEX regBits <<= 43; return code ^ regBits; @@ -2736,6 +2897,10 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, } if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { + + // Both prefix encodes register operand in 1's complement form + assert(regBits <= 0xF); + if (TakesVexPrefix(ins)) { assert(hasVexPrefix(code)); @@ -2755,8 +2920,10 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, * Used exclusively to generate the REX.X bit and truncate the register. */ -inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* code) +inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, code_t* code) { + instruction ins = id->idIns(); + assert(reg < REG_STK); #ifdef TARGET_AMD64 @@ -2767,7 +2934,14 @@ inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* if (IsExtendedReg(reg)) { - *code = AddRexXPrefix(ins, *code); // REX.X + if (IsHighSIMDReg(reg)) + { + *code = AddEvexVPrimePrefix(*code); // EVEX.X + } + if (reg & 0x8) + { + *code = AddRexXPrefix(id, *code); // REX.B + } } unsigned regBits = RegEncoding(reg); #else // !TARGET_AMD64 @@ -2783,7 +2957,7 @@ inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* * Returns the "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc* id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2801,7 +2975,7 @@ inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code) * Returns the given "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code) +inline emitter::code_t emitter::insEncodeRMreg(const instrDesc* id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2819,11 +2993,11 @@ inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code) * the given register. */ -inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; - unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8; + unsigned regcode = insEncodeReg012(id, reg, size, &code) << 8; code |= regcode; return code; } @@ -2834,11 +3008,11 @@ inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, e * the given register. */ -inline emitter::code_t emitter::insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; - unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8; + unsigned regcode = insEncodeReg012(id, reg, size, &code) << 8; code |= regcode; return code; } @@ -2859,13 +3033,13 @@ inline bool insNeedsRRIb(instruction ins) * Returns the "reg,reg,imm8" opcode with both the reg's set to the * the given register. */ -inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeRRIb(const instrDesc* id, regNumber reg, emitAttr size) { assert(size == EA_4BYTE); // All we handle for now. - assert(insNeedsRRIb(ins)); + assert(insNeedsRRIb(id->idIns())); // If this list gets longer, use a switch, or a table lookup. code_t code = 0x69c0; - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // We use the same register as source and destination. (Could have another version that does both regs...) code |= regcode; code |= (regcode << 3); @@ -2878,10 +3052,10 @@ inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, em * nibble of the opcode */ -inline emitter::code_t emitter::insEncodeOpreg(instruction ins, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeOpreg(const instrDesc* id, regNumber reg, emitAttr size) { - code_t code = insCodeRR(ins); - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + code_t code = insCodeRR(id->idIns()); + unsigned regcode = insEncodeReg012(id, reg, size, &code); code |= regcode; return code; } @@ -3155,7 +3329,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) } else { - sz += emitInsSize(id, insEncodeRMreg(ins, code), includeRexPrefixSize); + sz += emitInsSize(id, insEncodeRMreg(id, code), includeRexPrefixSize); } return sz; @@ -3284,7 +3458,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, assert(emitComp->lvaTempsHaveLargerOffsetThanVars()); // Check whether we can use compressed displacement if EVEX. - if (TakesEvexPrefix(id->idIns())) + if (TakesEvexPrefix(id)) { bool compressedFitsInByte = false; TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte); @@ -3328,7 +3502,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, #endif // !FEATURE_FIXED_OUT_ARGS bool useSmallEncoding = false; - if (TakesEvexPrefix(id->idIns())) + if (TakesEvexPrefix(id)) { TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding); } @@ -3481,7 +3655,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) } else { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -5091,7 +5265,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) /* We expect this to always be a 'big' opcode */ - assert(insEncodeMRreg(ins, reg, attr, insCodeMR(ins)) & 0x00FF0000); + assert(insEncodeMRreg(id, reg, attr, insCodeMR(ins)) & 0x00FF0000); size = attr; @@ -5111,7 +5285,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) id->idReg1(reg); // Vex bytes - sz += emitGetAdjustedSize(id, insEncodeMRreg(ins, reg, attr, insCodeMR(ins))); + sz += emitGetAdjustedSize(id, insEncodeMRreg(id, reg, attr, insCodeMR(ins))); // REX byte if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, attr)) @@ -8968,7 +9142,7 @@ void emitter::emitIns_Call(EmitCallType callType, { // Tailcall with addressing mode/register needs to be rex.w // prefixed to be recognized as part of epilog by unwinder. - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } sz = emitInsSizeAM(id, code); @@ -11330,13 +11504,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { // tail call with addressing mode (or through register) needs rex.w // prefix to be recognized by unwinder as part of epilog. - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Special case: call via a register if (id->idIsCallRegPtr()) { - code = insEncodeMRreg(ins, reg, EA_PTRSIZE, code); + code = insEncodeMRreg(id, reg, EA_PTRSIZE, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); dst += emitOutputWord(dst, code); goto DONE; @@ -11350,14 +11524,14 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Compute the REX prefix if it exists if (IsExtendedReg(reg, EA_PTRSIZE)) { - insEncodeReg012(ins, reg, EA_PTRSIZE, &code); + insEncodeReg012(id, reg, EA_PTRSIZE, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. reg = (regNumber)RegEncoding(reg); } if (IsExtendedReg(rgx, EA_PTRSIZE)) { - insEncodeRegSIB(ins, rgx, &code); + insEncodeRegSIB(id, rgx, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. rgx = (regNumber)RegEncoding(rgx); } @@ -11403,7 +11577,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { case IF_RWR_ARD: - assert(code == (insCodeRM(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8))); + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); code &= ~((code_t)0xFFFFFFFF); code |= 0xA0; @@ -11412,7 +11586,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_AWR_RRD: - assert(code == (insCodeMR(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8))); + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); code &= ~((code_t)0xFFFFFFFF); code |= 0xA2; @@ -11429,10 +11603,10 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Emit SIMD prefix if required // There are some callers who already add SIMD prefix and call this routine. // Therefore, add SIMD prefix is one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // For this format, moves do not support a third operand, so we only need to handle the binary ops. - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { if (IsDstDstSrcAVXInstruction(ins)) { @@ -11457,11 +11631,11 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, src1, size, code); + code = insEncodeReg3456(id, src1, size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { - code = insEncodeReg3456(ins, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg2(), size, code); } } @@ -11469,21 +11643,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doing so currently // since we cannot differentiate EVEX vs VEX without 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } if (IsExtendedReg(reg, EA_PTRSIZE)) { - insEncodeReg012(ins, reg, EA_PTRSIZE, &code); + insEncodeReg012(id, reg, EA_PTRSIZE, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. reg = (regNumber)RegEncoding(reg); } if (IsExtendedReg(rgx, EA_PTRSIZE)) { - insEncodeRegSIB(ins, rgx, &code); + insEncodeRegSIB(id, rgx, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. rgx = (regNumber)RegEncoding(rgx); } @@ -11527,7 +11701,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } } } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -11657,7 +11831,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -11888,7 +12062,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { // Put the register in the opcode - code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr); + code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr); // Is there a displacement? if (dspIsZero) @@ -11918,7 +12092,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // Put the register in the opcode - code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) << 8; + code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) << 8; // Is there a displacement? if (dspIsZero) @@ -11964,8 +12138,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (reg != REG_NA) { // The address is "[reg + {2/4/8} * rgx + icon]" - regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | - insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); + regByte = insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) | + insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -12031,8 +12205,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // The address is "[{2/4/8} * rgx + icon]" - regByte = insEncodeReg012(ins, REG_EBP, EA_PTRSIZE, nullptr) | - insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); + regByte = insEncodeReg012(id, REG_EBP, EA_PTRSIZE, nullptr) | + insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -12061,7 +12235,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // The address is "[reg+rgx+dsp]" - regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr); + regByte = insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -12299,16 +12473,16 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Add VEX or EVEX prefix if required. // There are some callers who already add prefix and call this routine. // Therefore, add VEX or EVEX prefix if one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). // Not doing so currently since we cannot differentiate EVEX vs VEX without // 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Special case emitting AVX instructions @@ -12335,9 +12509,9 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -12468,7 +12642,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this int dspAsByte = dsp; - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -12522,7 +12696,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -12748,12 +12922,12 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Compute VEX/EVEX prefix // Some of its callers already add EVEX/VEX prefix and then call this routine. // Therefore add EVEX/VEX prefix is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // `addc` is used for two kinds if instructions @@ -12788,7 +12962,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { case IF_RWR_MRD: - assert(code == (insCodeRM(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); code &= ~((code_t)0xFFFFFFFF); code |= 0xA0; @@ -12797,7 +12971,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_MWR_RRD: - assert(code == (insCodeMR(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); code &= ~((code_t)0xFFFFFFFF); code |= 0xA2; @@ -12835,9 +13009,9 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13214,13 +13388,13 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code |= 0x1; } - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Register... - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13234,7 +13408,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) // Output a size prefix for a 16-bit operand dst += emitOutputByte(dst, 0x66); } - dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(ins, reg, size, nullptr)); + dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(id, reg, size, nullptr)); } break; @@ -13244,9 +13418,9 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) case INS_push_hide: assert(size == EA_PTRSIZE); - code = insEncodeOpreg(ins, reg, size); + code = insEncodeOpreg(id, reg, size); - assert(!TakesSimdPrefix(ins)); + assert(!TakesSimdPrefix(id)); assert(!TakesRexWPrefix(ins, size)); // Output the REX prefix @@ -13266,13 +13440,13 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code = insCodeRR(ins); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Register... - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13301,7 +13475,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); assert(size == EA_1BYTE); - code = insEncodeMRreg(ins, reg, EA_1BYTE, insCodeMR(ins)); + code = insEncodeMRreg(id, reg, EA_1BYTE, insCodeMR(ins)); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13327,7 +13501,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); - code = insEncodeMRreg(ins, reg, size, insCodeMR(ins)); + code = insEncodeMRreg(id, reg, size, insCodeMR(ins)); if (size != EA_1BYTE) { @@ -13341,11 +13515,11 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) } } - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Output the REX prefix @@ -13432,36 +13606,36 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { code = insCodeMR(ins); } - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } else if ((ins == INS_movsx) || (ins == INS_movzx) || (insIsCMOV(ins))) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code) | (int)(size == EA_2BYTE); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code) | (int)(size == EA_2BYTE); #ifdef TARGET_AMD64 assert((size < EA_4BYTE) || (insIsCMOV(ins))); if ((size == EA_8BYTE) || (ins == INS_movsx)) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } else if (ins == INS_movsxd) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); #endif // TARGET_AMD64 } @@ -13471,8 +13645,8 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); if ((ins == INS_crc32) && (size > EA_1BYTE)) { code |= 0x0100; @@ -13485,15 +13659,15 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } else if (size == EA_8BYTE) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } #endif // FEATURE_HW_INTRINSICS else { - assert(!TakesSimdPrefix(ins)); + assert(!TakesSimdPrefix(id)); code = insCodeMR(ins); - code = insEncodeMRreg(ins, code); + code = insEncodeMRreg(id, code); if (ins != INS_test) { @@ -13523,7 +13697,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) // Don't need to zero out the high bits explicitly if ((ins != INS_xor) || (reg1 != reg2)) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } else { @@ -13560,10 +13734,10 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } } - unsigned regCode = insEncodeReg345(ins, regFor345Bits, size, &code); - regCode |= insEncodeReg012(ins, regFor012Bits, size, &code); + unsigned regCode = insEncodeReg345(id, regFor345Bits, size, &code); + regCode |= insEncodeReg012(id, regFor012Bits, size, &code); - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { // In case of AVX instructions that take 3 operands, we generally want to encode reg1 // as first source. In this case, reg1 is both a source and a destination. @@ -13575,12 +13749,12 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) if (IsDstDstSrcAVXInstruction(ins)) { // encode source/dest operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, reg1, size, code); + code = insEncodeReg3456(id, reg1, size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, reg2, size, code); + code = insEncodeReg3456(id, reg2, size, code); } } @@ -13822,21 +13996,21 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) emitAttr size = id->idOpSize(); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); - code = insEncodeRMreg(ins, code); + code = insEncodeRMreg(id, code); // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } - unsigned regCode = insEncodeReg345(ins, targetReg, size, &code); - regCode |= insEncodeReg012(ins, src2, size, &code); + unsigned regCode = insEncodeReg345(id, targetReg, size, &code); + regCode |= insEncodeReg012(id, src2, size, &code); // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, src1, size, code); + code = insEncodeReg3456(id, src1, size, code); // Output the REX/VEX/EVEX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13930,17 +14104,17 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // Get the 'base' opcode. code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMIreg(ins, reg, size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMIreg(id, reg, size, code); assert(code & 0x00FF0000); - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { // The 'vvvv' bits encode the destination register, which for this case (RI) // is the same as the source. - code = insEncodeReg3456(ins, reg, size, code); + code = insEncodeReg3456(id, reg, size, code); } - unsigned regcode = (insEncodeReg345(ins, regOpcode, size, &code) | insEncodeReg012(ins, reg, size, &code)) << 8; + unsigned regcode = (insEncodeReg345(id, regOpcode, size, &code) | insEncodeReg012(id, reg, size, &code)) << 8; // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13968,15 +14142,15 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) assert(code < 0x100); code |= 0x08; // Set the 'w' bit - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); code |= regcode; // This is INS_mov and will not take VEX prefix assert(!TakesVexPrefix(ins)); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -14068,13 +14242,13 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // r/m, immed form, but do have a dstReg,srcReg,imm8 form. if (valInByte && useSigned && insNeedsRRIb(ins)) { - code = insEncodeRRIb(ins, reg, size); + code = insEncodeRRIb(id, reg, size); } else { code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMIreg(ins, reg, size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMIreg(id, reg, size, code); } } @@ -14098,7 +14272,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) /* Set the 'w' bit to get the large version */ /* and the REX.W bit to get the really large version */ - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); code |= 0x1; break; #endif @@ -14307,9 +14481,9 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id) } else { - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); } @@ -14617,7 +14791,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) idAmd->idCodeSize(sz); code = insCodeRM(ins); - code |= (insEncodeReg345(ins, id->idReg1(), EA_PTRSIZE, &code) << 8); + code |= (insEncodeReg345(id, id->idReg1(), EA_PTRSIZE, &code) << 8); dst = emitOutputAM(dst, idAmd, code, nullptr); @@ -14733,7 +14907,7 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) // ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte) { - assert(TakesEvexPrefix(id->idIns())); + assert(TakesEvexPrefix(id)); insTupleType tt = insTupleTypeInfo(id->idIns()); assert(hasTupleTypeInfo(id->idIns())); @@ -14944,12 +15118,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) #ifdef TARGET_AMD64 // Support only scalar AVX instructions and hence size is hard coded to 4-byte. - code = AddSimdPrefixIfNeeded(ins, code, EA_4BYTE); + code = AddSimdPrefixIfNeeded(id, code, EA_4BYTE); if (((ins == INS_cdq) || (ins == INS_cwde)) && - (TakesRexWPrefix(ins, id->idOpSize()) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))) + (TakesRexWPrefix(ins, id->idOpSize()) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id)))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); #endif @@ -15223,8 +15397,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RRW_SHF: code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMRreg(ins, id->idReg1(), size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, id->idReg1(), size, code); // set the W bit if (size != EA_1BYTE) @@ -15233,9 +15407,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } // Emit the REX prefix if it exists - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Output a size prefix for a 16-bit operand @@ -15291,8 +15465,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMRreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, code); mReg = id->idReg1(); rReg = id->idReg2(); } @@ -15301,7 +15475,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeMI(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); assert((code & 0xC000) == 0); code |= 0xC000; @@ -15315,19 +15489,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeRM(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); mReg = id->idReg2(); rReg = id->idReg1(); } assert(code & 0x00FF0000); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { if (IsDstDstSrcAVXInstruction(ins)) { @@ -15337,17 +15511,17 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // (Though we will need to handle the few ops that can have the 'vvvv' bits as destination, // e.g. pslldq, when/if we support those instructions with 2 registers.) // (see x64 manual Table 2-9. Instructions with a VEX.vvvv destination) - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { // This is a "merge" move instruction. // Encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg2(), size, code); } } - regcode = (insEncodeReg345(ins, rReg, size, &code) | insEncodeReg012(ins, mReg, size, &code)); + regcode = (insEncodeReg345(id, rReg, size, &code) | insEncodeReg012(id, mReg, size, &code)); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -15462,8 +15636,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } @@ -15490,8 +15664,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); } sz = emitSizeOfInsDsc(id); @@ -15519,8 +15693,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } sz = emitSizeOfInsDsc(id); @@ -15531,8 +15705,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD: case IF_ARW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); sz = emitSizeOfInsDsc(id); break; @@ -15540,7 +15714,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD_RRD: { code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); dst = emitOutputAM(dst, id, code); sz = emitSizeOfInsDsc(id); break; @@ -15629,7 +15803,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15640,10 +15814,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } @@ -15664,15 +15838,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } @@ -15685,8 +15859,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodedInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // 4-byte AVX instructions are special cased inside emitOutputSV @@ -15697,7 +15871,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } break; @@ -15711,8 +15885,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // 4-byte AVX instructions are special cased inside emitOutputSV @@ -15723,7 +15897,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } @@ -15735,7 +15909,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SWR_RRD: case IF_SRW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15746,10 +15920,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); break; @@ -15783,7 +15957,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15794,10 +15968,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } @@ -15828,15 +16002,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } @@ -15850,8 +16024,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodedInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // Special case 4-byte AVX instructions @@ -15861,7 +16035,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } sz = emitSizeOfInsDsc(id); @@ -15876,8 +16050,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // Special case 4-byte AVX instructions @@ -15887,7 +16061,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } sz = emitSizeOfInsDsc(id); @@ -15896,7 +16070,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_MRD_OFF: code = insCode(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15907,10 +16081,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = insEncodeReg012(id->idIns(), id->idReg1(), size, &code); + regcode = insEncodeReg012(id, id->idReg1(), size, &code); dst = emitOutputCV(dst, id, code | 0x30 | regcode); sz = emitSizeOfInsDsc(id); break; @@ -15919,7 +16093,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_MWR_RRD: case IF_MRW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15930,10 +16104,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); sz = emitSizeOfInsDsc(id); break; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index dd4eec46dadb92..6741676dfce43f 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -75,16 +75,16 @@ unsigned emitGetAdjustedSize(instrDesc* id, code_t code) const; code_t emitExtractVexPrefix(instruction ins, code_t& code) const; code_t emitExtractEvexPrefix(instruction ins, code_t& code) const; -unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code); -unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code); -code_t insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code); -unsigned insEncodeRegSIB(instruction ins, regNumber reg, code_t* code); +unsigned insEncodeReg012(const instrDesc* id, regNumber reg, emitAttr size, code_t* code); +unsigned insEncodeReg345(const instrDesc* id, regNumber reg, emitAttr size, code_t* code); +code_t insEncodeReg3456(const instrDesc* id, regNumber reg, emitAttr size, code_t code); +unsigned insEncodeRegSIB(const instrDesc* id, regNumber reg, code_t* code); -code_t insEncodeMRreg(instruction ins, code_t code); -code_t insEncodeRMreg(instruction ins, code_t code); -code_t insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code); -code_t insEncodeRRIb(instruction ins, regNumber reg, emitAttr size); -code_t insEncodeOpreg(instruction ins, regNumber reg, emitAttr size); +code_t insEncodeMRreg(const instrDesc* id, code_t code); +code_t insEncodeRMreg(const instrDesc* id, code_t code); +code_t insEncodeMRreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code); +code_t insEncodeRRIb(const instrDesc* id, regNumber reg, emitAttr size); +code_t insEncodeOpreg(const instrDesc* id, regNumber reg, emitAttr size); unsigned insSSval(unsigned scale); @@ -103,16 +103,19 @@ bool IsVexEncodedInstruction(instruction ins) const; bool IsEvexEncodedInstruction(instruction ins) const; bool IsVexOrEvexEncodedInstruction(instruction ins) const; -code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code); +code_t insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code); -code_t AddRexWPrefix(instruction ins, code_t code); -code_t AddRexRPrefix(instruction ins, code_t code); -code_t AddRexXPrefix(instruction ins, code_t code); -code_t AddRexBPrefix(instruction ins, code_t code); +code_t AddRexWPrefix(const instrDesc* id, code_t code); +code_t AddRexRPrefix(const instrDesc* id, code_t code); +code_t AddRexXPrefix(const instrDesc* id, code_t code); +code_t AddRexBPrefix(const instrDesc* id, code_t code); code_t AddRexPrefix(instruction ins, code_t code); bool EncodedBySSE38orSSE3A(instruction ins) const; bool Is4ByteSSEInstruction(instruction ins) const; +code_t AddEvexVPrimePrefix(code_t code); +code_t AddEvexRPrimePrefix(code_t code); + static bool IsMovInstruction(instruction ins); bool HasSideEffect(instruction ins, emitAttr size); bool IsRedundantMov( @@ -181,13 +184,15 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr // Returns: // `true` if W bit needs to be set to 1. // -bool IsWEvexOpcodeExtension(instruction ins) +bool IsWEvexOpcodeExtension(const instrDesc* id) { - if (!TakesEvexPrefix(ins)) + if (!TakesEvexPrefix(id)) { return false; } + instruction ins = id->idIns(); + switch (ins) { case INS_movq: @@ -486,7 +491,7 @@ bool UseSimdEncoding() const #define EVEX_PREFIX_MASK 0xFF00000000000000ULL #define EVEX_PREFIX_CODE 0x6200000000000000ULL -bool TakesEvexPrefix(instruction ins) const; +bool TakesEvexPrefix(const instrDesc* id) const; //------------------------------------------------------------------------ // hasEvexPrefix: Returns true if the instruction encoding already @@ -514,9 +519,13 @@ code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); // // Returns: // code with prefix added. -code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) +// TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) +// to pass emitAttr size +code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) { - if (TakesEvexPrefix(ins)) + instruction ins = id->idIns(); + + if (TakesEvexPrefix(id)) { code = AddEvexPrefix(ins, code, size); } @@ -537,11 +546,14 @@ code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) // size - operand size // // Returns: -// `true` if code has an Evex prefix. -// -code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size) +// TRUE if code has an Evex prefix. +// TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) +// to pass emitAttr size +code_t AddSimdPrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitAttr size) { - if (TakesEvexPrefix(ins)) + instruction ins = id->idIns(); + + if (TakesEvexPrefix(id)) { code = !hasEvexPrefix(code) ? AddEvexPrefix(ins, code, size) : code; } @@ -552,7 +564,7 @@ code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr return code; } -bool TakesSimdPrefix(instruction ins) const; +bool TakesSimdPrefix(const instrDesc* id) const; //------------------------------------------------------------------------ // hasVexOrEvexPrefix: Returns true if the instruction encoding already @@ -1024,4 +1036,7 @@ inline bool HasEmbeddedBroadcast(instrDesc* id) return false; } +inline bool HasHighSIMDReg(const instrDesc* id) const; +inline bool IsHighSIMDReg(regNumber) const; + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 0edd913a00bab3..d36c2ae582b6d1 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19106,6 +19106,27 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) #endif } +//------------------------------------------------------------------------ +// isEvexCompatibleHWIntrinsic: Checks if the intrinsic has a compatible +// EVEX form for its intended lowering instruction. +// +// Return Value: +// true if the intrisic node lowering instruction has an EVEX form +// +bool GenTree::isEvexCompatibleHWIntrinsic() +{ + assert(gtOper == GT_HWINTRINSIC); + +// TODO-XARCH-AVX512 remove the ReturnsPerElementMask check once K registers have been properly +// implemented in the register allocator +#if defined(TARGET_AMD64) + return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && + !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); +#else + return false; +#endif +} + GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID, CorInfoType simdBaseJitType, diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 0e19fd87107827..7359a86970d09c 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1509,6 +1509,7 @@ struct GenTree bool isCommutativeHWIntrinsic() const; bool isContainableHWIntrinsic() const; bool isRMWHWIntrinsic(Compiler* comp); + bool isEvexCompatibleHWIntrinsic(); #else bool isCommutativeHWIntrinsic() const { @@ -1524,6 +1525,11 @@ struct GenTree { return false; } + + bool isEvexCompatibleHWIntrinsic() + { + return false; + } #endif // FEATURE_HW_INTRINSICS static bool OperIsCommutative(genTreeOps gtOper) diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index b1299df1c1f1cf..bacb22173cedfa 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -158,6 +158,9 @@ enum HWIntrinsicFlag : unsigned int // contained HW_Flag_MaybeCommutative = 0x80000, + // The intrinsic has no EVEX compatible form + HW_Flag_NoEvexSemantics = 0x100000 + #elif defined(TARGET_ARM64) // The intrinsic has an immediate operand // - the value can be (and should be) encoded in a corresponding instruction when the operand value is constant @@ -172,8 +175,7 @@ enum HWIntrinsicFlag : unsigned int HW_Flag_SIMDScalar = 0x1000, // The intrinsic supports some sort of containment analysis - HW_Flag_SupportsContainment = 0x2000 - + HW_Flag_SupportsContainment = 0x2000, #else #error Unsupported platform #endif @@ -758,6 +760,22 @@ struct HWIntrinsicInfo return (flags & HW_Flag_HasRMWSemantics) != 0; #else #error Unsupported platform +#endif + } + //------------------------------------------------------------------------ + // HasEvexSemantics: Checks if the NamedIntrinsic has a lowering to + // to an instruction with an EVEX form. + // + // Return Value: + // true if the NamedIntrinsic lowering has an EVEX form. + // + static bool HasEvexSemantics(NamedIntrinsic id) + { +#if defined(TARGET_XARCH) + HWIntrinsicFlag flags = lookupFlags(id); + return (flags & HW_Flag_NoEvexSemantics) == 0; +#else + return false; #endif } diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 8d5c2d16a35cbd..f474d5387333f4 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -68,7 +68,7 @@ HARDWARE_INTRINSIC(Vector128, EqualsAll, HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(Vector128, get_One, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) @@ -171,7 +171,7 @@ HARDWARE_INTRINSIC(Vector256, EqualsAll, HARDWARE_INTRINSIC(Vector256, EqualsAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ExtractMostSignificantBits, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(Vector256, get_One, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) @@ -256,42 +256,42 @@ HARDWARE_INTRINSIC(SSE, Add, HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) @@ -308,7 +308,7 @@ HARDWARE_INTRINSIC(SSE, Min, HARDWARE_INTRINSIC(SSE, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, MoveHighToLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE, MoveLowToHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) -HARDWARE_INTRINSIC(SSE, MoveMask, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE, MoveMask, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, MoveScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, MultiplyScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -317,10 +317,10 @@ HARDWARE_INTRINSIC(SSE, Prefetch0, HARDWARE_INTRINSIC(SSE, Prefetch1, 0, 1, {INS_invalid, INS_prefetcht1, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, Prefetch2, 0, 1, {INS_invalid, INS_prefetcht2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, PrefetchNonTemporal, 0, 1, {INS_invalid, INS_prefetchnta, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, Reciprocal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, Reciprocal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, Shuffle, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, SqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -357,42 +357,42 @@ HARDWARE_INTRINSIC(SSE2, AddScalar, HARDWARE_INTRINSIC(SSE2, And, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToUInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) @@ -420,7 +420,7 @@ HARDWARE_INTRINSIC(SSE2, MemoryFence, HARDWARE_INTRINSIC(SSE2, MaxScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2, Min, 16, 2, {INS_invalid, INS_pminub, INS_pminsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(SSE2, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE2, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, MoveScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE2, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuludq, INS_invalid, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, MultiplyHigh, 16, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -473,11 +473,11 @@ HARDWARE_INTRINSIC(SSE2_X64, StoreNonTemporal, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE3 Intrinsics -HARDWARE_INTRINSIC(SSE3, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE3, LoadDquVector128, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE3, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, LoadDquVector128, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE3, MoveAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3, MoveHighAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3, MoveLowAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -489,54 +489,54 @@ HARDWARE_INTRINSIC(SSE3, MoveLowAndDuplicate, // SSSE3 Intrinsics HARDWARE_INTRINSIC(SSSE3, Abs, 16, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(SSSE3, AlignRight, 16, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSSE3, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, MultiplyHighRoundScale, 16, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, Shuffle, 16, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSSE3, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE41 Intrinsics -HARDWARE_INTRINSIC(SSE41, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE41, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(SSE41, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Extract, 16, 2, {INS_pextrb, INS_pextrb, INS_invalid, INS_invalid, INS_pextrd, INS_pextrd, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MultiIns|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Insert, 16, 3, {INS_pinsrb, INS_pinsrb, INS_invalid, INS_invalid, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE41, LoadAlignedVector128NonTemporal, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41, Max, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41, Min, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE41, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, TestC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(SSE41, TestNotZAndNotC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(SSE41, TestZ, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE41, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestNotZAndNotC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestZ, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -568,15 +568,15 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX Intrinsics HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, And, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Compare, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) @@ -590,7 +590,7 @@ HARDWARE_INTRINSIC(AVX, CompareNotLessThan, HARDWARE_INTRINSIC(AVX, CompareNotLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareOrdered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector128Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -599,43 +599,43 @@ HARDWARE_INTRINSIC(AVX, ConvertToVector256Double, HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, Divide, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, DotProduct, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX, DotProduct, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, DuplicateEvenIndexed, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, DuplicateOddIndexed, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ExtractVector128, 32, 2, {INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, InsertVector128, 32, 3, {INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX, LoadAlignedVector256, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, LoadDquVector256, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, LoadDquVector256, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, LoadVector256, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX, Max, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(AVX, Min, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) -HARDWARE_INTRINSIC(AVX, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg) -HARDWARE_INTRINSIC(AVX, MoveMask, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, MoveMask, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Multiply, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, Or, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, Permute, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilps, INS_vpermilpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, Permute2x128, 32, 3, {INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX, Permute2x128, 32, 3, {INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, PermuteVar, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilpsvar, INS_vpermilpdvar}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Reciprocal, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, ReciprocalSqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundCurrentDirection, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToNearestInteger, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToNegativeInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToPositiveInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToZero, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, Reciprocal, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, ReciprocalSqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundCurrentDirection, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToNearestInteger, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToNegativeInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToPositiveInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToZero, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Shuffle, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_NoRMWSemantics|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX, Sqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, Store, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX, StoreAligned, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX, StoreAlignedNonTemporal, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, TestC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX, TestC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, UnpackHigh, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, UnpackLow, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, Xor, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -648,47 +648,47 @@ HARDWARE_INTRINSIC(AVX, Xor, HARDWARE_INTRINSIC(AVX2, Abs, 32, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX2, Add, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, And, 32, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, Average, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, ExtractVector128, 32, 2, {INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, ConvertToInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2, ConvertToUInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int16, 32, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int32, 32, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int64, 32, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, GatherVector128, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherVector256, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherMaskVector128, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherMaskVector256, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalAddSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalSubtractSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, GatherVector128, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherVector256, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherMaskVector128, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherMaskVector256, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalAddSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalSubtractSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, InsertVector128, 32, 3, {INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, LoadAlignedVector256NonTemporal, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(AVX2, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Max, 32, 2, {INS_pmaxsb, INS_pmaxub, INS_pmaxsw, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, Min, 32, 2, {INS_pminsb, INS_pminub, INS_pminsw, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, MoveMask, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX2, MoveMask, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Multiply, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, MultipleSumAbsoluteDifferences, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, MultipleSumAbsoluteDifferences, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, MultiplyAddAdjacent, 32, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_pmaddwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, MultiplyHigh, 32, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, MultiplyHighRoundScale, 32, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, MultiplyLow, 32, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, Or, 32, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, Permute2x128, 32, 3, {INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, Permute2x128, 32, 3, {INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Permute4x64, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq, INS_vpermq, INS_invalid, INS_vpermpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, PermuteVar8x32, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermd, INS_vpermd, INS_invalid, INS_invalid, INS_vpermps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX2, PackSignedSaturate, 32, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -704,7 +704,7 @@ HARDWARE_INTRINSIC(AVX2, ShiftRightLogicalVariable, HARDWARE_INTRINSIC(AVX2, Shuffle, 32, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_pshufd, INS_pshufd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_MaybeIMM) HARDWARE_INTRINSIC(AVX2, ShuffleHigh, 32, 2, {INS_invalid, INS_invalid, INS_pshufhw, INS_pshufhw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, ShuffleLow, 32, 2, {INS_invalid, INS_invalid, INS_pshuflw, INS_pshuflw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX2, Sign, 32, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, Sign, 32, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, SumAbsoluteDifferences, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, Subtract, 32, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, SubtractSaturate, 32, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -723,56 +723,56 @@ HARDWARE_INTRINSIC(AVXVNNI, MultiplyWideningAndAddSaturate, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AES Intrinsics -HARDWARE_INTRINSIC(AES, Decrypt, 16, 2, {INS_invalid, INS_aesdec, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, DecryptLast, 16, 2, {INS_invalid, INS_aesdeclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, Encrypt, 16, 2, {INS_invalid, INS_aesenc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, EncryptLast, 16, 2, {INS_invalid, INS_aesenclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, InverseMixColumns, 16, 1, {INS_invalid, INS_aesimc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, KeygenAssist, 16, 2, {INS_invalid, INS_aeskeygenassist, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AES, Decrypt, 16, 2, {INS_invalid, INS_aesdec, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, DecryptLast, 16, 2, {INS_invalid, INS_aesdeclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, Encrypt, 16, 2, {INS_invalid, INS_aesenc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, EncryptLast, 16, 2, {INS_invalid, INS_aesenclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, InverseMixColumns, 16, 1, {INS_invalid, INS_aesimc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, KeygenAssist, 16, 2, {INS_invalid, INS_aeskeygenassist, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics -HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, GetMaskUpToLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1, ResetLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1, TrailingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns) -HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics -HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, GetMaskUpToLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1_X64, ResetLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1_X64, TrailingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns) -HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI2 Intrinsics -HARDWARE_INTRINSIC(BMI2, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(BMI2, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI2, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI2 Intrinsics -HARDWARE_INTRINSIC(BMI2_X64, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2_X64, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2_X64, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(BMI2_X64, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -809,7 +809,7 @@ HARDWARE_INTRINSIC(LZCNT_X64, LeadingZeroCount, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // PCLMULQDQ Intrinsics -HARDWARE_INTRINSIC(PCLMULQDQ, CarrylessMultiply, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pclmulqdq, INS_pclmulqdq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(PCLMULQDQ, CarrylessMultiply, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pclmulqdq, INS_pclmulqdq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -842,8 +842,8 @@ HARDWARE_INTRINSIC(SSE, COMISS, HARDWARE_INTRINSIC(SSE, UCOMISS, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, COMISD, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, UCOMISD, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index a6968c123c7381..13ed02d75c6ead 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -202,8 +202,8 @@ INST3(movntdq, "movntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(movntpd, "movntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_64Bit) INST3(movntps, "movntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit) -INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_NONE, INS_FLAGS_None) -INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_NONE, INS_FLAGS_None) +INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqu32 +INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqa32 INST3(movlpd, "movlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movlps, "movlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), INS_TT_TUPLE1_FIXED, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movhpd, "movhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) @@ -341,10 +341,10 @@ INST3(pmulhuw, "pmulhuw", IUM_WR, BAD_CODE, BAD_CODE, INST3(pmuludq, "pmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), INS_TT_FULL_MEM, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result INST3(pmullw, "pmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result // TODO-XArch-AVX512: pand, pandn, por, and pxor have AVX512 instructions under different names, pandd, pandq etc -INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs +INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs INST3(psadbw, "psadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers INST3(psubsb, "psubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation INST3(psubusb, "psubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation @@ -493,10 +493,10 @@ INST3(vpbroadcastb, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, INST3(vpbroadcastw, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x79), INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_FLAGS_None) // Broadcast int16 value from reg/memory to entire ymm register INST3(vpbroadcastd, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Broadcast int32 value from reg/memory to entire ymm register INST3(vpbroadcastq, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Broadcast int64 value from reg/memory to entire ymm register -INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed floating point values -INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed integer values -INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values -INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values +INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 +INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 +INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is insertf32x4 +INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is inserti32x4 INST3(vzeroupper, "zeroupper", IUM_WR, 0xC577F8, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) INST3(vperm2i128, "perm2i128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x46), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Permute 128-bit halves of input register INST3(vpermq, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x00), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute 64-bit of input register diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index ecc946c65a1ff6..80c3a195166011 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -256,6 +256,23 @@ regMaskTP LinearScan::allSIMDRegs() return availableFloatRegs; } +//------------------------------------------------------------------------ +// lowSIMDRegs(): Return the set of SIMD registers associated with VEX +// encoding only, i.e., remove the high EVEX SIMD registers from the available +// set. +// +// Return Value: +// Register mask of the SSE/VEX-only SIMD registers +// +regMaskTP LinearScan::lowSIMDRegs() +{ +#if defined(TARGET_AMD64) + return (availableFloatRegs & RBM_LOWFLOAT); +#else + return availableFloatRegs; +#endif +} + void LinearScan::updateNextFixedRef(RegRecord* regRecord, RefPosition* nextRefPosition) { LsraLocation nextLocation; @@ -460,8 +477,19 @@ regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) } break; +#if defined(TARGET_AMD64) + case LSRA_LIMIT_UPPER_SIMD_SET: + if ((mask & LsraLimitUpperSimdSet) != RBM_NONE) + { + mask = getConstrainedRegMask(mask, LsraLimitUpperSimdSet, minRegCount); + } + break; +#endif + default: + { unreached(); + } } if (refPosition != nullptr && refPosition->isFixedRegRef) @@ -671,6 +699,17 @@ LinearScan::LinearScan(Compiler* theCompiler) } #endif // TARGET_AMD64 || TARGET_ARM64 +#if defined(TARGET_AMD64) + // TODO-XARCH-AVX512 switch this to canUseEvexEncoding() once we independently + // allow EVEX use from the stress flag (currently, if EVEX stress is turned off, + // we cannot use EVEX at all) + if (compiler->DoJitStressEvexEncoding()) + { + availableFloatRegs |= RBM_HIGHFLOAT; + availableDoubleRegs |= RBM_HIGHFLOAT; + } +#endif + for (unsigned int i = 0; i < TYP_COUNT; i++) { var_types thisType = (var_types)genActualTypes[i]; @@ -1848,7 +1887,7 @@ void LinearScan::identifyCandidates() } } JITDUMP(" "); - DBEXEC(VERBOSE, newInt->dump()); + DBEXEC(VERBOSE, newInt->dump(compiler)); } else { @@ -4025,7 +4064,7 @@ void LinearScan::processBlockStartLocations(BasicBlock* currentBlock) { // Just clear any constant registers and return. resetAvailableRegs(); - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); Interval* assignedInterval = physRegRecord->assignedInterval; @@ -4273,7 +4312,7 @@ void LinearScan::processBlockStartLocations(BasicBlock* currentBlock) resetRegState(); setRegsInUse(liveRegs); } - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); if ((liveRegs & genRegMask(reg)) == 0) @@ -4555,7 +4594,7 @@ void LinearScan::allocateRegisters() } resetRegState(); - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->recentRefPosition = nullptr; @@ -4718,7 +4757,7 @@ void LinearScan::allocateRegisters() #ifdef DEBUG // Validate the current state just after we've freed the registers. This ensures that any pending // freed registers will have had their state updated to reflect the intervals they were holding. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { regMaskTP regMask = genRegMask(reg); // If this isn't available or if it's still waiting to be freed (i.e. it was in @@ -5647,7 +5686,7 @@ void LinearScan::allocateRegisters() if (interval.isActive) { printf("Active "); - interval.dump(); + interval.dump(this->compiler); } } @@ -6638,7 +6677,7 @@ void LinearScan::resolveRegisters() // are encountered. if (enregisterLocalVars) { - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); Interval* assignedInterval = physRegRecord->assignedInterval; @@ -8969,29 +9008,6 @@ void LinearScan::dumpLsraStatsSummary(FILE* file) #endif // TRACK_LSRA_STATS #ifdef DEBUG -void dumpRegMask(regMaskTP regs) -{ - if (regs == RBM_ALLINT) - { - printf("[allInt]"); - } - else if (regs == (RBM_ALLINT & ~RBM_FPBASE)) - { - printf("[allIntButFP]"); - } - else if (regs == RBM_ALLFLOAT) - { - printf("[allFloat]"); - } - else if (regs == RBM_ALLDOUBLE) - { - printf("[allDouble]"); - } - else - { - dspRegMask(regs); - } -} static const char* getRefTypeName(RefType refType) { @@ -9063,7 +9079,7 @@ void RefPosition::dump(LinearScan* linearScan) printf(FMT_BB " ", this->bbNum); printf("regmask="); - dumpRegMask(registerAssignment); + linearScan->compiler->dumpRegMask(registerAssignment); printf(" minReg=%d", minRegCandidateCount); @@ -9126,7 +9142,7 @@ void RegRecord::dump() tinyDump(); } -void Interval::dump() +void Interval::dump(Compiler* compiler) { printf("Interval %2u:", intervalIndex); @@ -9199,7 +9215,7 @@ void Interval::dump() printf(" physReg:%s", getRegName(physReg)); printf(" Preferences="); - dumpRegMask(this->registerPreferences); + compiler->dumpRegMask(this->registerPreferences); if (relatedInterval) { @@ -9281,7 +9297,7 @@ void LinearScan::lsraDumpIntervals(const char* msg) { // only dump something if it has references // if (interval->firstRefPosition) - interval.dump(); + interval.dump(this->compiler); } printf("\n"); @@ -10417,7 +10433,7 @@ void LinearScan::verifyFinalAllocation() } // Clear register assignments. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->assignedInterval = nullptr; @@ -10521,7 +10537,7 @@ void LinearScan::verifyFinalAllocation() } // Clear register assignments. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->assignedInterval = nullptr; @@ -10846,7 +10862,7 @@ void LinearScan::verifyFinalAllocation() } // Clear register assignments. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->assignedInterval = nullptr; @@ -11872,7 +11888,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, if (preferCalleeSave) { - regMaskTP calleeSaveCandidates = calleeSaveRegs(currentInterval->registerType); + regMaskTP calleeSaveCandidates = linearScan->calleeSaveRegs(currentInterval->registerType); if (currentInterval->isWriteThru) { // We'll only prefer a callee-save register if it's already been used. @@ -11888,7 +11904,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, } else { - callerCalleePrefs = callerSaveRegs(currentInterval->registerType); + callerCalleePrefs = linearScan->callerSaveRegs(currentInterval->registerType); } // If this has a delayed use (due to being used in a rmw position of a diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index c63d0755ba4b15..323bec544cfed5 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -64,22 +64,6 @@ inline bool registerTypesEquivalent(RegisterType a, RegisterType b) return varTypeIsIntegralOrI(a) == varTypeIsIntegralOrI(b); } -//------------------------------------------------------------------------ -// calleeSaveRegs: Get the set of callee-save registers of the given RegisterType -// -inline regMaskTP calleeSaveRegs(RegisterType rt) -{ - return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_SAVED : RBM_FLT_CALLEE_SAVED; -} - -//------------------------------------------------------------------------ -// callerSaveRegs: Get the set of caller-save registers of the given RegisterType -// -inline regMaskTP callerSaveRegs(RegisterType rt) -{ - return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; -} - //------------------------------------------------------------------------ // RefInfo: Captures the necessary information for a definition that is "in-flight" // during `buildIntervals` (i.e. a tree-node definition has been encountered, @@ -740,8 +724,19 @@ class LinearScan : public LinearScanInterface unsigned lsraStressMask; // This controls the registers available for allocation - enum LsraStressLimitRegs{LSRA_LIMIT_NONE = 0, LSRA_LIMIT_CALLEE = 0x1, LSRA_LIMIT_CALLER = 0x2, - LSRA_LIMIT_SMALL_SET = 0x3, LSRA_LIMIT_MASK = 0x3}; + enum LsraStressLimitRegs + { + LSRA_LIMIT_NONE = 0, + LSRA_LIMIT_CALLEE = 0x1, + LSRA_LIMIT_CALLER = 0x2, + LSRA_LIMIT_SMALL_SET = 0x3, +#if defined(TARGET_AMD64) + LSRA_LIMIT_UPPER_SIMD_SET = 0x2000, + LSRA_LIMIT_MASK = 0x2003 +#else + LSRA_LIMIT_MASK = 0x3 +#endif + }; // When LSRA_LIMIT_SMALL_SET is specified, it is desirable to select a "mixed" set of caller- and callee-save // registers, so as to get different coverage than limiting to callee or caller. @@ -761,6 +756,9 @@ class LinearScan : public LinearScanInterface (RBM_EAX | RBM_ECX | RBM_EBX | RBM_ETW_FRAMED_EBP | RBM_ESI | RBM_EDI); #endif // !UNIX_AMD64_ABI static const regMaskTP LsraLimitSmallFPSet = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7); + static const regMaskTP LsraLimitUpperSimdSet = + (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | + RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31); #elif defined(TARGET_ARM) // On ARM, we may need two registers to set up the target register for a virtual call, so we need // to have at least the maximum number of arg registers, plus 2. @@ -1066,6 +1064,7 @@ class LinearScan : public LinearScanInterface regMaskTP allRegs(RegisterType rt); regMaskTP allByteRegs(); regMaskTP allSIMDRegs(); + regMaskTP lowSIMDRegs(); regMaskTP internalFloatRegCandidates(); void makeRegisterInactive(RegRecord* physRegRecord); @@ -1854,6 +1853,7 @@ class LinearScan : public LinearScanInterface int BuildCastUses(GenTreeCast* cast, regMaskTP candidates); #ifdef TARGET_XARCH int BuildRMWUses(GenTree* node, GenTree* op1, GenTree* op2, regMaskTP candidates = RBM_NONE); + inline regMaskTP BuildEvexIncompatibleMask(GenTree* tree); #endif // !TARGET_XARCH int BuildSelect(GenTreeOp* select); // This is the main entry point for building the RefPositions for a node. @@ -1934,6 +1934,40 @@ class LinearScan : public LinearScanInterface int BuildPutArgSplit(GenTreePutArgSplit* tree); #endif // FEATURE_ARG_SPLIT int BuildLclHeap(GenTree* tree); + +#if defined(TARGET_AMD64) + regMaskTP get_RBM_ALLFLOAT() const + { + return compiler->rbmAllFloat; + } + regMaskTP get_RBM_FLT_CALLEE_TRASH() const + { + return compiler->rbmFltCalleeTrash; + } + unsigned get_AVAILABLE_REG_COUNT() const + { + return compiler->availableRegCount; + } +#endif // TARGET_AMD64 + + //------------------------------------------------------------------------ + // calleeSaveRegs: Get the set of callee-save registers of the given RegisterType + // + // NOTE: we currently don't need a LinearScan `this` pointer for this definition, and some callers + // don't have one available, so make is static. + // + static regMaskTP calleeSaveRegs(RegisterType rt) + { + return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_SAVED : RBM_FLT_CALLEE_SAVED; + } + + //------------------------------------------------------------------------ + // callerSaveRegs: Get the set of caller-save registers of the given RegisterType + // + regMaskTP callerSaveRegs(RegisterType rt) const + { + return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; + } }; /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX @@ -1984,7 +2018,7 @@ class Interval : public Referenceable #ifdef DEBUG // print out representation - void dump(); + void dump(Compiler* compiler); // concise representation for embedding void tinyDump(); // extremely concise representation @@ -2194,7 +2228,7 @@ class Interval : public Referenceable if (preferCalleeSave) { - regMaskTP calleeSaveMask = (calleeSaveRegs(this->registerType) & (newPreferences)); + regMaskTP calleeSaveMask = (LinearScan::calleeSaveRegs(this->registerType) & newPreferences); if (calleeSaveMask != RBM_NONE) { newPreferences = calleeSaveMask; @@ -2519,10 +2553,6 @@ class RefPosition #endif // DEBUG }; -#ifdef DEBUG -void dumpRegMask(regMaskTP regs); -#endif // DEBUG - /*****************************************************************************/ #endif //_LSRA_H_ /*****************************************************************************/ diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 3908f1998792a9..e6988402f5ce66 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -159,7 +159,7 @@ Interval* LinearScan::newInterval(RegisterType theRegisterType) newInt->intervalIndex = static_cast(intervals.size() - 1); #endif // DEBUG - DBEXEC(VERBOSE, newInt->dump()); + DBEXEC(VERBOSE, newInt->dump(this->compiler)); return newInt; } @@ -1212,7 +1212,7 @@ bool LinearScan::buildKillPositionsForNode(GenTree* tree, LsraLocation currentLo // If there are no callee-saved registers, the call could kill all the registers. // This is a valid state, so in that case assert should not trigger. The RA will spill in order // to free a register later. - assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType)) == RBM_NONE); + assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType) == RBM_NONE)); } } } @@ -1860,8 +1860,9 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc JITDUMP("\n"); } -static const regNumber lsraRegOrder[] = {REG_VAR_ORDER}; -const unsigned lsraRegOrderSize = ArrLen(lsraRegOrder); +static const regNumber lsraRegOrder[] = {REG_VAR_ORDER}; +const unsigned lsraRegOrderSize = ArrLen(lsraRegOrder); +// TODO-XARCH-AVX512 we might want to move this to be configured with the rbm variables too static const regNumber lsraRegOrderFlt[] = {REG_VAR_ORDER_FLT}; const unsigned lsraRegOrderFltSize = ArrLen(lsraRegOrderFlt); @@ -1870,7 +1871,7 @@ const unsigned lsraRegOrderFltSize = ArrLen(lsraRegOrderFlt); // void LinearScan::buildPhysRegRecords() { - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* curr = &physRegs[reg]; curr->init(reg); @@ -3010,7 +3011,7 @@ void LinearScan::UpdatePreferencesOfDyingLocal(Interval* interval) { printf("Last use of V%02u between PUTARG and CALL. Removing occupied arg regs from preferences: ", compiler->lvaTrackedIndexToLclNum(varIndex)); - dumpRegMask(unpref); + compiler->dumpRegMask(unpref); printf("\n"); } #endif diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 524bbc8577e96a..94cd726d8434f4 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -156,7 +156,7 @@ int LinearScan::BuildNode(GenTree* tree) srcCount = 0; assert(dstCount == 1); assert(!tree->IsReuseRegVal()); - RefPosition* def = BuildDef(tree); + RefPosition* def = BuildDef(tree, BuildEvexIncompatibleMask(tree)); def->getInterval()->isConstant = true; } break; @@ -1885,21 +1885,24 @@ int LinearScan::BuildIntrinsic(GenTree* tree) break; } assert(tree->gtGetOp2IfPresent() == nullptr); + + // TODO-XARCH-AVX512 this is overly constraining register available as NI_System_Math_Abs + // can be lowered to EVEX compatible instruction (the rest cannot) int srcCount; if (op1->isContained()) { - srcCount = BuildOperandUses(op1); + srcCount = BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); } else { - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, BuildEvexIncompatibleMask(op1)); srcCount = 1; } if (internalFloatDef != nullptr) { buildInternalRegisterUses(); } - BuildDef(tree); + BuildDef(tree, BuildEvexIncompatibleMask(tree)); return srcCount; } @@ -2006,6 +2009,9 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it // is not allocated the same register as the target. bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); +#if defined(TARGET_AMD64) + bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic(); +#endif // Create internal temps, and handle any other special requirements. // Note that the default case for building uses will handle the RMW flag, but if the uses @@ -2090,8 +2096,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(!isRMW); // MaskMove hardcodes the destination (op3) in DI/EDI/RDI - srcCount += BuildOperandUses(op1); - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); + srcCount += BuildOperandUses(op2, BuildEvexIncompatibleMask(op2)); srcCount += BuildOperandUses(op3, RBM_EDI); buildUses = false; @@ -2107,10 +2113,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(isRMW); // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, BuildEvexIncompatibleMask(op1)); srcCount += 1; - srcCount += op2->isContained() ? BuildOperandUses(op2) : BuildDelayFreeUses(op2, op1); + srcCount += op2->isContained() ? BuildOperandUses(op2, BuildEvexIncompatibleMask(op2)) + : BuildDelayFreeUses(op2, op1, BuildEvexIncompatibleMask(op2)); srcCount += BuildDelayFreeUses(op3, op1, RBM_XMM0); buildUses = false; @@ -2305,14 +2312,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(!isRMW); // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1); - srcCount += BuildDelayFreeUses(op2); + srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); + srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2)); // op3 should always be contained assert(op3->isContained()); // get a tmp register for mask that will be cleared by gather instructions - buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); setInternalRegsDelayFree = true; buildUses = false; @@ -2328,16 +2335,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou GenTree* op5 = intrinsicTree->Op(5); // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1); - srcCount += BuildDelayFreeUses(op2); - srcCount += BuildDelayFreeUses(op3); - srcCount += BuildDelayFreeUses(op4); + srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); + srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2)); + srcCount += BuildDelayFreeUses(op3, nullptr, BuildEvexIncompatibleMask(op3)); + srcCount += BuildDelayFreeUses(op4, nullptr, BuildEvexIncompatibleMask(op4)); // op5 should always be contained assert(op5->isContained()); // get a tmp register for mask that will be cleared by gather instructions - buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); setInternalRegsDelayFree = true; buildUses = false; @@ -2355,25 +2362,40 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert((numArgs > 0) && (numArgs < 4)); + regMaskTP op1RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op1RegCandidates = BuildEvexIncompatibleMask(op1); + } +#endif + if (intrinsicTree->OperIsMemoryLoadOrStore()) { - srcCount += BuildAddrUses(op1); + srcCount += BuildAddrUses(op1, op1RegCandidates); } else if (isRMW && !op1->isContained()) { - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, op1RegCandidates); srcCount += 1; } else { - srcCount += BuildOperandUses(op1); + srcCount += BuildOperandUses(op1, op1RegCandidates); } if (op2 != nullptr) { + regMaskTP op2RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op2RegCandidates = BuildEvexIncompatibleMask(op2); + } +#endif if (op2->OperIs(GT_HWINTRINSIC) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained()) { - srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1)); + srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1), op2RegCandidates); } else if (isRMW) { @@ -2382,7 +2404,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // When op2 is not contained and we are commutative, we can set op2 // to also be a tgtPrefUse. Codegen will then swap the operands. - tgtPrefUse2 = BuildUse(op2); + tgtPrefUse2 = BuildUse(op2, op2RegCandidates); srcCount += 1; } else if (!op2->isContained() || varTypeIsArithmetic(intrinsicTree->TypeGet())) @@ -2390,7 +2412,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // When op2 is not contained or if we are producing a scalar value // we need to mark it as delay free because the operand and target // exist in the same register set. - srcCount += BuildDelayFreeUses(op2, op1); + srcCount += BuildDelayFreeUses(op2, op1, op2RegCandidates); } else { @@ -2398,17 +2420,25 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // have no concerns of overwriting op2 because they exist in different // register sets. - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op2, op2RegCandidates); } } else { - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op2, op2RegCandidates); } if (op3 != nullptr) { - srcCount += isRMW ? BuildDelayFreeUses(op3, op1) : BuildOperandUses(op3); + regMaskTP op3RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op3RegCandidates = BuildEvexIncompatibleMask(op3); + } +#endif + srcCount += isRMW ? BuildDelayFreeUses(op3, op1, op3RegCandidates) + : BuildOperandUses(op3, op3RegCandidates); } } } @@ -2418,6 +2448,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (dstCount == 1) { +#if defined(TARGET_AMD64) + if (!intrinsicTree->isEvexCompatibleHWIntrinsic() && + (varTypeIsFloating(intrinsicTree->gtType) || varTypeIsSIMD(intrinsicTree->gtType))) + { + dstCandidates = lowSIMDRegs(); + } +#endif + BuildDef(intrinsicTree, dstCandidates); } else @@ -2701,4 +2739,43 @@ void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/) } } +//------------------------------------------------------------------------------ +// BuildEvexIncompatibleMask: Returns RMB_NONE or a mask representing the +// lower SIMD registers for a node that lowers to an instruction that does not +// have an EVEX form (thus cannot use the upper SIMD registers). +// The caller invokes this function when it knows the node is EVEX incompatible. +// +// Simply using lowSIMDRegs() on an incompatible node's operand will incorrectly mask +// same cases, e.g., memory loads. +// +// Arguments: +// tree - tree to check for EVEX lowering compatibility +// +// Return Value: +// RBM_NONE if compatible with EVEX (or not a floating/SIMD register), +// lowSIMDRegs() (XMM0-XMM16) otherwise. +// +inline regMaskTP LinearScan::BuildEvexIncompatibleMask(GenTree* tree) +{ +#if defined(TARGET_AMD64) + if (!(varTypeIsFloating(tree->gtType) || varTypeIsSIMD(tree->gtType))) + { + return RBM_NONE; + } + + // If a node is contained and is a memory load etc., use RBM_NONE as it will use an integer register for the + // load, not a SIMD register. + if (tree->isContained() && + (tree->OperIsIndir() || (tree->OperIs(GT_HWINTRINSIC) && tree->AsHWIntrinsic()->OperIsMemoryLoad()) || + tree->OperIs(GT_LEA))) + { + return RBM_NONE; + } + + return lowSIMDRegs(); +#else + return RBM_NONE; +#endif +} + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 36aa1b1fde4aff..4adb72e3095a67 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -7052,7 +7052,7 @@ bool Compiler::optIsProfitableToHoistTree(GenTree* tree, unsigned lnum) // Don't hoist expressions that are not heavy: tree->GetCostEx() < (2*IND_COST_EX) if (tree->GetCostEx() < (2 * IND_COST_EX)) { - JITDUMP(" tree cost too low: %d < %d (loopVarCount %u >= availableRegCount %u)\n", tree->GetCostEx(), + JITDUMP(" tree cost too low: %d < %d (loopVarCount %u >= availRegCount %u)\n", tree->GetCostEx(), 2 * IND_COST_EX, loopVarCount, availRegCount); return false; } @@ -7071,7 +7071,7 @@ bool Compiler::optIsProfitableToHoistTree(GenTree* tree, unsigned lnum) // Don't hoist expressions that barely meet CSE cost requirements: tree->GetCostEx() == MIN_CSE_COST if (tree->GetCostEx() <= MIN_CSE_COST + 1) { - JITDUMP(" tree not good CSE: %d <= %d (varInOutCount %u > availableRegCount %u)\n", tree->GetCostEx(), + JITDUMP(" tree not good CSE: %d <= %d (varInOutCount %u > availRegCount %u)\n", tree->GetCostEx(), 2 * MIN_CSE_COST + 1, varInOutCount, availRegCount) return false; } diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h index 6f63bc51211d63..ca90673e85adfe 100644 --- a/src/coreclr/jit/register.h +++ b/src/coreclr/jit/register.h @@ -94,7 +94,27 @@ REGDEF(XMM12, 12+XMMBASE, XMMMASK(12), "mm12" ) REGDEF(XMM13, 13+XMMBASE, XMMMASK(13), "mm13" ) REGDEF(XMM14, 14+XMMBASE, XMMMASK(14), "mm14" ) REGDEF(XMM15, 15+XMMBASE, XMMMASK(15), "mm15" ) -REGDEF(STK, 16+XMMBASE, 0x0000, "STK" ) + +REGDEF(XMM16, 16+XMMBASE, XMMMASK(16), "mm16" ) +REGDEF(XMM17, 17+XMMBASE, XMMMASK(17), "mm17" ) +REGDEF(XMM18, 18+XMMBASE, XMMMASK(18), "mm18" ) +REGDEF(XMM19, 19+XMMBASE, XMMMASK(19), "mm19" ) +REGDEF(XMM20, 20+XMMBASE, XMMMASK(20), "mm20" ) +REGDEF(XMM21, 21+XMMBASE, XMMMASK(21), "mm21" ) +REGDEF(XMM22, 22+XMMBASE, XMMMASK(22), "mm22" ) +REGDEF(XMM23, 23+XMMBASE, XMMMASK(23), "mm23" ) + +REGDEF(XMM24, 24+XMMBASE, XMMMASK(24), "mm24" ) +REGDEF(XMM25, 25+XMMBASE, XMMMASK(25), "mm25" ) +REGDEF(XMM26, 26+XMMBASE, XMMMASK(26), "mm26" ) +REGDEF(XMM27, 27+XMMBASE, XMMMASK(27), "mm27" ) +REGDEF(XMM28, 28+XMMBASE, XMMMASK(28), "mm28" ) +REGDEF(XMM29, 29+XMMBASE, XMMMASK(29), "mm29" ) +REGDEF(XMM30, 30+XMMBASE, XMMMASK(30), "mm30" ) +REGDEF(XMM31, 31+XMMBASE, XMMMASK(31), "mm31" ) + +REGDEF(STK, 32+XMMBASE, 0x0000, "STK" ) + #endif // !TARGET_X86 #elif defined(TARGET_ARM) diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 392a5417141398..cc97831c9f5287 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -61,7 +61,11 @@ inline bool compUnixX86Abi() /*****************************************************************************/ // The following are intended to capture only those #defines that cannot be replaced // with static const members of Target -#if defined(TARGET_XARCH) +#if defined(TARGET_AMD64) +#define REGMASK_BITS 64 +#define CSE_CONST_SHARED_LOW_BITS 16 + +#elif defined(TARGET_X86) #define REGMASK_BITS 32 #define CSE_CONST_SHARED_LOW_BITS 16 @@ -146,13 +150,14 @@ enum _regNumber_enum : unsigned ACTUAL_REG_COUNT = REG_COUNT - 1 // everything but REG_STK (only real regs) }; -enum _regMask_enum : unsigned +enum _regMask_enum : uint64_t { RBM_NONE = 0, #define REGDEF(name, rnum, mask, sname) RBM_##name = mask, #define REGALIAS(alias, realname) RBM_##alias = RBM_##realname, #include "register.h" + }; #elif defined(TARGET_X86) @@ -181,6 +186,13 @@ enum _regMask_enum : unsigned #error Unsupported target architecture #endif +#if defined(TARGET_AMD64) +// AVAILABLE_REG_COUNT is defined to be dynamic, based on whether AVX-512 high registers are available. +#define AVAILABLE_REG_COUNT get_AVAILABLE_REG_COUNT() +#else +#define AVAILABLE_REG_COUNT ACTUAL_REG_COUNT +#endif + /*****************************************************************************/ // TODO-Cleanup: The types defined below are mildly confusing: why are there both? @@ -192,7 +204,7 @@ enum _regMask_enum : unsigned // In any case, we believe that is OK to freely cast between these types; no information will // be lost. -#if defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_AMD64) || defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) typedef unsigned __int64 regMaskTP; #else typedef unsigned regMaskTP; @@ -528,7 +540,7 @@ inline regMaskTP genRegMask(regNumber reg) // (L1 latency on sandy bridge is 4 cycles for [base] and 5 for [base + index*c] ) // the reason this is AMD-only is because the x86 BE will try to get reg masks for REG_STK // and the result needs to be zero. - regMaskTP result = 1 << reg; + regMaskTP result = 1ULL << reg; assert(result == regMasks[reg]); return result; #else diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 4ec128a6345d21..64af2659bd592d 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -78,10 +78,17 @@ #endif // !UNIX_AMD64_ABI #define CSE_CONSTS 1 // Enable if we want to CSE constants - #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15) + #define RBM_LOWFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15 ) + #define RBM_HIGHFLOAT (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) + #define CNT_HIGHFLOAT 16 + + #define RBM_ALLFLOAT_INIT RBM_LOWFLOAT + + #define RBM_ALLFLOAT get_RBM_ALLFLOAT() + #define RBM_ALLDOUBLE RBM_ALLFLOAT #define REG_FP_FIRST REG_XMM0 - #define REG_FP_LAST REG_XMM15 + #define REG_FP_LAST REG_XMM31 #define FIRST_FP_ARGREG REG_XMM0 #ifdef UNIX_AMD64_ABI @@ -117,8 +124,11 @@ #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15) #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_RDI|RBM_RSI|RBM_EDX|RBM_ECX|RBM_R8|RBM_R9|RBM_R10|RBM_R11) #define RBM_FLT_CALLEE_SAVED (0) - #define RBM_FLT_CALLEE_TRASH (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \ + + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_FLT_CALLEE_TRASH_INIT (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \ RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) + #define REG_PROFILER_ENTER_ARG_0 REG_R14 #define RBM_PROFILER_ENTER_ARG_0 RBM_R14 #define REG_PROFILER_ENTER_ARG_1 REG_R15 @@ -132,15 +142,19 @@ #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ESI|RBM_EDI|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15) #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_ECX|RBM_EDX|RBM_R8|RBM_R9|RBM_R10|RBM_R11) #define RBM_FLT_CALLEE_SAVED (RBM_XMM6|RBM_XMM7|RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) - #define RBM_FLT_CALLEE_TRASH (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5) + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_FLT_CALLEE_TRASH_INIT (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5) #endif // !UNIX_AMD64_ABI + #define RBM_FLT_CALLEE_TRASH get_RBM_FLT_CALLEE_TRASH() + #define RBM_OSR_INT_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_EBP) #define REG_FLT_CALLEE_SAVED_FIRST REG_XMM6 #define REG_FLT_CALLEE_SAVED_LAST REG_XMM15 #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) + #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) @@ -169,7 +183,7 @@ #define REG_WRITE_BARRIER_SRC REG_ARG_1 #define RBM_WRITE_BARRIER_SRC RBM_ARG_1 - #define RBM_CALLEE_TRASH_NOGC RBM_CALLEE_TRASH + #define RBM_CALLEE_TRASH_NOGC RBM_CALLEE_TRASH // Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. #define RBM_CALLEE_TRASH_WRITEBARRIER RBM_CALLEE_TRASH_NOGC @@ -181,7 +195,6 @@ #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF (RBM_RSI | RBM_RDI | RBM_CALLEE_TRASH_NOGC) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. - // Note that RDI and RSI are still valid byref pointers after this helper call, despite their value being changed. #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF (RBM_CALLEE_TRASH_NOGC & ~(RBM_RDI | RBM_RSI)) #if 0 @@ -203,7 +216,10 @@ #endif // !UNIX_AMD64_ABI #endif - #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15 + #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7, \ + REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15, \ + REG_XMM16,REG_XMM17,REG_XMM18,REG_XMM19,REG_XMM20,REG_XMM21,REG_XMM22,REG_XMM23, \ + REG_XMM24,REG_XMM25,REG_XMM26,REG_XMM27,REG_XMM28,REG_XMM29,REG_XMM30,REG_XMM31 #ifdef UNIX_AMD64_ABI #define CNT_CALLEE_SAVED (5 + REG_ETW_FRAMED_EBP_COUNT) @@ -211,7 +227,9 @@ #define CNT_CALLEE_ENREG (CNT_CALLEE_SAVED) #define CNT_CALLEE_SAVED_FLOAT (0) - #define CNT_CALLEE_TRASH_FLOAT (16) + #define CNT_CALLEE_TRASH_FLOAT_INIT (16) + #define CNT_CALLEE_TRASH_HIGHFLOAT (16) + /* NOTE: Sync with variable name defined in compiler.h */ #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 @@ -220,13 +238,16 @@ #define CNT_CALLEE_TRASH (7) #define CNT_CALLEE_ENREG (CNT_CALLEE_SAVED) - #define CNT_CALLEE_SAVED_FLOAT (10) - #define CNT_CALLEE_TRASH_FLOAT (6) - + #define CNT_CALLEE_SAVED_FLOAT (10) + #define CNT_CALLEE_TRASH_FLOAT_INIT (6) + #define CNT_CALLEE_TRASH_HIGHFLOAT (16) + /* NOTE: Sync with variable name defined in compiler.h */ #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ESI,REG_EDI,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ESI,RBM_EDI,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 #endif // !UNIX_AMD64_ABI + #define CNT_CALLEE_TRASH_FLOAT get_CNT_CALLEE_TRASH_FLOAT() + #define CALLEE_SAVED_REG_MAXSZ (CNT_CALLEE_SAVED*REGSIZE_BYTES) #define CALLEE_SAVED_FLOAT_MAXSZ (CNT_CALLEE_SAVED_FLOAT*16) @@ -413,8 +434,9 @@ // The registers trashed by profiler enter/leave/tailcall hook // See vm\amd64\asmhelpers.asm for more details. - #define RBM_PROFILER_ENTER_TRASH RBM_CALLEE_TRASH - #define RBM_PROFILER_TAILCALL_TRASH RBM_PROFILER_LEAVE_TRASH + #define RBM_PROFILER_ENTER_TRASH RBM_CALLEE_TRASH + + #define RBM_PROFILER_TAILCALL_TRASH RBM_PROFILER_LEAVE_TRASH // The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper. #ifdef UNIX_AMD64_ABI @@ -423,11 +445,11 @@ // On Unix a struct of size >=9 and <=16 bytes in size is returned in two return registers. // The return registers could be any two from the set { RAX, RDX, XMM0, XMM1 }. // STOP_FOR_GC helper preserves all the 4 possible return registers. - #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) + #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) #else // See vm\amd64\asmhelpers.asm for more details. - #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) + #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) #endif diff --git a/src/coreclr/jit/utils.h b/src/coreclr/jit/utils.h index 19083f39f4e202..6f661eef4d15c4 100644 --- a/src/coreclr/jit/utils.h +++ b/src/coreclr/jit/utils.h @@ -419,6 +419,16 @@ class PhasedVar return *this; } + PhasedVar& operator|=(const T& value) + { +#ifdef DEBUG + assert(m_writePhase); + m_initialized = true; +#endif // DEBUG + m_value |= value; + return *this; + } + // Note: if you need more = functions, you can define them here, like operator&= // Assign a value, but don't assert if we're not in the write phase, and diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 73f10f1ef4ce4f..89f2f9d33f7e2c 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -1970,14 +1970,14 @@ CONTEXT* AllocateOSContextHelper(BYTE** contextBuffer) // Determine if the processor supports AVX so we could // retrieve extended registers DWORD64 FeatureMask = GetEnabledXStateFeatures(); - if ((FeatureMask & XSTATE_MASK_AVX) != 0) + if ((FeatureMask & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) != 0) { context = context | CONTEXT_XSTATE; } // Retrieve contextSize by passing NULL for Buffer DWORD contextSize = 0; - ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX; + ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_MPX | XSTATE_MASK_AVX512; // The initialize call should fail but return contextSize BOOL success = g_pfnInitializeContext2 ? g_pfnInitializeContext2(NULL, context, NULL, &contextSize, xStateCompactionMask) : @@ -2899,7 +2899,7 @@ BOOL Thread::RedirectThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt) // This should not normally fail. // The system silently ignores any feature specified in the FeatureMask // which is not enabled on the processor. - SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX); + SetXStateFeaturesMask(pCtx, (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)); #endif //defined(TARGET_X86) || defined(TARGET_AMD64) // Make sure we specify CONTEXT_EXCEPTION_REQUEST to detect "trap frame reporting". @@ -3035,7 +3035,7 @@ BOOL Thread::RedirectCurrentThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt, CONT // Get may return 0 if no XState is set, which Set would not accept. if (srcFeatures != 0) { - success = SetXStateFeaturesMask(pCurrentThreadCtx, srcFeatures & XSTATE_MASK_AVX); + success = SetXStateFeaturesMask(pCurrentThreadCtx, srcFeatures & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)); _ASSERTE(success); if (!success) return FALSE; diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index ca85289438a2ef..68783c754fe6a2 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -155,6 +155,7 @@ +