ARMeilleure: Add initial support for AVX512 (EVEX encoding) (cont) (#4147)

* ARMeilleure: Add AVX512{F,VL,DQ,BW} detection

Add `UseAvx512Ortho` and `UseAvx512OrthoFloat` optimization flags as
short-hands for `F+VL` and `F+VL+DQ`.

* ARMeilleure: Add initial support for EVEX instruction encoding

Does not implement rounding, or exception controls.

* ARMeilleure: Add `X86Vpternlogd`

Accelerates the vector-`Not` instruction.

* ARMeilleure: Add check for `OSXSAVE` for AVX{2,512}

* ARMeilleure: Add check for `XCR0` flags

Add XCR0 register checks for AVX and AVX512F, following the guidelines
from section 14.3 and 15.2 from the Intel Architecture Software
Developer's Manual.

* ARMeilleure: Remove redundant `ReProtect` and `Dispose`, formatting

* ARMeilleure: Move XCR0 procedure to GetXcr0Eax

* ARMeilleure: Add `XCR0` to `FeatureInfo` structure

* ARMeilleure: Utilize `ReadOnlySpan` for Xcr0 assembly

Avoids an additional allocation

* ARMeilleure: Formatting fixes

* ARMeilleure: Fix EVEX encoding src2 register index

> Just like in VEX prefix, vvvv is provided in inverted form.

* ARMeilleure: Add `X86Vpternlogd` acceleration to `Vmvn_I`

Passes unit tests, verified instruction utilization

* ARMeilleure: Fix EVEX register operand designations

Operand 2 was being sourced improperly.

EVEX encoded instructions source their operands like so:
Operand 1: ModRM:reg
Operand 2: EVEX.vvvvv
Operand 3: ModRM:r/m
Operand 4: Imm

This fixes the improper register designations when emitting vpternlog.
Now "dest", "src1", "src2" arguments emit in the proper order in EVEX instructions.

* ARMeilleure: Add `X86Vpternlogd` acceleration to `Orn_V`

* ARMeilleure: PTC version bump

* ARMeilleure: Update EVEX encoding Debug.Assert to Debug.Fail

* ARMeilleure: Update EVEX encoding comment capitalization
This commit is contained in:
Wunk 2023-03-20 12:09:24 -07:00 committed by GitHub
parent 9f1cf6458c
commit 17620d18db
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 226 additions and 11 deletions

View file

@ -1034,7 +1034,13 @@ namespace ARMeilleure.CodeGen.X86
Debug.Assert(opCode != BadOp, "Invalid opcode value.");
if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding)
if ((flags & InstructionFlags.Evex) != 0 && HardwareCapabilities.SupportsEvexEncoding)
{
WriteEvexInst(dest, src1, src2, type, flags, opCode);
opCode &= 0xff;
}
else if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding)
{
// In a vex encoding, only one prefix can be active at a time. The active prefix is encoded in the second byte using two bits.
@ -1153,6 +1159,103 @@ namespace ARMeilleure.CodeGen.X86
}
}
private void WriteEvexInst(
Operand dest,
Operand src1,
Operand src2,
OperandType type,
InstructionFlags flags,
int opCode,
bool broadcast = false,
int registerWidth = 128,
int maskRegisterIdx = 0,
bool zeroElements = false)
{
int op1Idx = dest.GetRegister().Index;
int op2Idx = src1.GetRegister().Index;
int op3Idx = src2.GetRegister().Index;
WriteByte(0x62);
// P0
// Extend operand 1 register
bool r = (op1Idx & 8) == 0;
// Extend operand 3 register
bool x = (op3Idx & 16) == 0;
// Extend operand 3 register
bool b = (op3Idx & 8) == 0;
// Extend operand 1 register
bool rp = (op1Idx & 16) == 0;
// Escape code index
byte mm = 0b00;
switch ((ushort)(opCode >> 8))
{
case 0xf00: mm = 0b01; break;
case 0xf38: mm = 0b10; break;
case 0xf3a: mm = 0b11; break;
default: Debug.Fail($"Failed to EVEX encode opcode 0x{opCode:X}."); break;
}
WriteByte(
(byte)(
(r ? 0x80 : 0) |
(x ? 0x40 : 0) |
(b ? 0x20 : 0) |
(rp ? 0x10 : 0) |
mm));
// P1
// Specify 64-bit lane mode
bool w = Is64Bits(type);
// Operand 2 register index
byte vvvv = (byte)(~op2Idx & 0b1111);
// Opcode prefix
byte pp = (flags & InstructionFlags.PrefixMask) switch
{
InstructionFlags.Prefix66 => 0b01,
InstructionFlags.PrefixF3 => 0b10,
InstructionFlags.PrefixF2 => 0b11,
_ => 0
};
WriteByte(
(byte)(
(w ? 0x80 : 0) |
(vvvv << 3) |
0b100 |
pp));
// P2
// Mask register determines what elements to zero, rather than what elements to merge
bool z = zeroElements;
// Specifies register-width
byte ll = 0b00;
switch (registerWidth)
{
case 128: ll = 0b00; break;
case 256: ll = 0b01; break;
case 512: ll = 0b10; break;
default: Debug.Fail($"Invalid EVEX vector register width {registerWidth}."); break;
}
// Embedded broadcast in the case of a memory operand
bool bcast = broadcast;
// Extend operand 2 register
bool vp = (op2Idx & 16) == 0;
// Mask register index
Debug.Assert(maskRegisterIdx < 8, $"Invalid mask register index {maskRegisterIdx}.");
byte aaa = (byte)(maskRegisterIdx & 0b111);
WriteByte(
(byte)(
(z ? 0x80 : 0) |
(ll << 5) |
(bcast ? 0x10 : 0) |
(vp ? 8 : 0) |
aaa));
}
private void WriteCompactInst(Operand operand, int opCode)
{
int regIndex = operand.GetRegister().Index;