Merge branch 'master' into pptc_and_pool_enhancements

This commit is contained in:
LDj3SNuD 2021-01-27 06:25:40 +01:00
commit b767d1a6b0
82 changed files with 3890 additions and 793 deletions

View file

@ -119,6 +119,7 @@ namespace ARMeilleure.CodeGen.X86
Add(Intrinsic.X86Popcnt, new IntrinsicInfo(X86Instruction.Popcnt, IntrinsicType.PopCount));
Add(Intrinsic.X86Por, new IntrinsicInfo(X86Instruction.Por, IntrinsicType.Binary));
Add(Intrinsic.X86Pshufb, new IntrinsicInfo(X86Instruction.Pshufb, IntrinsicType.Binary));
Add(Intrinsic.X86Pshufd, new IntrinsicInfo(X86Instruction.Pshufd, IntrinsicType.BinaryImm));
Add(Intrinsic.X86Pslld, new IntrinsicInfo(X86Instruction.Pslld, IntrinsicType.Binary));
Add(Intrinsic.X86Pslldq, new IntrinsicInfo(X86Instruction.Pslldq, IntrinsicType.Binary));
Add(Intrinsic.X86Psllq, new IntrinsicInfo(X86Instruction.Psllq, IntrinsicType.Binary));

View file

@ -144,9 +144,10 @@ namespace ARMeilleure.Decoders
SetA64("101100100xxxxxxxxxxxxxxxxxxxxxxx", InstName.Orr, InstEmit.Orr, OpCodeAluImm.Create);
SetA64("00101010xx0xxxxx0xxxxxxxxxxxxxxx", InstName.Orr, InstEmit.Orr, OpCodeAluRs.Create);
SetA64("10101010xx0xxxxxxxxxxxxxxxxxxxxx", InstName.Orr, InstEmit.Orr, OpCodeAluRs.Create);
SetA64("1111100110xxxxxxxxxxxxxxxxxxxxxx", InstName.Pfrm, InstEmit.Pfrm, OpCodeMemImm.Create);
SetA64("11111000100xxxxxxxxx00xxxxxxxxxx", InstName.Pfrm, InstEmit.Pfrm, OpCodeMemImm.Create);
SetA64("11011000xxxxxxxxxxxxxxxxxxxxxxxx", InstName.Pfrm, InstEmit.Pfrm, OpCodeMemLit.Create);
SetA64("1111100110xxxxxxxxxxxxxxxxxxxxxx", InstName.Prfm, InstEmit.Prfm, OpCodeMemImm.Create); // immediate
SetA64("11111000100xxxxxxxxx00xxxxxxxxxx", InstName.Prfm, InstEmit.Prfm, OpCodeMemImm.Create); // prfum (unscaled offset)
SetA64("11011000xxxxxxxxxxxxxxxxxxxxxxxx", InstName.Prfm, InstEmit.Prfm, OpCodeMemLit.Create); // literal
SetA64("11111000101xxxxxxxxx10xxxxxxxxxx", InstName.Prfm, InstEmit.Prfm, OpCodeMemReg.Create); // register
SetA64("x101101011000000000000xxxxxxxxxx", InstName.Rbit, InstEmit.Rbit, OpCodeAlu.Create);
SetA64("1101011001011111000000xxxxx00000", InstName.Ret, InstEmit.Ret, OpCodeBReg.Create);
SetA64("x101101011000000000001xxxxxxxxxx", InstName.Rev16, InstEmit.Rev16, OpCodeAlu.Create);
@ -311,6 +312,7 @@ namespace ARMeilleure.Decoders
SetA64("0>0011100<1xxxxx111101xxxxxxxxxx", InstName.Fmax_V, InstEmit.Fmax_V, OpCodeSimdReg.Create);
SetA64("000111100x1xxxxx011010xxxxxxxxxx", InstName.Fmaxnm_S, InstEmit.Fmaxnm_S, OpCodeSimdReg.Create);
SetA64("0>0011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnm_V, InstEmit.Fmaxnm_V, OpCodeSimdReg.Create);
SetA64("011111100x110000110010xxxxxxxxxx", InstName.Fmaxnmp_S, InstEmit.Fmaxnmp_S, OpCodeSimd.Create);
SetA64("0>1011100<1xxxxx110001xxxxxxxxxx", InstName.Fmaxnmp_V, InstEmit.Fmaxnmp_V, OpCodeSimdReg.Create);
SetA64("0110111000110000110010xxxxxxxxxx", InstName.Fmaxnmv_V, InstEmit.Fmaxnmv_V, OpCodeSimd.Create);
SetA64("0>1011100<1xxxxx111101xxxxxxxxxx", InstName.Fmaxp_V, InstEmit.Fmaxp_V, OpCodeSimdReg.Create);
@ -319,6 +321,7 @@ namespace ARMeilleure.Decoders
SetA64("0>0011101<1xxxxx111101xxxxxxxxxx", InstName.Fmin_V, InstEmit.Fmin_V, OpCodeSimdReg.Create);
SetA64("000111100x1xxxxx011110xxxxxxxxxx", InstName.Fminnm_S, InstEmit.Fminnm_S, OpCodeSimdReg.Create);
SetA64("0>0011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnm_V, InstEmit.Fminnm_V, OpCodeSimdReg.Create);
SetA64("011111101x110000110010xxxxxxxxxx", InstName.Fminnmp_S, InstEmit.Fminnmp_S, OpCodeSimd.Create);
SetA64("0>1011101<1xxxxx110001xxxxxxxxxx", InstName.Fminnmp_V, InstEmit.Fminnmp_V, OpCodeSimdReg.Create);
SetA64("0110111010110000110010xxxxxxxxxx", InstName.Fminnmv_V, InstEmit.Fminnmv_V, OpCodeSimd.Create);
SetA64("0>1011101<1xxxxx111101xxxxxxxxxx", InstName.Fminp_V, InstEmit.Fminp_V, OpCodeSimdReg.Create);

View file

@ -102,7 +102,7 @@ namespace ARMeilleure.Instructions
}
}
public static void Pfrm(ArmEmitterContext context)
public static void Prfm(ArmEmitterContext context)
{
// Memory Prefetch, execute as no-op.
}

View file

@ -120,24 +120,155 @@ namespace ARMeilleure.Instructions
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
Operand res = context.VectorZero();
int elems = op.GetBytesCount() >> op.Size;
int eSize = 8 << op.Size;
for (int index = 0; index < elems; index++)
Operand res = eSize switch {
8 => Clz_V_I8 (context, GetVec(op.Rn)),
16 => Clz_V_I16(context, GetVec(op.Rn)),
32 => Clz_V_I32(context, GetVec(op.Rn)),
_ => null
};
if (res != null)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
}
else
{
int elems = op.GetBytesCount() >> op.Size;
Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
res = context.VectorZero();
res = EmitVectorInsert(context, res, de, index, op.Size);
for (int index = 0; index < elems; index++)
{
Operand ne = EmitVectorExtractZx(context, op.Rn, index, op.Size);
Operand de = context.Call(typeof(SoftFallback).GetMethod(nameof(SoftFallback.CountLeadingZeros)), ne, Const(eSize));
res = EmitVectorInsert(context, res, de, index, op.Size);
}
}
context.Copy(GetVec(op.Rd), res);
}
private static Operand Clz_V_I8(ArmEmitterContext context, Operand arg)
{
if (!Optimizations.UseSsse3)
{
return null;
}
// CLZ nibble table.
Operand clzTable = X86GetScalar(context, 0x01_01_01_01_02_02_03_04);
Operand maskLow = X86GetAllElements(context, 0x0f_0f_0f_0f);
Operand c04 = X86GetAllElements(context, 0x04_04_04_04);
// CLZ of low 4 bits of elements in arg.
Operand loClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, arg);
// Get the high 4 bits of elements in arg.
Operand hiArg = context.AddIntrinsic(Intrinsic.X86Psrlw, arg, Const(4));
hiArg = context.AddIntrinsic(Intrinsic.X86Pand, hiArg, maskLow);
// CLZ of high 4 bits of elements in arg.
Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, clzTable, hiArg);
// If high 4 bits are not all zero, we discard the CLZ of the low 4 bits.
Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqb, hiClz, c04);
loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
return context.AddIntrinsic(Intrinsic.X86Paddb, loClz, hiClz);
}
private static Operand Clz_V_I16(ArmEmitterContext context, Operand arg)
{
if (!Optimizations.UseSsse3)
{
return null;
}
Operand maskSwap = X86GetElements(context, 0x80_0f_80_0d_80_0b_80_09, 0x80_07_80_05_80_03_80_01);
Operand maskLow = X86GetAllElements(context, 0x00ff_00ff);
Operand c0008 = X86GetAllElements(context, 0x0008_0008);
// CLZ pair of high 8 and low 8 bits of elements in arg.
Operand hiloClz = Clz_V_I8(context, arg);
// Get CLZ of low 8 bits in each pair.
Operand loClz = context.AddIntrinsic(Intrinsic.X86Pand, hiloClz, maskLow);
// Get CLZ of high 8 bits in each pair.
Operand hiClz = context.AddIntrinsic(Intrinsic.X86Pshufb, hiloClz, maskSwap);
// If high 8 bits are not all zero, we discard the CLZ of the low 8 bits.
Operand mask = context.AddIntrinsic(Intrinsic.X86Pcmpeqw, hiClz, c0008);
loClz = context.AddIntrinsic(Intrinsic.X86Pand, loClz, mask);
return context.AddIntrinsic(Intrinsic.X86Paddw, loClz, hiClz);
}
private static Operand Clz_V_I32(ArmEmitterContext context, Operand arg)
{
// TODO: Use vplzcntd when AVX-512 is supported.
if (!Optimizations.UseSse2)
{
return null;
}
Operand AddVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Paddd, op0, op1);
Operand SubVectorI32(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Psubd, op0, op1);
Operand ShiftRightVectorUI32(Operand op0, int imm8) => context.AddIntrinsic(Intrinsic.X86Psrld, op0, Const(imm8));
Operand OrVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Por, op0, op1);
Operand AndVector(Operand op0, Operand op1) => context.AddIntrinsic(Intrinsic.X86Pand, op0, op1);
Operand NotVector(Operand op0) => context.AddIntrinsic(Intrinsic.X86Pandn, op0, context.VectorOne());
Operand c55555555 = X86GetAllElements(context, 0x55555555);
Operand c33333333 = X86GetAllElements(context, 0x33333333);
Operand c0f0f0f0f = X86GetAllElements(context, 0x0f0f0f0f);
Operand c0000003f = X86GetAllElements(context, 0x0000003f);
Operand tmp0;
Operand tmp1;
Operand res;
// Set all bits after highest set bit to 1.
res = OrVector(ShiftRightVectorUI32(arg, 1), arg);
res = OrVector(ShiftRightVectorUI32(res, 2), res);
res = OrVector(ShiftRightVectorUI32(res, 4), res);
res = OrVector(ShiftRightVectorUI32(res, 8), res);
res = OrVector(ShiftRightVectorUI32(res, 16), res);
// Make leading 0s into leading 1s.
res = NotVector(res);
// Count leading 1s, which is the population count.
tmp0 = ShiftRightVectorUI32(res, 1);
tmp0 = AndVector(tmp0, c55555555);
res = SubVectorI32(res, tmp0);
tmp0 = ShiftRightVectorUI32(res, 2);
tmp0 = AndVector(tmp0, c33333333);
tmp1 = AndVector(res, c33333333);
res = AddVectorI32(tmp0, tmp1);
tmp0 = ShiftRightVectorUI32(res, 4);
tmp0 = AddVectorI32(tmp0, res);
res = AndVector(tmp0, c0f0f0f0f);
tmp0 = ShiftRightVectorUI32(res, 8);
res = AddVectorI32(tmp0, res);
tmp0 = ShiftRightVectorUI32(res, 16);
res = AddVectorI32(tmp0, res);
res = AndVector(res, c0000003f);
return res;
}
public static void Cnt_V(ArmEmitterContext context)
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
@ -347,19 +478,17 @@ namespace ARMeilleure.Instructions
public static void Faddp_S(ArmEmitterContext context)
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
int sizeF = op.Size & 1;
if (Optimizations.FastFP && Optimizations.UseSse3)
{
if (sizeF == 0)
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
if ((op.Size & 1) == 0)
{
Operand res = context.AddIntrinsic(Intrinsic.X86Haddps, GetVec(op.Rn), GetVec(op.Rn));
context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
}
else /* if (sizeF == 1) */
else /* if ((op.Size & 1) == 1) */
{
Operand res = context.AddIntrinsic(Intrinsic.X86Haddpd, GetVec(op.Rn), GetVec(op.Rn));
@ -368,14 +497,10 @@ namespace ARMeilleure.Instructions
}
else
{
OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;
Operand ne0 = context.VectorExtract(type, GetVec(op.Rn), 0);
Operand ne1 = context.VectorExtract(type, GetVec(op.Rn), 1);
Operand res = EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), ne0, ne1);
context.Copy(GetVec(op.Rd), context.VectorInsert(context.VectorZero(), res, 0));
EmitScalarPairwiseOpF(context, (op1, op2) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPAdd), op1, op2);
});
}
}
@ -552,6 +677,24 @@ namespace ARMeilleure.Instructions
}
}
public static void Fmaxnmp_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse41)
{
EmitSse2ScalarPairwiseOpF(context, (op1, op2) =>
{
return EmitSse41MaxMinNumOpF(context, isMaxNum: true, scalar: true, op1, op2);
});
}
else
{
EmitScalarPairwiseOpF(context, (op1, op2) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMaxNum), op1, op2);
});
}
}
public static void Fmaxnmp_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse41)
@ -708,6 +851,24 @@ namespace ARMeilleure.Instructions
}
}
public static void Fminnmp_S(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse41)
{
EmitSse2ScalarPairwiseOpF(context, (op1, op2) =>
{
return EmitSse41MaxMinNumOpF(context, isMaxNum: false, scalar: true, op1, op2);
});
}
else
{
EmitScalarPairwiseOpF(context, (op1, op2) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMinNum), op1, op2);
});
}
}
public static void Fminnmp_V(ArmEmitterContext context)
{
if (Optimizations.FastFP && Optimizations.UseSse41)

View file

@ -209,6 +209,11 @@ namespace ARMeilleure.Instructions
}
public static Operand X86GetElements(ArmEmitterContext context, long e1, long e0)
{
return X86GetElements(context, (ulong)e1, (ulong)e0);
}
public static Operand X86GetElements(ArmEmitterContext context, ulong e1, ulong e0)
{
Operand vector0 = context.VectorCreateScalar(Const(e0));
Operand vector1 = context.VectorCreateScalar(Const(e1));
@ -1118,6 +1123,49 @@ namespace ARMeilleure.Instructions
context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res));
}
public static void EmitScalarPairwiseOpF(ArmEmitterContext context, Func2I emit)
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
OperandType type = (op.Size & 1) != 0 ? OperandType.FP64 : OperandType.FP32;
Operand ne0 = context.VectorExtract(type, GetVec(op.Rn), 0);
Operand ne1 = context.VectorExtract(type, GetVec(op.Rn), 1);
Operand res = context.VectorInsert(context.VectorZero(), emit(ne0, ne1), 0);
context.Copy(GetVec(op.Rd), res);
}
public static void EmitSse2ScalarPairwiseOpF(ArmEmitterContext context, Func2I emit)
{
OpCodeSimd op = (OpCodeSimd)context.CurrOp;
Operand n = GetVec(op.Rn);
Operand op0, op1;
if ((op.Size & 1) == 0)
{
const int sm0 = 2 << 6 | 2 << 4 | 2 << 2 | 0 << 0;
const int sm1 = 2 << 6 | 2 << 4 | 2 << 2 | 1 << 0;
Operand zeroN = context.VectorZeroUpper64(n);
op0 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm0));
op1 = context.AddIntrinsic(Intrinsic.X86Pshufd, zeroN, Const(sm1));
}
else /* if ((op.Size & 1) == 1) */
{
Operand zero = context.VectorZero();
op0 = context.AddIntrinsic(Intrinsic.X86Movlhps, n, zero);
op1 = context.AddIntrinsic(Intrinsic.X86Movhlps, zero, n);
}
context.Copy(GetVec(op.Rd), emit(op0, op1));
}
public static void EmitVectorPairwiseOpF(ArmEmitterContext context, Func2I emit)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;

View file

@ -68,7 +68,7 @@ namespace ARMeilleure.Instructions
Nop,
Orn,
Orr,
Pfrm,
Prfm,
Rbit,
Ret,
Rev16,
@ -212,6 +212,7 @@ namespace ARMeilleure.Instructions
Fmax_V,
Fmaxnm_S,
Fmaxnm_V,
Fmaxnmp_S,
Fmaxnmp_V,
Fmaxnmv_V,
Fmaxp_V,
@ -220,6 +221,7 @@ namespace ARMeilleure.Instructions
Fmin_V,
Fminnm_S,
Fminnm_V,
Fminnmp_S,
Fminnmp_V,
Fminnmv_V,
Fminp_V,

View file

@ -108,6 +108,7 @@ namespace ARMeilleure.IntermediateRepresentation
X86Popcnt,
X86Por,
X86Pshufb,
X86Pshufd,
X86Pslld,
X86Pslldq,
X86Psllq,

View file

@ -23,7 +23,7 @@ namespace ARMeilleure.Translation.PTC
{
private const string HeaderMagicString = "PTChd\0\0\0";
private const uint InternalVersion = 1817; //! To be incremented manually for each change to the ARMeilleure project.
private const uint InternalVersion = 1968; //! To be incremented manually for each change to the ARMeilleure project.
private const string ActualDir = "0";
private const string BackupDir = "1";