Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics

* More instructions, some cleanup.

* Intrinsics for Move instructions (zip etc)

These pass the existing tests.

* Intrinsics for some of Cvt

While doing this I noticed that the conversion for int/fp was incorrect
in the slow path. I'll fix this in the original repo.

* Intrinsics for more Arithmetic instructions.

* Intrinsics for Vext

* Fix VEXT Intrinsic for double words.

* Use InsertPs to move scalar values.

* Cleanup, fix VPADD.f32 and VMIN signed integer.

* Cleanup, add SSE2 support for scalar insert.

Works similarly to the IR scalar insert, but obviously this one works
directly on V128.

* Minor cleanup.

* Enable intrinsic for FP64 to integer conversion.

* Address feedback apart from splitting out intrinsic float abs

Also: bad VREV encodings as undefined rather than throwing in translation.

* Move float abs to helper, fix bug with cvt

* Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately.

* Get name of variable at compilation rather than string literal.

* Use correct double sign mask.
This commit is contained in:
jduncanator 2020-03-05 11:41:33 +11:00 committed by GitHub
parent d9ed827696
commit 68e15c1a74
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 2077 additions and 400 deletions

View file

@ -1,9 +1,11 @@
using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.State;
using ARMeilleure.Translation;
using System;
using System.Diagnostics;
using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper32;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
@ -63,21 +65,56 @@ namespace ARMeilleure.Instructions
if (toInteger)
{
EmitVectorUnaryOpF32(context, (op1) =>
if (Optimizations.UseSse41)
{
return EmitSaturateFloatToInt(context, op1, unsigned);
});
EmitSse41ConvertVector32(context, FPRoundingMode.TowardsZero, !unsigned);
}
else
{
EmitVectorUnaryOpF32(context, (op1) =>
{
return EmitSaturateFloatToInt(context, op1, unsigned);
});
}
}
else
{
if (unsigned)
if (Optimizations.UseSse2)
{
EmitVectorUnaryOpZx32(context, (op1) => EmitFPConvert(context, op1, floatSize, false));
}
EmitVectorUnaryOpSimd32(context, (n) =>
{
if (unsigned)
{
Operand mask = X86GetAllElements(context, 0x47800000);
Operand res = context.AddIntrinsic(Intrinsic.X86Psrld, n, Const(16));
res = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, res);
res = context.AddIntrinsic(Intrinsic.X86Mulps, res, mask);
Operand res2 = context.AddIntrinsic(Intrinsic.X86Pslld, n, Const(16));
res2 = context.AddIntrinsic(Intrinsic.X86Psrld, res2, Const(16));
res2 = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, res2);
return context.AddIntrinsic(Intrinsic.X86Addps, res, res2);
}
else
{
return context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, n);
}
});
}
else
{
EmitVectorUnaryOpSx32(context, (op1) => EmitFPConvert(context, op1, floatSize, true));
if (unsigned)
{
EmitVectorUnaryOpZx32(context, (op1) => EmitFPConvert(context, op1, floatSize, false));
}
else
{
EmitVectorUnaryOpSx32(context, (op1) => EmitFPConvert(context, op1, floatSize, true));
}
}
}
}
@ -123,44 +160,51 @@ namespace ARMeilleure.Instructions
bool unsigned = (op.Opc2 & 1) == 0;
bool roundWithFpscr = op.Opc != 1;
Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
Operand asInteger;
// TODO: Fast Path.
if (roundWithFpscr)
if (!roundWithFpscr && Optimizations.UseSse41)
{
// These need to get the FPSCR value, so it's worth noting we'd need to do a c# call at some point.
if (floatSize == OperandType.FP64)
{
if (unsigned)
{
asInteger = context.Call(new _U32_F64(SoftFallback.DoubleToUInt32), toConvert);
}
else
{
asInteger = context.Call(new _S32_F64(SoftFallback.DoubleToInt32), toConvert);
}
}
else
{
if (unsigned)
{
asInteger = context.Call(new _U32_F32(SoftFallback.FloatToUInt32), toConvert);
}
else
{
asInteger = context.Call(new _S32_F32(SoftFallback.FloatToInt32), toConvert);
}
}
}
EmitSse41ConvertInt32(context, FPRoundingMode.TowardsZero, !unsigned);
}
else
{
// Round towards zero.
asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned);
}
Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
InsertScalar(context, op.Vd, asInteger);
Operand asInteger;
// TODO: Fast Path.
if (roundWithFpscr)
{
if (floatSize == OperandType.FP64)
{
if (unsigned)
{
asInteger = context.Call(new _U32_F64(SoftFallback.DoubleToUInt32), toConvert);
}
else
{
asInteger = context.Call(new _S32_F64(SoftFallback.DoubleToInt32), toConvert);
}
}
else
{
if (unsigned)
{
asInteger = context.Call(new _U32_F32(SoftFallback.FloatToUInt32), toConvert);
}
else
{
asInteger = context.Call(new _S32_F32(SoftFallback.FloatToInt32), toConvert);
}
}
}
else
{
// Round towards zero.
asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned);
}
InsertScalar(context, op.Vd, asInteger);
}
}
else
{
@ -192,6 +236,26 @@ namespace ARMeilleure.Instructions
return context.Call(dlg, n, Const((int)roundMode));
}
private static FPRoundingMode RMToRoundMode(int rm)
{
FPRoundingMode roundMode;
switch (rm)
{
case 0b01:
roundMode = FPRoundingMode.ToNearest;
break;
case 0b10:
roundMode = FPRoundingMode.TowardsPlusInfinity;
break;
case 0b11:
roundMode = FPRoundingMode.TowardsMinusInfinity;
break;
default:
throw new ArgumentOutOfRangeException(nameof(rm));
}
return roundMode;
}
public static void Vcvt_R(ArmEmitterContext context)
{
OpCode32SimdCvtFI op = (OpCode32SimdCvtFI)context.CurrOp;
@ -199,30 +263,38 @@ namespace ARMeilleure.Instructions
OperandType floatSize = op.RegisterSize == RegisterSize.Int64 ? OperandType.FP64 : OperandType.FP32;
bool unsigned = (op.Opc & 1) == 0;
int rm = op.Opc2 & 3;
Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
switch (op.Opc2)
if (Optimizations.UseSse41 && rm != 0b00)
{
case 0b00: // Away
toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert);
break;
case 0b01: // Nearest
toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert);
break;
case 0b10: // Towards positive infinity
toConvert = EmitUnaryMathCall(context, MathF.Ceiling, Math.Ceiling, toConvert);
break;
case 0b11: // Towards negative infinity
toConvert = EmitUnaryMathCall(context, MathF.Floor, Math.Floor, toConvert);
break;
EmitSse41ConvertInt32(context, RMToRoundMode(rm), !unsigned);
}
else
{
Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
Operand asInteger;
switch (rm)
{
case 0b00: // Away
toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert);
break;
case 0b01: // Nearest
toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert);
break;
case 0b10: // Towards positive infinity
toConvert = EmitUnaryMathCall(context, MathF.Ceiling, Math.Ceiling, toConvert);
break;
case 0b11: // Towards negative infinity
toConvert = EmitUnaryMathCall(context, MathF.Floor, Math.Floor, toConvert);
break;
}
asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned);
Operand asInteger;
InsertScalar(context, op.Vd, asInteger);
asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned);
InsertScalar(context, op.Vd, asInteger);
}
}
public static void Vrint_RM(ArmEmitterContext context)
@ -231,30 +303,59 @@ namespace ARMeilleure.Instructions
OperandType floatSize = op.RegisterSize == RegisterSize.Int64 ? OperandType.FP64 : OperandType.FP32;
Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
int rm = op.Opc2 & 3;
switch (op.Opc2)
if (Optimizations.UseSse2 && rm != 0b00)
{
case 0b00: // Away
toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert);
break;
case 0b01: // Nearest
toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert);
break;
case 0b10: // Towards positive infinity
toConvert = EmitUnaryMathCall(context, MathF.Ceiling, Math.Ceiling, toConvert);
break;
case 0b11: // Towards negative infinity
toConvert = EmitUnaryMathCall(context, MathF.Floor, Math.Floor, toConvert);
break;
}
EmitScalarUnaryOpSimd32(context, (m) =>
{
Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd;
InsertScalar(context, op.Vd, toConvert);
FPRoundingMode roundMode = RMToRoundMode(rm);
return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(roundMode)));
});
}
else
{
Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
switch (rm)
{
case 0b00: // Away
toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert);
break;
case 0b01: // Nearest
toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert);
break;
case 0b10: // Towards positive infinity
toConvert = EmitUnaryMathCall(context, MathF.Ceiling, Math.Ceiling, toConvert);
break;
case 0b11: // Towards negative infinity
toConvert = EmitUnaryMathCall(context, MathF.Floor, Math.Floor, toConvert);
break;
}
InsertScalar(context, op.Vd, toConvert);
}
}
public static void Vrint_Z(ArmEmitterContext context)
{
EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Truncate, Math.Truncate, op1));
IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
if (Optimizations.UseSse2)
{
EmitScalarUnaryOpSimd32(context, (m) =>
{
Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd;
return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(FPRoundingMode.TowardsZero)));
});
}
else
{
EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Truncate, Math.Truncate, op1));
}
}
private static Operand EmitFPConvert(ArmEmitterContext context, Operand value, OperandType type, bool signed)
@ -270,5 +371,211 @@ namespace ARMeilleure.Instructions
return context.ConvertToFPUI(type, value);
}
}
private static void EmitSse41ConvertInt32(ArmEmitterContext context, FPRoundingMode roundMode, bool signed)
{
// A port of the similar round function in InstEmitSimdCvt.
OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
bool doubleSize = (op.Size & 1) != 0;
int shift = doubleSize ? 1 : 2;
Operand n = GetVecA32(op.Vm >> shift);
n = EmitSwapScalar(context, n, op.Vm, doubleSize);
if (!doubleSize)
{
Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, n, n, Const((int)CmpCondition.OrderedQ));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode)));
Operand zero = context.VectorZero();
Operand nCmp;
Operand nIntOrLong2 = null;
if (!signed)
{
nCmp = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
}
int fpMaxVal = 0x4F000000; // 2.14748365E9f (2147483648)
Operand fpMaxValMask = X86GetScalar(context, fpMaxVal);
Operand nIntOrLong = context.AddIntrinsicInt(Intrinsic.X86Cvtss2si, nRes);
if (!signed)
{
nRes = context.AddIntrinsic(Intrinsic.X86Subss, nRes, fpMaxValMask);
nCmp = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
nIntOrLong2 = context.AddIntrinsicInt(Intrinsic.X86Cvtss2si, nRes);
}
nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan));
Operand nInt = context.AddIntrinsicInt(Intrinsic.X86Cvtsi2si, nRes);
Operand dRes;
if (signed)
{
dRes = context.BitwiseExclusiveOr(nIntOrLong, nInt);
}
else
{
dRes = context.BitwiseExclusiveOr(nIntOrLong2, nInt);
dRes = context.Add(dRes, nIntOrLong);
}
InsertScalar(context, op.Vd, dRes);
}
else
{
Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, n, Const((int)CmpCondition.OrderedQ));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode)));
Operand zero = context.VectorZero();
Operand nCmp;
Operand nIntOrLong2 = null;
if (!signed)
{
nCmp = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
}
long fpMaxVal = 0x41E0000000000000L; // 2147483648.0000000d (2147483648)
Operand fpMaxValMask = X86GetScalar(context, fpMaxVal);
Operand nIntOrLong = context.AddIntrinsicInt(Intrinsic.X86Cvtsd2si, nRes);
if (!signed)
{
nRes = context.AddIntrinsic(Intrinsic.X86Subsd, nRes, fpMaxValMask);
nCmp = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
nIntOrLong2 = context.AddIntrinsicInt(Intrinsic.X86Cvtsd2si, nRes);
}
nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan));
Operand nLong = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, nRes);
nLong = context.ConvertI64ToI32(nLong);
Operand dRes;
if (signed)
{
dRes = context.BitwiseExclusiveOr(nIntOrLong, nLong);
}
else
{
dRes = context.BitwiseExclusiveOr(nIntOrLong2, nLong);
dRes = context.Add(dRes, nIntOrLong);
}
InsertScalar(context, op.Vd, dRes);
}
}
private static void EmitSse41ConvertVector32(ArmEmitterContext context, FPRoundingMode roundMode, bool signed)
{
OpCode32Simd op = (OpCode32Simd)context.CurrOp;
EmitVectorUnaryOpSimd32(context, (n) =>
{
int sizeF = op.Size & 1;
if (sizeF == 0)
{
Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, n, n, Const((int)CmpCondition.OrderedQ));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode)));
Operand zero = context.VectorZero();
Operand nCmp;
if (!signed)
{
nCmp = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
}
Operand fpMaxValMask = X86GetAllElements(context, 0x4F000000); // 2.14748365E9f (2147483648)
Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes);
Operand nInt2 = null;
if (!signed)
{
nRes = context.AddIntrinsic(Intrinsic.X86Subps, nRes, fpMaxValMask);
nCmp = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
nInt2 = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes);
}
nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan));
if (signed)
{
return context.AddIntrinsic(Intrinsic.X86Pxor, nInt, nRes);
}
else
{
Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nInt2, nRes);
return context.AddIntrinsic(Intrinsic.X86Paddd, dRes, nInt);
}
}
else /* if (sizeF == 1) */
{
Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, n, n, Const((int)CmpCondition.OrderedQ));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode)));
Operand zero = context.VectorZero();
Operand nCmp;
if (!signed)
{
nCmp = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
}
Operand fpMaxValMask = X86GetAllElements(context, 0x43E0000000000000L); // 9.2233720368547760E18d (9223372036854775808)
Operand nLong = InstEmit.EmitSse2CvtDoubleToInt64OpF(context, nRes, false);
Operand nLong2 = null;
if (!signed)
{
nRes = context.AddIntrinsic(Intrinsic.X86Subpd, nRes, fpMaxValMask);
nCmp = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
nLong2 = InstEmit.EmitSse2CvtDoubleToInt64OpF(context, nRes, false);
}
nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan));
if (signed)
{
return context.AddIntrinsic(Intrinsic.X86Pxor, nLong, nRes);
}
else
{
Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nLong2, nRes);
return context.AddIntrinsic(Intrinsic.X86Paddq, dRes, nLong);
}
}
});
}
}
}