Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics * More instructions, some cleanup. * Intrinsics for Move instructions (zip etc) These pass the existing tests. * Intrinsics for some of Cvt While doing this I noticed that the conversion for int/fp was incorrect in the slow path. I'll fix this in the original repo. * Intrinsics for more Arithmetic instructions. * Intrinsics for Vext * Fix VEXT Intrinsic for double words. * Use InsertPs to move scalar values. * Cleanup, fix VPADD.f32 and VMIN signed integer. * Cleanup, add SSE2 support for scalar insert. Works similarly to the IR scalar insert, but obviously this one works directly on V128. * Minor cleanup. * Enable intrinsic for FP64 to integer conversion. * Address feedback apart from splitting out intrinsic float abs Also: bad VREV encodings as undefined rather than throwing in translation. * Move float abs to helper, fix bug with cvt * Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately. * Get name of variable at compilation rather than string literal. * Use correct double sign mask.
2020-03-05 11:41:33 +11:00 · 2020-03-05 11:41:33 +11:00 · 68e15c1a74
commit 68e15c1a74
parent d9ed827696
12 changed files with 2077 additions and 400 deletions
--- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
@ -1,9 +1,11 @@
 using ARMeilleure.Decoders;
 using ARMeilleure.IntermediateRepresentation;
+using ARMeilleure.State;
 using ARMeilleure.Translation;
 using System;
 using System.Diagnostics;

+using static ARMeilleure.Instructions.InstEmitHelper;
 using static ARMeilleure.Instructions.InstEmitSimdHelper;
 using static ARMeilleure.Instructions.InstEmitSimdHelper32;
 using static ARMeilleure.IntermediateRepresentation.OperandHelper;
@ -63,21 +65,56 @@ namespace ARMeilleure.Instructions

            if (toInteger)
            {
-                EmitVectorUnaryOpF32(context, (op1) =>
+                if (Optimizations.UseSse41)
                {
-                    return EmitSaturateFloatToInt(context, op1, unsigned);
-                });
+                    EmitSse41ConvertVector32(context, FPRoundingMode.TowardsZero, !unsigned);
+                }
+                else
+                {
+                    EmitVectorUnaryOpF32(context, (op1) =>
+                    {
+                        return EmitSaturateFloatToInt(context, op1, unsigned);
+                    });
+                }
            }
            else
            {
-                if (unsigned)
+                if (Optimizations.UseSse2)
                {
-                    EmitVectorUnaryOpZx32(context, (op1) => EmitFPConvert(context, op1, floatSize, false));
-                } 
+                    EmitVectorUnaryOpSimd32(context, (n) =>
+                    {
+                        if (unsigned)
+                        {
+                            Operand mask = X86GetAllElements(context, 0x47800000);
+
+                            Operand res = context.AddIntrinsic(Intrinsic.X86Psrld, n, Const(16));
+                            res = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, res);
+                            res = context.AddIntrinsic(Intrinsic.X86Mulps, res, mask);
+
+                            Operand res2 = context.AddIntrinsic(Intrinsic.X86Pslld, n, Const(16));
+                            res2 = context.AddIntrinsic(Intrinsic.X86Psrld, res2, Const(16));
+                            res2 = context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, res2);
+
+                            return context.AddIntrinsic(Intrinsic.X86Addps, res, res2);
+                        } 
+                        else
+                        {
+                            return context.AddIntrinsic(Intrinsic.X86Cvtdq2ps, n);
+                        }
+                    });
+                }
                else
                {
-                    EmitVectorUnaryOpSx32(context, (op1) => EmitFPConvert(context, op1, floatSize, true));
+                    if (unsigned)
+                    {
+                        EmitVectorUnaryOpZx32(context, (op1) => EmitFPConvert(context, op1, floatSize, false));
+                    }
+                    else
+                    {
+                        EmitVectorUnaryOpSx32(context, (op1) => EmitFPConvert(context, op1, floatSize, true));
+                    }
                }
+
            }
            
        }
@ -123,44 +160,51 @@ namespace ARMeilleure.Instructions
                bool unsigned = (op.Opc2 & 1) == 0;
                bool roundWithFpscr = op.Opc != 1;

-                Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
-
-                Operand asInteger;
-
-                // TODO: Fast Path.
-                if (roundWithFpscr)
+                if (!roundWithFpscr && Optimizations.UseSse41)
                {
-                    // These need to get the FPSCR value, so it's worth noting we'd need to do a c# call at some point.
-                    if (floatSize == OperandType.FP64)
-                    {
-                        if (unsigned)
-                        {
-                            asInteger = context.Call(new _U32_F64(SoftFallback.DoubleToUInt32), toConvert);
-                        } 
-                        else
-                        {
-                            asInteger = context.Call(new _S32_F64(SoftFallback.DoubleToInt32), toConvert);
-                        }
-                    } 
-                    else
-                    {
-                        if (unsigned)
-                        {
-                            asInteger = context.Call(new _U32_F32(SoftFallback.FloatToUInt32), toConvert);
-                        } 
-                        else
-                        {
-                            asInteger = context.Call(new _S32_F32(SoftFallback.FloatToInt32), toConvert);
-                        }
-                    }
-                } 
+                    EmitSse41ConvertInt32(context, FPRoundingMode.TowardsZero, !unsigned);
+                }
                else
                {
-                    // Round towards zero.
-                    asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned);
-                }
+                    Operand toConvert = ExtractScalar(context, floatSize, op.Vm);

-                InsertScalar(context, op.Vd, asInteger);
+                    Operand asInteger;
+
+                    // TODO: Fast Path.
+                    if (roundWithFpscr)
+                    {
+                        if (floatSize == OperandType.FP64)
+                        {
+                            if (unsigned)
+                            {
+                                asInteger = context.Call(new _U32_F64(SoftFallback.DoubleToUInt32), toConvert);
+                            }
+                            else
+                            {
+                                asInteger = context.Call(new _S32_F64(SoftFallback.DoubleToInt32), toConvert);
+                            }
+
+                        }
+                        else
+                        {
+                            if (unsigned)
+                            {
+                                asInteger = context.Call(new _U32_F32(SoftFallback.FloatToUInt32), toConvert);
+                            }
+                            else
+                            {
+                                asInteger = context.Call(new _S32_F32(SoftFallback.FloatToInt32), toConvert);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // Round towards zero.
+                        asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned);
+                    }
+
+                    InsertScalar(context, op.Vd, asInteger);
+                }
            } 
            else
            {
@ -192,6 +236,26 @@ namespace ARMeilleure.Instructions
            return context.Call(dlg, n, Const((int)roundMode));
        }

+        private static FPRoundingMode RMToRoundMode(int rm)
+        {
+            FPRoundingMode roundMode;
+            switch (rm)
+            {
+                case 0b01:
+                    roundMode = FPRoundingMode.ToNearest;
+                    break;
+                case 0b10:
+                    roundMode = FPRoundingMode.TowardsPlusInfinity;
+                    break;
+                case 0b11:
+                    roundMode = FPRoundingMode.TowardsMinusInfinity;
+                    break;
+                default:
+                    throw new ArgumentOutOfRangeException(nameof(rm));
+            }
+            return roundMode;
+        }
+
        public static void Vcvt_R(ArmEmitterContext context)
        {
            OpCode32SimdCvtFI op = (OpCode32SimdCvtFI)context.CurrOp;
@ -199,30 +263,38 @@ namespace ARMeilleure.Instructions
            OperandType floatSize = op.RegisterSize == RegisterSize.Int64 ? OperandType.FP64 : OperandType.FP32;

            bool unsigned = (op.Opc & 1) == 0;
+            int rm = op.Opc2 & 3;

-            Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
-
-            switch (op.Opc2)
+            if (Optimizations.UseSse41 && rm != 0b00)
            {
-                case 0b00: // Away
-                    toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert);
-                    break;
-                case 0b01: // Nearest
-                    toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert);
-                    break;
-                case 0b10: // Towards positive infinity
-                    toConvert = EmitUnaryMathCall(context, MathF.Ceiling, Math.Ceiling, toConvert);
-                    break;
-                case 0b11: // Towards negative infinity
-                    toConvert = EmitUnaryMathCall(context, MathF.Floor, Math.Floor, toConvert);
-                    break;
+                EmitSse41ConvertInt32(context, RMToRoundMode(rm), !unsigned);
            }
+            else
+            {
+                Operand toConvert = ExtractScalar(context, floatSize, op.Vm);

-            Operand asInteger;
+                switch (rm)
+                {
+                    case 0b00: // Away
+                        toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert);
+                        break;
+                    case 0b01: // Nearest
+                        toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert);
+                        break;
+                    case 0b10: // Towards positive infinity
+                        toConvert = EmitUnaryMathCall(context, MathF.Ceiling, Math.Ceiling, toConvert);
+                        break;
+                    case 0b11: // Towards negative infinity
+                        toConvert = EmitUnaryMathCall(context, MathF.Floor, Math.Floor, toConvert);
+                        break;
+                }

-            asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned);
+                Operand asInteger;

-            InsertScalar(context, op.Vd, asInteger);
+                asInteger = EmitSaturateFloatToInt(context, toConvert, unsigned);
+
+                InsertScalar(context, op.Vd, asInteger);
+            }
        }

        public static void Vrint_RM(ArmEmitterContext context)
@ -231,30 +303,59 @@ namespace ARMeilleure.Instructions

            OperandType floatSize = op.RegisterSize == RegisterSize.Int64 ? OperandType.FP64 : OperandType.FP32;

-            Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
+            int rm = op.Opc2 & 3;

-            switch (op.Opc2)
+            if (Optimizations.UseSse2 && rm != 0b00)
            {
-                case 0b00: // Away
-                    toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert);
-                    break;
-                case 0b01: // Nearest
-                    toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert);
-                    break;
-                case 0b10: // Towards positive infinity
-                    toConvert = EmitUnaryMathCall(context, MathF.Ceiling, Math.Ceiling, toConvert);
-                    break;
-                case 0b11: // Towards negative infinity
-                    toConvert = EmitUnaryMathCall(context, MathF.Floor, Math.Floor, toConvert);
-                    break;
-            }
+                EmitScalarUnaryOpSimd32(context, (m) =>
+                {
+                    Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd;

-            InsertScalar(context, op.Vd, toConvert);
+                    FPRoundingMode roundMode = RMToRoundMode(rm);
+
+                    return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(roundMode)));
+                });
+            }
+            else 
+            {
+                Operand toConvert = ExtractScalar(context, floatSize, op.Vm);
+
+                switch (rm)
+                {
+                    case 0b00: // Away
+                        toConvert = EmitRoundMathCall(context, MidpointRounding.AwayFromZero, toConvert);
+                        break;
+                    case 0b01: // Nearest
+                        toConvert = EmitRoundMathCall(context, MidpointRounding.ToEven, toConvert);
+                        break;
+                    case 0b10: // Towards positive infinity
+                        toConvert = EmitUnaryMathCall(context, MathF.Ceiling, Math.Ceiling, toConvert);
+                        break;
+                    case 0b11: // Towards negative infinity
+                        toConvert = EmitUnaryMathCall(context, MathF.Floor, Math.Floor, toConvert);
+                        break;
+                }
+
+                InsertScalar(context, op.Vd, toConvert);
+            }
        }

        public static void Vrint_Z(ArmEmitterContext context)
        {
-            EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Truncate, Math.Truncate, op1));
+            IOpCodeSimd op = (IOpCodeSimd)context.CurrOp;
+
+            if (Optimizations.UseSse2)
+            {
+                EmitScalarUnaryOpSimd32(context, (m) =>
+                {
+                    Intrinsic inst = (op.Size & 1) == 0 ? Intrinsic.X86Roundss : Intrinsic.X86Roundsd;
+                    return context.AddIntrinsic(inst, m, Const(X86GetRoundControl(FPRoundingMode.TowardsZero)));
+                });
+            } 
+            else
+            {
+                EmitScalarUnaryOpF32(context, (op1) => EmitUnaryMathCall(context, MathF.Truncate, Math.Truncate, op1));
+            }
        }

        private static Operand EmitFPConvert(ArmEmitterContext context, Operand value, OperandType type, bool signed)
@ -270,5 +371,211 @@ namespace ARMeilleure.Instructions
                return context.ConvertToFPUI(type, value);
            }
        }
+
+        private static void EmitSse41ConvertInt32(ArmEmitterContext context, FPRoundingMode roundMode, bool signed)
+        {
+            // A port of the similar round function in InstEmitSimdCvt.
+            OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;
+
+            bool doubleSize = (op.Size & 1) != 0;
+            int shift = doubleSize ? 1 : 2;
+            Operand n = GetVecA32(op.Vm >> shift);
+            n = EmitSwapScalar(context, n, op.Vm, doubleSize);
+
+            if (!doubleSize)
+            {
+                Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, n, n, Const((int)CmpCondition.OrderedQ));
+                nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
+
+                nRes = context.AddIntrinsic(Intrinsic.X86Roundss, nRes, Const(X86GetRoundControl(roundMode)));
+
+                Operand zero = context.VectorZero();
+
+                Operand nCmp;
+                Operand nIntOrLong2 = null;
+                if (!signed)
+                {
+                    nCmp = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
+                    nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
+                }
+
+                int fpMaxVal = 0x4F000000; // 2.14748365E9f (2147483648)
+
+                Operand fpMaxValMask = X86GetScalar(context, fpMaxVal);
+
+                Operand nIntOrLong = context.AddIntrinsicInt(Intrinsic.X86Cvtss2si, nRes);
+
+                if (!signed)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Subss, nRes, fpMaxValMask);
+
+                    nCmp = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
+                    nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
+
+                    nIntOrLong2 = context.AddIntrinsicInt(Intrinsic.X86Cvtss2si, nRes);
+                }
+
+                nRes = context.AddIntrinsic(Intrinsic.X86Cmpss, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan));
+
+                Operand nInt = context.AddIntrinsicInt(Intrinsic.X86Cvtsi2si, nRes);
+
+                Operand dRes;
+                if (signed)
+                {
+                    dRes = context.BitwiseExclusiveOr(nIntOrLong, nInt);
+                } 
+                else
+                {
+                    dRes = context.BitwiseExclusiveOr(nIntOrLong2, nInt);
+                    dRes = context.Add(dRes, nIntOrLong);
+                }
+
+                InsertScalar(context, op.Vd, dRes);
+            }
+            else
+            {
+                Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, n, Const((int)CmpCondition.OrderedQ));
+                nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
+
+                nRes = context.AddIntrinsic(Intrinsic.X86Roundsd, nRes, Const(X86GetRoundControl(roundMode)));
+
+                Operand zero = context.VectorZero();
+
+                Operand nCmp;
+                Operand nIntOrLong2 = null;
+                if (!signed)
+                {
+                    nCmp = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
+                    nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
+                }
+
+                long fpMaxVal = 0x41E0000000000000L; // 2147483648.0000000d (2147483648)
+
+                Operand fpMaxValMask = X86GetScalar(context, fpMaxVal);
+
+                Operand nIntOrLong = context.AddIntrinsicInt(Intrinsic.X86Cvtsd2si, nRes);
+
+                if (!signed)
+                {
+                    nRes = context.AddIntrinsic(Intrinsic.X86Subsd, nRes, fpMaxValMask);
+
+                    nCmp = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
+                    nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
+
+                    nIntOrLong2 = context.AddIntrinsicInt(Intrinsic.X86Cvtsd2si, nRes);
+                }
+
+                nRes = context.AddIntrinsic(Intrinsic.X86Cmpsd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan));
+
+                Operand nLong = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, nRes);
+                nLong = context.ConvertI64ToI32(nLong);
+
+                Operand dRes;
+                if (signed)
+                {
+                    dRes = context.BitwiseExclusiveOr(nIntOrLong, nLong);
+                }
+                else
+                {
+                    dRes = context.BitwiseExclusiveOr(nIntOrLong2, nLong);
+                    dRes = context.Add(dRes, nIntOrLong);
+                }
+
+                InsertScalar(context, op.Vd, dRes);
+            }
+        }
+
+        private static void EmitSse41ConvertVector32(ArmEmitterContext context, FPRoundingMode roundMode, bool signed)
+        {
+            OpCode32Simd op = (OpCode32Simd)context.CurrOp;
+
+            EmitVectorUnaryOpSimd32(context, (n) =>
+            {
+                int sizeF = op.Size & 1;
+
+                if (sizeF == 0)
+                {
+                    Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, n, n, Const((int)CmpCondition.OrderedQ));
+                    nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
+
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundps, nRes, Const(X86GetRoundControl(roundMode)));
+
+                    Operand zero = context.VectorZero();
+                    Operand nCmp;
+                    if (!signed)
+                    {
+                        nCmp = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
+                        nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
+                    }
+
+                    Operand fpMaxValMask = X86GetAllElements(context, 0x4F000000); // 2.14748365E9f (2147483648)
+
+                    Operand nInt = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes);
+                    Operand nInt2 = null;
+                    if (!signed)
+                    {
+                        nRes = context.AddIntrinsic(Intrinsic.X86Subps, nRes, fpMaxValMask);
+
+                        nCmp = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
+                        nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
+
+                        nInt2 = context.AddIntrinsic(Intrinsic.X86Cvtps2dq, nRes);
+                    }
+
+                    nRes = context.AddIntrinsic(Intrinsic.X86Cmpps, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan));
+
+                    if (signed)
+                    {
+                        return context.AddIntrinsic(Intrinsic.X86Pxor, nInt, nRes);
+                    } 
+                    else
+                    {
+                        Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nInt2, nRes);
+                        return context.AddIntrinsic(Intrinsic.X86Paddd, dRes, nInt);
+                    }
+                }
+                else /* if (sizeF == 1) */
+                {
+                    Operand nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, n, n, Const((int)CmpCondition.OrderedQ));
+                    nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, n);
+
+                    nRes = context.AddIntrinsic(Intrinsic.X86Roundpd, nRes, Const(X86GetRoundControl(roundMode)));
+
+                    Operand zero = context.VectorZero();
+                    Operand nCmp;
+                    if (!signed)
+                    {
+                        nCmp = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
+                        nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
+                    }
+
+                    Operand fpMaxValMask = X86GetAllElements(context, 0x43E0000000000000L); // 9.2233720368547760E18d (9223372036854775808)
+
+                    Operand nLong = InstEmit.EmitSse2CvtDoubleToInt64OpF(context, nRes, false);
+                    Operand nLong2 = null;
+                    if (!signed)
+                    {
+                        nRes = context.AddIntrinsic(Intrinsic.X86Subpd, nRes, fpMaxValMask);
+
+                        nCmp = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, zero, Const((int)CmpCondition.NotLessThanOrEqual));
+                        nRes = context.AddIntrinsic(Intrinsic.X86Pand, nRes, nCmp);
+
+                        nLong2 = InstEmit.EmitSse2CvtDoubleToInt64OpF(context, nRes, false);
+                    }
+
+                    nRes = context.AddIntrinsic(Intrinsic.X86Cmppd, nRes, fpMaxValMask, Const((int)CmpCondition.NotLessThan));
+
+                    if (signed)
+                    {
+                        return context.AddIntrinsic(Intrinsic.X86Pxor, nLong, nRes);
+                    }
+                    else
+                    {
+                        Operand dRes = context.AddIntrinsic(Intrinsic.X86Pxor, nLong2, nRes);
+                        return context.AddIntrinsic(Intrinsic.X86Paddq, dRes, nLong);
+                    }
+                }
+            });
+        }
    }
 }