Implement Fast Paths for most A32 SIMD instructions (#952)

* Begin work on A32 SIMD Intrinsics * More instructions, some cleanup. * Intrinsics for Move instructions (zip etc) These pass the existing tests. * Intrinsics for some of Cvt While doing this I noticed that the conversion for int/fp was incorrect in the slow path. I'll fix this in the original repo. * Intrinsics for more Arithmetic instructions. * Intrinsics for Vext * Fix VEXT Intrinsic for double words. * Use InsertPs to move scalar values. * Cleanup, fix VPADD.f32 and VMIN signed integer. * Cleanup, add SSE2 support for scalar insert. Works similarly to the IR scalar insert, but obviously this one works directly on V128. * Minor cleanup. * Enable intrinsic for FP64 to integer conversion. * Address feedback apart from splitting out intrinsic float abs Also: bad VREV encodings as undefined rather than throwing in translation. * Move float abs to helper, fix bug with cvt * Rename opc2 & 3 to match A32 docs, use ArgumentOutOfRangeException appropriately. * Get name of variable at compilation rather than string literal. * Use correct double sign mask.
2020-03-05 11:41:33 +11:00 · 2020-03-05 11:41:33 +11:00 · 68e15c1a74
commit 68e15c1a74
parent d9ed827696
12 changed files with 2077 additions and 400 deletions
--- a/ARMeilleure/Instructions/InstEmitSimdCmp32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCmp32.cs
@ -5,6 +5,7 @@ using ARMeilleure.Translation;
 using System;

 using static ARMeilleure.Instructions.InstEmitHelper;
+using static ARMeilleure.Instructions.InstEmitSimdHelper;
 using static ARMeilleure.Instructions.InstEmitSimdHelper32;
 using static ARMeilleure.IntermediateRepresentation.OperandHelper;

@ -16,7 +17,14 @@ namespace ARMeilleure.Instructions
    {
        public static void Vceq_V(ArmEmitterContext context)
        {
-            EmitCmpOpF32(context, SoftFloat32.FPCompareEQFpscr, SoftFloat64.FPCompareEQFpscr, false);
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                EmitSse2CmpOpF32(context, CmpCondition.Equal, false);
+            }
+            else
+            {
+                EmitCmpOpF32(context, SoftFloat32.FPCompareEQFpscr, SoftFloat64.FPCompareEQFpscr, false);
+            }
        }

        public static void Vceq_I(ArmEmitterContext context)
@ -30,7 +38,14 @@ namespace ARMeilleure.Instructions

            if (op.F)
            {
-                EmitCmpOpF32(context, SoftFloat32.FPCompareEQFpscr, SoftFloat64.FPCompareEQFpscr, true);
+                if (Optimizations.FastFP && Optimizations.UseSse2)
+                {
+                    EmitSse2CmpOpF32(context, CmpCondition.Equal, true);
+                }
+                else
+                {
+                    EmitCmpOpF32(context, SoftFloat32.FPCompareEQFpscr, SoftFloat64.FPCompareEQFpscr, true);
+                }
            }
            else
            {
@ -40,7 +55,14 @@ namespace ARMeilleure.Instructions

        public static void Vcge_V(ArmEmitterContext context)
        {
-            EmitCmpOpF32(context, SoftFloat32.FPCompareGEFpscr, SoftFloat64.FPCompareGEFpscr, false);
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                EmitSse2CmpOpF32(context, CmpCondition.GreaterThanOrEqual, false);
+            }
+            else
+            {
+                EmitCmpOpF32(context, SoftFloat32.FPCompareGEFpscr, SoftFloat64.FPCompareGEFpscr, false);
+            }
        }

        public static void Vcge_I(ArmEmitterContext context)
@ -56,7 +78,14 @@ namespace ARMeilleure.Instructions

            if (op.F)
            {
-                EmitCmpOpF32(context, SoftFloat32.FPCompareGEFpscr, SoftFloat64.FPCompareGEFpscr, true);
+                if (Optimizations.FastFP && Optimizations.UseSse2)
+                {
+                    EmitSse2CmpOpF32(context, CmpCondition.GreaterThanOrEqual, true);
+                }
+                else
+                {
+                    EmitCmpOpF32(context, SoftFloat32.FPCompareGEFpscr, SoftFloat64.FPCompareGEFpscr, true);
+                }
            } 
            else
            {
@ -66,7 +95,14 @@ namespace ARMeilleure.Instructions

        public static void Vcgt_V(ArmEmitterContext context)
        {
-            EmitCmpOpF32(context, SoftFloat32.FPCompareGTFpscr, SoftFloat64.FPCompareGTFpscr, false);
+            if (Optimizations.FastFP && Optimizations.UseSse2)
+            {
+                EmitSse2CmpOpF32(context, CmpCondition.GreaterThan, false);
+            }
+            else
+            {
+                EmitCmpOpF32(context, SoftFloat32.FPCompareGTFpscr, SoftFloat64.FPCompareGTFpscr, false);
+            }
        }

        public static void Vcgt_I(ArmEmitterContext context)
@ -82,7 +118,14 @@ namespace ARMeilleure.Instructions

            if (op.F)
            {
-                EmitCmpOpF32(context, SoftFloat32.FPCompareGTFpscr, SoftFloat64.FPCompareGTFpscr, true);
+                if (Optimizations.FastFP && Optimizations.UseSse2)
+                {
+                    EmitSse2CmpOpF32(context, CmpCondition.GreaterThan, true);
+                }
+                else
+                {
+                    EmitCmpOpF32(context, SoftFloat32.FPCompareGTFpscr, SoftFloat64.FPCompareGTFpscr, true);
+                }
            }
            else
            {
@ -96,7 +139,14 @@ namespace ARMeilleure.Instructions

            if (op.F)
            {
-                EmitCmpOpF32(context, SoftFloat32.FPCompareLEFpscr, SoftFloat64.FPCompareLEFpscr, true);
+                if (Optimizations.FastFP && Optimizations.UseSse2)
+                {
+                    EmitSse2CmpOpF32(context, CmpCondition.LessThanOrEqual, true);
+                }
+                else
+                {
+                    EmitCmpOpF32(context, SoftFloat32.FPCompareLEFpscr, SoftFloat64.FPCompareLEFpscr, true);
+                }
            }
            else
            {
@ -110,7 +160,14 @@ namespace ARMeilleure.Instructions

            if (op.F)
            {
-                EmitCmpOpF32(context, SoftFloat32.FPCompareLTFpscr, SoftFloat64.FPCompareLTFpscr, true);
+                if (Optimizations.FastFP && Optimizations.UseSse2)
+                {
+                    EmitSse2CmpOpF32(context, CmpCondition.LessThan, true);
+                }
+                else
+                {
+                    EmitCmpOpF32(context, SoftFloat32.FPCompareLTFpscr, SoftFloat64.FPCompareLTFpscr, true);
+                }
            }
            else
            {
@ -224,23 +281,77 @@ namespace ARMeilleure.Instructions
            OpCode32SimdS op = (OpCode32SimdS)context.CurrOp;

            bool cmpWithZero = (op.Opc & 2) != 0;
+            int sizeF = op.Size & 1;
+
+            if (Optimizations.FastFP && (signalNaNs ? Optimizations.UseAvx : Optimizations.UseSse2))
            {
-                int fSize = op.Size & 1;
-                OperandType type = fSize != 0 ? OperandType.FP64 : OperandType.FP32;
+                CmpCondition cmpOrdered = signalNaNs ? CmpCondition.OrderedS : CmpCondition.OrderedQ;
+
+                bool doubleSize = sizeF != 0;
+                int shift = doubleSize ? 1 : 2;
+                Operand m = GetVecA32(op.Vm >> shift);
+                Operand n = GetVecA32(op.Vd >> shift);
+
+                n = EmitSwapScalar(context, n, op.Vd, doubleSize);
+                m = cmpWithZero ? context.VectorZero() : EmitSwapScalar(context, m, op.Vm, doubleSize);
+
+                Operand lblNaN = Label();
+                Operand lblEnd = Label();
+
+                if (!doubleSize)
+                {
+                    Operand ordMask = context.AddIntrinsic(Intrinsic.X86Cmpss, n, m, Const((int)cmpOrdered));
+
+                    Operand isOrdered = context.AddIntrinsicInt(Intrinsic.X86Cvtsi2si, ordMask);
+
+                    context.BranchIfFalse(lblNaN, isOrdered);
+
+                    Operand cf = context.AddIntrinsicInt(Intrinsic.X86Comissge, n, m);
+                    Operand zf = context.AddIntrinsicInt(Intrinsic.X86Comisseq, n, m);
+                    Operand nf = context.AddIntrinsicInt(Intrinsic.X86Comisslt, n, m);
+
+                    EmitSetFPSCRFlags(context, nf, zf, cf, Const(0));
+                }
+                else
+                {
+                    Operand ordMask = context.AddIntrinsic(Intrinsic.X86Cmpsd, n, m, Const((int)cmpOrdered));
+
+                    Operand isOrdered = context.AddIntrinsicLong(Intrinsic.X86Cvtsi2si, ordMask);
+
+                    context.BranchIfFalse(lblNaN, isOrdered);
+
+                    Operand cf = context.AddIntrinsicInt(Intrinsic.X86Comisdge, n, m);
+                    Operand zf = context.AddIntrinsicInt(Intrinsic.X86Comisdeq, n, m);
+                    Operand nf = context.AddIntrinsicInt(Intrinsic.X86Comisdlt, n, m);
+
+                    EmitSetFPSCRFlags(context, nf, zf, cf, Const(0));
+                }
+
+                context.Branch(lblEnd);
+
+                context.MarkLabel(lblNaN);
+
+                EmitSetFPSCRFlags(context, Const(3));
+
+                context.MarkLabel(lblEnd);
+            }
+            else
+            {
+                OperandType type = sizeF != 0 ? OperandType.FP64 : OperandType.FP32;

                Operand ne = ExtractScalar(context, type, op.Vd);
                Operand me;

                if (cmpWithZero)
                {
-                    me = fSize == 0 ? ConstF(0f) : ConstF(0d);
+                    me = sizeF == 0 ? ConstF(0f) : ConstF(0d);
                }
                else
                {
                    me = ExtractScalar(context, type, op.Vm);
                }

-                Delegate dlg = fSize != 0
+                Delegate dlg = sizeF != 0
                    ? (Delegate)new _S32_F64_F64_Bool(SoftFloat64.FPCompare)
                    : (Delegate)new _S32_F32_F32_Bool(SoftFloat32.FPCompare);

@ -269,5 +380,36 @@ namespace ARMeilleure.Instructions
            SetFpFlag(context, FPState.ZFlag, Extract(nzcv, 2));
            SetFpFlag(context, FPState.NFlag, Extract(nzcv, 3));
        }
+
+        private static void EmitSetFPSCRFlags(ArmEmitterContext context, Operand n, Operand z, Operand c, Operand v)
+        {
+            SetFpFlag(context, FPState.VFlag, v);
+            SetFpFlag(context, FPState.CFlag, c);
+            SetFpFlag(context, FPState.ZFlag, z);
+            SetFpFlag(context, FPState.NFlag, n);
+        }
+
+        private static void EmitSse2CmpOpF32(ArmEmitterContext context, CmpCondition cond, bool zero)
+        {
+            OpCode32Simd op = (OpCode32Simd)context.CurrOp;
+
+            int sizeF = op.Size & 1;
+            Intrinsic inst = (sizeF == 0) ? Intrinsic.X86Cmpps : Intrinsic.X86Cmppd;
+
+            if (zero)
+            {
+                EmitVectorUnaryOpSimd32(context, (m) =>
+                {
+                    return context.AddIntrinsic(inst, m, context.VectorZero(), Const((int)cond));
+                });
+            }
+            else
+            {
+                EmitVectorBinaryOpSimd32(context, (n, m) =>
+                {
+                    return context.AddIntrinsic(inst, n, m, Const((int)cond));
+                });
+            }
+        }
    }
 }