Add Tbx Inst. (fast & slow paths), with Tests. (#782)

* Update OpCodeTable.cs

* Update InstName.cs

* Update InstEmitSimdMove.cs

* Update SoftFallback.cs

* Update DelegateTypes.cs

* Update CpuTestSimdTbl.cs

* Update CpuTest.cs

* Update Ryujinx.Tests.csproj

* Nit.
This commit is contained in:
LDj3SNuD 2019-10-04 16:43:20 +02:00 committed by gdkchan
parent 92e5e3c505
commit 16869402bf
8 changed files with 255 additions and 192 deletions

View file

@ -518,6 +518,7 @@ namespace ARMeilleure.Decoders
SetA64("01011110xx100000001110xxxxxxxxxx", InstName.Suqadd_S, InstEmit.Suqadd_S, typeof(OpCodeSimd));
SetA64("0>001110<<100000001110xxxxxxxxxx", InstName.Suqadd_V, InstEmit.Suqadd_V, typeof(OpCodeSimd));
SetA64("0x001110000xxxxx0xx000xxxxxxxxxx", InstName.Tbl_V, InstEmit.Tbl_V, typeof(OpCodeSimdTbl));
SetA64("0x001110000xxxxx0xx100xxxxxxxxxx", InstName.Tbx_V, InstEmit.Tbx_V, typeof(OpCodeSimdTbl));
SetA64("0>001110<<0xxxxx001010xxxxxxxxxx", InstName.Trn1_V, InstEmit.Trn1_V, typeof(OpCodeSimdReg));
SetA64("0>001110<<0xxxxx011010xxxxxxxxxx", InstName.Trn2_V, InstEmit.Trn2_V, typeof(OpCodeSimdReg));
SetA64("0x101110<<1xxxxx011111xxxxxxxxxx", InstName.Uaba_V, InstEmit.Uaba_V, typeof(OpCodeSimdReg));

View file

@ -61,11 +61,17 @@ namespace ARMeilleure.Instructions
delegate V128 _V128_U64(ulong a1);
delegate V128 _V128_V128(V128 a1);
delegate V128 _V128_V128_S32_V128(V128 a1, int a2, V128 a3);
delegate V128 _V128_V128_S32_V128_V128(V128 a1, int a2, V128 a3, V128 a4);
delegate V128 _V128_V128_S32_V128_V128_V128(V128 a1, int a2, V128 a3, V128 a4, V128 a5);
delegate V128 _V128_V128_S32_V128_V128_V128_V128(V128 a1, int a2, V128 a3, V128 a4, V128 a5, V128 a6);
delegate V128 _V128_V128_U32_V128(V128 a1, uint a2, V128 a3);
delegate V128 _V128_V128_V128(V128 a1, V128 a2);
delegate V128 _V128_V128_V128_S32_V128(V128 a1, V128 a2, int a3, V128 a4);
delegate V128 _V128_V128_V128_S32_V128_V128(V128 a1, V128 a2, int a3, V128 a4, V128 a5);
delegate V128 _V128_V128_V128_S32_V128_V128_V128(V128 a1, V128 a2, int a3, V128 a4, V128 a5, V128 a6);
delegate V128 _V128_V128_V128_S32_V128_V128_V128_V128(V128 a1, V128 a2, int a3, V128 a4, V128 a5, V128 a6, V128 a7);
delegate V128 _V128_V128_V128_V128(V128 a1, V128 a2, V128 a3);
delegate V128 _V128_V128_V128_V128_V128(V128 a1, V128 a2, V128 a3, V128 a4);
delegate V128 _V128_V128_V128_V128_V128_V128(V128 a1, V128 a2, V128 a3, V128 a4, V128 a5);
delegate void _Void();
delegate void _Void_U64(ulong a1);
@ -75,4 +81,4 @@ namespace ARMeilleure.Instructions
delegate void _Void_U64_U64(ulong a1, ulong a2);
delegate void _Void_U64_U8(ulong a1, byte a2);
delegate void _Void_U64_V128(ulong a1, V128 a2);
}
}

View file

@ -2,6 +2,7 @@ using ARMeilleure.Decoders;
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.Translation;
using System;
using System.Collections.Generic;
using static ARMeilleure.Instructions.InstEmitHelper;
using static ARMeilleure.Instructions.InstEmitSimdHelper;
@ -384,79 +385,12 @@ namespace ARMeilleure.Instructions
public static void Tbl_V(ArmEmitterContext context)
{
OpCodeSimdTbl op = (OpCodeSimdTbl)context.CurrOp;
EmitTableVectorLookup(context, isTbl: true);
}
if (Optimizations.UseSsse3)
{
Operand n = GetVec(op.Rn);
Operand m = GetVec(op.Rm);
Operand mask = X86GetAllElements(context, 0x0F0F0F0F0F0F0F0FL);
Operand mMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, m, mask);
mMask = context.AddIntrinsic(Intrinsic.X86Por, mMask, m);
Operand res = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mMask);
for (int index = 1; index < op.Size; index++)
{
Operand ni = GetVec((op.Rn + index) & 0x1f);
Operand indexMask = X86GetAllElements(context, 0x1010101010101010L * index);
Operand mMinusMask = context.AddIntrinsic(Intrinsic.X86Psubb, m, indexMask);
Operand mMask2 = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, mMinusMask, mask);
mMask2 = context.AddIntrinsic(Intrinsic.X86Por, mMask2, mMinusMask);
Operand res2 = context.AddIntrinsic(Intrinsic.X86Pshufb, ni, mMask2);
res = context.AddIntrinsic(Intrinsic.X86Por, res, res2);
}
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(GetVec(op.Rd), res);
}
else
{
Operand[] args = new Operand[1 + op.Size];
args[0] = GetVec(op.Rm);
for (int index = 0; index < op.Size; index++)
{
args[1 + index] = GetVec((op.Rn + index) & 0x1f);
}
Delegate dlg = null;
switch (op.Size)
{
case 1: dlg = op.RegisterSize == RegisterSize.Simd64
? (Delegate)new _V128_V128_V128(SoftFallback.Tbl1_V64)
: (Delegate)new _V128_V128_V128(SoftFallback.Tbl1_V128); break;
case 2: dlg = op.RegisterSize == RegisterSize.Simd64
? (Delegate)new _V128_V128_V128_V128(SoftFallback.Tbl2_V64)
: (Delegate)new _V128_V128_V128_V128(SoftFallback.Tbl2_V128); break;
case 3: dlg = op.RegisterSize == RegisterSize.Simd64
? (Delegate)new _V128_V128_V128_V128_V128(SoftFallback.Tbl3_V64)
: (Delegate)new _V128_V128_V128_V128_V128(SoftFallback.Tbl3_V128); break;
case 4: dlg = op.RegisterSize == RegisterSize.Simd64
? (Delegate)new _V128_V128_V128_V128_V128_V128(SoftFallback.Tbl4_V64)
: (Delegate)new _V128_V128_V128_V128_V128_V128(SoftFallback.Tbl4_V128); break;
}
context.Copy(GetVec(op.Rd), context.Call(dlg, args));
}
public static void Tbx_V(ArmEmitterContext context)
{
EmitTableVectorLookup(context, isTbl: false);
}
public static void Trn1_V(ArmEmitterContext context)
@ -577,6 +511,116 @@ namespace ARMeilleure.Instructions
context.Copy(GetVec(op.Rd), mask);
}
private static void EmitTableVectorLookup(ArmEmitterContext context, bool isTbl)
{
OpCodeSimdTbl op = (OpCodeSimdTbl)context.CurrOp;
if (Optimizations.UseSsse3)
{
Operand d = GetVec(op.Rd);
Operand m = GetVec(op.Rm);
Operand res;
Operand mask = X86GetAllElements(context, 0x0F0F0F0F0F0F0F0FL);
// Fast path for single register table.
{
Operand n = GetVec(op.Rn);
Operand mMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, m, mask);
mMask = context.AddIntrinsic(Intrinsic.X86Por, mMask, m);
res = context.AddIntrinsic(Intrinsic.X86Pshufb, n, mMask);
}
for (int index = 1; index < op.Size; index++)
{
Operand ni = GetVec((op.Rn + index) & 0x1F);
Operand idxMask = X86GetAllElements(context, 0x1010101010101010L * index);
Operand mSubMask = context.AddIntrinsic(Intrinsic.X86Psubb, m, idxMask);
Operand mMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, mSubMask, mask);
mMask = context.AddIntrinsic(Intrinsic.X86Por, mMask, mSubMask);
Operand res2 = context.AddIntrinsic(Intrinsic.X86Pshufb, ni, mMask);
res = context.AddIntrinsic(Intrinsic.X86Por, res, res2);
}
if (!isTbl)
{
Operand idxMask = X86GetAllElements(context, (0x1010101010101010L * op.Size) - 0x0101010101010101L);
Operand zeroMask = context.VectorZero();
Operand mPosMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, m, idxMask);
Operand mNegMask = context.AddIntrinsic(Intrinsic.X86Pcmpgtb, zeroMask, m);
Operand mMask = context.AddIntrinsic(Intrinsic.X86Por, mPosMask, mNegMask);
Operand dMask = context.AddIntrinsic(Intrinsic.X86Pand, d, mMask);
res = context.AddIntrinsic(Intrinsic.X86Por, res, dMask);
}
if (op.RegisterSize == RegisterSize.Simd64)
{
res = context.VectorZeroUpper64(res);
}
context.Copy(d, res);
}
else
{
Operand d = GetVec(op.Rd);
List<Operand> args = new List<Operand>();
if (!isTbl)
{
args.Add(d);
}
args.Add(GetVec(op.Rm));
args.Add(Const(op.RegisterSize == RegisterSize.Simd64 ? 8 : 16));
for (int index = 0; index < op.Size; index++)
{
args.Add(GetVec((op.Rn + index) & 0x1F));
}
Delegate dlg = null;
switch (op.Size)
{
case 1: dlg = isTbl
? (Delegate)new _V128_V128_S32_V128 (SoftFallback.Tbl1)
: (Delegate)new _V128_V128_V128_S32_V128(SoftFallback.Tbx1);
break;
case 2: dlg = isTbl
? (Delegate)new _V128_V128_S32_V128_V128 (SoftFallback.Tbl2)
: (Delegate)new _V128_V128_V128_S32_V128_V128(SoftFallback.Tbx2);
break;
case 3: dlg = isTbl
? (Delegate)new _V128_V128_S32_V128_V128_V128 (SoftFallback.Tbl3)
: (Delegate)new _V128_V128_V128_S32_V128_V128_V128(SoftFallback.Tbx3);
break;
case 4: dlg = isTbl
? (Delegate)new _V128_V128_S32_V128_V128_V128_V128 (SoftFallback.Tbl4)
: (Delegate)new _V128_V128_V128_S32_V128_V128_V128_V128(SoftFallback.Tbx4);
break;
}
context.Copy(d, context.Call(dlg, args.ToArray()));
}
}
private static void EmitVectorTranspose(ArmEmitterContext context, int part)
{
OpCodeSimdReg op = (OpCodeSimdReg)context.CurrOp;
@ -791,4 +835,4 @@ namespace ARMeilleure.Instructions
}
}
}
}
}

View file

@ -375,6 +375,7 @@ namespace ARMeilleure.Instructions
Suqadd_S,
Suqadd_V,
Tbl_V,
Tbx_V,
Trn1_V,
Trn2_V,
Uaba_V,
@ -456,4 +457,4 @@ namespace ARMeilleure.Instructions
Strd,
Strh
}
}
}

View file

@ -837,49 +837,55 @@ namespace ARMeilleure.Instructions
#endregion
#region "Table"
public static V128 Tbl1_V64(V128 vector, V128 tb0)
public static V128 Tbl1(V128 vector, int bytes, V128 tb0)
{
return Tbl(vector, 8, tb0);
return TblOrTbx(default, vector, bytes, tb0);
}
public static V128 Tbl1_V128(V128 vector, V128 tb0)
public static V128 Tbl2(V128 vector, int bytes, V128 tb0, V128 tb1)
{
return Tbl(vector, 16, tb0);
return TblOrTbx(default, vector, bytes, tb0, tb1);
}
public static V128 Tbl2_V64(V128 vector, V128 tb0, V128 tb1)
public static V128 Tbl3(V128 vector, int bytes, V128 tb0, V128 tb1, V128 tb2)
{
return Tbl(vector, 8, tb0, tb1);
return TblOrTbx(default, vector, bytes, tb0, tb1, tb2);
}
public static V128 Tbl2_V128(V128 vector, V128 tb0, V128 tb1)
public static V128 Tbl4(V128 vector, int bytes, V128 tb0, V128 tb1, V128 tb2, V128 tb3)
{
return Tbl(vector, 16, tb0, tb1);
return TblOrTbx(default, vector, bytes, tb0, tb1, tb2, tb3);
}
public static V128 Tbl3_V64(V128 vector, V128 tb0, V128 tb1, V128 tb2)
public static V128 Tbx1(V128 dest, V128 vector, int bytes, V128 tb0)
{
return Tbl(vector, 8, tb0, tb1, tb2);
return TblOrTbx(dest, vector, bytes, tb0);
}
public static V128 Tbl3_V128(V128 vector, V128 tb0, V128 tb1, V128 tb2)
public static V128 Tbx2(V128 dest, V128 vector, int bytes, V128 tb0, V128 tb1)
{
return Tbl(vector, 16, tb0, tb1, tb2);
return TblOrTbx(dest, vector, bytes, tb0, tb1);
}
public static V128 Tbl4_V64(V128 vector, V128 tb0, V128 tb1, V128 tb2, V128 tb3)
public static V128 Tbx3(V128 dest, V128 vector, int bytes, V128 tb0, V128 tb1, V128 tb2)
{
return Tbl(vector, 8, tb0, tb1, tb2, tb3);
return TblOrTbx(dest, vector, bytes, tb0, tb1, tb2);
}
public static V128 Tbl4_V128(V128 vector, V128 tb0, V128 tb1, V128 tb2, V128 tb3)
public static V128 Tbx4(V128 dest, V128 vector, int bytes, V128 tb0, V128 tb1, V128 tb2, V128 tb3)
{
return Tbl(vector, 16, tb0, tb1, tb2, tb3);
return TblOrTbx(dest, vector, bytes, tb0, tb1, tb2, tb3);
}
private static V128 Tbl(V128 vector, int bytes, params V128[] tb)
private static V128 TblOrTbx(V128 dest, V128 vector, int bytes, params V128[] tb)
{
byte[] res = new byte[16];
byte[] res = new byte[16];
if (dest != default)
{
Buffer.BlockCopy(dest.ToArray(), 0, res, 0, bytes);
}
byte[] table = new byte[tb.Length * 16];
for (byte index = 0; index < tb.Length; index++)