Add SSE4.2 Path for CRC32, add A32 variant, add tests for non-castagnoli variants. (#1328)
* Add CRC32 A32 instructions. * Fix CRC32 instructions. * Add CRC intrinsic and fast path. Loop is currently unrolled, will look into adding temp vars after tests are added. * Begin work on Crc tests * Fix SSE4.2 path for CRC32C, finialize tests. * Remove unused IR path. * Fix spacing between prefix checks. * This should be Src. * PTC Version * OpCodeTable Order * Integer check improvement. Value and Crc can be either 32 or 64 size. * This wasn't necessary... * If size is 3, value type must be I64. * Fix same src+dest handling for non crc intrinsics. * Pre-fix (ha) issue with vex encodings
This commit is contained in:
parent
30d4f752f4
commit
d7044b10a2
15 changed files with 448 additions and 161 deletions
|
@ -28,10 +28,10 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Vex = 1 << 4,
|
||||
|
||||
PrefixBit = 16,
|
||||
PrefixMask = 3 << PrefixBit,
|
||||
PrefixMask = 7 << PrefixBit,
|
||||
Prefix66 = 1 << PrefixBit,
|
||||
PrefixF3 = 2 << PrefixBit,
|
||||
PrefixF2 = 3 << PrefixBit
|
||||
PrefixF2 = 4 << PrefixBit
|
||||
}
|
||||
|
||||
private struct InstructionInfo
|
||||
|
@ -104,6 +104,9 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Add(X86Instruction.Comisd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Comiss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f2f, InstructionFlags.Vex));
|
||||
Add(X86Instruction.Cpuid, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fa2, InstructionFlags.RegOnly));
|
||||
Add(X86Instruction.Crc32, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2));
|
||||
Add(X86Instruction.Crc32_16, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f1, InstructionFlags.PrefixF2 | InstructionFlags.Prefix66));
|
||||
Add(X86Instruction.Crc32_8, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38f0, InstructionFlags.PrefixF2 | InstructionFlags.Reg8Src));
|
||||
Add(X86Instruction.Cvtdq2pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF3));
|
||||
Add(X86Instruction.Cvtdq2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f5b, InstructionFlags.Vex));
|
||||
Add(X86Instruction.Cvtpd2dq, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000fe6, InstructionFlags.Vex | InstructionFlags.PrefixF2));
|
||||
|
@ -1172,7 +1175,15 @@ namespace ARMeilleure.CodeGen.X86
|
|||
|
||||
if ((flags & InstructionFlags.Vex) != 0 && HardwareCapabilities.SupportsVexEncoding)
|
||||
{
|
||||
int vexByte2 = (int)(flags & InstructionFlags.PrefixMask) >> (int)InstructionFlags.PrefixBit;
|
||||
// In a vex encoding, only one prefix can be active at a time. The active prefix is encoded in the second byte using two bits.
|
||||
|
||||
int vexByte2 = (flags & InstructionFlags.PrefixMask) switch
|
||||
{
|
||||
InstructionFlags.Prefix66 => 1,
|
||||
InstructionFlags.PrefixF3 => 2,
|
||||
InstructionFlags.PrefixF2 => 3,
|
||||
_ => 0
|
||||
};
|
||||
|
||||
if (src1 != null)
|
||||
{
|
||||
|
@ -1220,11 +1231,19 @@ namespace ARMeilleure.CodeGen.X86
|
|||
}
|
||||
else
|
||||
{
|
||||
switch (flags & InstructionFlags.PrefixMask)
|
||||
if (flags.HasFlag(InstructionFlags.Prefix66))
|
||||
{
|
||||
case InstructionFlags.Prefix66: WriteByte(0x66); break;
|
||||
case InstructionFlags.PrefixF2: WriteByte(0xf2); break;
|
||||
case InstructionFlags.PrefixF3: WriteByte(0xf3); break;
|
||||
WriteByte(0x66);
|
||||
}
|
||||
|
||||
if (flags.HasFlag(InstructionFlags.PrefixF2))
|
||||
{
|
||||
WriteByte(0xf2);
|
||||
}
|
||||
|
||||
if (flags.HasFlag(InstructionFlags.PrefixF3))
|
||||
{
|
||||
WriteByte(0xf3);
|
||||
}
|
||||
|
||||
if (rexPrefix != 0)
|
||||
|
|
|
@ -333,6 +333,21 @@ namespace ARMeilleure.CodeGen.X86
|
|||
break;
|
||||
}
|
||||
|
||||
case IntrinsicType.Crc32:
|
||||
{
|
||||
Operand dest = operation.Destination;
|
||||
Operand src1 = operation.GetSource(0);
|
||||
Operand src2 = operation.GetSource(1);
|
||||
|
||||
EnsureSameReg(dest, src1);
|
||||
|
||||
Debug.Assert(dest.Type.IsInteger() && src1.Type.IsInteger() && src2.Type.IsInteger());
|
||||
|
||||
context.Assembler.WriteInstruction(info.Inst, dest, src2, dest.Type);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case IntrinsicType.BinaryImm:
|
||||
{
|
||||
Operand dest = operation.Destination;
|
||||
|
|
|
@ -38,6 +38,9 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Add(Intrinsic.X86Comisseq, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_));
|
||||
Add(Intrinsic.X86Comissge, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_));
|
||||
Add(Intrinsic.X86Comisslt, new IntrinsicInfo(X86Instruction.Comiss, IntrinsicType.Comis_));
|
||||
Add(Intrinsic.X86Crc32, new IntrinsicInfo(X86Instruction.Crc32, IntrinsicType.Crc32));
|
||||
Add(Intrinsic.X86Crc32_16, new IntrinsicInfo(X86Instruction.Crc32_16, IntrinsicType.Crc32));
|
||||
Add(Intrinsic.X86Crc32_8, new IntrinsicInfo(X86Instruction.Crc32_8, IntrinsicType.Crc32));
|
||||
Add(Intrinsic.X86Cvtdq2pd, new IntrinsicInfo(X86Instruction.Cvtdq2pd, IntrinsicType.Unary));
|
||||
Add(Intrinsic.X86Cvtdq2ps, new IntrinsicInfo(X86Instruction.Cvtdq2ps, IntrinsicType.Unary));
|
||||
Add(Intrinsic.X86Cvtpd2dq, new IntrinsicInfo(X86Instruction.Cvtpd2dq, IntrinsicType.Unary));
|
||||
|
|
|
@ -9,6 +9,7 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Binary,
|
||||
BinaryGpr,
|
||||
BinaryImm,
|
||||
Crc32,
|
||||
Ternary,
|
||||
TernaryImm
|
||||
}
|
||||
|
|
|
@ -1294,11 +1294,22 @@ namespace ARMeilleure.CodeGen.X86
|
|||
case Instruction.VectorInsert16:
|
||||
case Instruction.VectorInsert8:
|
||||
return !HardwareCapabilities.SupportsVexEncoding;
|
||||
|
||||
case Instruction.Extended:
|
||||
return IsIntrinsicSameOperandDestSrc1(operation);
|
||||
}
|
||||
|
||||
return IsVexSameOperandDestSrc1(operation);
|
||||
}
|
||||
|
||||
private static bool IsIntrinsicSameOperandDestSrc1(Operation operation)
|
||||
{
|
||||
IntrinsicOperation intrinOp = (IntrinsicOperation)operation;
|
||||
IntrinsicInfo info = IntrinsicTable.GetInfo(intrinOp.Intrinsic);
|
||||
|
||||
return info.Type == IntrinsicType.Crc32 || IsVexSameOperandDestSrc1(operation);
|
||||
}
|
||||
|
||||
private static bool IsVexSameOperandDestSrc1(Operation operation)
|
||||
{
|
||||
if (IsIntrinsic(operation.Instruction))
|
||||
|
|
|
@ -33,6 +33,9 @@ namespace ARMeilleure.CodeGen.X86
|
|||
Comisd,
|
||||
Comiss,
|
||||
Cpuid,
|
||||
Crc32,
|
||||
Crc32_16,
|
||||
Crc32_8,
|
||||
Cvtdq2pd,
|
||||
Cvtdq2ps,
|
||||
Cvtpd2dq,
|
||||
|
|
|
@ -659,6 +659,12 @@ namespace ARMeilleure.Decoders
|
|||
SetA32("<<<<00110101xxxx0000xxxxxxxxxxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluImm));
|
||||
SetA32("<<<<00010101xxxx0000xxxxxxx0xxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluRsImm));
|
||||
SetA32("<<<<00010101xxxx0000xxxx0xx1xxxx", InstName.Cmp, InstEmit32.Cmp, typeof(OpCode32AluRsReg));
|
||||
SetA32("<<<<00010000xxxxxxxx00000100xxxx", InstName.Crc32b, InstEmit32.Crc32b, typeof(OpCode32AluReg));
|
||||
SetA32("<<<<00010000xxxxxxxx00100100xxxx", InstName.Crc32cb, InstEmit32.Crc32cb, typeof(OpCode32AluReg));
|
||||
SetA32("<<<<00010010xxxxxxxx00100100xxxx", InstName.Crc32ch, InstEmit32.Crc32ch, typeof(OpCode32AluReg));
|
||||
SetA32("<<<<00010100xxxxxxxx00100100xxxx", InstName.Crc32cw, InstEmit32.Crc32cw, typeof(OpCode32AluReg));
|
||||
SetA32("<<<<00010010xxxxxxxx00000100xxxx", InstName.Crc32h, InstEmit32.Crc32h, typeof(OpCode32AluReg));
|
||||
SetA32("<<<<00010100xxxxxxxx00000100xxxx", InstName.Crc32w, InstEmit32.Crc32w, typeof(OpCode32AluReg));
|
||||
SetA32("1111010101111111111100000101xxxx", InstName.Dmb, InstEmit32.Dmb, typeof(OpCode32));
|
||||
SetA32("1111010101111111111100000100xxxx", InstName.Dsb, InstEmit32.Dsb, typeof(OpCode32));
|
||||
SetA32("<<<<0010001xxxxxxxxxxxxxxxxxxxxx", InstName.Eor, InstEmit32.Eor, typeof(OpCode32AluImm));
|
||||
|
|
|
@ -1,182 +1,67 @@
|
|||
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
||||
|
||||
using ARMeilleure.Decoders;
|
||||
using ARMeilleure.IntermediateRepresentation;
|
||||
using ARMeilleure.Translation;
|
||||
|
||||
using static ARMeilleure.Instructions.InstEmitHashHelper;
|
||||
using static ARMeilleure.Instructions.InstEmitHelper;
|
||||
using static ARMeilleure.Instructions.InstEmitSimdHelper;
|
||||
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
|
||||
|
||||
namespace ARMeilleure.Instructions
|
||||
{
|
||||
static partial class InstEmit
|
||||
{
|
||||
private const int ByteSizeLog2 = 0;
|
||||
private const int HWordSizeLog2 = 1;
|
||||
private const int WordSizeLog2 = 2;
|
||||
private const int DWordSizeLog2 = 3;
|
||||
|
||||
public static void Crc32b(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, false, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, nameof(SoftFallback.Crc32b));
|
||||
}
|
||||
EmitCrc32Call(context, ByteSizeLog2, false);
|
||||
}
|
||||
|
||||
public static void Crc32h(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, false, 16);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, nameof(SoftFallback.Crc32h));
|
||||
}
|
||||
EmitCrc32Call(context, HWordSizeLog2, false);
|
||||
}
|
||||
|
||||
public static void Crc32w(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, false, 32);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, nameof(SoftFallback.Crc32w));
|
||||
}
|
||||
EmitCrc32Call(context, WordSizeLog2, false);
|
||||
}
|
||||
|
||||
public static void Crc32x(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized64(context, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, nameof(SoftFallback.Crc32x));
|
||||
}
|
||||
EmitCrc32Call(context, DWordSizeLog2, false);
|
||||
}
|
||||
|
||||
public static void Crc32cb(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, true, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, nameof(SoftFallback.Crc32cb));
|
||||
}
|
||||
EmitCrc32Call(context, ByteSizeLog2, true);
|
||||
}
|
||||
|
||||
public static void Crc32ch(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, true, 16);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, nameof(SoftFallback.Crc32ch));
|
||||
}
|
||||
EmitCrc32Call(context, HWordSizeLog2, true);
|
||||
}
|
||||
|
||||
public static void Crc32cw(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized(context, true, 32);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, nameof(SoftFallback.Crc32cw));
|
||||
}
|
||||
EmitCrc32Call(context, WordSizeLog2, true);
|
||||
}
|
||||
|
||||
public static void Crc32cx(ArmEmitterContext context)
|
||||
{
|
||||
if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
EmitCrc32Optimized64(context, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
EmitCrc32Call(context, nameof(SoftFallback.Crc32cx));
|
||||
}
|
||||
EmitCrc32Call(context, DWordSizeLog2, true);
|
||||
}
|
||||
|
||||
private static void EmitCrc32Optimized(ArmEmitterContext context, bool castagnoli, int bitsize)
|
||||
{
|
||||
OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
|
||||
|
||||
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
|
||||
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
|
||||
|
||||
Operand crc = GetIntOrZR(context, op.Rn);
|
||||
Operand data = GetIntOrZR(context, op.Rm);
|
||||
|
||||
crc = context.VectorInsert(context.VectorZero(), crc, 0);
|
||||
|
||||
switch (bitsize)
|
||||
{
|
||||
case 8: data = context.VectorInsert8(context.VectorZero(), data, 0); break;
|
||||
case 16: data = context.VectorInsert16(context.VectorZero(), data, 0); break;
|
||||
case 32: data = context.VectorInsert(context.VectorZero(), data, 0); break;
|
||||
}
|
||||
|
||||
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
|
||||
|
||||
if (bitsize < 32)
|
||||
{
|
||||
crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
|
||||
}
|
||||
|
||||
SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
|
||||
}
|
||||
|
||||
private static void EmitCrc32Optimized64(ArmEmitterContext context, bool castagnoli)
|
||||
{
|
||||
OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
|
||||
|
||||
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
|
||||
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
|
||||
|
||||
Operand crc = GetIntOrZR(context, op.Rn);
|
||||
Operand data = GetIntOrZR(context, op.Rm);
|
||||
|
||||
crc = context.VectorInsert(context.VectorZero(), crc, 0);
|
||||
data = context.VectorInsert(context.VectorZero(), data, 0);
|
||||
|
||||
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
|
||||
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
|
||||
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
|
||||
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
|
||||
|
||||
SetIntOrZR(context, op.Rd, context.VectorExtract(OperandType.I32, tmp, 2));
|
||||
}
|
||||
|
||||
private static void EmitCrc32Call(ArmEmitterContext context, string name)
|
||||
private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c)
|
||||
{
|
||||
OpCodeAluBinary op = (OpCodeAluBinary)context.CurrOp;
|
||||
|
||||
Operand n = GetIntOrZR(context, op.Rn);
|
||||
Operand m = GetIntOrZR(context, op.Rm);
|
||||
|
||||
Operand d = context.Call(typeof(SoftFallback).GetMethod(name), n, m);
|
||||
Operand d = EmitCrc32(context, n, m, size, c);
|
||||
|
||||
SetIntOrZR(context, op.Rd, d);
|
||||
}
|
||||
|
|
54
ARMeilleure/Instructions/InstEmitHash32.cs
Normal file
54
ARMeilleure/Instructions/InstEmitHash32.cs
Normal file
|
@ -0,0 +1,54 @@
|
|||
using ARMeilleure.Decoders;
|
||||
using ARMeilleure.IntermediateRepresentation;
|
||||
using ARMeilleure.Translation;
|
||||
|
||||
using static ARMeilleure.Instructions.InstEmitHelper;
|
||||
using static ARMeilleure.Instructions.InstEmitHashHelper;
|
||||
|
||||
namespace ARMeilleure.Instructions
|
||||
{
|
||||
static partial class InstEmit32
|
||||
{
|
||||
public static void Crc32b(ArmEmitterContext context)
|
||||
{
|
||||
EmitCrc32Call(context, ByteSizeLog2, false);
|
||||
}
|
||||
|
||||
public static void Crc32h(ArmEmitterContext context)
|
||||
{
|
||||
EmitCrc32Call(context, HWordSizeLog2, false);
|
||||
}
|
||||
|
||||
public static void Crc32w(ArmEmitterContext context)
|
||||
{
|
||||
EmitCrc32Call(context, WordSizeLog2, false);
|
||||
}
|
||||
|
||||
public static void Crc32cb(ArmEmitterContext context)
|
||||
{
|
||||
EmitCrc32Call(context, ByteSizeLog2, true);
|
||||
}
|
||||
|
||||
public static void Crc32ch(ArmEmitterContext context)
|
||||
{
|
||||
EmitCrc32Call(context, HWordSizeLog2, true);
|
||||
}
|
||||
|
||||
public static void Crc32cw(ArmEmitterContext context)
|
||||
{
|
||||
EmitCrc32Call(context, WordSizeLog2, true);
|
||||
}
|
||||
|
||||
private static void EmitCrc32Call(ArmEmitterContext context, int size, bool c)
|
||||
{
|
||||
IOpCode32AluReg op = (IOpCode32AluReg)context.CurrOp;
|
||||
|
||||
Operand n = GetIntA32(context, op.Rn);
|
||||
Operand m = GetIntA32(context, op.Rm);
|
||||
|
||||
Operand d = EmitCrc32(context, n, m, size, c);
|
||||
|
||||
EmitAluStore(context, d);
|
||||
}
|
||||
}
|
||||
}
|
119
ARMeilleure/Instructions/InstEmitHashHelper.cs
Normal file
119
ARMeilleure/Instructions/InstEmitHashHelper.cs
Normal file
|
@ -0,0 +1,119 @@
|
|||
// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
|
||||
|
||||
using ARMeilleure.IntermediateRepresentation;
|
||||
using ARMeilleure.Translation;
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
|
||||
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
|
||||
using static ARMeilleure.Instructions.InstEmitSimdHelper;
|
||||
|
||||
namespace ARMeilleure.Instructions
|
||||
{
|
||||
static class InstEmitHashHelper
|
||||
{
|
||||
public const uint Crc32RevPoly = 0xedb88320;
|
||||
public const uint Crc32cRevPoly = 0x82f63b78;
|
||||
|
||||
public static Operand EmitCrc32(ArmEmitterContext context, Operand crc, Operand value, int size, bool castagnoli)
|
||||
{
|
||||
Debug.Assert(crc.Type.IsInteger() && value.Type.IsInteger());
|
||||
Debug.Assert(size >= 0 && size < 4);
|
||||
Debug.Assert((size < 3) || (value.Type == OperandType.I64));
|
||||
|
||||
if (castagnoli && Optimizations.UseSse42)
|
||||
{
|
||||
// The CRC32 instruction does not have an immediate variant, so ensure both inputs are in registers.
|
||||
value = (value.Kind == OperandKind.Constant) ? context.Copy(value) : value;
|
||||
crc = (crc.Kind == OperandKind.Constant) ? context.Copy(crc) : crc;
|
||||
|
||||
Intrinsic op = size switch
|
||||
{
|
||||
0 => Intrinsic.X86Crc32_8,
|
||||
1 => Intrinsic.X86Crc32_16,
|
||||
_ => Intrinsic.X86Crc32,
|
||||
};
|
||||
|
||||
return (size == 3) ? context.ConvertI64ToI32(context.AddIntrinsicLong(op, crc, value)) : context.AddIntrinsicInt(op, crc, value);
|
||||
}
|
||||
else if (Optimizations.UsePclmulqdq)
|
||||
{
|
||||
return size switch
|
||||
{
|
||||
3 => EmitCrc32Optimized64(context, crc, value, castagnoli),
|
||||
_ => EmitCrc32Optimized(context, crc, value, castagnoli, size),
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
string name = (size, castagnoli) switch
|
||||
{
|
||||
(0, false) => nameof(SoftFallback.Crc32b),
|
||||
(1, false) => nameof(SoftFallback.Crc32h),
|
||||
(2, false) => nameof(SoftFallback.Crc32w),
|
||||
(3, false) => nameof(SoftFallback.Crc32x),
|
||||
(0, true) => nameof(SoftFallback.Crc32cb),
|
||||
(1, true) => nameof(SoftFallback.Crc32ch),
|
||||
(2, true) => nameof(SoftFallback.Crc32cw),
|
||||
(3, true) => nameof(SoftFallback.Crc32cx),
|
||||
_ => throw new ArgumentOutOfRangeException(nameof(size))
|
||||
};
|
||||
|
||||
return context.Call(typeof(SoftFallback).GetMethod(name), crc, value);
|
||||
}
|
||||
}
|
||||
|
||||
private static Operand EmitCrc32Optimized(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli, int size)
|
||||
{
|
||||
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
|
||||
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
|
||||
|
||||
crc = context.VectorInsert(context.VectorZero(), crc, 0);
|
||||
|
||||
switch (size)
|
||||
{
|
||||
case 0: data = context.VectorInsert8(context.VectorZero(), data, 0); break;
|
||||
case 1: data = context.VectorInsert16(context.VectorZero(), data, 0); break;
|
||||
case 2: data = context.VectorInsert(context.VectorZero(), data, 0); break;
|
||||
}
|
||||
|
||||
int bitsize = 8 << size;
|
||||
|
||||
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(64 - bitsize));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(0));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
|
||||
|
||||
if (bitsize < 32)
|
||||
{
|
||||
crc = context.AddIntrinsic(Intrinsic.X86Pslldq, crc, Const((64 - bitsize) / 8));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, crc);
|
||||
}
|
||||
|
||||
return context.VectorExtract(OperandType.I32, tmp, 2);
|
||||
}
|
||||
|
||||
private static Operand EmitCrc32Optimized64(ArmEmitterContext context, Operand crc, Operand data, bool castagnoli)
|
||||
{
|
||||
long mu = castagnoli ? 0x0DEA713F1 : 0x1F7011641; // mu' = floor(x^64/P(x))'
|
||||
long polynomial = castagnoli ? 0x105EC76F0 : 0x1DB710641; // P'(x) << 1
|
||||
|
||||
crc = context.VectorInsert(context.VectorZero(), crc, 0);
|
||||
data = context.VectorInsert(context.VectorZero(), data, 0);
|
||||
|
||||
Operand tmp = context.AddIntrinsic(Intrinsic.X86Pxor, crc, data);
|
||||
Operand res = context.AddIntrinsic(Intrinsic.X86Pslldq, tmp, Const(4));
|
||||
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, res, X86GetScalar(context, mu), Const(0));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
|
||||
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pxor, tmp, res);
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Psllq, tmp, Const(32));
|
||||
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, mu), Const(1));
|
||||
tmp = context.AddIntrinsic(Intrinsic.X86Pclmulqdq, tmp, X86GetScalar(context, polynomial), Const(0));
|
||||
|
||||
return context.VectorExtract(OperandType.I32, tmp, 2);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -27,6 +27,9 @@ namespace ARMeilleure.IntermediateRepresentation
|
|||
X86Comisseq,
|
||||
X86Comissge,
|
||||
X86Comisslt,
|
||||
X86Crc32,
|
||||
X86Crc32_16,
|
||||
X86Crc32_8,
|
||||
X86Cvtdq2pd,
|
||||
X86Cvtdq2ps,
|
||||
X86Cvtpd2dq,
|
||||
|
|
|
@ -20,7 +20,7 @@ namespace ARMeilleure.Translation.PTC
|
|||
{
|
||||
private const string HeaderMagic = "PTChd";
|
||||
|
||||
private const int InternalVersion = 8; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
private const int InternalVersion = 9; //! To be incremented manually for each change to the ARMeilleure project.
|
||||
|
||||
private const string BaseDir = "Ryujinx";
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue