Reduce JIT GC allocations (#2515)
* Turn `MemoryOperand` into a struct * Remove `IntrinsicOperation` * Remove `PhiNode` * Remove `Node` * Turn `Operand` into a struct * Turn `Operation` into a struct * Clean up pool management methods * Add `Arena` allocator * Move `OperationHelper` to `Operation.Factory` * Move `OperandHelper` to `Operand.Factory` * Optimize `Operation` a bit * Fix `Arena` initialization * Rename `NativeList<T>` to `ArenaList<T>` * Reduce `Operand` size from 88 to 56 bytes * Reduce `Operation` size from 56 to 40 bytes * Add optimistic interning of Register & Constant operands * Optimize `RegisterUsage` pass a bit * Optimize `RemoveUnusedNodes` pass a bit Iterating in reverse-order allows killing dependency chains in a single pass. * Fix PPTC symbols * Optimize `BasicBlock` a bit Reduce allocations from `_successor` & `DominanceFrontiers` * Fix `Operation` resize * Make `Arena` expandable Change the arena allocator to be expandable by allocating in pages, with some of them being pooled. Currently 32 pages are pooled. An LRU removal mechanism should probably be added to it. Apparently MHR can allocate bitmaps large enough to exceed the 16MB limit for the type. * Move `Arena` & `ArenaList` to `Common` * Remove `ThreadStaticPool` & co * Add `PhiOperation` * Reduce `Operand` size from 56 from 48 bytes * Add linear-probing to `Operand` intern table * Optimize `HybridAllocator` a bit * Add `Allocators` class * Tune `ArenaAllocator` sizes * Add page removal mechanism to `ArenaAllocator` Remove pages which have not been used for more than 5s after each reset. I am on fence if this would be better using a Gen2 callback object like the one in System.Buffers.ArrayPool<T>, to trim the pool. Because right now if a large translation happens, the pages will be freed only after a reset. This reset may not happen for a while because no new translation is hit, but the arena base sizes are rather small. * Fix `OOM` when allocating larger than page size in `ArenaAllocator` Tweak resizing mechanism for Operand.Uses and Assignemnts. * Optimize `Optimizer` a bit * Optimize `Operand.Add<T>/Remove<T>` a bit * Clean up `PreAllocator` * Fix phi insertion order Reduce codegen diffs. * Fix code alignment * Use new heuristics for degree of parallelism * Suppress warnings * Address gdkchan's feedback Renamed `GetValue()` to `GetValueUnsafe()` to make it more clear that `Operand.Value` should usually not be modified directly. * Add fast path to `ArenaAllocator` * Assembly for `ArenaAllocator.Allocate(ulong)`: .L0: mov rax, [rcx+0x18] lea r8, [rax+rdx] cmp r8, [rcx+0x10] ja short .L2 .L1: mov rdx, [rcx+8] add rax, [rdx+8] mov [rcx+0x18], r8 ret .L2: jmp ArenaAllocator.AllocateSlow(UInt64) A few variable/field had to be changed to ulong so that RyuJIT avoids emitting zero-extends. * Implement a new heuristic to free pooled pages. If an arena is used often, it is more likely that its pages will be needed, so the pages are kept for longer (e.g: during PPTC rebuild or burst sof compilations). If is not used often, then it is more likely that its pages will not be needed (e.g: after PPTC rebuild or bursts of compilations). * Address riperiperi's feedback * Use `EqualityComparer<T>` in `IntrusiveList<T>` Avoids a potential GC hole in `Equals(T, T)`.
This commit is contained in:
parent
cd4530f29c
commit
22b2cb39af
91 changed files with 2354 additions and 2142 deletions
|
@ -1,9 +1,11 @@
|
|||
using ARMeilleure.IntermediateRepresentation;
|
||||
using ARMeilleure.State;
|
||||
using System;
|
||||
|
||||
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
|
||||
using static ARMeilleure.IntermediateRepresentation.OperationHelper;
|
||||
using System.Numerics;
|
||||
using System.Runtime.Intrinsics;
|
||||
using System.Runtime.Intrinsics.X86;
|
||||
using static ARMeilleure.IntermediateRepresentation.Operand.Factory;
|
||||
using static ARMeilleure.IntermediateRepresentation.Operation.Factory;
|
||||
|
||||
namespace ARMeilleure.Translation
|
||||
{
|
||||
|
@ -14,27 +16,48 @@ namespace ARMeilleure.Translation
|
|||
|
||||
private struct RegisterMask : IEquatable<RegisterMask>
|
||||
{
|
||||
public long IntMask { get; set; }
|
||||
public long VecMask { get; set; }
|
||||
public long IntMask => Mask.GetElement(0);
|
||||
public long VecMask => Mask.GetElement(1);
|
||||
|
||||
public Vector128<long> Mask { get; }
|
||||
|
||||
public RegisterMask(Vector128<long> mask)
|
||||
{
|
||||
Mask = mask;
|
||||
}
|
||||
|
||||
public RegisterMask(long intMask, long vecMask)
|
||||
{
|
||||
IntMask = intMask;
|
||||
VecMask = vecMask;
|
||||
Mask = Vector128.Create(intMask, vecMask);
|
||||
}
|
||||
|
||||
public static RegisterMask operator &(RegisterMask x, RegisterMask y)
|
||||
{
|
||||
if (Sse2.IsSupported)
|
||||
{
|
||||
return new RegisterMask(Sse2.And(x.Mask, y.Mask));
|
||||
}
|
||||
|
||||
return new RegisterMask(x.IntMask & y.IntMask, x.VecMask & y.VecMask);
|
||||
}
|
||||
|
||||
public static RegisterMask operator |(RegisterMask x, RegisterMask y)
|
||||
{
|
||||
if (Sse2.IsSupported)
|
||||
{
|
||||
return new RegisterMask(Sse2.Or(x.Mask, y.Mask));
|
||||
}
|
||||
|
||||
return new RegisterMask(x.IntMask | y.IntMask, x.VecMask | y.VecMask);
|
||||
}
|
||||
|
||||
public static RegisterMask operator ~(RegisterMask x)
|
||||
{
|
||||
if (Sse2.IsSupported)
|
||||
{
|
||||
return new RegisterMask(Sse2.AndNot(x.Mask, Vector128<long>.AllBitsSet));
|
||||
}
|
||||
|
||||
return new RegisterMask(~x.IntMask, ~x.VecMask);
|
||||
}
|
||||
|
||||
|
@ -55,12 +78,12 @@ namespace ARMeilleure.Translation
|
|||
|
||||
public bool Equals(RegisterMask other)
|
||||
{
|
||||
return IntMask == other.IntMask && VecMask == other.VecMask;
|
||||
return Mask.Equals(other.Mask);
|
||||
}
|
||||
|
||||
public override int GetHashCode()
|
||||
{
|
||||
return HashCode.Combine(IntMask, VecMask);
|
||||
return Mask.GetHashCode();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -72,27 +95,23 @@ namespace ARMeilleure.Translation
|
|||
|
||||
for (BasicBlock block = cfg.Blocks.First; block != null; block = block.ListNext)
|
||||
{
|
||||
for (Node node = block.Operations.First; node != null; node = node.ListNext)
|
||||
for (Operation node = block.Operations.First; node != default; node = node.ListNext)
|
||||
{
|
||||
Operation operation = node as Operation;
|
||||
|
||||
for (int srcIndex = 0; srcIndex < operation.SourcesCount; srcIndex++)
|
||||
for (int index = 0; index < node.SourcesCount; index++)
|
||||
{
|
||||
Operand source = operation.GetSource(srcIndex);
|
||||
Operand source = node.GetSource(index);
|
||||
|
||||
if (source.Kind != OperandKind.Register)
|
||||
if (source.Kind == OperandKind.Register)
|
||||
{
|
||||
continue;
|
||||
Register register = source.GetRegister();
|
||||
|
||||
localInputs[block.Index] |= GetMask(register) & ~localOutputs[block.Index];
|
||||
}
|
||||
|
||||
Register register = source.GetRegister();
|
||||
|
||||
localInputs[block.Index] |= GetMask(register) & ~localOutputs[block.Index];
|
||||
}
|
||||
|
||||
if (operation.Destination != null && operation.Destination.Kind == OperandKind.Register)
|
||||
if (node.Destination != default && node.Destination.Kind == OperandKind.Register)
|
||||
{
|
||||
localOutputs[block.Index] |= GetMask(operation.Destination.GetRegister());
|
||||
localOutputs[block.Index] |= GetMask(node.Destination.GetRegister());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -104,7 +123,6 @@ namespace ARMeilleure.Translation
|
|||
RegisterMask[] globalOutputs = new RegisterMask[cfg.Blocks.Count];
|
||||
|
||||
bool modified;
|
||||
|
||||
bool firstPass = true;
|
||||
|
||||
do
|
||||
|
@ -121,7 +139,6 @@ namespace ARMeilleure.Translation
|
|||
BasicBlock predecessor = block.Predecessors[0];
|
||||
|
||||
RegisterMask cmnOutputs = localOutputs[predecessor.Index] | globalCmnOutputs[predecessor.Index];
|
||||
|
||||
RegisterMask outputs = globalOutputs[predecessor.Index];
|
||||
|
||||
for (int pIndex = 1; pIndex < block.Predecessors.Count; pIndex++)
|
||||
|
@ -129,7 +146,6 @@ namespace ARMeilleure.Translation
|
|||
predecessor = block.Predecessors[pIndex];
|
||||
|
||||
cmnOutputs &= localOutputs[predecessor.Index] | globalCmnOutputs[predecessor.Index];
|
||||
|
||||
outputs |= globalOutputs[predecessor.Index];
|
||||
}
|
||||
|
||||
|
@ -140,21 +156,13 @@ namespace ARMeilleure.Translation
|
|||
cmnOutputs &= globalCmnOutputs[block.Index];
|
||||
}
|
||||
|
||||
if (Exchange(globalCmnOutputs, block.Index, cmnOutputs))
|
||||
{
|
||||
modified = true;
|
||||
}
|
||||
|
||||
modified |= Exchange(globalCmnOutputs, block.Index, cmnOutputs);
|
||||
outputs |= localOutputs[block.Index];
|
||||
|
||||
if (Exchange(globalOutputs, block.Index, globalOutputs[block.Index] | outputs))
|
||||
{
|
||||
modified = true;
|
||||
}
|
||||
modified |= Exchange(globalOutputs, block.Index, globalOutputs[block.Index] | outputs);
|
||||
}
|
||||
else if (Exchange(globalOutputs, block.Index, localOutputs[block.Index]))
|
||||
else
|
||||
{
|
||||
modified = true;
|
||||
modified |= Exchange(globalOutputs, block.Index, localOutputs[block.Index]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -165,17 +173,14 @@ namespace ARMeilleure.Translation
|
|||
|
||||
RegisterMask inputs = localInputs[block.Index];
|
||||
|
||||
for (int i = 0; i < block.SuccessorCount; i++)
|
||||
for (int i = 0; i < block.SuccessorsCount; i++)
|
||||
{
|
||||
inputs |= globalInputs[block.GetSuccessor(i).Index];
|
||||
}
|
||||
|
||||
inputs &= ~globalCmnOutputs[block.Index];
|
||||
|
||||
if (Exchange(globalInputs, block.Index, globalInputs[block.Index] | inputs))
|
||||
{
|
||||
modified = true;
|
||||
}
|
||||
modified |= Exchange(globalInputs, block.Index, globalInputs[block.Index] | inputs);
|
||||
}
|
||||
|
||||
firstPass = false;
|
||||
|
@ -192,12 +197,18 @@ namespace ARMeilleure.Translation
|
|||
block.Operations.Remove(block.Operations.First);
|
||||
}
|
||||
|
||||
Operand arg = default;
|
||||
|
||||
// The only block without any predecessor should be the entry block.
|
||||
// It always needs a context load as it is the first block to run.
|
||||
if (block.Predecessors.Count == 0 || hasContextLoad)
|
||||
{
|
||||
LoadLocals(block, globalInputs[block.Index].VecMask, RegisterType.Vector, mode);
|
||||
LoadLocals(block, globalInputs[block.Index].IntMask, RegisterType.Integer, mode);
|
||||
arg = Local(OperandType.I64);
|
||||
|
||||
Operation loadArg = block.Operations.AddFirst(Operation(Instruction.LoadArgument, arg, Const(0)));
|
||||
|
||||
LoadLocals(block, globalInputs[block.Index].VecMask, RegisterType.Vector, mode, loadArg, arg);
|
||||
LoadLocals(block, globalInputs[block.Index].IntMask, RegisterType.Integer, mode, loadArg, arg);
|
||||
}
|
||||
|
||||
bool hasContextStore = HasContextStore(block);
|
||||
|
@ -209,8 +220,15 @@ namespace ARMeilleure.Translation
|
|||
|
||||
if (EndsWithReturn(block) || hasContextStore)
|
||||
{
|
||||
StoreLocals(block, globalOutputs[block.Index].IntMask, RegisterType.Integer, mode);
|
||||
StoreLocals(block, globalOutputs[block.Index].VecMask, RegisterType.Vector, mode);
|
||||
if (arg == default)
|
||||
{
|
||||
arg = Local(OperandType.I64);
|
||||
|
||||
block.Append(Operation(Instruction.LoadArgument, arg, Const(0)));
|
||||
}
|
||||
|
||||
StoreLocals(block, globalOutputs[block.Index].IntMask, RegisterType.Integer, mode, arg);
|
||||
StoreLocals(block, globalOutputs[block.Index].VecMask, RegisterType.Vector, mode, arg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -222,27 +240,31 @@ namespace ARMeilleure.Translation
|
|||
|
||||
private static bool HasContextStore(BasicBlock block)
|
||||
{
|
||||
return EndsWith(block, Instruction.StoreToContext) && block.GetLastOp().SourcesCount == 0;
|
||||
return EndsWith(block, Instruction.StoreToContext) && block.Operations.Last.SourcesCount == 0;
|
||||
}
|
||||
|
||||
private static bool StartsWith(BasicBlock block, Instruction inst)
|
||||
{
|
||||
if (block.Operations.Count == 0)
|
||||
if (block.Operations.Count > 0)
|
||||
{
|
||||
return false;
|
||||
Operation first = block.Operations.First;
|
||||
|
||||
return first != default && first.Instruction == inst;
|
||||
}
|
||||
|
||||
return block.Operations.First is Operation operation && operation.Instruction == inst;
|
||||
return false;
|
||||
}
|
||||
|
||||
private static bool EndsWith(BasicBlock block, Instruction inst)
|
||||
{
|
||||
if (block.Operations.Count == 0)
|
||||
if (block.Operations.Count > 0)
|
||||
{
|
||||
return false;
|
||||
Operation last = block.Operations.Last;
|
||||
|
||||
return last != default && last.Instruction == inst;
|
||||
}
|
||||
|
||||
return block.Operations.Last is Operation operation && operation.Instruction == inst;
|
||||
return false;
|
||||
}
|
||||
|
||||
private static RegisterMask GetMask(Register register)
|
||||
|
@ -263,76 +285,57 @@ namespace ARMeilleure.Translation
|
|||
|
||||
private static bool Exchange(RegisterMask[] masks, int blkIndex, RegisterMask value)
|
||||
{
|
||||
RegisterMask oldValue = masks[blkIndex];
|
||||
ref RegisterMask curValue = ref masks[blkIndex];
|
||||
|
||||
masks[blkIndex] = value;
|
||||
bool changed = curValue != value;
|
||||
|
||||
return oldValue != value;
|
||||
curValue = value;
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
private static void LoadLocals(BasicBlock block, long inputs, RegisterType baseType, ExecutionMode mode)
|
||||
private static void LoadLocals(
|
||||
BasicBlock block,
|
||||
long inputs,
|
||||
RegisterType baseType,
|
||||
ExecutionMode mode,
|
||||
Operation loadArg,
|
||||
Operand arg)
|
||||
{
|
||||
Operand arg0 = Local(OperandType.I64);
|
||||
|
||||
for (int bit = 63; bit >= 0; bit--)
|
||||
while (inputs != 0)
|
||||
{
|
||||
long mask = 1L << bit;
|
||||
|
||||
if ((inputs & mask) == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
int bit = 63 - BitOperations.LeadingZeroCount((ulong)inputs);
|
||||
|
||||
Operand dest = GetRegFromBit(bit, baseType, mode);
|
||||
|
||||
long offset = NativeContext.GetRegisterOffset(dest.GetRegister());
|
||||
|
||||
Operand offset = Const((long)NativeContext.GetRegisterOffset(dest.GetRegister()));
|
||||
Operand addr = Local(OperandType.I64);
|
||||
|
||||
Operation loadOp = Operation(Instruction.Load, dest, addr);
|
||||
block.Operations.AddAfter(loadArg, Operation(Instruction.Load, dest, addr));
|
||||
block.Operations.AddAfter(loadArg, Operation(Instruction.Add, addr, arg, offset));
|
||||
|
||||
block.Operations.AddFirst(loadOp);
|
||||
|
||||
Operation calcOffsOp = Operation(Instruction.Add, addr, arg0, Const(offset));
|
||||
|
||||
block.Operations.AddFirst(calcOffsOp);
|
||||
inputs &= ~(1L << bit);
|
||||
}
|
||||
|
||||
Operation loadArg0 = Operation(Instruction.LoadArgument, arg0, Const(0));
|
||||
|
||||
block.Operations.AddFirst(loadArg0);
|
||||
}
|
||||
|
||||
private static void StoreLocals(BasicBlock block, long outputs, RegisterType baseType, ExecutionMode mode)
|
||||
private static void StoreLocals(
|
||||
BasicBlock block,
|
||||
long outputs,
|
||||
RegisterType baseType,
|
||||
ExecutionMode mode,
|
||||
Operand arg)
|
||||
{
|
||||
Operand arg0 = Local(OperandType.I64);
|
||||
|
||||
Operation loadArg0 = Operation(Instruction.LoadArgument, arg0, Const(0));
|
||||
|
||||
block.Append(loadArg0);
|
||||
|
||||
for (int bit = 0; bit < 64; bit++)
|
||||
while (outputs != 0)
|
||||
{
|
||||
long mask = 1L << bit;
|
||||
|
||||
if ((outputs & mask) == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
int bit = BitOperations.TrailingZeroCount(outputs);
|
||||
|
||||
Operand source = GetRegFromBit(bit, baseType, mode);
|
||||
|
||||
long offset = NativeContext.GetRegisterOffset(source.GetRegister());
|
||||
|
||||
Operand offset = Const((long)NativeContext.GetRegisterOffset(source.GetRegister()));
|
||||
Operand addr = Local(OperandType.I64);
|
||||
|
||||
Operation calcOffsOp = Operation(Instruction.Add, addr, arg0, Const(offset));
|
||||
block.Append(Operation(Instruction.Add, addr, arg, offset));
|
||||
block.Append(Operation(Instruction.Store, default, addr, source));
|
||||
|
||||
block.Append(calcOffsOp);
|
||||
|
||||
Operation storeOp = Operation(Instruction.Store, null, addr, source);
|
||||
|
||||
block.Append(storeOp);
|
||||
outputs &= ~(1L << bit);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -340,15 +343,15 @@ namespace ARMeilleure.Translation
|
|||
{
|
||||
if (bit < RegsCount)
|
||||
{
|
||||
return OperandHelper.Register(bit, baseType, GetOperandType(baseType, mode));
|
||||
return Register(bit, baseType, GetOperandType(baseType, mode));
|
||||
}
|
||||
else if (baseType == RegisterType.Integer)
|
||||
{
|
||||
return OperandHelper.Register(bit & RegsMask, RegisterType.Flag, OperandType.I32);
|
||||
return Register(bit & RegsMask, RegisterType.Flag, OperandType.I32);
|
||||
}
|
||||
else if (baseType == RegisterType.Vector)
|
||||
{
|
||||
return OperandHelper.Register(bit & RegsMask, RegisterType.FpFlag, OperandType.I32);
|
||||
return Register(bit & RegsMask, RegisterType.FpFlag, OperandType.I32);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -371,12 +374,9 @@ namespace ARMeilleure.Translation
|
|||
|
||||
private static bool EndsWithReturn(BasicBlock block)
|
||||
{
|
||||
if (!(block.GetLastOp() is Operation operation))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
Operation last = block.Operations.Last;
|
||||
|
||||
return operation.Instruction == Instruction.Return;
|
||||
return last != default && last.Instruction == Instruction.Return;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue