Reduce JIT GC allocations (#2515)

* Turn `MemoryOperand` into a struct

* Remove `IntrinsicOperation`

* Remove `PhiNode`

* Remove `Node`

* Turn `Operand` into a struct

* Turn `Operation` into a struct

* Clean up pool management methods

* Add `Arena` allocator

* Move `OperationHelper` to `Operation.Factory`

* Move `OperandHelper` to `Operand.Factory`

* Optimize `Operation` a bit

* Fix `Arena` initialization

* Rename `NativeList<T>` to `ArenaList<T>`

* Reduce `Operand` size from 88 to 56 bytes

* Reduce `Operation` size from 56 to 40 bytes

* Add optimistic interning of Register & Constant operands

* Optimize `RegisterUsage` pass a bit

* Optimize `RemoveUnusedNodes` pass a bit

Iterating in reverse-order allows killing dependency chains in a single
pass.

* Fix PPTC symbols

* Optimize `BasicBlock` a bit

Reduce allocations from `_successor` & `DominanceFrontiers`

* Fix `Operation` resize

* Make `Arena` expandable

Change the arena allocator to be expandable by allocating in pages, with
some of them being pooled. Currently 32 pages are pooled. An LRU removal
mechanism should probably be added to it.

Apparently MHR can allocate bitmaps large enough to exceed the 16MB
limit for the type.

* Move `Arena` & `ArenaList` to `Common`

* Remove `ThreadStaticPool` & co

* Add `PhiOperation`

* Reduce `Operand` size from 56 from 48 bytes

* Add linear-probing to `Operand` intern table

* Optimize `HybridAllocator` a bit

* Add `Allocators` class

* Tune `ArenaAllocator` sizes

* Add page removal mechanism to `ArenaAllocator`

Remove pages which have not been used for more than 5s after each reset.

I am on fence if this would be better using a Gen2 callback object like
the one in System.Buffers.ArrayPool<T>, to trim the pool. Because right
now if a large translation happens, the pages will be freed only after a
reset. This reset may not happen for a while because no new translation
is hit, but the arena base sizes are rather small.

* Fix `OOM` when allocating larger than page size in `ArenaAllocator`

Tweak resizing mechanism for Operand.Uses and Assignemnts.

* Optimize `Optimizer` a bit

* Optimize `Operand.Add<T>/Remove<T>` a bit

* Clean up `PreAllocator`

* Fix phi insertion order

Reduce codegen diffs.

* Fix code alignment

* Use new heuristics for degree of parallelism

* Suppress warnings

* Address gdkchan's feedback

Renamed `GetValue()` to `GetValueUnsafe()` to make it more clear that
`Operand.Value` should usually not be modified directly.

* Add fast path to `ArenaAllocator`

* Assembly for `ArenaAllocator.Allocate(ulong)`:

  .L0:
    mov rax, [rcx+0x18]
    lea r8, [rax+rdx]
    cmp r8, [rcx+0x10]
    ja short .L2
  .L1:
    mov rdx, [rcx+8]
    add rax, [rdx+8]
    mov [rcx+0x18], r8
    ret
  .L2:
    jmp ArenaAllocator.AllocateSlow(UInt64)

  A few variable/field had to be changed to ulong so that RyuJIT avoids
  emitting zero-extends.

* Implement a new heuristic to free pooled pages.

  If an arena is used often, it is more likely that its pages will be
  needed, so the pages are kept for longer (e.g: during PPTC rebuild or
  burst sof compilations). If is not used often, then it is more likely
  that its pages will not be needed (e.g: after PPTC rebuild or bursts
  of compilations).

* Address riperiperi's feedback

* Use `EqualityComparer<T>` in `IntrusiveList<T>`

Avoids a potential GC hole in `Equals(T, T)`.
This commit is contained in:
FICTURE7 2021-08-17 22:08:34 +04:00 committed by GitHub
parent cd4530f29c
commit 22b2cb39af
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
91 changed files with 2354 additions and 2142 deletions

View file

@ -1,9 +1,11 @@
using ARMeilleure.IntermediateRepresentation;
using ARMeilleure.State;
using System;
using static ARMeilleure.IntermediateRepresentation.OperandHelper;
using static ARMeilleure.IntermediateRepresentation.OperationHelper;
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using static ARMeilleure.IntermediateRepresentation.Operand.Factory;
using static ARMeilleure.IntermediateRepresentation.Operation.Factory;
namespace ARMeilleure.Translation
{
@ -14,27 +16,48 @@ namespace ARMeilleure.Translation
private struct RegisterMask : IEquatable<RegisterMask>
{
public long IntMask { get; set; }
public long VecMask { get; set; }
public long IntMask => Mask.GetElement(0);
public long VecMask => Mask.GetElement(1);
public Vector128<long> Mask { get; }
public RegisterMask(Vector128<long> mask)
{
Mask = mask;
}
public RegisterMask(long intMask, long vecMask)
{
IntMask = intMask;
VecMask = vecMask;
Mask = Vector128.Create(intMask, vecMask);
}
public static RegisterMask operator &(RegisterMask x, RegisterMask y)
{
if (Sse2.IsSupported)
{
return new RegisterMask(Sse2.And(x.Mask, y.Mask));
}
return new RegisterMask(x.IntMask & y.IntMask, x.VecMask & y.VecMask);
}
public static RegisterMask operator |(RegisterMask x, RegisterMask y)
{
if (Sse2.IsSupported)
{
return new RegisterMask(Sse2.Or(x.Mask, y.Mask));
}
return new RegisterMask(x.IntMask | y.IntMask, x.VecMask | y.VecMask);
}
public static RegisterMask operator ~(RegisterMask x)
{
if (Sse2.IsSupported)
{
return new RegisterMask(Sse2.AndNot(x.Mask, Vector128<long>.AllBitsSet));
}
return new RegisterMask(~x.IntMask, ~x.VecMask);
}
@ -55,12 +78,12 @@ namespace ARMeilleure.Translation
public bool Equals(RegisterMask other)
{
return IntMask == other.IntMask && VecMask == other.VecMask;
return Mask.Equals(other.Mask);
}
public override int GetHashCode()
{
return HashCode.Combine(IntMask, VecMask);
return Mask.GetHashCode();
}
}
@ -72,27 +95,23 @@ namespace ARMeilleure.Translation
for (BasicBlock block = cfg.Blocks.First; block != null; block = block.ListNext)
{
for (Node node = block.Operations.First; node != null; node = node.ListNext)
for (Operation node = block.Operations.First; node != default; node = node.ListNext)
{
Operation operation = node as Operation;
for (int srcIndex = 0; srcIndex < operation.SourcesCount; srcIndex++)
for (int index = 0; index < node.SourcesCount; index++)
{
Operand source = operation.GetSource(srcIndex);
Operand source = node.GetSource(index);
if (source.Kind != OperandKind.Register)
if (source.Kind == OperandKind.Register)
{
continue;
Register register = source.GetRegister();
localInputs[block.Index] |= GetMask(register) & ~localOutputs[block.Index];
}
Register register = source.GetRegister();
localInputs[block.Index] |= GetMask(register) & ~localOutputs[block.Index];
}
if (operation.Destination != null && operation.Destination.Kind == OperandKind.Register)
if (node.Destination != default && node.Destination.Kind == OperandKind.Register)
{
localOutputs[block.Index] |= GetMask(operation.Destination.GetRegister());
localOutputs[block.Index] |= GetMask(node.Destination.GetRegister());
}
}
}
@ -104,7 +123,6 @@ namespace ARMeilleure.Translation
RegisterMask[] globalOutputs = new RegisterMask[cfg.Blocks.Count];
bool modified;
bool firstPass = true;
do
@ -121,7 +139,6 @@ namespace ARMeilleure.Translation
BasicBlock predecessor = block.Predecessors[0];
RegisterMask cmnOutputs = localOutputs[predecessor.Index] | globalCmnOutputs[predecessor.Index];
RegisterMask outputs = globalOutputs[predecessor.Index];
for (int pIndex = 1; pIndex < block.Predecessors.Count; pIndex++)
@ -129,7 +146,6 @@ namespace ARMeilleure.Translation
predecessor = block.Predecessors[pIndex];
cmnOutputs &= localOutputs[predecessor.Index] | globalCmnOutputs[predecessor.Index];
outputs |= globalOutputs[predecessor.Index];
}
@ -140,21 +156,13 @@ namespace ARMeilleure.Translation
cmnOutputs &= globalCmnOutputs[block.Index];
}
if (Exchange(globalCmnOutputs, block.Index, cmnOutputs))
{
modified = true;
}
modified |= Exchange(globalCmnOutputs, block.Index, cmnOutputs);
outputs |= localOutputs[block.Index];
if (Exchange(globalOutputs, block.Index, globalOutputs[block.Index] | outputs))
{
modified = true;
}
modified |= Exchange(globalOutputs, block.Index, globalOutputs[block.Index] | outputs);
}
else if (Exchange(globalOutputs, block.Index, localOutputs[block.Index]))
else
{
modified = true;
modified |= Exchange(globalOutputs, block.Index, localOutputs[block.Index]);
}
}
@ -165,17 +173,14 @@ namespace ARMeilleure.Translation
RegisterMask inputs = localInputs[block.Index];
for (int i = 0; i < block.SuccessorCount; i++)
for (int i = 0; i < block.SuccessorsCount; i++)
{
inputs |= globalInputs[block.GetSuccessor(i).Index];
}
inputs &= ~globalCmnOutputs[block.Index];
if (Exchange(globalInputs, block.Index, globalInputs[block.Index] | inputs))
{
modified = true;
}
modified |= Exchange(globalInputs, block.Index, globalInputs[block.Index] | inputs);
}
firstPass = false;
@ -192,12 +197,18 @@ namespace ARMeilleure.Translation
block.Operations.Remove(block.Operations.First);
}
Operand arg = default;
// The only block without any predecessor should be the entry block.
// It always needs a context load as it is the first block to run.
if (block.Predecessors.Count == 0 || hasContextLoad)
{
LoadLocals(block, globalInputs[block.Index].VecMask, RegisterType.Vector, mode);
LoadLocals(block, globalInputs[block.Index].IntMask, RegisterType.Integer, mode);
arg = Local(OperandType.I64);
Operation loadArg = block.Operations.AddFirst(Operation(Instruction.LoadArgument, arg, Const(0)));
LoadLocals(block, globalInputs[block.Index].VecMask, RegisterType.Vector, mode, loadArg, arg);
LoadLocals(block, globalInputs[block.Index].IntMask, RegisterType.Integer, mode, loadArg, arg);
}
bool hasContextStore = HasContextStore(block);
@ -209,8 +220,15 @@ namespace ARMeilleure.Translation
if (EndsWithReturn(block) || hasContextStore)
{
StoreLocals(block, globalOutputs[block.Index].IntMask, RegisterType.Integer, mode);
StoreLocals(block, globalOutputs[block.Index].VecMask, RegisterType.Vector, mode);
if (arg == default)
{
arg = Local(OperandType.I64);
block.Append(Operation(Instruction.LoadArgument, arg, Const(0)));
}
StoreLocals(block, globalOutputs[block.Index].IntMask, RegisterType.Integer, mode, arg);
StoreLocals(block, globalOutputs[block.Index].VecMask, RegisterType.Vector, mode, arg);
}
}
}
@ -222,27 +240,31 @@ namespace ARMeilleure.Translation
private static bool HasContextStore(BasicBlock block)
{
return EndsWith(block, Instruction.StoreToContext) && block.GetLastOp().SourcesCount == 0;
return EndsWith(block, Instruction.StoreToContext) && block.Operations.Last.SourcesCount == 0;
}
private static bool StartsWith(BasicBlock block, Instruction inst)
{
if (block.Operations.Count == 0)
if (block.Operations.Count > 0)
{
return false;
Operation first = block.Operations.First;
return first != default && first.Instruction == inst;
}
return block.Operations.First is Operation operation && operation.Instruction == inst;
return false;
}
private static bool EndsWith(BasicBlock block, Instruction inst)
{
if (block.Operations.Count == 0)
if (block.Operations.Count > 0)
{
return false;
Operation last = block.Operations.Last;
return last != default && last.Instruction == inst;
}
return block.Operations.Last is Operation operation && operation.Instruction == inst;
return false;
}
private static RegisterMask GetMask(Register register)
@ -263,76 +285,57 @@ namespace ARMeilleure.Translation
private static bool Exchange(RegisterMask[] masks, int blkIndex, RegisterMask value)
{
RegisterMask oldValue = masks[blkIndex];
ref RegisterMask curValue = ref masks[blkIndex];
masks[blkIndex] = value;
bool changed = curValue != value;
return oldValue != value;
curValue = value;
return changed;
}
private static void LoadLocals(BasicBlock block, long inputs, RegisterType baseType, ExecutionMode mode)
private static void LoadLocals(
BasicBlock block,
long inputs,
RegisterType baseType,
ExecutionMode mode,
Operation loadArg,
Operand arg)
{
Operand arg0 = Local(OperandType.I64);
for (int bit = 63; bit >= 0; bit--)
while (inputs != 0)
{
long mask = 1L << bit;
if ((inputs & mask) == 0)
{
continue;
}
int bit = 63 - BitOperations.LeadingZeroCount((ulong)inputs);
Operand dest = GetRegFromBit(bit, baseType, mode);
long offset = NativeContext.GetRegisterOffset(dest.GetRegister());
Operand offset = Const((long)NativeContext.GetRegisterOffset(dest.GetRegister()));
Operand addr = Local(OperandType.I64);
Operation loadOp = Operation(Instruction.Load, dest, addr);
block.Operations.AddAfter(loadArg, Operation(Instruction.Load, dest, addr));
block.Operations.AddAfter(loadArg, Operation(Instruction.Add, addr, arg, offset));
block.Operations.AddFirst(loadOp);
Operation calcOffsOp = Operation(Instruction.Add, addr, arg0, Const(offset));
block.Operations.AddFirst(calcOffsOp);
inputs &= ~(1L << bit);
}
Operation loadArg0 = Operation(Instruction.LoadArgument, arg0, Const(0));
block.Operations.AddFirst(loadArg0);
}
private static void StoreLocals(BasicBlock block, long outputs, RegisterType baseType, ExecutionMode mode)
private static void StoreLocals(
BasicBlock block,
long outputs,
RegisterType baseType,
ExecutionMode mode,
Operand arg)
{
Operand arg0 = Local(OperandType.I64);
Operation loadArg0 = Operation(Instruction.LoadArgument, arg0, Const(0));
block.Append(loadArg0);
for (int bit = 0; bit < 64; bit++)
while (outputs != 0)
{
long mask = 1L << bit;
if ((outputs & mask) == 0)
{
continue;
}
int bit = BitOperations.TrailingZeroCount(outputs);
Operand source = GetRegFromBit(bit, baseType, mode);
long offset = NativeContext.GetRegisterOffset(source.GetRegister());
Operand offset = Const((long)NativeContext.GetRegisterOffset(source.GetRegister()));
Operand addr = Local(OperandType.I64);
Operation calcOffsOp = Operation(Instruction.Add, addr, arg0, Const(offset));
block.Append(Operation(Instruction.Add, addr, arg, offset));
block.Append(Operation(Instruction.Store, default, addr, source));
block.Append(calcOffsOp);
Operation storeOp = Operation(Instruction.Store, null, addr, source);
block.Append(storeOp);
outputs &= ~(1L << bit);
}
}
@ -340,15 +343,15 @@ namespace ARMeilleure.Translation
{
if (bit < RegsCount)
{
return OperandHelper.Register(bit, baseType, GetOperandType(baseType, mode));
return Register(bit, baseType, GetOperandType(baseType, mode));
}
else if (baseType == RegisterType.Integer)
{
return OperandHelper.Register(bit & RegsMask, RegisterType.Flag, OperandType.I32);
return Register(bit & RegsMask, RegisterType.Flag, OperandType.I32);
}
else if (baseType == RegisterType.Vector)
{
return OperandHelper.Register(bit & RegsMask, RegisterType.FpFlag, OperandType.I32);
return Register(bit & RegsMask, RegisterType.FpFlag, OperandType.I32);
}
else
{
@ -371,12 +374,9 @@ namespace ARMeilleure.Translation
private static bool EndsWithReturn(BasicBlock block)
{
if (!(block.GetLastOp() is Operation operation))
{
return false;
}
Operation last = block.Operations.Last;
return operation.Instruction == Instruction.Return;
return last != default && last.Instruction == Instruction.Return;
}
}
}