Optimize texture format conversion, and MethodCopyBuffer (#1274)

* Improve performance when converting texture formats.

Still more work to do.

* Speed up buffer -> texture copies.

No longer copies byte by byte. Fast path when formats are identical.

* Fix a few things, 64 byte block fast copy.

* Spacing cleanup, unrelated change.

* Fix base offset calculation for region copies.

* Fix Linear -> BlockLinear

* Fix some nits. (part 1 of review feedback)

* Use a generic version of the Convert* functions rather than lambdas.

This is some real monkey's paw shit.

* Remove unnecessary span constructor.

* Revert "Use a generic version of the Convert* functions rather than lambdas."

This reverts commit aa43dcfbe8.

* Fix bug with rectangle destination writing, better rectangle calculation for linear textures.
This commit is contained in:
riperiperi 2020-06-13 23:31:06 +01:00 committed by GitHub
parent ce983f360b
commit bea1fc2e8d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 356 additions and 61 deletions

View file

@ -1,6 +1,6 @@
using Ryujinx.Common;
using System;
using System.Runtime.Intrinsics;
using static Ryujinx.Graphics.Texture.BlockLinearConstants;
namespace Ryujinx.Graphics.Texture
@ -64,11 +64,14 @@ namespace Ryujinx.Graphics.Texture
}
int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
int xStart = strideTrunc / bytesPerPixel;
int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
int outStrideGap = stride - w * bytesPerPixel;
int alignment = gobWidth;
if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
@ -86,36 +89,74 @@ namespace Ryujinx.Graphics.Texture
mipGobBlocksInZ,
bytesPerPixel);
for (int layer = 0; layer < layers; layer++)
unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
{
int inBaseOffset = layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level);
for (int z = 0; z < d; z++)
for (int y = 0; y < h; y++)
fixed (byte* outputPtr = output, dataPtr = data)
{
for (int x = 0; x < strideTrunc; x += 16)
byte* outPtr = outputPtr + outOffs;
for (int layer = 0; layer < layers; layer++)
{
int offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset(x, y, z);
byte* inBaseOffset = dataPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
Span<byte> dest = output.Slice(outOffs + x, 16);
for (int z = 0; z < d; z++)
{
layoutConverter.SetZ(z);
for (int y = 0; y < h; y++)
{
layoutConverter.SetY(y);
data.Slice(offset, 16).CopyTo(dest);
for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64)
{
byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
byte* offset2 = offset + 0x20;
byte* offset3 = offset + 0x100;
byte* offset4 = offset + 0x120;
Vector128<byte> value = *(Vector128<byte>*)offset;
Vector128<byte> value2 = *(Vector128<byte>*)offset2;
Vector128<byte> value3 = *(Vector128<byte>*)offset3;
Vector128<byte> value4 = *(Vector128<byte>*)offset4;
*(Vector128<byte>*)outPtr = value;
*(Vector128<byte>*)(outPtr + 16) = value2;
*(Vector128<byte>*)(outPtr + 32) = value3;
*(Vector128<byte>*)(outPtr + 48) = value4;
}
for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16)
{
byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
*(Vector128<byte>*)outPtr = *(Vector128<byte>*)offset;
}
for (int x = xStart; x < w; x++, outPtr += bytesPerPixel)
{
byte* offset = inBaseOffset + layoutConverter.GetOffset(x);
*(T*)outPtr = *(T*)offset;
}
outPtr += outStrideGap;
}
}
}
for (int x = xStart; x < w; x++)
{
int offset = inBaseOffset + layoutConverter.GetOffset(x, y, z);
Span<byte> dest = output.Slice(outOffs + x * bytesPerPixel, bytesPerPixel);
data.Slice(offset, bytesPerPixel).CopyTo(dest);
}
outOffs += stride;
outOffs += stride * h * d * layers;
}
return true;
}
}
bool _ = bytesPerPixel switch
{
1 => Convert<byte>(output, data),
2 => Convert<ushort>(output, data),
4 => Convert<uint>(output, data),
8 => Convert<ulong>(output, data),
12 => Convert<Bpp12Pixel>(output, data),
16 => Convert<Vector128<byte>>(output, data),
_ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
};
}
return output;
}
@ -132,22 +173,18 @@ namespace Ryujinx.Graphics.Texture
int h = BitUtils.DivRoundUp(height, blockHeight);
int outStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
int lineSize = w * bytesPerPixel;
Span<byte> output = new byte[h * outStride];
int outOffs = 0;
int inOffs = 0;
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
int offset = y * stride + x * bytesPerPixel;
Span<byte> dest = output.Slice(outOffs + x * bytesPerPixel, bytesPerPixel);
data.Slice(offset, bytesPerPixel).CopyTo(dest);
}
data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize));
inOffs += stride;
outOffs += outStride;
}
@ -198,8 +235,15 @@ namespace Ryujinx.Graphics.Texture
mipGobBlocksInZ >>= 1;
}
int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
int xStart = strideTrunc / bytesPerPixel;
int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
int inStrideGap = stride - w * bytesPerPixel;
int alignment = gobWidth;
if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
@ -217,25 +261,73 @@ namespace Ryujinx.Graphics.Texture
mipGobBlocksInZ,
bytesPerPixel);
for (int layer = 0; layer < layers; layer++)
unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
{
int outBaseOffset = layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level);
for (int z = 0; z < d; z++)
for (int y = 0; y < h; y++)
fixed (byte* outputPtr = output, dataPtr = data)
{
for (int x = 0; x < w; x++)
byte* inPtr = dataPtr + inOffs;
for (int layer = 0; layer < layers; layer++)
{
int offset = outBaseOffset + layoutConverter.GetOffset(x, y, z);
byte* outBaseOffset = outputPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
Span<byte> dest = output.Slice(offset, bytesPerPixel);
for (int z = 0; z < d; z++)
{
layoutConverter.SetZ(z);
for (int y = 0; y < h; y++)
{
layoutConverter.SetY(y);
data.Slice(inOffs + x * bytesPerPixel, bytesPerPixel).CopyTo(dest);
for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64)
{
byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
byte* offset2 = offset + 0x20;
byte* offset3 = offset + 0x100;
byte* offset4 = offset + 0x120;
Vector128<byte> value = *(Vector128<byte>*)inPtr;
Vector128<byte> value2 = *(Vector128<byte>*)(inPtr + 16);
Vector128<byte> value3 = *(Vector128<byte>*)(inPtr + 32);
Vector128<byte> value4 = *(Vector128<byte>*)(inPtr + 48);
*(Vector128<byte>*)offset = value;
*(Vector128<byte>*)offset2 = value2;
*(Vector128<byte>*)offset3 = value3;
*(Vector128<byte>*)offset4 = value4;
}
for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16)
{
byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
*(Vector128<byte>*)offset = *(Vector128<byte>*)inPtr;
}
for (int x = xStart; x < w; x++, inPtr += bytesPerPixel)
{
byte* offset = outBaseOffset + layoutConverter.GetOffset(x);
*(T*)offset = *(T*)inPtr;
}
inPtr += inStrideGap;
}
}
}
inOffs += stride;
inOffs += stride * h * d * layers;
}
return true;
}
bool _ = bytesPerPixel switch
{
1 => Convert<byte>(output, data),
2 => Convert<ushort>(output, data),
4 => Convert<uint>(output, data),
8 => Convert<ulong>(output, data),
12 => Convert<Bpp12Pixel>(output, data),
16 => Convert<Vector128<byte>>(output, data),
_ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
};
}
return output;
@ -254,23 +346,19 @@ namespace Ryujinx.Graphics.Texture
int h = BitUtils.DivRoundUp(height, blockHeight);
int inStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
int lineSize = width * bytesPerPixel;
Span<byte> output = new byte[h * stride];
int inOffs = 0;
int outOffs = 0;
for (int y = 0; y < h; y++)
{
for (int x = 0; x < w; x++)
{
int offset = y * stride + x * bytesPerPixel;
Span<byte> dest = output.Slice(offset, bytesPerPixel);
data.Slice(inOffs + x * bytesPerPixel, bytesPerPixel).CopyTo(dest);
}
data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize));
inOffs += inStride;
outOffs += stride;
}
return output;