New NVDEC and VIC implementation (#1384)

* Initial NVDEC and VIC implementation

* Update FFmpeg.AutoGen to 4.3.0

* Add nvdec dependencies for Windows

* Unify some VP9 structures

* Rename VP9 structure fields

* Improvements to Video API

* XML docs for Common.Memory

* Remove now unused or redundant overloads from MemoryAccessor

* NVDEC UV surface read/write scalar paths

* Add FIXME comments about hacky things/stuff that will need to be fixed in the future

* Cleaned up VP9 memory allocation

* Remove some debug logs

* Rename some VP9 structs

* Remove unused struct

* No need to compile Ryujinx.Graphics.Host1x with unsafe anymore

* Name AsyncWorkQueue threads to make debugging easier

* Make Vp9PictureInfo a ref struct

* LayoutConverter no longer needs the depth argument (broken by rebase)

* Pooling of VP9 buffers, plus fix a memory leak on VP9

* Really wish VS could rename projects properly...

* Address feedback

* Remove using

* Catch OperationCanceledException

* Add licensing informations

* Add THIRDPARTY.md to release too

Co-authored-by: Thog <me@thog.eu>
This commit is contained in:
gdkchan 2020-07-12 00:07:01 -03:00 committed by GitHub
parent 38b26cf424
commit 4d02a2d2c0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
202 changed files with 20563 additions and 2567 deletions

View file

@ -0,0 +1,151 @@
using Ryujinx.Graphics.Gpu.Memory;
using Ryujinx.Graphics.Video;
using System;
using System.Diagnostics;
namespace Ryujinx.Graphics.Nvdec.Image
{
class SurfaceCache
{
// Must be equal to at least the maximum number of surfaces
// that can be in use simultaneously (which is 17, since H264
// can have up to 16 reference frames, and we need another one
// for the current frame).
// Realistically, most codecs won't ever use more than 4 simultaneously.
private const int MaxItems = 17;
private struct CacheItem
{
public int ReferenceCount;
public uint LumaOffset;
public uint ChromaOffset;
public int Width;
public int Height;
public CodecId CodecId;
public ISurface Surface;
}
private readonly CacheItem[] _pool = new CacheItem[MaxItems];
private readonly MemoryManager _gmm;
public SurfaceCache(MemoryManager gmm)
{
_gmm = gmm;
}
public ISurface Get(IDecoder decoder, CodecId codecId, uint lumaOffset, uint chromaOffset, int width, int height)
{
ISurface surface = null;
// Try to find a compatible surface with same parameters, and same offsets.
for (int i = 0; i < MaxItems; i++)
{
ref CacheItem item = ref _pool[i];
if (item.LumaOffset == lumaOffset &&
item.ChromaOffset == chromaOffset &&
item.CodecId == codecId &&
item.Width == width &&
item.Height == height)
{
item.ReferenceCount++;
surface = item.Surface;
MoveToFront(i);
break;
}
}
// If we failed to find a perfect match, now ignore the offsets.
// Search backwards to replace the oldest compatible surface,
// this avoids thrashing frquently used surfaces.
// Now we need to ensure that the surface is not in use, as we'll change the data.
if (surface == null)
{
for (int i = MaxItems - 1; i >= 0; i--)
{
ref CacheItem item = ref _pool[i];
if (item.ReferenceCount == 0 && item.CodecId == codecId && item.Width == width && item.Height == height)
{
item.ReferenceCount = 1;
item.LumaOffset = lumaOffset;
item.ChromaOffset = chromaOffset;
surface = item.Surface;
if ((lumaOffset | chromaOffset) != 0)
{
SurfaceReader.Read(_gmm, surface, lumaOffset, chromaOffset);
}
MoveToFront(i);
break;
}
}
}
// If everything else failed, we try to create a new surface,
// and insert it on the pool. We replace the oldest item on the
// pool to avoid thrashing frequently used surfaces.
// If even the oldest item is in use, that means that the entire pool
// is in use, in that case we throw as there's no place to insert
// the new surface.
if (surface == null)
{
if (_pool[MaxItems - 1].ReferenceCount == 0)
{
surface = decoder.CreateSurface(width, height);
if ((lumaOffset | chromaOffset) != 0)
{
SurfaceReader.Read(_gmm, surface, lumaOffset, chromaOffset);
}
MoveToFront(MaxItems - 1);
ref CacheItem item = ref _pool[0];
item.Surface?.Dispose();
item.ReferenceCount = 1;
item.LumaOffset = lumaOffset;
item.ChromaOffset = chromaOffset;
item.Width = width;
item.Height = height;
item.CodecId = codecId;
item.Surface = surface;
}
else
{
throw new InvalidOperationException("No free slot on the surface pool.");
}
}
return surface;
}
public void Put(ISurface surface)
{
for (int i = 0; i < MaxItems; i++)
{
ref CacheItem item = ref _pool[i];
if (item.Surface == surface)
{
item.ReferenceCount--;
Debug.Assert(item.ReferenceCount >= 0);
break;
}
}
}
private void MoveToFront(int index)
{
// If index is 0 we don't need to do anything,
// as it's already on the front.
if (index != 0)
{
CacheItem temp = _pool[index];
Array.Copy(_pool, 0, _pool, 1, index);
_pool[0] = temp;
}
}
}
}

View file

@ -0,0 +1,26 @@
using Ryujinx.Graphics.Texture;
using Ryujinx.Graphics.Video;
using System;
namespace Ryujinx.Graphics.Nvdec.Image
{
static class SurfaceCommon
{
public static int GetBlockLinearSize(int width, int height, int bytesPerPixel)
{
return SizeCalculator.GetBlockLinearTextureSize(width, height, 1, 1, 1, 1, 1, bytesPerPixel, 2, 1, 1).TotalSize;
}
public static void Copy(ISurface src, ISurface dst)
{
src.YPlane.AsSpan().CopyTo(dst.YPlane.AsSpan());
src.UPlane.AsSpan().CopyTo(dst.UPlane.AsSpan());
src.VPlane.AsSpan().CopyTo(dst.VPlane.AsSpan());
}
public unsafe static Span<byte> AsSpan(this Plane plane)
{
return new Span<byte>((void*)plane.Pointer, plane.Length);
}
}
}

View file

@ -0,0 +1,133 @@
using Ryujinx.Common;
using Ryujinx.Graphics.Gpu.Memory;
using Ryujinx.Graphics.Texture;
using Ryujinx.Graphics.Video;
using System;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using static Ryujinx.Graphics.Nvdec.Image.SurfaceCommon;
namespace Ryujinx.Graphics.Nvdec.Image
{
static class SurfaceReader
{
public static void Read(MemoryManager gmm, ISurface surface, uint lumaOffset, uint chromaOffset)
{
int width = surface.Width;
int height = surface.Height;
int stride = surface.Stride;
ReadOnlySpan<byte> luma = gmm.DeviceGetSpan(lumaOffset, GetBlockLinearSize(width, height, 1));
ReadLuma(surface.YPlane.AsSpan(), luma, stride, width, height);
int uvWidth = surface.UvWidth;
int uvHeight = surface.UvHeight;
int uvStride = surface.UvStride;
ReadOnlySpan<byte> chroma = gmm.DeviceGetSpan(chromaOffset, GetBlockLinearSize(uvWidth, uvHeight, 2));
ReadChroma(surface.UPlane.AsSpan(), surface.VPlane.AsSpan(), chroma, uvStride, uvWidth, uvHeight);
}
private static void ReadLuma(Span<byte> dst, ReadOnlySpan<byte> src, int dstStride, int width, int height)
{
LayoutConverter.ConvertBlockLinearToLinear(dst, width, height, dstStride, 1, 2, src);
}
private unsafe static void ReadChroma(
Span<byte> dstU,
Span<byte> dstV,
ReadOnlySpan<byte> src,
int dstStride,
int width,
int height)
{
OffsetCalculator calc = new OffsetCalculator(width, height, 0, false, 2, 2);
if (Sse2.IsSupported)
{
int strideTrunc64 = BitUtils.AlignDown(width * 2, 64);
int outStrideGap = dstStride - width;
fixed (byte* dstUPtr = dstU, dstVPtr = dstV, dataPtr = src)
{
byte* uPtr = dstUPtr;
byte* vPtr = dstVPtr;
for (int y = 0; y < height; y++)
{
calc.SetY(y);
for (int x = 0; x < strideTrunc64; x += 64, uPtr += 32, vPtr += 32)
{
byte* offset = dataPtr + calc.GetOffsetWithLineOffset64(x);
byte* offset2 = offset + 0x20;
byte* offset3 = offset + 0x100;
byte* offset4 = offset + 0x120;
Vector128<byte> value = *(Vector128<byte>*)offset;
Vector128<byte> value2 = *(Vector128<byte>*)offset2;
Vector128<byte> value3 = *(Vector128<byte>*)offset3;
Vector128<byte> value4 = *(Vector128<byte>*)offset4;
Vector128<byte> u00 = Sse2.UnpackLow(value, value2);
Vector128<byte> v00 = Sse2.UnpackHigh(value, value2);
Vector128<byte> u01 = Sse2.UnpackLow(value3, value4);
Vector128<byte> v01 = Sse2.UnpackHigh(value3, value4);
Vector128<byte> u10 = Sse2.UnpackLow(u00, v00);
Vector128<byte> v10 = Sse2.UnpackHigh(u00, v00);
Vector128<byte> u11 = Sse2.UnpackLow(u01, v01);
Vector128<byte> v11 = Sse2.UnpackHigh(u01, v01);
Vector128<byte> u20 = Sse2.UnpackLow(u10, v10);
Vector128<byte> v20 = Sse2.UnpackHigh(u10, v10);
Vector128<byte> u21 = Sse2.UnpackLow(u11, v11);
Vector128<byte> v21 = Sse2.UnpackHigh(u11, v11);
Vector128<byte> u30 = Sse2.UnpackLow(u20, v20);
Vector128<byte> v30 = Sse2.UnpackHigh(u20, v20);
Vector128<byte> u31 = Sse2.UnpackLow(u21, v21);
Vector128<byte> v31 = Sse2.UnpackHigh(u21, v21);
*(Vector128<byte>*)uPtr = u30;
*(Vector128<byte>*)(uPtr + 16) = u31;
*(Vector128<byte>*)vPtr = v30;
*(Vector128<byte>*)(vPtr + 16) = v31;
}
for (int x = strideTrunc64 / 2; x < width; x++, uPtr++, vPtr++)
{
byte* offset = dataPtr + calc.GetOffset(x);
*uPtr = *offset;
*vPtr = *(offset + 1);
}
uPtr += outStrideGap;
vPtr += outStrideGap;
}
}
}
else
{
for (int y = 0; y < height; y++)
{
int dstBaseOffset = y * dstStride;
calc.SetY(y);
for (int x = 0; x < width; x++)
{
int srcOffset = calc.GetOffset(x);
dstU[dstBaseOffset + x] = src[srcOffset];
dstV[dstBaseOffset + x] = src[srcOffset + 1];
}
}
}
}
}
}

View file

@ -0,0 +1,126 @@
using Ryujinx.Common;
using Ryujinx.Graphics.Gpu.Memory;
using Ryujinx.Graphics.Texture;
using Ryujinx.Graphics.Video;
using System;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using static Ryujinx.Graphics.Nvdec.Image.SurfaceCommon;
using static Ryujinx.Graphics.Nvdec.MemoryExtensions;
namespace Ryujinx.Graphics.Nvdec.Image
{
static class SurfaceWriter
{
public static void Write(MemoryManager gmm, ISurface surface, uint lumaOffset, uint chromaOffset)
{
int lumaSize = GetBlockLinearSize(surface.Width, surface.Height, 1);
using var luma = gmm.GetWritableRegion(ExtendOffset(lumaOffset), lumaSize);
WriteLuma(
luma.Memory.Span,
surface.YPlane.AsSpan(),
surface.Stride,
surface.Width,
surface.Height);
int chromaSize = GetBlockLinearSize(surface.UvWidth, surface.UvHeight, 2);
using var chroma = gmm.GetWritableRegion(ExtendOffset(chromaOffset), chromaSize);
WriteChroma(
chroma.Memory.Span,
surface.UPlane.AsSpan(),
surface.VPlane.AsSpan(),
surface.UvStride,
surface.UvWidth,
surface.UvHeight);
}
private static void WriteLuma(Span<byte> dst, ReadOnlySpan<byte> src, int srcStride, int width, int height)
{
LayoutConverter.ConvertLinearToBlockLinear(dst, width, height, srcStride, 1, 2, src);
}
private unsafe static void WriteChroma(
Span<byte> dst,
ReadOnlySpan<byte> srcU,
ReadOnlySpan<byte> srcV,
int srcStride,
int width,
int height)
{
OffsetCalculator calc = new OffsetCalculator(width, height, 0, false, 2, 2);
if (Sse2.IsSupported)
{
int strideTrunc64 = BitUtils.AlignDown(width * 2, 64);
int inStrideGap = srcStride - width;
fixed (byte* outputPtr = dst, srcUPtr = srcU, srcVPtr = srcV)
{
byte* inUPtr = srcUPtr;
byte* inVPtr = srcVPtr;
for (int y = 0; y < height; y++)
{
calc.SetY(y);
for (int x = 0; x < strideTrunc64; x += 64, inUPtr += 32, inVPtr += 32)
{
byte* offset = outputPtr + calc.GetOffsetWithLineOffset64(x);
byte* offset2 = offset + 0x20;
byte* offset3 = offset + 0x100;
byte* offset4 = offset + 0x120;
Vector128<byte> value = *(Vector128<byte>*)inUPtr;
Vector128<byte> value2 = *(Vector128<byte>*)inVPtr;
Vector128<byte> value3 = *(Vector128<byte>*)(inUPtr + 16);
Vector128<byte> value4 = *(Vector128<byte>*)(inVPtr + 16);
Vector128<byte> uv0 = Sse2.UnpackLow(value, value2);
Vector128<byte> uv1 = Sse2.UnpackHigh(value, value2);
Vector128<byte> uv2 = Sse2.UnpackLow(value3, value4);
Vector128<byte> uv3 = Sse2.UnpackHigh(value3, value4);
*(Vector128<byte>*)offset = uv0;
*(Vector128<byte>*)offset2 = uv1;
*(Vector128<byte>*)offset3 = uv2;
*(Vector128<byte>*)offset4 = uv3;
}
for (int x = strideTrunc64 / 2; x < width; x++, inUPtr++, inVPtr++)
{
byte* offset = outputPtr + calc.GetOffset(x);
*offset = *inUPtr;
*(offset + 1) = *inVPtr;
}
inUPtr += inStrideGap;
inVPtr += inStrideGap;
}
}
}
else
{
for (int y = 0; y < height; y++)
{
int srcBaseOffset = y * srcStride;
calc.SetY(y);
for (int x = 0; x < width; x++)
{
int dstOffset = calc.GetOffset(x);
dst[dstOffset + 0] = srcU[srcBaseOffset + x];
dst[dstOffset + 1] = srcV[srcBaseOffset + x];
}
}
}
}
}
}