mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-07-12 12:45:56 +00:00
vector_alu: Improve handling of mbcnt append/consume patterns (#3184)
Some checks are pending
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
Some checks are pending
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / linux-sdl-gcc (push) Blocked by required conditions
Build and Release / linux-qt-gcc (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions
* vector_alu: Improve handling of mbcnt append/consume patterns The existing implementation was written to handle a single pattern of mbcnt before the DS_APPEND instruction v_mbcnt_hi_u32_b32 vX, exec_hi, 0 v_mbcnt_lo_u32_b32 vX, exec_lo, vX ds_append vY offset:4 gds v_add_i32 vX, vcc, vY, vX In this case however the DS_APPEND is before the mbcnt pattern ds_append vX gds v_mbcnt_hi_u32_b32 vY, exec_hi, vX v_mbcnt_lo_u32_b32 vZ, exec_lo, vY The mbcnt instructions are always in pairs of hi/lo and in general are quite flexible. But they assume the subgroup size is 64 so they are not recompiled literally. Together with DS_APPEND they are used to derive a unique per thread index in a buffer (different from using thread_id as order could be random). DS_APPEND instruction works on per subgroup level, by adding number of active threads of subgroup to the GDS counter, essentially giving a multiple-of-64 base index to all threads. Then each thread executes the mbcnt pair which returns the number of active threads with id less than the itself and adds it with the base. The recompiler translates DS_APPEND into an atomic increment of a storage buffer counter, which already gives the desired unique index, so this pattern is a no-op. On main it was set to zero as per the first pattern to avoid altering the DS_APPEND result. The new handling passes through the initial value of the pattern instead, which has the same effect but works on either case. * vk_rasterizer: Always sync DMA buffers
This commit is contained in:
parent
7431b30005
commit
48460d1cbe
2 changed files with 16 additions and 12 deletions
|
@ -558,27 +558,31 @@ void Translator::V_BCNT_U32_B32(const GcnInst& inst) {
|
|||
|
||||
void Translator::V_MBCNT_U32_B32(bool is_low, const GcnInst& inst) {
|
||||
if (!is_low) {
|
||||
// v_mbcnt_hi_u32_b32 v2, -1, 0
|
||||
// v_mbcnt_hi_u32_b32 vX, -1, 0
|
||||
if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193 &&
|
||||
inst.src[1].field == OperandField::ConstZero) {
|
||||
return;
|
||||
}
|
||||
// v_mbcnt_hi_u32_b32 vX, exec_hi, 0
|
||||
if (inst.src[0].field == OperandField::ExecHi &&
|
||||
inst.src[1].field == OperandField::ConstZero) {
|
||||
return;
|
||||
// v_mbcnt_hi_u32_b32 vX, exec_hi, 0/vZ
|
||||
if ((inst.src[0].field == OperandField::ExecHi ||
|
||||
inst.src[0].field == OperandField::VccHi) &&
|
||||
(inst.src[1].field == OperandField::ConstZero ||
|
||||
inst.src[1].field == OperandField::VectorGPR)) {
|
||||
return SetDst(inst.dst[0], GetSrc(inst.src[1]));
|
||||
}
|
||||
UNREACHABLE();
|
||||
} else {
|
||||
// v_mbcnt_lo_u32_b32 v2, -1, vX
|
||||
// v_mbcnt_lo_u32_b32 vY, -1, vX
|
||||
// used combined with above to fetch lane id in non-compute stages
|
||||
if (inst.src[0].field == OperandField::SignedConstIntNeg && inst.src[0].code == 193) {
|
||||
SetDst(inst.dst[0], ir.LaneId());
|
||||
return SetDst(inst.dst[0], ir.LaneId());
|
||||
}
|
||||
// v_mbcnt_lo_u32_b32 v20, exec_lo, vX
|
||||
// used combined in above for append buffer indexing.
|
||||
if (inst.src[0].field == OperandField::ExecLo) {
|
||||
SetDst(inst.dst[0], ir.Imm32(0));
|
||||
// v_mbcnt_lo_u32_b32 vY, exec_lo, vX
|
||||
// used combined with above for append buffer indexing.
|
||||
if (inst.src[0].field == OperandField::ExecLo || inst.src[0].field == OperandField::VccLo) {
|
||||
return SetDst(inst.dst[0], GetSrc(inst.src[1]));
|
||||
}
|
||||
UNREACHABLE();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -471,7 +471,7 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) {
|
|||
uses_dma |= stage->uses_dma;
|
||||
}
|
||||
|
||||
if (uses_dma && !fault_process_pending) {
|
||||
if (uses_dma) {
|
||||
// We only use fault buffer for DMA right now.
|
||||
{
|
||||
Common::RecursiveSharedLock lock{mapped_ranges_mutex};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue