From be12305f65a779a0b2a13c2f0885d02cd120a909 Mon Sep 17 00:00:00 2001 From: Lander Gallastegi Date: Fri, 20 Jun 2025 12:00:23 +0200 Subject: [PATCH] video_core: Page manager/region manager optimization (#3070) * Bit array test * Some corrections * Fix AVX path on SetRange * Finish bitArray * Batched protect progress * Inclusion fix * Last logic fixes for BitArray * Page manager: batch protect, masked ranges * Page manager bitarray * clang-format * Fix out of bounds read * clang * clang * Lock during callbacks * Rename untracked to writeable * Construct and mask in one step * Sync on region mutex for thw whole protection This is a temporary workarround until a fix is found for the page manager having issues when multiple threads update the same page at the same time. * Bring back the gpu masking until properly handled * Sync page manager protections * clang-format * Rename and fixups * I fucked up clang-formatting one more time... * kek --- CMakeLists.txt | 6 +- src/common/bit_array.h | 411 ++++++++++++++++++ src/video_core/buffer_cache/buffer_cache.h | 2 +- ...memory_tracker_base.h => memory_tracker.h} | 2 +- .../buffer_cache/region_definitions.h | 28 ++ src/video_core/buffer_cache/region_manager.h | 208 +++++++++ src/video_core/buffer_cache/word_manager.h | 296 ------------- src/video_core/page_manager.cpp | 175 +++++--- src/video_core/page_manager.h | 8 +- .../texture_cache/texture_cache.cpp | 6 +- 10 files changed, 781 insertions(+), 361 deletions(-) create mode 100644 src/common/bit_array.h rename src/video_core/buffer_cache/{memory_tracker_base.h => memory_tracker.h} (99%) create mode 100644 src/video_core/buffer_cache/region_definitions.h create mode 100644 src/video_core/buffer_cache/region_manager.h delete mode 100644 src/video_core/buffer_cache/word_manager.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 09fddb3d7..d8fe5f68b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -653,6 +653,7 @@ set(COMMON src/common/logging/backend.cpp src/common/arch.h src/common/assert.cpp src/common/assert.h + src/common/bit_array.h src/common/bit_field.h src/common/bounded_threadsafe_queue.h src/common/concepts.h @@ -913,9 +914,10 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp src/video_core/buffer_cache/buffer.h src/video_core/buffer_cache/buffer_cache.cpp src/video_core/buffer_cache/buffer_cache.h - src/video_core/buffer_cache/memory_tracker_base.h + src/video_core/buffer_cache/memory_tracker.h src/video_core/buffer_cache/range_set.h - src/video_core/buffer_cache/word_manager.h + src/video_core/buffer_cache/region_definitions.h + src/video_core/buffer_cache/region_manager.h src/video_core/renderer_vulkan/liverpool_to_vk.cpp src/video_core/renderer_vulkan/liverpool_to_vk.h src/video_core/renderer_vulkan/vk_common.cpp diff --git a/src/common/bit_array.h b/src/common/bit_array.h new file mode 100644 index 000000000..f211bbf95 --- /dev/null +++ b/src/common/bit_array.h @@ -0,0 +1,411 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include "common/types.h" + +#ifdef __AVX2__ +#define BIT_ARRAY_USE_AVX +#include +#endif + +namespace Common { + +template +class BitArray { + static_assert(N % 64 == 0, "BitArray size must be a multiple of 64 bits."); + + static constexpr size_t BITS_PER_WORD = 64; + static constexpr size_t WORD_COUNT = N / BITS_PER_WORD; + static constexpr size_t WORDS_PER_AVX = 4; + static constexpr size_t AVX_WORD_COUNT = WORD_COUNT / WORDS_PER_AVX; + +public: + using Range = std::pair; + + class Iterator { + public: + explicit Iterator(const BitArray& bit_array_, u64 start) : bit_array(bit_array_) { + range = bit_array.FirstRangeFrom(start); + } + + Iterator& operator++() { + range = bit_array.FirstRangeFrom(range.second); + return *this; + } + + bool operator==(const Iterator& other) const { + return range == other.range; + } + + bool operator!=(const Iterator& other) const { + return !(*this == other); + } + + const Range& operator*() const { + return range; + } + + const Range* operator->() const { + return ⦥ + } + + private: + const BitArray& bit_array; + Range range; + }; + + using const_iterator = Iterator; + using iterator_category = std::forward_iterator_tag; + using value_type = Range; + using difference_type = std::ptrdiff_t; + using pointer = const Range*; + using reference = const Range&; + + BitArray() = default; + BitArray(const BitArray& other) = default; + BitArray& operator=(const BitArray& other) = default; + BitArray(BitArray&& other) noexcept = default; + BitArray& operator=(BitArray&& other) noexcept = default; + ~BitArray() = default; + + BitArray(const BitArray& other, size_t start, size_t end) { + if (start >= end || end > N) { + return; + } + const size_t first_word = start / BITS_PER_WORD; + const size_t last_word = (end - 1) / BITS_PER_WORD; + const size_t start_bit = start % BITS_PER_WORD; + const size_t end_bit = (end - 1) % BITS_PER_WORD; + const u64 start_mask = ~((1ULL << start_bit) - 1); + const u64 end_mask = end_bit == BITS_PER_WORD - 1 ? ~0ULL : (1ULL << (end_bit + 1)) - 1; + if (first_word == last_word) { + data[first_word] = other.data[first_word] & (start_mask & end_mask); + } else { + data[first_word] = other.data[first_word] & start_mask; + size_t i = first_word + 1; +#ifdef BIT_ARRAY_USE_AVX + for (; i + WORDS_PER_AVX <= last_word; i += WORDS_PER_AVX) { + const __m256i current = + _mm256_loadu_si256(reinterpret_cast(&other.data[i])); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&data[i]), current); + } +#endif + for (; i < last_word; ++i) { + data[i] = other.data[i]; + } + data[last_word] = other.data[last_word] & end_mask; + } + } + + BitArray(const BitArray& other, const Range& range) + : BitArray(other, range.first, range.second) {} + + const_iterator begin() const { + return Iterator(*this, 0); + } + const_iterator end() const { + return Iterator(*this, N); + } + + inline constexpr void Set(size_t idx) { + data[idx / BITS_PER_WORD] |= (1ULL << (idx % BITS_PER_WORD)); + } + + inline constexpr void Unset(size_t idx) { + data[idx / BITS_PER_WORD] &= ~(1ULL << (idx % BITS_PER_WORD)); + } + + inline constexpr bool Get(size_t idx) const { + return (data[idx / BITS_PER_WORD] & (1ULL << (idx % BITS_PER_WORD))) != 0; + } + + inline void SetRange(size_t start, size_t end) { + if (start >= end || end > N) { + return; + } + const size_t first_word = start / BITS_PER_WORD; + const size_t last_word = (end - 1) / BITS_PER_WORD; + const size_t start_bit = start % BITS_PER_WORD; + const size_t end_bit = (end - 1) % BITS_PER_WORD; + const u64 start_mask = ~((1ULL << start_bit) - 1); + const u64 end_mask = end_bit == BITS_PER_WORD - 1 ? ~0ULL : (1ULL << (end_bit + 1)) - 1; + if (first_word == last_word) { + data[first_word] |= start_mask & end_mask; + } else { + data[first_word] |= start_mask; + size_t i = first_word + 1; +#ifdef BIT_ARRAY_USE_AVX + const __m256i value = _mm256_set1_epi64x(-1); + for (; i + WORDS_PER_AVX <= last_word; i += WORDS_PER_AVX) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&data[i]), value); + } +#endif + for (; i < last_word; ++i) { + data[i] = ~0ULL; + } + data[last_word] |= end_mask; + } + } + + inline void UnsetRange(size_t start, size_t end) { + if (start >= end || end > N) { + return; + } + size_t first_word = start / BITS_PER_WORD; + const size_t last_word = (end - 1) / BITS_PER_WORD; + const size_t start_bit = start % BITS_PER_WORD; + const size_t end_bit = (end - 1) % BITS_PER_WORD; + const u64 start_mask = (1ULL << start_bit) - 1; + const u64 end_mask = end_bit == BITS_PER_WORD - 1 ? 0ULL : ~((1ULL << (end_bit + 1)) - 1); + if (first_word == last_word) { + data[first_word] &= start_mask | end_mask; + } else { + data[first_word] &= start_mask; + size_t i = first_word + 1; +#ifdef BIT_ARRAY_USE_AVX + const __m256i value = _mm256_setzero_si256(); + for (; i + WORDS_PER_AVX <= last_word; i += WORDS_PER_AVX) { + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&data[i]), value); + } +#endif + for (; i < last_word; ++i) { + data[i] = 0ULL; + } + data[last_word] &= end_mask; + } + } + + inline constexpr void SetRange(const Range& range) { + SetRange(range.first, range.second); + } + + inline constexpr void UnsetRange(const Range& range) { + UnsetRange(range.first, range.second); + } + + inline constexpr void Clear() { + data.fill(0); + } + + inline constexpr void Fill() { + data.fill(~0ULL); + } + + inline constexpr bool None() const { + u64 result = 0; + for (const auto& word : data) { + result |= word; + } + return result == 0; + } + + inline constexpr bool Any() const { + return !None(); + } + + Range FirstRangeFrom(size_t start) const { + if (start >= N) { + return {N, N}; + } + const auto find_end_bit = [&](size_t word) { +#ifdef BIT_ARRAY_USE_AVX + const __m256i all_one = _mm256_set1_epi64x(-1); + for (; word + WORDS_PER_AVX <= WORD_COUNT; word += WORDS_PER_AVX) { + const __m256i current = + _mm256_loadu_si256(reinterpret_cast(&data[word])); + const __m256i cmp = _mm256_cmpeq_epi64(current, all_one); + if (_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) { + break; + } + } +#endif + for (; word < WORD_COUNT; ++word) { + if (data[word] != ~0ULL) { + return (word * BITS_PER_WORD) + std::countr_one(data[word]); + } + } + return N; + }; + + const auto word_bits = [&](size_t index, u64 word) { + const int empty_bits = std::countr_zero(word); + const int ones_count = std::countr_one(word >> empty_bits); + const size_t start_bit = index * BITS_PER_WORD + empty_bits; + if (ones_count + empty_bits < BITS_PER_WORD) { + return Range{start_bit, start_bit + ones_count}; + } + return Range{start_bit, find_end_bit(index + 1)}; + }; + + const size_t start_word = start / BITS_PER_WORD; + const size_t start_bit = start % BITS_PER_WORD; + const u64 masked_first = data[start_word] & (~((1ULL << start_bit) - 1)); + if (masked_first) { + return word_bits(start_word, masked_first); + } + + size_t word = start_word + 1; +#ifdef BIT_ARRAY_USE_AVX + for (; word + WORDS_PER_AVX <= WORD_COUNT; word += WORDS_PER_AVX) { + const __m256i current = + _mm256_loadu_si256(reinterpret_cast(&data[word])); + if (!_mm256_testz_si256(current, current)) { + break; + } + } +#endif + for (; word < WORD_COUNT; ++word) { + if (data[word] != 0) { + return word_bits(word, data[word]); + } + } + return {N, N}; + } + + inline constexpr Range FirstRange() const { + return FirstRangeFrom(0); + } + + Range LastRangeFrom(size_t end) const { + if (end == 0) { + return {0, 0}; + } + if (end > N) { + end = N; + } + const auto find_start_bit = [&](size_t word) { +#ifdef BIT_ARRAY_USE_AVX + const __m256i all_zero = _mm256_setzero_si256(); + for (; word >= WORDS_PER_AVX; word -= WORDS_PER_AVX) { + const __m256i current = _mm256_loadu_si256( + reinterpret_cast(&data[word - WORDS_PER_AVX])); + const __m256i cmp = _mm256_cmpeq_epi64(current, all_zero); + if (_mm256_movemask_epi8(cmp) != 0xFFFFFFFF) { + break; + } + } +#endif + for (; word > 0; --word) { + if (data[word - 1] != ~0ULL) { + return word * BITS_PER_WORD - std::countl_one(data[word - 1]); + } + } + return size_t(0); + }; + const auto word_bits = [&](size_t index, u64 word) { + const int empty_bits = std::countl_zero(word); + const int ones_count = std::countl_one(word << empty_bits); + const size_t end_bit = index * BITS_PER_WORD - empty_bits; + if (empty_bits + ones_count < BITS_PER_WORD) { + return Range{end_bit - ones_count, end_bit}; + } + return Range{find_start_bit(index - 1), end_bit}; + }; + const size_t end_word = ((end - 1) / BITS_PER_WORD) + 1; + const size_t end_bit = (end - 1) % BITS_PER_WORD; + u64 masked_last = data[end_word - 1]; + if (end_bit < BITS_PER_WORD - 1) { + masked_last &= (1ULL << (end_bit + 1)) - 1; + } + if (masked_last) { + return word_bits(end_word, masked_last); + } + size_t word = end_word - 1; +#ifdef BIT_ARRAY_USE_AVX + for (; word >= WORDS_PER_AVX; word -= WORDS_PER_AVX) { + const __m256i current = + _mm256_loadu_si256(reinterpret_cast(&data[word - WORDS_PER_AVX])); + if (!_mm256_testz_si256(current, current)) { + break; + } + } +#endif + for (; word > 0; --word) { + if (data[word - 1] != 0) { + return word_bits(word, data[word - 1]); + } + } + return {0, 0}; + } + + inline constexpr Range LastRange() const { + return LastRangeFrom(N); + } + + inline constexpr size_t Size() const { + return N; + } + + inline constexpr BitArray& operator|=(const BitArray& other) { + for (size_t i = 0; i < WORD_COUNT; ++i) { + data[i] |= other.data[i]; + } + return *this; + } + + inline constexpr BitArray& operator&=(const BitArray& other) { + for (size_t i = 0; i < WORD_COUNT; ++i) { + data[i] &= other.data[i]; + } + return *this; + } + + inline constexpr BitArray& operator^=(const BitArray& other) { + for (size_t i = 0; i < WORD_COUNT; ++i) { + data[i] ^= other.data[i]; + } + return *this; + } + + inline constexpr BitArray& operator~() { + for (size_t i = 0; i < WORD_COUNT; ++i) { + data[i] = ~data[i]; + } + return *this; + } + + inline constexpr BitArray operator|(const BitArray& other) const { + BitArray result = *this; + result |= other; + return result; + } + + inline constexpr BitArray operator&(const BitArray& other) const { + BitArray result = *this; + result &= other; + return result; + } + + inline constexpr BitArray operator^(const BitArray& other) const { + BitArray result = *this; + result ^= other; + return result; + } + + inline constexpr BitArray operator~() const { + BitArray result = *this; + result = ~result; + return result; + } + + inline constexpr bool operator==(const BitArray& other) const { + u64 result = 0; + for (size_t i = 0; i < WORD_COUNT; ++i) { + result |= data[i] ^ other.data[i]; + } + return result == 0; + } + + inline constexpr bool operator!=(const BitArray& other) const { + return !(*this == other); + } + +private: + std::array data{}; +}; + +} // namespace Common \ No newline at end of file diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d7d753213..651ba84dc 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -9,7 +9,7 @@ #include "common/slot_vector.h" #include "common/types.h" #include "video_core/buffer_cache/buffer.h" -#include "video_core/buffer_cache/memory_tracker_base.h" +#include "video_core/buffer_cache/memory_tracker.h" #include "video_core/buffer_cache/range_set.h" #include "video_core/multi_level_page_table.h" diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker.h similarity index 99% rename from src/video_core/buffer_cache/memory_tracker_base.h rename to src/video_core/buffer_cache/memory_tracker.h index c60aa9c80..37fafa2d6 100644 --- a/src/video_core/buffer_cache/memory_tracker_base.h +++ b/src/video_core/buffer_cache/memory_tracker.h @@ -9,7 +9,7 @@ #include #include "common/debug.h" #include "common/types.h" -#include "video_core/buffer_cache/word_manager.h" +#include "video_core/buffer_cache/region_manager.h" namespace VideoCore { diff --git a/src/video_core/buffer_cache/region_definitions.h b/src/video_core/buffer_cache/region_definitions.h new file mode 100644 index 000000000..80c6afdc6 --- /dev/null +++ b/src/video_core/buffer_cache/region_definitions.h @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include "common/bit_array.h" +#include "common/types.h" + +namespace VideoCore { + +constexpr u64 PAGES_PER_WORD = 64; +constexpr u64 BYTES_PER_PAGE = 4_KB; + +constexpr u64 HIGHER_PAGE_BITS = 22; +constexpr u64 HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS; +constexpr u64 HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL; +constexpr u64 NUM_REGION_PAGES = HIGHER_PAGE_SIZE / BYTES_PER_PAGE; + +enum class Type { + CPU, + GPU, + Writeable, +}; + +using RegionBits = Common::BitArray; + +} // namespace VideoCore \ No newline at end of file diff --git a/src/video_core/buffer_cache/region_manager.h b/src/video_core/buffer_cache/region_manager.h new file mode 100644 index 000000000..07ffee36b --- /dev/null +++ b/src/video_core/buffer_cache/region_manager.h @@ -0,0 +1,208 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include "common/div_ceil.h" + +#ifdef __linux__ +#include "common/adaptive_mutex.h" +#else +#include "common/spin_lock.h" +#endif +#include "common/debug.h" +#include "common/types.h" +#include "video_core/buffer_cache/region_definitions.h" +#include "video_core/page_manager.h" + +namespace VideoCore { + +/** + * Allows tracking CPU and GPU modification of pages in a contigious 4MB virtual address region. + * Information is stored in bitsets for spacial locality and fast update of single pages. + */ +class RegionManager { +public: + explicit RegionManager(PageManager* tracker_, VAddr cpu_addr_) + : tracker{tracker_}, cpu_addr{cpu_addr_} { + cpu.Fill(); + gpu.Clear(); + writeable.Fill(); + } + explicit RegionManager() = default; + + void SetCpuAddress(VAddr new_cpu_addr) { + cpu_addr = new_cpu_addr; + } + + VAddr GetCpuAddr() const { + return cpu_addr; + } + + static constexpr size_t SanitizeAddress(size_t address) { + return static_cast(std::max(static_cast(address), 0LL)); + } + + template + RegionBits& GetRegionBits() noexcept { + static_assert(type != Type::Writeable); + if constexpr (type == Type::CPU) { + return cpu; + } else if constexpr (type == Type::GPU) { + return gpu; + } else if constexpr (type == Type::Writeable) { + return writeable; + } else { + static_assert(false, "Invalid type"); + } + } + + template + const RegionBits& GetRegionBits() const noexcept { + static_assert(type != Type::Writeable); + if constexpr (type == Type::CPU) { + return cpu; + } else if constexpr (type == Type::GPU) { + return gpu; + } else if constexpr (type == Type::Writeable) { + return writeable; + } else { + static_assert(false, "Invalid type"); + } + } + + /** + * Change the state of a range of pages + * + * @param dirty_addr Base address to mark or unmark as modified + * @param size Size in bytes to mark or unmark as modified + */ + template + void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) { + RENDERER_TRACE; + const size_t offset = dirty_addr - cpu_addr; + const size_t start_page = SanitizeAddress(offset) / BYTES_PER_PAGE; + const size_t end_page = Common::DivCeil(SanitizeAddress(offset + size), BYTES_PER_PAGE); + if (start_page >= NUM_REGION_PAGES || end_page <= start_page) { + return; + } + std::scoped_lock lk{lock}; + static_assert(type != Type::Writeable); + + RegionBits& bits = GetRegionBits(); + if constexpr (enable) { + bits.SetRange(start_page, end_page); + } else { + bits.UnsetRange(start_page, end_page); + } + if constexpr (type == Type::CPU) { + UpdateProtection(); + } + } + + /** + * Loop over each page in the given range, turn off those bits and notify the tracker if + * needed. Call the given function on each turned off range. + * + * @param query_cpu_range Base CPU address to loop over + * @param size Size in bytes of the CPU range to loop over + * @param func Function to call for each turned off region + */ + template + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, auto&& func) { + RENDERER_TRACE; + const size_t offset = query_cpu_range - cpu_addr; + const size_t start_page = SanitizeAddress(offset) / BYTES_PER_PAGE; + const size_t end_page = Common::DivCeil(SanitizeAddress(offset + size), BYTES_PER_PAGE); + if (start_page >= NUM_REGION_PAGES || end_page <= start_page) { + return; + } + std::scoped_lock lk{lock}; + static_assert(type != Type::Writeable); + + RegionBits& bits = GetRegionBits(); + RegionBits mask(bits, start_page, end_page); + + // TODO: this will not be needed once we handle readbacks + if constexpr (type == Type::GPU) { + mask &= ~writeable; + } + + for (const auto& [start, end] : mask) { + func(cpu_addr + start * BYTES_PER_PAGE, (end - start) * BYTES_PER_PAGE); + } + + if constexpr (clear) { + bits.UnsetRange(start_page, end_page); + if constexpr (type == Type::CPU) { + UpdateProtection(); + } + } + } + + /** + * Returns true when a region has been modified + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template + [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { + RENDERER_TRACE; + const size_t start_page = SanitizeAddress(offset) / BYTES_PER_PAGE; + const size_t end_page = Common::DivCeil(SanitizeAddress(offset + size), BYTES_PER_PAGE); + if (start_page >= NUM_REGION_PAGES || end_page <= start_page) { + return false; + } + // std::scoped_lock lk{lock}; // Is this needed? + static_assert(type != Type::Writeable); + + const RegionBits& bits = GetRegionBits(); + RegionBits test(bits, start_page, end_page); + + // TODO: this will not be needed once we handle readbacks + if constexpr (type == Type::GPU) { + test &= ~writeable; + } + + return test.Any(); + } + +private: + /** + * Notify tracker about changes in the CPU tracking state of a word in the buffer + * + * @param word_index Index to the word to notify to the tracker + * @param current_bits Current state of the word + * @param new_bits New state of the word + * + * @tparam add_to_tracker True when the tracker should start tracking the new pages + */ + template + void UpdateProtection() { + RENDERER_TRACE; + RegionBits mask = cpu ^ writeable; + + if (mask.None()) { + return; // No changes to the CPU tracking state + } + + writeable = cpu; + tracker->UpdatePageWatchersForRegion(cpu_addr, mask); + } + +#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP + Common::AdaptiveMutex lock; +#else + Common::SpinLock lock; +#endif + PageManager* tracker; + VAddr cpu_addr = 0; + RegionBits cpu; + RegionBits gpu; + RegionBits writeable; +}; + +} // namespace VideoCore diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h deleted file mode 100644 index 51a912c62..000000000 --- a/src/video_core/buffer_cache/word_manager.h +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#pragma once - -#include -#include -#include -#include - -#ifdef __linux__ -#include "common/adaptive_mutex.h" -#else -#include "common/spin_lock.h" -#endif -#include "common/debug.h" -#include "common/types.h" -#include "video_core/page_manager.h" - -namespace VideoCore { - -constexpr u64 PAGES_PER_WORD = 64; -constexpr u64 BYTES_PER_PAGE = 4_KB; -constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; - -constexpr u64 HIGHER_PAGE_BITS = 22; -constexpr u64 HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS; -constexpr u64 HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL; -constexpr u64 NUM_REGION_WORDS = HIGHER_PAGE_SIZE / BYTES_PER_WORD; - -enum class Type { - CPU, - GPU, - Untracked, -}; - -using WordsArray = std::array; - -/** - * Allows tracking CPU and GPU modification of pages in a contigious 4MB virtual address region. - * Information is stored in bitsets for spacial locality and fast update of single pages. - */ -class RegionManager { -public: - explicit RegionManager(PageManager* tracker_, VAddr cpu_addr_) - : tracker{tracker_}, cpu_addr{cpu_addr_} { - cpu.fill(~u64{0}); - gpu.fill(0); - untracked.fill(~u64{0}); - } - explicit RegionManager() = default; - - void SetCpuAddress(VAddr new_cpu_addr) { - cpu_addr = new_cpu_addr; - } - - VAddr GetCpuAddr() const { - return cpu_addr; - } - - static constexpr u64 ExtractBits(u64 word, size_t page_start, size_t page_end) { - constexpr size_t number_bits = sizeof(u64) * 8; - const size_t limit_page_end = number_bits - std::min(page_end, number_bits); - u64 bits = (word >> page_start) << page_start; - bits = (bits << limit_page_end) >> limit_page_end; - return bits; - } - - static constexpr std::pair GetWordPage(VAddr address) { - const size_t converted_address = static_cast(address); - const size_t word_number = converted_address / BYTES_PER_WORD; - const size_t amount_pages = converted_address % BYTES_PER_WORD; - return std::make_pair(word_number, amount_pages / BYTES_PER_PAGE); - } - - template - void IterateWords(size_t offset, size_t size, Func&& func) const { - RENDERER_TRACE; - using FuncReturn = std::invoke_result_t; - static constexpr bool BOOL_BREAK = std::is_same_v; - const size_t start = static_cast(std::max(static_cast(offset), 0LL)); - const size_t end = static_cast(std::max(static_cast(offset + size), 0LL)); - if (start >= HIGHER_PAGE_SIZE || end <= start) { - return; - } - auto [start_word, start_page] = GetWordPage(start); - auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL); - constexpr size_t num_words = NUM_REGION_WORDS; - start_word = std::min(start_word, num_words); - end_word = std::min(end_word, num_words); - const size_t diff = end_word - start_word; - end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD; - end_word = std::min(end_word, num_words); - end_page += diff * PAGES_PER_WORD; - constexpr u64 base_mask{~0ULL}; - for (size_t word_index = start_word; word_index < end_word; word_index++) { - const u64 mask = ExtractBits(base_mask, start_page, end_page); - start_page = 0; - end_page -= PAGES_PER_WORD; - if constexpr (BOOL_BREAK) { - if (func(word_index, mask)) { - return; - } - } else { - func(word_index, mask); - } - } - } - - void IteratePages(u64 mask, auto&& func) const { - RENDERER_TRACE; - size_t offset = 0; - while (mask != 0) { - const size_t empty_bits = std::countr_zero(mask); - offset += empty_bits; - mask >>= empty_bits; - - const size_t continuous_bits = std::countr_one(mask); - func(offset, continuous_bits); - mask = continuous_bits < PAGES_PER_WORD ? (mask >> continuous_bits) : 0; - offset += continuous_bits; - } - } - - /** - * Change the state of a range of pages - * - * @param dirty_addr Base address to mark or unmark as modified - * @param size Size in bytes to mark or unmark as modified - */ - template - void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) { - std::scoped_lock lk{lock}; - std::span state_words = Span(); - IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) { - if constexpr (type == Type::CPU) { - UpdateProtection(index, untracked[index], mask); - } - if constexpr (enable) { - state_words[index] |= mask; - if constexpr (type == Type::CPU) { - untracked[index] |= mask; - } - } else { - state_words[index] &= ~mask; - if constexpr (type == Type::CPU) { - untracked[index] &= ~mask; - } - } - }); - } - - /** - * Loop over each page in the given range, turn off those bits and notify the tracker if - * needed. Call the given function on each turned off range. - * - * @param query_cpu_range Base CPU address to loop over - * @param size Size in bytes of the CPU range to loop over - * @param func Function to call for each turned off region - */ - template - void ForEachModifiedRange(VAddr query_cpu_range, s64 size, auto&& func) { - RENDERER_TRACE; - std::scoped_lock lk{lock}; - static_assert(type != Type::Untracked); - - std::span state_words = Span(); - const size_t offset = query_cpu_range - cpu_addr; - bool pending = false; - size_t pending_offset{}; - size_t pending_pointer{}; - const auto release = [&]() { - func(cpu_addr + pending_offset * BYTES_PER_PAGE, - (pending_pointer - pending_offset) * BYTES_PER_PAGE); - }; - IterateWords(offset, size, [&](size_t index, u64 mask) { - RENDERER_TRACE; - if constexpr (type == Type::GPU) { - mask &= ~untracked[index]; - } - const u64 word = state_words[index] & mask; - if constexpr (clear) { - if constexpr (type == Type::CPU) { - UpdateProtection(index, untracked[index], mask); - untracked[index] &= ~mask; - } - state_words[index] &= ~mask; - } - const size_t base_offset = index * PAGES_PER_WORD; - IteratePages(word, [&](size_t pages_offset, size_t pages_size) { - RENDERER_TRACE; - const auto reset = [&]() { - pending_offset = base_offset + pages_offset; - pending_pointer = base_offset + pages_offset + pages_size; - }; - if (!pending) { - reset(); - pending = true; - return; - } - if (pending_pointer == base_offset + pages_offset) { - pending_pointer += pages_size; - return; - } - release(); - reset(); - }); - }); - if (pending) { - release(); - } - } - - /** - * Returns true when a region has been modified - * - * @param offset Offset in bytes from the start of the buffer - * @param size Size in bytes of the region to query for modifications - */ - template - [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { - static_assert(type != Type::Untracked); - - const std::span state_words = Span(); - bool result = false; - IterateWords(offset, size, [&](size_t index, u64 mask) { - if constexpr (type == Type::GPU) { - mask &= ~untracked[index]; - } - const u64 word = state_words[index] & mask; - if (word != 0) { - result = true; - return true; - } - return false; - }); - return result; - } - -private: - /** - * Notify tracker about changes in the CPU tracking state of a word in the buffer - * - * @param word_index Index to the word to notify to the tracker - * @param current_bits Current state of the word - * @param new_bits New state of the word - * - * @tparam add_to_tracker True when the tracker should start tracking the new pages - */ - template - void UpdateProtection(u64 word_index, u64 current_bits, u64 new_bits) const { - RENDERER_TRACE; - constexpr s32 delta = add_to_tracker ? 1 : -1; - u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits; - VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; - IteratePages(changed_bits, [&](size_t offset, size_t size) { - tracker->UpdatePageWatchers(addr + offset * BYTES_PER_PAGE, - size * BYTES_PER_PAGE); - }); - } - - template - std::span Span() noexcept { - if constexpr (type == Type::CPU) { - return cpu; - } else if constexpr (type == Type::GPU) { - return gpu; - } else if constexpr (type == Type::Untracked) { - return untracked; - } - } - - template - std::span Span() const noexcept { - if constexpr (type == Type::CPU) { - return cpu; - } else if constexpr (type == Type::GPU) { - return gpu; - } else if constexpr (type == Type::Untracked) { - return untracked; - } - } - -#ifdef PTHREAD_ADAPTIVE_MUTEX_INITIALIZER_NP - Common::AdaptiveMutex lock; -#else - Common::SpinLock lock; -#endif - PageManager* tracker; - VAddr cpu_addr = 0; - WordsArray cpu; - WordsArray gpu; - WordsArray untracked; -}; - -} // namespace VideoCore diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp index 39c03e7da..145779070 100644 --- a/src/video_core/page_manager.cpp +++ b/src/video_core/page_manager.cpp @@ -48,19 +48,15 @@ struct PageManager::Impl { u8 AddDelta() { if constexpr (delta == 1) { return ++num_watchers; - } else { + } else if constexpr (delta == -1) { ASSERT_MSG(num_watchers > 0, "Not enough watchers"); return --num_watchers; + } else { + return num_watchers; } } }; - struct UpdateProtectRange { - VAddr addr; - u64 size; - Core::MemoryPermission perms; - }; - static constexpr size_t ADDRESS_BITS = 40; static constexpr size_t NUM_ADDRESS_PAGES = 1ULL << (40 - PAGE_BITS); inline static Vulkan::Rasterizer* rasterizer; @@ -190,66 +186,122 @@ struct PageManager::Impl { } #endif - template + template void UpdatePageWatchers(VAddr addr, u64 size) { RENDERER_TRACE; - boost::container::small_vector update_ranges; - { - std::scoped_lock lk(lock); - size_t page = addr >> PAGE_BITS; - auto perms = cached_pages[page].Perm(); - u64 range_begin = 0; - u64 range_bytes = 0; + size_t page = addr >> PAGE_BITS; + auto perms = cached_pages[page].Perm(); + u64 range_begin = 0; + u64 range_bytes = 0; - const auto release_pending = [&] { - if (range_bytes > 0) { - RENDERER_TRACE; - // Add pending (un)protect action - update_ranges.push_back({range_begin << PAGE_BITS, range_bytes, perms}); - range_bytes = 0; - } - }; + const auto release_pending = [&] { + if (range_bytes > 0) { + RENDERER_TRACE; + // Perform pending (un)protect action + Protect(range_begin << PAGE_BITS, range_bytes, perms); + range_bytes = 0; + } + }; - // Iterate requested pages - const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); - const u64 aligned_addr = page << PAGE_BITS; - const u64 aligned_end = page_end << PAGE_BITS; - ASSERT_MSG(rasterizer->IsMapped(aligned_addr, aligned_end - aligned_addr), - "Attempted to track non-GPU memory at address {:#x}, size {:#x}.", - aligned_addr, aligned_end - aligned_addr); + std::scoped_lock lk(lock); - for (; page != page_end; ++page) { - PageState& state = cached_pages[page]; + // Iterate requested pages + const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); + const u64 aligned_addr = page << PAGE_BITS; + const u64 aligned_end = page_end << PAGE_BITS; + ASSERT_MSG(rasterizer->IsMapped(aligned_addr, aligned_end - aligned_addr), + "Attempted to track non-GPU memory at address {:#x}, size {:#x}.", aligned_addr, + aligned_end - aligned_addr); - // Apply the change to the page state - const u8 new_count = state.AddDelta(); + for (; page != page_end; ++page) { + PageState& state = cached_pages[page]; + // Apply the change to the page state + const u8 new_count = state.AddDelta(); + + if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] { // If the protection changed add pending (un)protect action - if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] { - release_pending(); - perms = new_perms; - } - - // If the page must be (un)protected, add it to the pending range - if ((new_count == 0 && delta < 0) || (new_count == 1 && delta > 0)) { - if (range_bytes == 0) { - range_begin = page; - } - range_bytes += PAGE_SIZE; - } else { - release_pending(); - } + release_pending(); + perms = new_perms; + } else if (range_bytes != 0) { + // If the protection did not change, extend the current range + range_bytes += PAGE_SIZE; } - // Add pending (un)protect action - release_pending(); + // Only start a new range if the page must be (un)protected + if (range_bytes == 0 && ((new_count == 0 && !track) || (new_count == 1 && track))) { + range_begin = page; + range_bytes = PAGE_SIZE; + } } - // Flush deferred protects - for (const auto& range : update_ranges) { - Protect(range.addr, range.size, range.perms); + // Add pending (un)protect action + release_pending(); + } + + template + void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) { + RENDERER_TRACE; + auto start_range = mask.FirstRange(); + auto end_range = mask.LastRange(); + + if (start_range.second == end_range.second) { + // Optimization: if all pages are contiguous, use the regular UpdatePageWatchers + const VAddr start_addr = base_addr + (start_range.first << PAGE_BITS); + const u64 size = (start_range.second - start_range.first) << PAGE_BITS; + + UpdatePageWatchers(start_addr, size); + return; } + + size_t base_page = (base_addr >> PAGE_BITS); + auto perms = cached_pages[base_page + start_range.first].Perm(); + u64 range_begin = 0; + u64 range_bytes = 0; + + const auto release_pending = [&] { + if (range_bytes > 0) { + RENDERER_TRACE; + // Perform pending (un)protect action + Protect((range_begin << PAGE_BITS), range_bytes, perms); + range_bytes = 0; + } + }; + + std::scoped_lock lk(lock); + + // Iterate pages + for (size_t page = start_range.first; page < end_range.second; ++page) { + PageState& state = cached_pages[base_page + page]; + const bool update = mask.Get(page); + + // Apply the change to the page state + const u8 new_count = update ? state.AddDelta() : state.AddDelta<0>(); + + if (auto new_perms = state.Perm(); new_perms != perms) [[unlikely]] { + // If the protection changed add pending (un)protect action + release_pending(); + perms = new_perms; + } else if (range_bytes != 0) { + // If the protection did not change, extend the current range + range_bytes += PAGE_SIZE; + } + + // If the page is not being updated, skip it + if (!update) { + continue; + } + + // Only start a new range if the page must be (un)protected + if (range_bytes == 0 && ((new_count == 0 && !track) || (new_count == 1 && track))) { + range_begin = base_page + page; + range_bytes = PAGE_SIZE; + } + } + + // Add pending (un)protect action + release_pending(); } std::array cached_pages{}; @@ -273,12 +325,21 @@ void PageManager::OnGpuUnmap(VAddr address, size_t size) { impl->OnUnmap(address, size); } -template +template void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const { - impl->UpdatePageWatchers(addr, size); + impl->UpdatePageWatchers(addr, size); } -template void PageManager::UpdatePageWatchers<1>(VAddr addr, u64 size) const; -template void PageManager::UpdatePageWatchers<-1>(VAddr addr, u64 size) const; +template +void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const { + impl->UpdatePageWatchersForRegion(base_addr, mask); +} + +template void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const; +template void PageManager::UpdatePageWatchers(VAddr addr, u64 size) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; +template void PageManager::UpdatePageWatchersForRegion(VAddr base_addr, + RegionBits& mask) const; } // namespace VideoCore diff --git a/src/video_core/page_manager.h b/src/video_core/page_manager.h index 98dd099af..157b34984 100644 --- a/src/video_core/page_manager.h +++ b/src/video_core/page_manager.h @@ -6,6 +6,7 @@ #include #include "common/alignment.h" #include "common/types.h" +#include "video_core/buffer_cache//region_definitions.h" namespace Vulkan { class Rasterizer; @@ -28,9 +29,14 @@ public: void OnGpuUnmap(VAddr address, size_t size); /// Updates watches in the pages touching the specified region. - template + template void UpdatePageWatchers(VAddr addr, u64 size) const; + /// Updates watches in the pages touching the specified region + /// using a mask. + template + void UpdatePageWatchersForRegion(VAddr base_addr, RegionBits& mask) const; + /// Returns page aligned address. static constexpr VAddr GetPageAddr(VAddr addr) { return Common::AlignDown(addr, PAGE_SIZE); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index a1ff5db8a..a50601af6 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -761,7 +761,7 @@ void TextureCache::UntrackImage(ImageId image_id) { image.track_addr = 0; image.track_addr_end = 0; if (size != 0) { - tracker.UpdatePageWatchers<-1>(addr, size); + tracker.UpdatePageWatchers(addr, size); } } @@ -780,7 +780,7 @@ void TextureCache::UntrackImageHead(ImageId image_id) { // Cehck its hash later. MarkAsMaybeDirty(image_id, image); } - tracker.UpdatePageWatchers<-1>(image_begin, size); + tracker.UpdatePageWatchers(image_begin, size); } void TextureCache::UntrackImageTail(ImageId image_id) { @@ -799,7 +799,7 @@ void TextureCache::UntrackImageTail(ImageId image_id) { // Cehck its hash later. MarkAsMaybeDirty(image_id, image); } - tracker.UpdatePageWatchers<-1>(addr, size); + tracker.UpdatePageWatchers(addr, size); } void TextureCache::DeleteImage(ImageId image_id) {