mirror of
https://git.eden-emu.dev/eden-emu/eden
synced 2026-05-31 23:07:06 +02:00
[buffer_cache] Add batching support for memory tracker updates (#3288)
I added a batching/ coalescing of ranges in WordManager to reduce calls per pages in UpdatePagesCachedCount, also a test to verify if FlushCachedWrites coalesced (reduces callings to UpdatePagesCachedCount) callings and register each of them to inspect them. Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3288 Reviewed-by: Maufeat <sahyno1996@gmail.com> Reviewed-by: DraVee <dravee@eden-emu.dev> Co-authored-by: CamilleLaVey <camillelavey99@gmail.com> Co-committed-by: CamilleLaVey <camillelavey99@gmail.com>
This commit is contained in:
parent
51cc1bc6be
commit
1a9b4b37e1
4 changed files with 155 additions and 6 deletions
|
|
@ -1,3 +1,6 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
|
@ -9,6 +12,7 @@
|
||||||
#include <deque>
|
#include <deque>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
#include "common/range_mutex.h"
|
#include "common/range_mutex.h"
|
||||||
|
|
@ -44,6 +48,7 @@ public:
|
||||||
~DeviceMemoryManager();
|
~DeviceMemoryManager();
|
||||||
|
|
||||||
static constexpr bool HAS_FLUSH_INVALIDATION = true;
|
static constexpr bool HAS_FLUSH_INVALIDATION = true;
|
||||||
|
static constexpr size_t AS_BITS = Traits::device_virtual_bits;
|
||||||
|
|
||||||
void BindInterface(DeviceInterface* device_inter);
|
void BindInterface(DeviceInterface* device_inter);
|
||||||
|
|
||||||
|
|
@ -117,7 +122,12 @@ public:
|
||||||
|
|
||||||
void UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta);
|
void UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta);
|
||||||
|
|
||||||
static constexpr size_t AS_BITS = Traits::device_virtual_bits;
|
// New batch API to update multiple ranges with a single lock acquisition.
|
||||||
|
void UpdatePagesCachedBatch(const std::vector<std::pair<DAddr, size_t>>& ranges, s32 delta);
|
||||||
|
|
||||||
|
private:
|
||||||
|
// Internal helper that performs the update assuming the caller already holds the necessary lock.
|
||||||
|
void UpdatePagesCachedCountNoLock(DAddr addr, size_t size, s32 delta);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static constexpr size_t device_virtual_bits = Traits::device_virtual_bits;
|
static constexpr size_t device_virtual_bits = Traits::device_virtual_bits;
|
||||||
|
|
@ -214,6 +224,8 @@ private:
|
||||||
std::unique_ptr<CachedPages> cached_pages;
|
std::unique_ptr<CachedPages> cached_pages;
|
||||||
Common::RangeMutex counter_guard;
|
Common::RangeMutex counter_guard;
|
||||||
std::mutex mapping_guard;
|
std::mutex mapping_guard;
|
||||||
|
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace Core
|
} // namespace Core
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,6 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||||
|
|
||||||
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
|
||||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
|
@ -5,6 +8,8 @@
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "common/address_space.h"
|
#include "common/address_space.h"
|
||||||
#include "common/address_space.inc"
|
#include "common/address_space.inc"
|
||||||
|
|
@ -507,8 +512,7 @@ void DeviceMemoryManager<Traits>::UnregisterProcess(Asid asid) {
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Traits>
|
template <typename Traits>
|
||||||
void DeviceMemoryManager<Traits>::UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta) {
|
void DeviceMemoryManager<Traits>::UpdatePagesCachedCountNoLock(DAddr addr, size_t size, s32 delta) {
|
||||||
Common::ScopedRangeLock lk(counter_guard, addr, size);
|
|
||||||
u64 uncache_begin = 0;
|
u64 uncache_begin = 0;
|
||||||
u64 cache_begin = 0;
|
u64 cache_begin = 0;
|
||||||
u64 uncache_bytes = 0;
|
u64 uncache_bytes = 0;
|
||||||
|
|
@ -586,4 +590,47 @@ void DeviceMemoryManager<Traits>::UpdatePagesCachedCount(DAddr addr, size_t size
|
||||||
release_pending();
|
release_pending();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename Traits>
|
||||||
|
void DeviceMemoryManager<Traits>::UpdatePagesCachedCount(DAddr addr, size_t size, s32 delta) {
|
||||||
|
Common::ScopedRangeLock lk(counter_guard, addr, size);
|
||||||
|
UpdatePagesCachedCountNoLock(addr, size, delta);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Traits>
|
||||||
|
void DeviceMemoryManager<Traits>::UpdatePagesCachedBatch(const std::vector<std::pair<DAddr, size_t>>& ranges, s32 delta) {
|
||||||
|
if (ranges.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Make a local copy and sort by address
|
||||||
|
std::vector<std::pair<DAddr, size_t>> tmp = ranges;
|
||||||
|
std::sort(tmp.begin(), tmp.end(), [](const auto& a, const auto& b) { return a.first < b.first; });
|
||||||
|
|
||||||
|
// Coalesce adjacent/overlapping ranges
|
||||||
|
std::vector<std::pair<DAddr, size_t>> coalesced;
|
||||||
|
DAddr cur_addr = tmp[0].first;
|
||||||
|
size_t cur_size = tmp[0].second;
|
||||||
|
for (size_t i = 1; i < tmp.size(); ++i) {
|
||||||
|
DAddr next_addr = tmp[i].first;
|
||||||
|
size_t next_size = tmp[i].second;
|
||||||
|
if (cur_addr + cur_size >= next_addr) {
|
||||||
|
// overlapping or contiguous
|
||||||
|
const DAddr end = std::max(cur_addr + cur_size, next_addr + next_size);
|
||||||
|
cur_size = end - cur_addr;
|
||||||
|
} else {
|
||||||
|
coalesced.emplace_back(cur_addr, cur_size);
|
||||||
|
cur_addr = next_addr;
|
||||||
|
cur_size = next_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
coalesced.emplace_back(cur_addr, cur_size);
|
||||||
|
|
||||||
|
const DAddr lock_begin = coalesced.front().first;
|
||||||
|
const DAddr lock_end = coalesced.back().first + coalesced.back().second;
|
||||||
|
Common::ScopedRangeLock lk(counter_guard, lock_begin, static_cast<size_t>(lock_end - lock_begin));
|
||||||
|
|
||||||
|
for (const auto& [addr, size] : coalesced) {
|
||||||
|
UpdatePagesCachedCountNoLock(addr, size, delta);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace Core
|
} // namespace Core
|
||||||
|
|
|
||||||
|
|
@ -4,11 +4,15 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include <catch2/catch_test_macros.hpp>
|
#include <catch2/catch_test_macros.hpp>
|
||||||
|
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
#include "video_core/buffer_cache/memory_tracker_base.h"
|
#include "video_core/buffer_cache/memory_tracker_base.h"
|
||||||
|
#include "core/device_memory.h"
|
||||||
|
#include "video_core/host1x/gpu_device_memory_manager.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
using Range = std::pair<u64, u64>;
|
using Range = std::pair<u64, u64>;
|
||||||
|
|
@ -23,6 +27,8 @@ constexpr VAddr c = 16 * HIGH_PAGE_SIZE;
|
||||||
class RasterizerInterface {
|
class RasterizerInterface {
|
||||||
public:
|
public:
|
||||||
void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
|
void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
|
||||||
|
++update_calls;
|
||||||
|
calls.emplace_back(addr, size, delta);
|
||||||
const u64 page_start{addr >> Core::DEVICE_PAGEBITS};
|
const u64 page_start{addr >> Core::DEVICE_PAGEBITS};
|
||||||
const u64 page_end{(addr + size + Core::DEVICE_PAGESIZE - 1) >> Core::DEVICE_PAGEBITS};
|
const u64 page_end{(addr + size + Core::DEVICE_PAGESIZE - 1) >> Core::DEVICE_PAGEBITS};
|
||||||
for (u64 page = page_start; page < page_end; ++page) {
|
for (u64 page = page_start; page < page_end; ++page) {
|
||||||
|
|
@ -36,6 +42,9 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] size_t UpdateCalls() const noexcept { return update_calls; }
|
||||||
|
[[nodiscard]] const std::vector<std::tuple<VAddr, u64, int>>& UpdateCallsList() const noexcept { return calls; }
|
||||||
|
|
||||||
[[nodiscard]] int Count(VAddr addr) const noexcept {
|
[[nodiscard]] int Count(VAddr addr) const noexcept {
|
||||||
const auto it = page_table.find(addr >> Core::DEVICE_PAGEBITS);
|
const auto it = page_table.find(addr >> Core::DEVICE_PAGEBITS);
|
||||||
return it == page_table.end() ? 0 : it->second;
|
return it == page_table.end() ? 0 : it->second;
|
||||||
|
|
@ -51,7 +60,10 @@ public:
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::unordered_map<u64, int> page_table;
|
std::unordered_map<u64, int> page_table;
|
||||||
|
std::vector<std::tuple<VAddr, u64, int>> calls;
|
||||||
|
size_t update_calls = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // Anonymous namespace
|
} // Anonymous namespace
|
||||||
|
|
||||||
using MemoryTracker = VideoCommon::MemoryTrackerBase<RasterizerInterface>;
|
using MemoryTracker = VideoCommon::MemoryTrackerBase<RasterizerInterface>;
|
||||||
|
|
@ -544,3 +556,34 @@ TEST_CASE("MemoryTracker: Cached write downloads") {
|
||||||
memory_track->MarkRegionAsCpuModified(c, WORD);
|
memory_track->MarkRegionAsCpuModified(c, WORD);
|
||||||
REQUIRE(rasterizer.Count() == 0);
|
REQUIRE(rasterizer.Count() == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE("MemoryTracker: FlushCachedWrites batching") {
|
||||||
|
RasterizerInterface rasterizer;
|
||||||
|
std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer));
|
||||||
|
memory_track->UnmarkRegionAsCpuModified(c, WORD * 2);
|
||||||
|
memory_track->CachedCpuWrite(c + PAGE, PAGE);
|
||||||
|
memory_track->CachedCpuWrite(c + PAGE * 2, PAGE);
|
||||||
|
memory_track->CachedCpuWrite(c + PAGE * 4, PAGE);
|
||||||
|
REQUIRE(rasterizer.UpdateCalls() == 0);
|
||||||
|
memory_track->FlushCachedWrites();
|
||||||
|
// Now we expect a single batch call (coalesced ranges) to the device memory manager
|
||||||
|
REQUIRE(rasterizer.UpdateCalls() == 1);
|
||||||
|
const auto& calls = rasterizer.UpdateCallsList();
|
||||||
|
REQUIRE(std::get<0>(calls[0]) == c + PAGE);
|
||||||
|
REQUIRE(std::get<1>(calls[0]) == PAGE * 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CASE("DeviceMemoryManager: UpdatePagesCachedBatch basic") {
|
||||||
|
Core::DeviceMemory device_memory;
|
||||||
|
Tegra::MaxwellDeviceMemoryManager manager(device_memory);
|
||||||
|
// empty should be a no-op
|
||||||
|
std::vector<std::pair<Core::DAddr, size_t>> empty;
|
||||||
|
manager.UpdatePagesCachedBatch(empty, 1);
|
||||||
|
|
||||||
|
// small ranges should be accepted and not crash
|
||||||
|
std::vector<std::pair<Core::DAddr, size_t>> ranges;
|
||||||
|
ranges.emplace_back(0, Core::Memory::YUZU_PAGESIZE);
|
||||||
|
ranges.emplace_back(Core::Memory::YUZU_PAGESIZE, Core::Memory::YUZU_PAGESIZE);
|
||||||
|
manager.UpdatePagesCachedBatch(ranges, 1);
|
||||||
|
SUCCEED("UpdatePagesCachedBatch executed without error");
|
||||||
|
}
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <span>
|
#include <span>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "common/alignment.h"
|
#include "common/alignment.h"
|
||||||
#include "common/common_funcs.h"
|
#include "common/common_funcs.h"
|
||||||
|
|
@ -256,9 +257,10 @@ public:
|
||||||
std::span<u64> state_words = words.template Span<type>();
|
std::span<u64> state_words = words.template Span<type>();
|
||||||
[[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>();
|
[[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>();
|
||||||
[[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>();
|
[[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>();
|
||||||
|
std::vector<std::pair<VAddr, u64>> ranges;
|
||||||
IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) {
|
IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) {
|
||||||
if constexpr (type == Type::CPU || type == Type::CachedCPU) {
|
if constexpr (type == Type::CPU || type == Type::CachedCPU) {
|
||||||
NotifyRasterizer<!enable>(index, untracked_words[index], mask);
|
CollectChangedRanges<(!enable)>(index, untracked_words[index], mask, ranges);
|
||||||
}
|
}
|
||||||
if constexpr (enable) {
|
if constexpr (enable) {
|
||||||
state_words[index] |= mask;
|
state_words[index] |= mask;
|
||||||
|
|
@ -279,6 +281,9 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
if (!ranges.empty()) {
|
||||||
|
ApplyCollectedRanges(ranges, (!enable) ? 1 : -1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -304,6 +309,7 @@ public:
|
||||||
func(cpu_addr + pending_offset * BYTES_PER_PAGE,
|
func(cpu_addr + pending_offset * BYTES_PER_PAGE,
|
||||||
(pending_pointer - pending_offset) * BYTES_PER_PAGE);
|
(pending_pointer - pending_offset) * BYTES_PER_PAGE);
|
||||||
};
|
};
|
||||||
|
std::vector<std::pair<VAddr, u64>> ranges;
|
||||||
IterateWords(offset, size, [&](size_t index, u64 mask) {
|
IterateWords(offset, size, [&](size_t index, u64 mask) {
|
||||||
if constexpr (type == Type::GPU) {
|
if constexpr (type == Type::GPU) {
|
||||||
mask &= ~untracked_words[index];
|
mask &= ~untracked_words[index];
|
||||||
|
|
@ -311,7 +317,7 @@ public:
|
||||||
const u64 word = state_words[index] & mask;
|
const u64 word = state_words[index] & mask;
|
||||||
if constexpr (clear) {
|
if constexpr (clear) {
|
||||||
if constexpr (type == Type::CPU || type == Type::CachedCPU) {
|
if constexpr (type == Type::CPU || type == Type::CachedCPU) {
|
||||||
NotifyRasterizer<true>(index, untracked_words[index], mask);
|
CollectChangedRanges<true>(index, untracked_words[index], mask, ranges);
|
||||||
}
|
}
|
||||||
state_words[index] &= ~mask;
|
state_words[index] &= ~mask;
|
||||||
if constexpr (type == Type::CPU || type == Type::CachedCPU) {
|
if constexpr (type == Type::CPU || type == Type::CachedCPU) {
|
||||||
|
|
@ -343,6 +349,9 @@ public:
|
||||||
if (pending) {
|
if (pending) {
|
||||||
release();
|
release();
|
||||||
}
|
}
|
||||||
|
if (!ranges.empty()) {
|
||||||
|
ApplyCollectedRanges(ranges, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -425,13 +434,17 @@ public:
|
||||||
u64* const cached_words = Array<Type::CachedCPU>();
|
u64* const cached_words = Array<Type::CachedCPU>();
|
||||||
u64* const untracked_words = Array<Type::Untracked>();
|
u64* const untracked_words = Array<Type::Untracked>();
|
||||||
u64* const cpu_words = Array<Type::CPU>();
|
u64* const cpu_words = Array<Type::CPU>();
|
||||||
|
std::vector<std::pair<VAddr, u64>> ranges;
|
||||||
for (u64 word_index = 0; word_index < num_words; ++word_index) {
|
for (u64 word_index = 0; word_index < num_words; ++word_index) {
|
||||||
const u64 cached_bits = cached_words[word_index];
|
const u64 cached_bits = cached_words[word_index];
|
||||||
NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits);
|
CollectChangedRanges<false>(word_index, untracked_words[word_index], cached_bits, ranges);
|
||||||
untracked_words[word_index] |= cached_bits;
|
untracked_words[word_index] |= cached_bits;
|
||||||
cpu_words[word_index] |= cached_bits;
|
cpu_words[word_index] |= cached_bits;
|
||||||
cached_words[word_index] = 0;
|
cached_words[word_index] = 0;
|
||||||
}
|
}
|
||||||
|
if (!ranges.empty()) {
|
||||||
|
ApplyCollectedRanges(ranges, -1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
@ -470,6 +483,40 @@ private:
|
||||||
*
|
*
|
||||||
* @tparam add_to_tracker True when the tracker should start tracking the new pages
|
* @tparam add_to_tracker True when the tracker should start tracking the new pages
|
||||||
*/
|
*/
|
||||||
|
template <bool add_to_tracker>
|
||||||
|
void CollectChangedRanges(u64 word_index, u64 current_bits, u64 new_bits,
|
||||||
|
std::vector<std::pair<VAddr, u64>>& out_ranges) const {
|
||||||
|
u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits;
|
||||||
|
VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
|
||||||
|
IteratePages(changed_bits, [&](size_t offset, size_t size) {
|
||||||
|
out_ranges.emplace_back(addr + offset * BYTES_PER_PAGE, size * BYTES_PER_PAGE);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void ApplyCollectedRanges(std::vector<std::pair<VAddr, u64>>& ranges, int delta) const {
|
||||||
|
if (ranges.empty()) return;
|
||||||
|
std::sort(ranges.begin(), ranges.end(),
|
||||||
|
[](const auto& a, const auto& b) { return a.first < b.first; });
|
||||||
|
// Coalesce adjacent/contiguous ranges
|
||||||
|
std::vector<std::pair<VAddr, size_t>> coalesced;
|
||||||
|
coalesced.reserve(ranges.size());
|
||||||
|
VAddr cur_addr = ranges[0].first;
|
||||||
|
size_t cur_size = static_cast<size_t>(ranges[0].second);
|
||||||
|
for (size_t i = 1; i < ranges.size(); ++i) {
|
||||||
|
if (cur_addr + cur_size == ranges[i].first) {
|
||||||
|
cur_size += static_cast<size_t>(ranges[i].second);
|
||||||
|
} else {
|
||||||
|
coalesced.emplace_back(cur_addr, cur_size);
|
||||||
|
cur_addr = ranges[i].first;
|
||||||
|
cur_size = static_cast<size_t>(ranges[i].second);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
coalesced.emplace_back(cur_addr, cur_size);
|
||||||
|
// Use batch API to reduce lock acquisitions and contention.
|
||||||
|
tracker->UpdatePagesCachedBatch(coalesced, delta);
|
||||||
|
ranges.clear();
|
||||||
|
}
|
||||||
|
|
||||||
template <bool add_to_tracker>
|
template <bool add_to_tracker>
|
||||||
void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
|
void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
|
||||||
u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits;
|
u64 changed_bits = (add_to_tracker ? current_bits : ~current_bits) & new_bits;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue