[texture_cache, buffer_cache] Added TLS handling + changed command queue for GPU threading. (#3579)

(Merge of #3495 + #3108) This PR works around to simplify math operations on hot pointers inside the access and requests to the cache of buffers and texture cache, removing previous logic of indirection and replaced by a PoD approach. This will ensure less CPU times spended on the same request and flow directly into another chain of the render, in the same way, command queue currently uses an internal mutex that constraints the flow of data within the GPU threads, we're moving over a single command, I verified to keep using mutexes instead of internal mutex + mutex per operation, which are resolved by themselves. In simplier words, this aims to improve performance on those games and devices where the waits for next orders on GPU commands were heavier than a single verification. Co-Authored-by: @CamilleLaVey Co-Authored-by: @Lizzie Co-authored-by: CamilleLaVey <camillelavey99@gmail.com> Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/3579 Reviewed-by: CamilleLaVey <camillelavey99@gmail.com> Co-authored-by: lizzie <lizzie@eden-emu.dev> Co-committed-by: lizzie <lizzie@eden-emu.dev>
2026-05-31 14:27:07 +02:00 · 2026-02-20 00:52:07 +01:00 · 2026-02-20 00:52:07 +01:00 · c9c136bea7
commit c9c136bea7
parent 6f9d025ad2
12 changed files with 266 additions and 72 deletions
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
 // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later

@ -39,7 +42,8 @@ public:
    static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS;

    explicit BufferBase(VAddr cpu_addr_, u64 size_bytes_)
-        : cpu_addr{cpu_addr_}, size_bytes{size_bytes_} {}
+        : cpu_addr_cached{static_cast<DAddr>(cpu_addr_)}, cpu_addr{cpu_addr_},
+          size_bytes{size_bytes_} {}

    explicit BufferBase(NullBufferParams) {}

@ -97,6 +101,8 @@ public:
        return cpu_addr;
    }

+    DAddr cpu_addr_cached = 0;
+
    /// Returns the offset relative to the given CPU address
    /// @pre IsInBounds returns true
    [[nodiscard]] u32 Offset(VAddr other_cpu_addr) const noexcept {
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@ -1,4 +1,4 @@
-// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
+// SPDX-FileCopyrightText: Copyright 2026 Eden Emulator Project
 // SPDX-License-Identifier: GPL-3.0-or-later

 // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
@ -382,6 +382,10 @@ void BufferCache<P>::BindHostComputeBuffers() {
    BindHostComputeUniformBuffers();
    BindHostComputeStorageBuffers();
    BindHostComputeTextureBuffers();
+    if (any_buffer_uploaded) {
+        runtime.PostCopyBarrier();
+        any_buffer_uploaded = false;
+    }
 }

 template <class P>
@ -763,45 +767,85 @@ void BufferCache<P>::BindHostIndexBuffer() {
    }
 }

+template <class P>
+void BufferCache<P>::BindHostVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size,
+                                          u32 stride) {
+    if constexpr (IS_OPENGL) {
+        runtime.BindVertexBuffer(index, buffer, offset, size, stride);
+    } else {
+        runtime.BindVertexBuffer(index, buffer.Handle(), offset, size, stride);
+    }
+}
+
+template <class P>
+Binding& BufferCache<P>::VertexBufferSlot(u32 index) {
+    ASSERT(index < NUM_VERTEX_BUFFERS);
+    return v_buffer[index];
+}
+
+template <class P>
+const Binding& BufferCache<P>::VertexBufferSlot(u32 index) const {
+    ASSERT(index < NUM_VERTEX_BUFFERS);
+    return v_buffer[index];
+}
+
+template <class P>
+void BufferCache<P>::UpdateVertexBufferSlot(u32 index, const Binding& binding) {
+    Binding& slot = VertexBufferSlot(index);
+    if (slot.device_addr != binding.device_addr || slot.size != binding.size) {
+        ++vertex_buffers_serial;
+    }
+    slot = binding;
+    if (binding.buffer_id != NULL_BUFFER_ID && binding.size != 0) {
+        enabled_vertex_buffers_mask |= (1u << index);
+    } else {
+        enabled_vertex_buffers_mask &= ~(1u << index);
+    }
+}
+
 template <class P>
 void BufferCache<P>::BindHostVertexBuffers() {
-    HostBindings<typename P::Buffer> host_bindings;
-    bool any_valid{false};
    auto& flags = maxwell3d->dirty.flags;
-    for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
-        const Binding& binding = channel_state->vertex_buffers[index];
+    u32 enabled_mask = enabled_vertex_buffers_mask;
+    HostBindings<Buffer> bindings{};
+    u32 last_index = std::numeric_limits<u32>::max();
+    const auto flush_bindings = [&]() {
+        if (bindings.buffers.empty()) {
+            return;
+        }
+        bindings.max_index = bindings.min_index + static_cast<u32>(bindings.buffers.size());
+        runtime.BindVertexBuffers(bindings);
+        bindings = HostBindings<Buffer>{};
+        last_index = std::numeric_limits<u32>::max();
+    };
+    while (enabled_mask != 0) {
+        const u32 index = std::countr_zero(enabled_mask);
+        enabled_mask &= (enabled_mask - 1);
+        const Binding& binding = VertexBufferSlot(index);
        Buffer& buffer = slot_buffers[binding.buffer_id];
        TouchBuffer(buffer, binding.buffer_id);
        SynchronizeBuffer(buffer, binding.device_addr, binding.size);
        if (!flags[Dirty::VertexBuffer0 + index]) {
+            flush_bindings();
            continue;
        }
        flags[Dirty::VertexBuffer0 + index] = false;
-
-        host_bindings.min_index = (std::min)(host_bindings.min_index, index);
-        host_bindings.max_index = (std::max)(host_bindings.max_index, index);
-        any_valid = true;
-    }
-
-    if (any_valid) {
-        host_bindings.max_index++;
-        for (u32 index = host_bindings.min_index; index < host_bindings.max_index; index++) {
-            flags[Dirty::VertexBuffer0 + index] = false;
-
-            const Binding& binding = channel_state->vertex_buffers[index];
-            Buffer& buffer = slot_buffers[binding.buffer_id];
-
-            const u32 stride = maxwell3d->regs.vertex_streams[index].stride;
-            const u32 offset = buffer.Offset(binding.device_addr);
-            buffer.MarkUsage(offset, binding.size);
-
-            host_bindings.buffers.push_back(&buffer);
-            host_bindings.offsets.push_back(offset);
-            host_bindings.sizes.push_back(binding.size);
-            host_bindings.strides.push_back(stride);
+        const u32 stride = maxwell3d->regs.vertex_streams[index].stride;
+        const u32 offset = buffer.Offset(binding.device_addr);
+        buffer.MarkUsage(offset, binding.size);
+        if (!bindings.buffers.empty() && index != last_index + 1) {
+            flush_bindings();
        }
-        runtime.BindVertexBuffers(host_bindings);
+        if (bindings.buffers.empty()) {
+            bindings.min_index = index;
+        }
+        bindings.buffers.push_back(&buffer);
+        bindings.offsets.push_back(offset);
+        bindings.sizes.push_back(binding.size);
+        bindings.strides.push_back(stride);
+        last_index = index;
    }
+    flush_bindings();
 }

 template <class P>
@ -1205,17 +1249,20 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) {
    u32 size = address_size; // TODO: Analyze stride and number of vertices
    if (array.enable == 0 || size == 0 || !device_addr) {
        channel_state->vertex_buffers[index] = NULL_BINDING;
+        UpdateVertexBufferSlot(index, NULL_BINDING);
        return;
    }
    if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end) || size >= 64_MiB) {
        size = static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, size));
    }
    const BufferId buffer_id = FindBuffer(*device_addr, size);
-    channel_state->vertex_buffers[index] = Binding{
+    const Binding binding{
        .device_addr = *device_addr,
        .size = size,
        .buffer_id = buffer_id,
    };
+    channel_state->vertex_buffers[index] = binding;
+    UpdateVertexBufferSlot(index, binding);
 }

 template <class P>
@ -1528,12 +1575,12 @@ void BufferCache<P>::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept {

 template <class P>
 bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, DAddr device_addr, u32 size) {
-    boost::container::small_vector<BufferCopy, 4> copies;
+    upload_copies.clear();
    u64 total_size_bytes = 0;
    u64 largest_copy = 0;
-    DAddr buffer_start = buffer.CpuAddr();
+    const DAddr buffer_start = buffer.cpu_addr_cached;
    memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) {
-        copies.push_back(BufferCopy{
+        upload_copies.push_back(BufferCopy{
            .src_offset = total_size_bytes,
            .dst_offset = device_addr_out - buffer_start,
            .size = range_size,
@ -1544,8 +1591,9 @@ bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, DAddr device_addr, u32 si
    if (total_size_bytes == 0) {
        return true;
    }
-    const std::span<BufferCopy> copies_span(copies.data(), copies.size());
+    const std::span<BufferCopy> copies_span(upload_copies.data(), upload_copies.size());
    UploadMemory(buffer, total_size_bytes, largest_copy, copies_span);
+    any_buffer_uploaded = true;
    return false;
 }

@ -1735,6 +1783,7 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
        auto& binding = channel_state->vertex_buffers[index];
        if (binding.buffer_id == buffer_id) {
            binding.buffer_id = BufferId{};
+            UpdateVertexBufferSlot(index, binding);
            dirty_vertex_buffers.push_back(index);
        }
    }
--- a/src/video_core/buffer_cache/buffer_cache_base.h
+++ b/src/video_core/buffer_cache/buffer_cache_base.h
@ -320,6 +320,7 @@ public:

    std::recursive_mutex mutex;
    Runtime& runtime;
+    bool any_buffer_uploaded = false;

 private:
    template <typename Func>
@ -372,6 +373,8 @@ private:

    void BindHostTransformFeedbackBuffers();

+    void BindHostVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride);
+
    void BindHostComputeUniformBuffers();

    void BindHostComputeStorageBuffers();
@ -453,6 +456,12 @@ private:

    [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept;

+    [[nodiscard]] Binding& VertexBufferSlot(u32 index);
+
+    [[nodiscard]] const Binding& VertexBufferSlot(u32 index) const;
+
+    void UpdateVertexBufferSlot(u32 index, const Binding& binding);
+
    void ClearDownload(DAddr base_addr, u64 size);

    void InlineMemoryImplementation(DAddr dest_address, size_t copy_size,
@ -472,6 +481,12 @@ private:

    u32 last_index_count = 0;

+    u32 enabled_vertex_buffers_mask = 0;
+    u64 vertex_buffers_serial = 0;
+    std::array<Binding, 32> v_buffer{};
+
+    boost::container::small_vector<BufferCopy, 4> upload_copies;
+
    MemoryTracker memory_tracker;
    Common::RangeSet<DAddr> uncommitted_gpu_modified_ranges;
    Common::RangeSet<DAddr> gpu_modified_ranges;