Implement overhead-free sequenced buffer updates with megabuffers

Previously constant buffer updates would be handled on the CPU and only the end result would be synced to the GPU before execute. This caused issues as if the constant buffer contents was changed between each draw in a renderpass (e.g. text rendering) the draws themselves would only see the final resulting constant buffer. We had earlier tried to fix this by using vkCmdUpdateBuffer however this caused significant performance loss due to an oversight in Adreno drivers. We could have worked around this simply by using vkCmdCopy buffer however there would still be a performance loss due to renderpasses being split up with copies inbetween. To avoid this we introduce 'megabuffers', a brand new technique not done before in any other switch emulators. Rather than replaying the copies in sequence on the GPU, we take advantage of the fact that buffers are generally small in order to replay buffers on the GPU instead. Each write and subsequent usage of a buffer will cause a copy of the buffer with that write, and all prior applied to be pushed into the megabuffer, this way at the start of execute the megabuffer will hold all used states of the buffer simultaneously. Draws then reference these individual states in sequence to allow everything to work without any copies. In order to support this buffers have been moved to an immediate sync model, with synchronisation being done at usage-time rather than execute (in order to keep contents properly sequenced) and GPU-side writes now need to be explictly marked (since they prevent megabuffering). It should also be noted that a fallback path using cmdCopyBuffer exists for the cases where buffers are too large or GPU dirty.
2025-07-17 08:46:39 +00:00 · 2022-04-23 18:10:39 +01:00
parent 0d9992cb8e
commit de796cd2cd
7 changed files with 363 additions and 59 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -8,6 +8,11 @@
 #include "buffer.h"

 namespace skyline::gpu {
+    void Buffer::TryEnableMegaBuffering() {
+        megaBufferOffset = 0;
+        megaBufferingEnabled = backing.size() < MegaBufferingDisableThreshold;
+    }
+
    void Buffer::SetupGuestMappings() {
        u8 *alignedData{util::AlignDown(guest->data(), PAGE_SIZE)};
        size_t alignedSize{static_cast<size_t>(util::AlignUp(guest->data() + guest->size(), PAGE_SIZE) - alignedData)};
@ -28,10 +33,64 @@ namespace skyline::gpu {
    }

    Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
+        TryEnableMegaBuffering();
        SetupGuestMappings();
    }

+    Buffer::Buffer(GPU &gpu, const std::shared_ptr<FenceCycle> &pCycle, GuestBuffer guest, span<std::shared_ptr<Buffer>> srcBuffers) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
+        std::scoped_lock bufLock{*this};
+
+        TryEnableMegaBuffering();
+        SetupGuestMappings();
+
+        // Source buffers don't necessarily fully overlap with us so we have to perform a sync here to prevent any gaps
+        SynchronizeHost(false);
+
+        // Copies between two buffers based off of their mappings in guest memory
+        auto copyBuffer{[](auto dstGuest, auto srcGuest, auto dstPtr, auto srcPtr) {
+            if (dstGuest.begin().base() <= srcGuest.begin().base()) {
+                size_t dstOffset{static_cast<size_t>(srcGuest.begin().base() - dstGuest.begin().base())};
+                size_t copySize{std::min(dstGuest.size() - dstOffset, srcGuest.size())};
+                std::memcpy(dstPtr + dstOffset, srcPtr, copySize);
+            } else if (dstGuest.begin().base() > srcGuest.begin().base()) {
+                size_t srcOffset{static_cast<size_t>(dstGuest.begin().base() - srcGuest.begin().base())};
+                size_t copySize{std::min(dstGuest.size(), srcGuest.size() - srcOffset)};
+                std::memcpy(dstPtr, srcPtr + srcOffset, copySize);
+            }
+        }};
+
+        // Transfer data/state from source buffers
+        for (const auto &srcBuffer : srcBuffers) {
+            std::scoped_lock lock{*srcBuffer};
+            if (srcBuffer->guest) {
+                if (!srcBuffer->megaBufferingEnabled)
+                    megaBufferingEnabled = false;
+
+                if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty) {
+                    // If the source buffer is GPU dirty we cannot directly copy over its GPU backing contents
+
+                    // Only sync back the buffer if it's not attched to the current fence cycle, otherwise propagate the GPU dirtiness
+                    if (!srcBuffer->cycle.owner_before(pCycle)) {
+                        // Perform a GPU -> CPU sync on the source then do a CPU -> GPU sync for the region occupied by the source
+                        // This is required since if we were created from a two buffers: one GPU dirty in the current cycle, and one GPU dirty in the previous cycle, if we marked ourselves as CPU dirty here then the GPU dirtiness from the current cycle buffer would be ignored and cause writes to be missed
+                        srcBuffer->SynchronizeGuest(true);
+                        copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->mirror.data());
+                    } else {
+                        MarkGpuDirty();
+                    }
+                } else if (srcBuffer->dirtyState == Buffer::DirtyState::Clean) {
+                    // For clean buffers we can just copy over the GPU backing data directly
+                    // This is necessary since clean buffers may not have matching GPU/CPU data in the case of non-megabuffered inline updates
+                    copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->backing.data());
+                }
+
+                // CPU dirty buffers are already synchronized in the initial SynchronizeHost call so don't need special handling
+            }
+        }
+    }
+
    Buffer::Buffer(GPU &gpu, vk::DeviceSize size) : gpu(gpu), backing(gpu.memory.AllocateBuffer(size)) {
+        TryEnableMegaBuffering();
        dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
    }

@ -47,6 +106,8 @@ namespace skyline::gpu {
    void Buffer::MarkGpuDirty() {
        if (dirtyState == DirtyState::GpuDirty || !guest)
            return;
+
+        megaBufferingEnabled = false; // We can no longer megabuffer this buffer after it has been written by the GPU
        gpu.state.nce->RetrapRegions(*trapHandle, false);
        dirtyState = DirtyState::GpuDirty;
    }
@ -61,6 +122,15 @@ namespace skyline::gpu {
        }
    }

+    bool Buffer::PollFence() {
+        auto lCycle{cycle.lock()};
+        if (lCycle && lCycle->Poll()) {
+            cycle.reset();
+            return true;
+        }
+        return false;
+    }
+
    void Buffer::SynchronizeHost(bool rwTrap) {
        if (dirtyState != DirtyState::CpuDirty || !guest)
            return; // If the buffer has not been modified on the CPU or there's no guest buffer, there is no need to synchronize it
@ -69,9 +139,13 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Buffer::SynchronizeHost");

+        // If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date
+        InvalidateMegaBuffer();
+
        std::memcpy(backing.data(), mirror.data(), mirror.size());

        if (rwTrap) {
+            megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
            gpu.state.nce->RetrapRegions(*trapHandle, false);
            dirtyState = DirtyState::GpuDirty;
        } else {
@ -89,9 +163,13 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle");

+        // If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date so force a recreation
+        InvalidateMegaBuffer();
+
        std::memcpy(backing.data(), mirror.data(), mirror.size());

        if (rwTrap) {
+            megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
            gpu.state.nce->RetrapRegions(*trapHandle, false);
            dirtyState = DirtyState::GpuDirty;
        } else {
@ -100,11 +178,13 @@ namespace skyline::gpu {
        }
    }

-    void Buffer::SynchronizeGuest(bool skipTrap, bool skipFence) {
+    void Buffer::SynchronizeGuest(bool skipTrap, bool nonBlocking) {
        if (dirtyState != DirtyState::GpuDirty || !guest)
            return; // If the buffer has not been used on the GPU or there's no guest buffer, there is no need to synchronize it

-        if (!skipFence)
+        if (nonBlocking && !PollFence())
+            return;
+        else if (!nonBlocking)
            WaitOnFence();

        TRACE_EVENT("gpu", "Buffer::SynchronizeGuest");
@ -113,7 +193,9 @@ namespace skyline::gpu {

        if (!skipTrap)
            gpu.state.nce->RetrapRegions(*trapHandle, true);
+
        dirtyState = DirtyState::Clean;
+        TryEnableMegaBuffering(); // If megaBuffering was disabled due to potential GPU dirtiness we can safely try to re-enable it now that the buffer is clean
    }

    /**
@ -138,18 +220,45 @@ namespace skyline::gpu {
        cycle = pCycle;
    }

-    void Buffer::Read(span<u8> data, vk::DeviceSize offset) {
-        if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean)
+    void Buffer::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
+        if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean) {
            std::memcpy(data.data(), mirror.data() + offset, data.size());
-        else if (dirtyState == DirtyState::GpuDirty)
+        } else if (dirtyState == DirtyState::GpuDirty) {
+            // If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that we read valid data
+            if (!cycle.owner_before(pCycle))
+                flushHostCallback();
+
+            SynchronizeGuest();
+
            std::memcpy(data.data(), backing.data() + offset, data.size());
+        }
    }

-    void Buffer::Write(span<u8> data, vk::DeviceSize offset) {
-        if (dirtyState == DirtyState::CpuDirty || dirtyState == DirtyState::Clean)
-            std::memcpy(mirror.data() + offset, data.data(), data.size());
-        if (dirtyState == DirtyState::GpuDirty || dirtyState == DirtyState::Clean)
+    void Buffer::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) {
+        InvalidateMegaBuffer(); // Since we're writing to the backing buffer the megabuffer contents will require refresh
+
+        if (dirtyState == DirtyState::CpuDirty) {
+            SynchronizeHostWithCycle(pCycle); // Perform a CPU -> GPU sync to ensure correct ordering of writes
+        } else if (dirtyState == DirtyState::GpuDirty) {
+            // If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that writes are correctly ordered
+            if (!cycle.owner_before(pCycle))
+                flushHostCallback();
+
+            SynchronizeGuest();
+        }
+
+        if (dirtyState != DirtyState::Clean)
+            Logger::Error("Attempting to write to a dirty buffer"); // This should never happen since we do syncs in both directions above
+
+        std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents
+
+        if (megaBufferingEnabled) {
+            // If megabuffering is enabled then we don't need to do any special sequencing here, we can write directly to the backing and the sequencing for it will be handled at usage time
            std::memcpy(backing.data() + offset, data.data(), data.size());
+        } else {
+            // Fallback to a GPU-side inline update for the buffer contents to ensure correct sequencing with draws
+            gpuCopyCallback();
+        }
    }

    Buffer::BufferViewStorage::BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) : offset(offset), size(size), format(format) {}
@ -207,6 +316,25 @@ namespace skyline::gpu {
        return BufferView{shared_from_this(), &views.back()};
    }

+    vk::DeviceSize Buffer::AcquireMegaBuffer() {
+        SynchronizeGuest(false, true); // First try and enable megabuffering by doing an immediate sync
+
+        if (!megaBufferingEnabled)
+            return 0; // Bail out if megabuffering is disabled for this buffer
+
+        SynchronizeHost(); // Since pushes to the megabuffer use the GPU backing contents ensure they're up-to-date by performing a CPU -> GPU sync
+
+        if (megaBufferOffset)
+            return megaBufferOffset; // If the current buffer contents haven't been changed since the last acquire, we can just return the existing offset
+
+        megaBufferOffset = gpu.buffer.megaBuffer.Push(backing, true); // Buffers are required to be page aligned in the megabuffer
+        return megaBufferOffset;
+    }
+
+    void Buffer::InvalidateMegaBuffer() {
+        megaBufferOffset = 0;
+    }
+
    BufferView::BufferView(std::shared_ptr<Buffer> buffer, Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}

    void BufferView::AttachCycle(const std::shared_ptr<FenceCycle> &cycle) {
@ -230,11 +358,21 @@ namespace skyline::gpu {
        }
    }

-    void BufferView::Read(span<u8> data, vk::DeviceSize offset) const {
-        bufferDelegate->buffer->Read(data, offset + bufferDelegate->view->offset);
+    void BufferView::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
+        bufferDelegate->buffer->Read(pCycle, flushHostCallback, data, offset + bufferDelegate->view->offset);
    }

-    void BufferView::Write(span<u8> data, vk::DeviceSize offset) const {
-        bufferDelegate->buffer->Write(data, offset + bufferDelegate->view->offset);
+    void BufferView::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
+        bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
+    }
+
+    vk::DeviceSize BufferView::AcquireMegaBuffer() const {
+        vk::DeviceSize bufferOffset{bufferDelegate->buffer->AcquireMegaBuffer()};
+
+        // Propagate 0 results since they signify that megabuffering isn't supported for a buffer
+        if (bufferOffset)
+            return bufferOffset + bufferDelegate->view->offset;
+        else
+            return 0;
    }
 }