Rewrite buffer megabuffering to be per view and more efficient

This commit implements several key optimisations in megabuffering that are all inherently interlinked. - Megabuffering is moved from per-buffer to per-view copies, this makes megabuffering possible for small views into larger underlying buffers which is often the case with even the simplest of games, - Megabuffering is no longer the default option, it is only enabled for buffer views that have had inline GPU writes applied to them in the past as that is the only case where they are beneficial. In any other case the cost of copying, even with a 128KiB limit can be significant. - With both of these changes, there is now possibility for overlapping views where one uses megabuffering and one does not. In order to allow GPU inline writes to work consistently in such cases a system of 'host immutability' has been implemented, when a buffer is marked as host immutable for a given cycle, all writes to the buffer from that point to the point the cycle is signalled will be performed on the GPU, ensuring that the backing contents are correctly sequenced
2025-07-17 08:46:39 +00:00 · 2022-06-10 21:26:19 +01:00
parent 2e356b8f0b
commit 7709dc8cf6
5 changed files with 128 additions and 77 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -8,9 +8,11 @@
 #include "buffer.h"

 namespace skyline::gpu {
-    void Buffer::TryEnableMegaBuffering() {
-        megaBufferOffset = 0;
-        megaBufferingEnabled = backing.size() < MegaBufferingDisableThreshold;
+    bool Buffer::CheckHostImmutable() {
+        if (hostImmutableCycle && hostImmutableCycle->Poll())
+            hostImmutableCycle.reset();
+
+        return hostImmutableCycle != nullptr;
    }

    void Buffer::SetupGuestMappings() {
@ -33,14 +35,11 @@ namespace skyline::gpu {
    }

    Buffer::Buffer(GPU &gpu, GuestBuffer guest) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
-        TryEnableMegaBuffering();
        SetupGuestMappings();
    }

    Buffer::Buffer(GPU &gpu, const std::shared_ptr<FenceCycle> &pCycle, GuestBuffer guest, span<std::shared_ptr<Buffer>> srcBuffers) : gpu(gpu), backing(gpu.memory.AllocateBuffer(guest.size())), guest(guest) {
        std::scoped_lock bufLock{*this};
-
-        TryEnableMegaBuffering();
        SetupGuestMappings();

        // Source buffers don't necessarily fully overlap with us so we have to perform a sync here to prevent any gaps
@ -63,8 +62,15 @@ namespace skyline::gpu {
        for (const auto &srcBuffer : srcBuffers) {
            std::scoped_lock lock{*srcBuffer};
            if (srcBuffer->guest) {
-                if (!srcBuffer->megaBufferingEnabled)
-                    megaBufferingEnabled = false;
+                if (srcBuffer->hostImmutableCycle) {
+                    // Propagate any host immutability
+                    if (hostImmutableCycle) {
+                        if (srcBuffer->hostImmutableCycle.owner_before(hostImmutableCycle))
+                            hostImmutableCycle = srcBuffer->hostImmutableCycle;
+                    } else {
+                        hostImmutableCycle = srcBuffer->hostImmutableCycle;
+                    }
+                }

                if (srcBuffer->dirtyState == Buffer::DirtyState::GpuDirty) {
                    // If the source buffer is GPU dirty we cannot directly copy over its GPU backing contents
@ -80,7 +86,7 @@ namespace skyline::gpu {
                    }
                } else if (srcBuffer->dirtyState == Buffer::DirtyState::Clean) {
                    // For clean buffers we can just copy over the GPU backing data directly
-                    // This is necessary since clean buffers may not have matching GPU/CPU data in the case of non-megabuffered inline updates
+                    // This is necessary since clean buffers may not have matching GPU/CPU data in the case of inline updates for host immutable buffers
                    copyBuffer(guest, *srcBuffer->guest, backing.data(), srcBuffer->backing.data());
                }

@ -90,7 +96,6 @@ namespace skyline::gpu {
    }

    Buffer::Buffer(GPU &gpu, vk::DeviceSize size) : gpu(gpu), backing(gpu.memory.AllocateBuffer(size)) {
-        TryEnableMegaBuffering();
        dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
    }

@ -107,7 +112,7 @@ namespace skyline::gpu {
        if (dirtyState == DirtyState::GpuDirty || !guest)
            return;

-        megaBufferingEnabled = false; // We can no longer megabuffer this buffer after it has been written by the GPU
+        AdvanceSequence(); // The GPU will modify buffer contents so advance to the next sequence
        gpu.state.nce->RetrapRegions(*trapHandle, false);
        dirtyState = DirtyState::GpuDirty;
    }
@ -139,13 +144,10 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Buffer::SynchronizeHost");

-        // If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date
-        InvalidateMegaBuffer();
-
+        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        std::memcpy(backing.data(), mirror.data(), mirror.size());

        if (rwTrap) {
-            megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
            gpu.state.nce->RetrapRegions(*trapHandle, false);
            dirtyState = DirtyState::GpuDirty;
        } else {
@ -163,13 +165,10 @@ namespace skyline::gpu {

        TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle");

-        // If we have performed a CPU->GPU sync and megabuffering is enabled for this buffer the megabuffer copy of the buffer will no longer be up-to-date so force a recreation
-        InvalidateMegaBuffer();
-
+        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        std::memcpy(backing.data(), mirror.data(), mirror.size());

        if (rwTrap) {
-            megaBufferingEnabled = false; // We can't megabuffer a buffer written by the GPU
            gpu.state.nce->RetrapRegions(*trapHandle, false);
            dirtyState = DirtyState::GpuDirty;
        } else {
@ -195,7 +194,6 @@ namespace skyline::gpu {
            gpu.state.nce->RetrapRegions(*trapHandle, true);

        dirtyState = DirtyState::Clean;
-        TryEnableMegaBuffering(); // If megaBuffering was disabled due to potential GPU dirtiness we can safely try to re-enable it now that the buffer is clean
    }

    /**
@ -236,7 +234,8 @@ namespace skyline::gpu {
    }

    void Buffer::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) {
-        InvalidateMegaBuffer(); // Since we're writing to the backing buffer the megabuffer contents will require refresh
+        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
+        everHadInlineUpdate = true;

        // Perform a syncs in both directions to ensure correct ordering of writes
        if (dirtyState == DirtyState::CpuDirty)
@ -249,13 +248,12 @@ namespace skyline::gpu {

        std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents

-        if (megaBufferingEnabled) {
-            // If megabuffering is enabled then we don't need to do any special sequencing here, we can write directly to the backing and the sequencing for it will be handled at usage time
-            std::memcpy(backing.data() + offset, data.data(), data.size());
-        } else {
-            // Fallback to a GPU-side inline update for the buffer contents to ensure correct sequencing with draws
+        if (CheckHostImmutable())
+            // Perform a GPU-side inline update for the buffer contents if this buffer is host immutable since we can't directly modify the backing
            gpuCopyCallback();
-        }
+        else
+            // If that's not the case we don't need to do any GPU-side sequencing here, we can write directly to the backing and the sequencing for it will be handled at usage time
+            std::memcpy(backing.data() + offset, data.data(), data.size());
    }

    BufferView Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) {
@ -264,23 +262,20 @@ namespace skyline::gpu {
        return BufferView{shared_from_this(), &(*it)};
    }

-    vk::DeviceSize Buffer::AcquireMegaBuffer(MegaBuffer &megaBuffer) {
-        SynchronizeGuest(false, true); // First try and enable megabuffering by doing an immediate sync
+    std::pair<u64, span<u8>> Buffer::AcquireCurrentSequence() {
+        SynchronizeGuest(false, true); // First try to remove GPU dirtiness by doing an immediate sync and taking a quick shower

-        if (!megaBufferingEnabled)
-            return 0; // Bail out if megabuffering is disabled for this buffer
+        if (dirtyState == DirtyState::GpuDirty)
+            // Bail out if buffer is GPU dirty - since we don't know the contents ahead of time the sequence is indeterminate
+            return {};

-        SynchronizeHost(); // Since pushes to the megabuffer use the GPU backing contents ensure they're up-to-date by performing a CPU -> GPU sync
+        SynchronizeHost(); // Ensure that the returned mirror is fully up-to-date by performing a CPU -> GPU sync

-        if (megaBufferOffset)
-            return megaBufferOffset; // If the current buffer contents haven't been changed since the last acquire, we can just return the existing offset
-
-        megaBufferOffset = megaBuffer.Push(backing, true); // Buffers are required to be page aligned in the megabuffer
-        return megaBufferOffset;
+        return {sequenceNumber, mirror};
    }

-    void Buffer::InvalidateMegaBuffer() {
-        megaBufferOffset = 0;
+    void Buffer::AdvanceSequence() {
+        sequenceNumber++;
    }

    span<u8> Buffer::GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {
@ -290,6 +285,10 @@ namespace skyline::gpu {
        return mirror;
    }

+    void Buffer::MarkHostImmutable(const std::shared_ptr<FenceCycle> &pCycle) {
+        hostImmutableCycle = pCycle;
+    }
+
    Buffer::BufferViewStorage::BufferViewStorage(vk::DeviceSize offset, vk::DeviceSize size, vk::Format format) : offset(offset), size(size), format(format) {}

    Buffer::BufferDelegate::BufferDelegate(std::shared_ptr<Buffer> pBuffer, const Buffer::BufferViewStorage *view) : buffer(std::move(pBuffer)), view(view) {
@ -347,7 +346,10 @@ namespace skyline::gpu {
        }
    }

-    void BufferView::RegisterUsage(const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
+    void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
+        // Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further writes in the current cycle to occur on the GPU
+        bufferDelegate->buffer->MarkHostImmutable(pCycle);
+
        usageCallback(*bufferDelegate->view, bufferDelegate->buffer);
        if (!bufferDelegate->usageCallback) {
            bufferDelegate->usageCallback = usageCallback;
@ -364,17 +366,39 @@ namespace skyline::gpu {
    }

    void BufferView::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
+        // If megabuffering can't be enabled we have to do a GPU-side copy to ensure sequencing
+        bool gpuCopy{bufferDelegate->view->size > MegaBufferingDisableThreshold};
+        if (gpuCopy)
+            // This will force the host buffer contents to stay as is for the current cycle, requiring that write operations are instead sequenced on the GPU for the entire buffer
+            bufferDelegate->buffer->MarkHostImmutable(pCycle);
+
        bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
    }

    vk::DeviceSize BufferView::AcquireMegaBuffer(MegaBuffer &megaBuffer) const {
-        vk::DeviceSize bufferOffset{bufferDelegate->buffer->AcquireMegaBuffer(megaBuffer)};
-
-        // Propagate 0 results since they signify that megabuffering isn't supported for a buffer
-        if (bufferOffset)
-            return bufferOffset + bufferDelegate->view->offset;
-        else
+        if (!bufferDelegate->buffer->EverHadInlineUpdate())
+            // Don't megabuffer buffers that have never had inline updates since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
            return 0;
+
+        if (bufferDelegate->view->size > MegaBufferingDisableThreshold)
+            return 0;
+
+        auto[newSequence, sequenceSpan]{bufferDelegate->buffer->AcquireCurrentSequence()};
+        if (!newSequence)
+            return 0; // If the sequence can't be acquired then the buffer is GPU dirty and we can't megabuffer
+
+        // If a copy of the view for the current sequence is already in megabuffer then we can just use that
+        if (newSequence == bufferDelegate->view->lastAcquiredSequence && bufferDelegate->view->megabufferOffset)
+            return bufferDelegate->view->megabufferOffset;
+
+        // If the view is not in the megabuffer then we need to allocate a new copy
+        auto viewBackingSpan{sequenceSpan.subspan(bufferDelegate->view->offset, bufferDelegate->view->size)};
+
+        // TODO: we could optimise the alignment requirements here based on buffer usage
+        bufferDelegate->view->megabufferOffset = megaBuffer.Push(viewBackingSpan, true);
+        bufferDelegate->view->lastAcquiredSequence = newSequence;
+
+        return bufferDelegate->view->megabufferOffset; // Success!
    }

    span<u8> BufferView::GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {