Rework Texture & Buffer for Context and FenceCycle Chaining

GPU resources have been designed with locking by fences in mind, fences were treated as implicit locks on a GPU, design paradigms such as `GraphicsContext` simply unlocking the texture mutex after attaching it which would set the fence cycle were considered fine prior but are unoptimal as it enforces that a `FenceCycle` effectively ensures exclusivity. This conflates the function of a mutex which is mutual exclusion and that of the fence which is to track GPU-side completion and led to tying if it was acceptable to use a GPU resource to GPU completion rather than simply if it was not currently being used by the CPU which is the function of the mutex. This rework fixes this with the groundwork that has been laid with previous commits, as `Context` semantics are utilized to move back to using mutexes for locking of resources and tracking the usage on the GPU in a cleaner way rather than arbitrary fence comparisons. This also leads to cleaning up a lot of methods that involved usage of fences that no longer require it and therefore can be entirely removed, further cleaning up the codebase. It also opens the door for future improvements such as the removal of `hostImmutableCycle` and replacing them with better solutions, the implementation of which is broken at the moment regardless. While moving to `Context`-based locking the question of multiple GPU workloads being in-flight while using overlapping resources came up which brought a fundamental limitation of `FenceCycle` to light which was that only one resource could be concurrently attached to a cycle and it could not adequately represent multi-cycle dependencies. `FenceCycle` chaining was designed to fix this inadequacy and allows for several different GPU workloads to be in-flight concurrently while utilizing the same resources as long as they can ensure GPU-GPU synchronization.
2025-07-17 08:46:39 +00:00 · 2022-06-26 14:40:46 +05:30
parent 07d45ee504
commit 1239907ce8
6 changed files with 83 additions and 142 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -64,8 +64,7 @@ namespace skyline::gpu {
                if (srcBuffer->hostImmutableCycle) {
                    // Propagate any host immutability
                    if (hostImmutableCycle) {
-                        if (srcBuffer->hostImmutableCycle.owner_before(hostImmutableCycle))
-                            hostImmutableCycle = srcBuffer->hostImmutableCycle;
+                        srcBuffer->hostImmutableCycle->Wait();
                    } else {
                        hostImmutableCycle = srcBuffer->hostImmutableCycle;
                    }
@ -119,17 +118,15 @@ namespace skyline::gpu {
    void Buffer::WaitOnFence() {
        TRACE_EVENT("gpu", "Buffer::WaitOnFence");

-        auto lCycle{cycle.lock()};
-        if (lCycle) {
-            lCycle->Wait();
-            cycle.reset();
+        if (cycle) {
+            cycle->Wait();
+            cycle = nullptr;
        }
    }

    bool Buffer::PollFence() {
-        auto lCycle{cycle.lock()};
-        if (lCycle && lCycle->Poll()) {
-            cycle.reset();
+        if (cycle && cycle->Poll()) {
+            cycle = nullptr;
            return true;
        }
        return false;
@ -155,27 +152,6 @@ namespace skyline::gpu {
        }
    }

-    void Buffer::SynchronizeHostWithCycle(const std::shared_ptr<FenceCycle> &pCycle, bool rwTrap) {
-        if (dirtyState != DirtyState::CpuDirty || !guest)
-            return;
-
-        if (!cycle.owner_before(pCycle))
-            WaitOnFence();
-
-        TRACE_EVENT("gpu", "Buffer::SynchronizeHostWithCycle");
-
-        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
-        std::memcpy(backing.data(), mirror.data(), mirror.size());
-
-        if (rwTrap) {
-            gpu.state.nce->RetrapRegions(*trapHandle, false);
-            dirtyState = DirtyState::GpuDirty;
-        } else {
-            gpu.state.nce->RetrapRegions(*trapHandle, true);
-            dirtyState = DirtyState::Clean;
-        }
-    }
-
    void Buffer::SynchronizeGuest(bool skipTrap, bool nonBlocking) {
        if (dirtyState != DirtyState::GpuDirty || !guest)
            return; // If the buffer has not been used on the GPU or there's no guest buffer, there is no need to synchronize it
@ -195,52 +171,30 @@ namespace skyline::gpu {
        dirtyState = DirtyState::Clean;
    }

-    /**
-     * @brief A FenceCycleDependency that synchronizes the contents of a host buffer with the guest buffer
-     */
-    struct BufferGuestSync {
-        std::shared_ptr<Buffer> buffer;
-
-        explicit BufferGuestSync(std::shared_ptr<Buffer> buffer) : buffer(std::move(buffer)) {}
-
-        ~BufferGuestSync() {
-            TRACE_EVENT("gpu", "Buffer::BufferGuestSync");
-            buffer->SynchronizeGuest();
-        }
-    };
-
-    void Buffer::SynchronizeGuestWithCycle(const std::shared_ptr<FenceCycle> &pCycle) {
-        if (!cycle.owner_before(pCycle))
-            WaitOnFence();
-
-        pCycle->AttachObject(std::make_shared<BufferGuestSync>(shared_from_this()));
-        cycle = pCycle;
-    }
-
-    void Buffer::SynchronizeGuestImmediate(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {
+    void Buffer::SynchronizeGuestImmediate(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
        // If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that we read valid data
-        if (cycle.owner_before(pCycle))
+        if (!isFirstUsage)
            flushHostCallback();

        SynchronizeGuest();
    }

-    void Buffer::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
+    void Buffer::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
        if (dirtyState == DirtyState::GpuDirty)
-            SynchronizeGuestImmediate(pCycle, flushHostCallback);
+            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);

        std::memcpy(data.data(), mirror.data() + offset, data.size());
    }

-    void Buffer::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) {
+    void Buffer::Write(bool isFirstUsage, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) {
        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        everHadInlineUpdate = true;

        // Perform a syncs in both directions to ensure correct ordering of writes
        if (dirtyState == DirtyState::CpuDirty)
-            SynchronizeHostWithCycle(pCycle);
+            SynchronizeHost();
        else if (dirtyState == DirtyState::GpuDirty)
-            SynchronizeGuestImmediate(pCycle, flushHostCallback);
+            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);

        if (dirtyState != DirtyState::Clean)
            Logger::Error("Attempting to write to a dirty buffer"); // This should never happen since we do syncs in both directions above
@ -277,9 +231,9 @@ namespace skyline::gpu {
        sequenceNumber++;
    }

-    span<u8> Buffer::GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {
+    span<u8> Buffer::GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
        if (dirtyState == DirtyState::GpuDirty)
-            SynchronizeGuestImmediate(pCycle, flushHostCallback);
+            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);

        return mirror;
    }
@ -372,18 +326,9 @@ namespace skyline::gpu {

    BufferView::BufferView(std::shared_ptr<Buffer> buffer, const Buffer::BufferViewStorage *view) : bufferDelegate(std::make_shared<Buffer::BufferDelegate>(std::move(buffer), view)) {}

-    void BufferView::AttachCycle(const std::shared_ptr<FenceCycle> &cycle) {
-        auto buffer{bufferDelegate->buffer.get()};
-        if (!buffer->cycle.owner_before(cycle)) {
-            buffer->WaitOnFence();
-            buffer->cycle = cycle;
-            cycle->AttachObject(bufferDelegate);
-        }
-    }
-
-    void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
+    void BufferView::RegisterUsage(const std::shared_ptr<FenceCycle> &cycle, const std::function<void(const Buffer::BufferViewStorage &, const std::shared_ptr<Buffer> &)> &usageCallback) {
        // Users of RegisterUsage expect the buffer contents to be sequenced as the guest GPU would be, so force any further writes in the current cycle to occur on the GPU
-        bufferDelegate->buffer->MarkHostImmutable(pCycle);
+        bufferDelegate->buffer->MarkHostImmutable(cycle);

        usageCallback(*bufferDelegate->view, bufferDelegate->buffer);
        if (!bufferDelegate->usageCallback) {
@ -396,18 +341,18 @@ namespace skyline::gpu {
        }
    }

-    void BufferView::Read(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
-        bufferDelegate->buffer->Read(pCycle, flushHostCallback, data, offset + bufferDelegate->view->offset);
+    void BufferView::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) const {
+        bufferDelegate->buffer->Read(isFirstUsage, flushHostCallback, data, offset + bufferDelegate->view->offset);
    }

-    void BufferView::Write(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
+    void BufferView::Write(bool isFirstUsage, const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback, const std::function<void()> &gpuCopyCallback, span<u8> data, vk::DeviceSize offset) const {
        // If megabuffering can't be enabled we have to do a GPU-side copy to ensure sequencing
        bool gpuCopy{bufferDelegate->view->size > MegaBufferingDisableThreshold};
        if (gpuCopy)
            // This will force the host buffer contents to stay as is for the current cycle, requiring that write operations are instead sequenced on the GPU for the entire buffer
            bufferDelegate->buffer->MarkHostImmutable(pCycle);

-        bufferDelegate->buffer->Write(pCycle, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
+        bufferDelegate->buffer->Write(isFirstUsage, flushHostCallback, gpuCopyCallback, data, offset + bufferDelegate->view->offset);
    }

    vk::DeviceSize BufferView::AcquireMegaBuffer(MegaBuffer &megaBuffer) const {
@ -436,8 +381,8 @@ namespace skyline::gpu {
        return bufferDelegate->view->megabufferOffset; // Success!
    }

-    span<u8> BufferView::GetReadOnlyBackingSpan(const std::shared_ptr<FenceCycle> &pCycle, const std::function<void()> &flushHostCallback) {
-        auto backing{bufferDelegate->buffer->GetReadOnlyBackingSpan(pCycle, flushHostCallback)};
+    span<u8> BufferView::GetReadOnlyBackingSpan(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
+        auto backing{bufferDelegate->buffer->GetReadOnlyBackingSpan(isFirstUsage, flushHostCallback)};
        return backing.subspan(bufferDelegate->view->offset, bufferDelegate->view->size);
    }
 }