Implement an alternative buffer path using direct memory importing

By importing guest memory directly onto the host GPU we can avoid many of the complexities that occur with memory tracking as well as the heavy performance overhead in some situations. Since it's still desired to support the traditional buffer method, as it's faster in some cases and more widely supported, most of the exposed buffer methods have been split into two variants with just a small amount of shared code. While in most cases the code is simpler, one area with more complexity is handling CPU accesses that need to be sequenced, since we don't have any place we can easily apply writes to on the GPFIFO thread that wont also impact the buffer on the GPU, to solve this, when the GPU is actively using a buffer's contents, an interval list is used to keep track of any GPFIO-written regions on the CPU and any CPU reads to them will instead be directed to a shadow of the buffer with just those writes applied. Once the GPU has finished using buffer contents the shadow can then be removed as all writes will have been done by the GPU. The main caveat of this is that it requires tying host sync to guest sync, this can reduce performance in games which double buffer command buffers as it prevents us from fully saturating the CPU with the GPFIFO thread.
2025-07-17 08:46:39 +00:00 · 2022-12-27 18:21:58 +00:00
parent b3f7e990cc
commit 3d31ade35f
9 changed files with 481 additions and 152 deletions
--- a/app/src/main/cpp/skyline/gpu/buffer.cpp
+++ b/app/src/main/cpp/skyline/gpu/buffer.cpp
@ -1,6 +1,7 @@
 // SPDX-License-Identifier: MPL-2.0
 // Copyright © 2021 Skyline Team and Contributors (https://github.com/skyline-emu/)

+#include <adrenotools/driver.h>
 #include <gpu.h>
 #include <kernel/memory.h>
 #include <kernel/types/KProcess.h>
@ -18,12 +19,9 @@ namespace skyline::gpu {
        unifiedMegaBuffer = {};
    }

-    void Buffer::SetupGuestMappings() {
-        u8 *alignedData{util::AlignDown(guest->data(), constant::PageSize)};
-        size_t alignedSize{static_cast<size_t>(util::AlignUp(guest->data() + guest->size(), constant::PageSize) - alignedData)};
-
-        alignedMirror = gpu.state.process->memory.CreateMirror(span<u8>{alignedData, alignedSize});
-        mirror = alignedMirror.subspan(static_cast<size_t>(guest->data() - alignedData), guest->size());
+    void Buffer::SetupStagedTraps() {
+        if (isDirect)
+            return;

        // We can't just capture this in the lambda since the lambda could exceed the lifetime of the buffer
        std::weak_ptr<Buffer> weakThis{shared_from_this()};
@ -99,7 +97,7 @@ namespace skyline::gpu {
            if (buffer->accumulatedGuestWaitTime > FastReadbackHackWaitTimeThreshold && *buffer->gpu.state.settings->enableFastGpuReadbackHack) {
                // As opposed to skipping readback as we do for textures, with buffers we can still perform the readback but just without syncinc the GPU
                // While the read data may be invalid it's still better than nothing and works in most cases
-                memcpy(buffer->mirror.data(), buffer->backing.data(), buffer->mirror.size());
+                memcpy(buffer->mirror.data(), buffer->backing->data(), buffer->mirror.size());
                buffer->dirtyState = DirtyState::Clean;
                return true;
            }
@ -118,37 +116,266 @@ namespace skyline::gpu {
        });
    }

-    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id)
-        : gpu{gpu},
-          backing{gpu.memory.AllocateBuffer(guest.size())},
-          guest{guest},
-          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
-          id{id},
-          megaBufferTableShift{std::max(std::bit_width(guest.size() / MegaBufferTableMaxEntries - 1), MegaBufferTableShiftMin)} {
-        megaBufferTable.resize(guest.size() / (1 << megaBufferTableShift));
-    }
+    void Buffer::InsertWriteIntervalDirect(WriteTrackingInterval entry) {
+        auto firstIt{std::lower_bound(directTrackedWrites.begin(), directTrackedWrites.end(), entry, [](const auto &lhs, const auto &rhs) {
+            return lhs.end < rhs.offset;
+        })}; // Lowest offset entry that (maybe) overlaps with the new entry

-    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, vk::DeviceSize size, size_t id)
-        : gpu{gpu},
-          backing{gpu.memory.AllocateBuffer(size)},
-          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
-          id{id} {
-        dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
-    }
-
-    Buffer::~Buffer() {
-        if (trapHandle)
-            gpu.state.nce->DeleteTrap(*trapHandle);
-        SynchronizeGuest(true);
-        if (alignedMirror.valid())
-            munmap(alignedMirror.data(), alignedMirror.size());
-        WaitOnFence();
-    }
-
-    void Buffer::MarkGpuDirty() {
-        if (!guest)
+        if (firstIt == directTrackedWrites.end() || firstIt->offset >= entry.end) {
+            directTrackedWrites.insert(firstIt, entry);
            return;
+        }
+        // Now firstIt will always overlap
+        
+        auto lastIt{firstIt}; // Highest offset entry that overlaps with the new entry
+        while (std::next(lastIt) != directTrackedWrites.end() && std::next(lastIt)->offset < entry.end)
+            lastIt++;

+        // Since firstIt and lastIt both are guaranteed to overlap, max them to get the new entry's end
+        size_t end{std::max(std::max(firstIt->end, entry.end), lastIt->end)};
+
+        // Erase all overlapping entries but the first
+        auto eraseStartIt{std::next(firstIt)};
+        auto eraseEndIt{std::next(lastIt)};
+        if (eraseStartIt != eraseEndIt) {
+            lastIt = directTrackedWrites.erase(eraseStartIt, eraseEndIt);
+            firstIt = std::prev(lastIt);
+        }
+
+        firstIt->offset = std::min(entry.offset, firstIt->offset);
+        firstIt->end = end;
+    }
+
+    Buffer::QueryIntervalResult Buffer::QueryWriteIntervalDirect(u64 offset) {
+        auto it{std::lower_bound(directTrackedWrites.begin(), directTrackedWrites.end(), offset, [](const auto &lhs, const auto &rhs) {
+            return lhs.end < rhs;
+        })}; // Lowest offset entry that (maybe) overlaps with the new entry
+
+        if (it == directTrackedWrites.end()) // No overlaps for the entire rest of buffer
+            return {false, mirror.size() - offset};
+        else if (it->offset > offset) // No overlap, return the distance to the next possible overlap
+            return {false, it->offset - offset};
+        else // Overlap, return the distance to the end of the overlap
+            return {true, it->end - offset};
+    }
+
+    void Buffer::EnableTrackedShadowDirect() {
+        if (!directTrackedShadowActive) {
+            directTrackedShadow.resize(guest->size());
+            directTrackedShadowActive = true;
+        }
+    }
+
+    span<u8> Buffer::BeginWriteCpuSequencedDirect(size_t offset, size_t size) {
+        EnableTrackedShadowDirect();
+        InsertWriteIntervalDirect({offset, offset + size});
+        return {directTrackedShadow.data() + offset, size};
+    }
+
+    bool Buffer::RefreshGpuReadsActiveDirect() {
+        bool readsActive{SequencedCpuBackingWritesBlocked() || !PollFence()};
+        if (!readsActive) {
+            if (directTrackedShadowActive) {
+                directTrackedShadowActive = false;
+                directTrackedShadow.clear();
+                directTrackedShadow.shrink_to_fit();
+            }
+            directTrackedWrites.clear();
+        }
+        
+        return readsActive;
+    }
+    
+    bool Buffer::RefreshGpuWritesActiveDirect(bool wait, const std::function<void()> &flushHostCallback) {
+        if (directGpuWritesActive && (!PollFence() || AllCpuBackingWritesBlocked())) {
+            if (wait) {
+                if (AllCpuBackingWritesBlocked()) // If we are dirty in the current cycle we'll need to flush
+                    flushHostCallback();
+
+                WaitOnFence();
+
+                // No longer dirty
+            } else {
+                return true;
+            }
+        }
+
+        directGpuWritesActive = false;
+        return false;
+    }
+
+    bool Buffer::ValidateMegaBufferViewImplDirect(vk::DeviceSize size) {
+        if (!everHadInlineUpdate || size >= MegaBufferChunkSize)
+            // Don't megabuffer buffers that have never had inline updates
+            return false;
+
+        if (RefreshGpuWritesActiveDirect())
+            // If the buffer is currently being written to by the GPU then we can't megabuffer it
+            return false;
+
+        if (directTrackedShadowActive)
+            // If the mirror contents aren't fully up to date then we can't megabuffer that would ignore any shadow tracked writes
+            return false;
+
+        return true;
+    }
+
+    bool Buffer::ValidateMegaBufferViewImplStaged(vk::DeviceSize size) {
+        if ((!everHadInlineUpdate && sequenceNumber < FrequentlySyncedThreshold) || size >= MegaBufferChunkSize)
+            // Don't megabuffer buffers that have never had inline updates and are not frequently synced since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
+            return false;
+
+        // We are safe to check dirty state here since it will only ever be set GPU dirty with the buffer locked and from the active GPFIFO thread. This helps with perf since the lock ends up being slightly expensive
+        if (dirtyState == DirtyState::GpuDirty)
+            // Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate
+            return false;
+
+        return true;
+    }
+
+    bool Buffer::ValidateMegaBufferView(vk::DeviceSize size) {
+        return isDirect ? ValidateMegaBufferViewImplDirect(size) : ValidateMegaBufferViewImplStaged(size);
+    }
+
+    void Buffer::CopyFromImplDirect(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
+        everHadInlineUpdate = true;
+        bool needsGpuTracking{src->RefreshGpuWritesActiveDirect() || RefreshGpuWritesActiveDirect()};
+        bool needsCpuTracking{RefreshGpuReadsActiveDirect() && !needsGpuTracking};
+        if (needsGpuTracking || needsCpuTracking) {
+            if (needsGpuTracking) // Force buffer to be dirty for this cycle if either of the sources are dirty, this is needed as otherwise it could have just been dirty from the previous cycle
+                MarkGpuDirty();
+            gpuCopyCallback();
+
+            if (needsCpuTracking)
+                src->Read(false, {}, BeginWriteCpuSequencedDirect(dstOffset, size), srcOffset);
+        } else {
+            src->Read(false, {}, {mirror.data() + dstOffset, size}, srcOffset);
+        }
+    }
+
+    void Buffer::CopyFromImplStaged(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
+        std::scoped_lock lock{stateMutex, src->stateMutex}; // Fine even if src and dst are same since recursive mutex
+
+        if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
+            // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes)
+            SynchronizeHost();
+
+        if (dirtyState != DirtyState::GpuDirty && src->dirtyState != DirtyState::GpuDirty) {
+            std::memcpy(mirror.data() + dstOffset, src->mirror.data() + srcOffset, size);
+
+            if (dirtyState == DirtyState::CpuDirty && !SequencedCpuBackingWritesBlocked())
+                // Skip updating backing if the changes are gonna be updated later by SynchroniseHost in executor anyway
+                return;
+
+            if (!SequencedCpuBackingWritesBlocked() && PollFence())
+                // We can write directly to the backing as long as this resource isn't being actively used by a past workload (in the current context or another)
+                std::memcpy(backing->data() + dstOffset, src->mirror.data() + srcOffset, size);
+            else
+                gpuCopyCallback();
+        } else {
+            MarkGpuDirty();
+            gpuCopyCallback();
+        }
+    }
+
+    bool Buffer::WriteImplDirect(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
+        // If the buffer is GPU dirty do the write on the GPU and we're done
+        if (RefreshGpuWritesActiveDirect()) {
+            if (gpuCopyCallback) {
+                // Propagate dirtiness to the current cycle, since if this is only dirty in a previous cycle that could change at any time and we would need to have the write saved somewhere for CPU reads
+                // By propagating the dirtiness to the current cycle we can avoid this and force a wait on any reads
+                MarkGpuDirty();
+                gpuCopyCallback();
+                return false;
+            } else {
+                return true;
+            }
+        }
+
+        if (RefreshGpuReadsActiveDirect()) {
+            // If the GPU could read the buffer we need to track the write in the shadow and do the actual write on the GPU
+            if (gpuCopyCallback)
+                gpuCopyCallback();
+            else
+                return true;
+
+            BeginWriteCpuSequencedDirect(offset, data.size()).copy_from(data);
+            return false;
+        }
+
+        // If the GPU isn't accessing the mirror we can just write directly to it
+        std::memcpy(mirror.data() + offset, data.data(), data.size());
+        return false;
+    }
+
+    bool Buffer::WriteImplStaged(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
+        // We cannot have *ANY* state changes for the duration of this function, if the buffer became CPU dirty partway through the GPU writes would mismatch the CPU writes
+        std::scoped_lock lock{stateMutex};
+
+        // If the buffer is GPU dirty do the write on the GPU and we're done
+        if (dirtyState == DirtyState::GpuDirty) {
+            if (gpuCopyCallback)
+                gpuCopyCallback();
+            else
+                return true;
+        }
+
+        if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
+            // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes)
+            SynchronizeHost();
+
+        std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents
+
+        if (dirtyState == DirtyState::CpuDirty && !SequencedCpuBackingWritesBlocked())
+            // Skip updating backing if the changes are gonna be updated later by SynchroniseHost in executor anyway
+            return false;
+
+        if (!SequencedCpuBackingWritesBlocked() && PollFence()) {
+            // We can write directly to the backing as long as this resource isn't being actively used by a past workload (in the current context or another)
+            std::memcpy(backing->data() + offset, data.data(), data.size());
+        } else {
+            // If this buffer is host immutable, perform a GPU-side inline update for the buffer contents since we can't directly modify the backing
+            // If no copy callback is supplied, return true to indicate that the caller should repeat the write with an appropriate callback
+            if (gpuCopyCallback)
+                gpuCopyCallback();
+            else
+                return true;
+        }
+
+        return false;
+    }
+
+    void Buffer::ReadImplDirect(const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
+        // If GPU writes are active then wait until that's no longer the case
+        RefreshGpuWritesActiveDirect(true, flushHostCallback);
+
+        if (directTrackedShadowActive && RefreshGpuReadsActiveDirect()) {
+            size_t curOffset{offset};
+            while (curOffset != data.size() + offset) {
+                auto result{QueryWriteIntervalDirect(curOffset)};
+                auto srcData{result.useShadow ? directTrackedShadow.data() : mirror.data()};
+                std::memcpy(data.data() + curOffset - offset, srcData + curOffset, result.size);
+                curOffset += result.size;
+            }
+        } else [[likely]] {
+            std::memcpy(data.data(), mirror.data() + offset, data.size());
+        }
+    }
+
+    void Buffer::ReadImplStaged(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
+        if (dirtyState == DirtyState::GpuDirty)
+            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);
+
+        std::memcpy(data.data(), mirror.data() + offset, data.size());
+    }
+
+    void Buffer::MarkGpuDirtyImplDirect() {
+        directGpuWritesActive = true;
+        BlockAllCpuBackingWrites();
+        AdvanceSequence();
+    }
+
+    void Buffer::MarkGpuDirtyImplStaged() {
        std::scoped_lock lock{stateMutex}; // stateMutex is locked to prevent state changes at any point during this function

        if (dirtyState == DirtyState::GpuDirty)
@ -166,6 +393,49 @@ namespace skyline::gpu {
        AdvanceSequence(); // The GPU will modify buffer contents so advance to the next sequence
    }

+    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, GuestBuffer guest, size_t id, bool direct)
+        : gpu{gpu},
+          guest{guest},
+          mirror{gpu.state.process->memory.CreateMirror(guest)},
+          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
+          isDirect{direct},
+          id{id},
+          megaBufferTableShift{std::max(std::bit_width(guest.size() / MegaBufferTableMaxEntries - 1), MegaBufferTableShiftMin)} {
+        if (isDirect)
+            directBacking = gpu.memory.ImportBuffer(mirror);
+        else
+            backing = gpu.memory.AllocateBuffer(mirror.size());
+
+        megaBufferTable.resize(guest.size() / (1 << megaBufferTableShift));
+    }
+
+    Buffer::Buffer(LinearAllocatorState<> &delegateAllocator, GPU &gpu, vk::DeviceSize size, size_t id)
+        : gpu{gpu},
+          backing{gpu.memory.AllocateBuffer(size)},
+          delegate{delegateAllocator.EmplaceUntracked<BufferDelegate>(this)},
+          id{id} {
+        dirtyState = DirtyState::Clean; // Since this is a host-only buffer it's always going to be clean
+    }
+
+    Buffer::~Buffer() {
+        if (trapHandle)
+            gpu.state.nce->DeleteTrap(*trapHandle);
+        SynchronizeGuest(true);
+        if (mirror.valid())
+            munmap(mirror.data(), mirror.size());
+        WaitOnFence();
+    }
+
+    void Buffer::MarkGpuDirty() {
+        if (!guest)
+            return;
+
+        if (isDirect)
+            MarkGpuDirtyImplDirect();
+        else
+            MarkGpuDirtyImplStaged();
+    }
+
    void Buffer::WaitOnFence() {
        TRACE_EVENT("gpu", "Buffer::WaitOnFence");

@ -198,7 +468,7 @@ namespace skyline::gpu {
    }

    void Buffer::SynchronizeHost(bool skipTrap) {
-        if (!guest)
+        if (!guest || isDirect)
            return;

        TRACE_EVENT("gpu", "Buffer::SynchronizeHost");
@ -217,11 +487,11 @@ namespace skyline::gpu {
                gpu.state.nce->TrapRegions(*trapHandle, true); // Trap any future CPU writes to this buffer, must be done before the memcpy so that any modifications during the copy are tracked
        }

-        std::memcpy(backing.data(), mirror.data(), mirror.size());
+        std::memcpy(backing->data(), mirror.data(), mirror.size());
    }

    bool Buffer::SynchronizeGuest(bool skipTrap, bool nonBlocking) {
-        if (!guest)
+        if (!guest || isDirect)
            return false;

        TRACE_EVENT("gpu", "Buffer::SynchronizeGuest");
@ -236,7 +506,7 @@ namespace skyline::gpu {
                return false; // If the fence is not signalled and non-blocking behaviour is requested then bail out

            WaitOnFence();
-            std::memcpy(mirror.data(), backing.data(), mirror.size());
+            std::memcpy(mirror.data(), backing->data(), mirror.size());

            dirtyState = DirtyState::Clean;
        }
@ -248,6 +518,9 @@ namespace skyline::gpu {
    }

    void Buffer::SynchronizeGuestImmediate(bool isFirstUsage, const std::function<void()> &flushHostCallback) {
+        if (isDirect)
+            return;
+
        // If this buffer was attached to the current cycle, flush all pending host GPU work and wait to ensure that we read valid data
        if (!isFirstUsage)
            flushHostCallback();
@ -256,82 +529,32 @@ namespace skyline::gpu {
    }

    void Buffer::Read(bool isFirstUsage, const std::function<void()> &flushHostCallback, span<u8> data, vk::DeviceSize offset) {
-        if (dirtyState == DirtyState::GpuDirty)
-            SynchronizeGuestImmediate(isFirstUsage, flushHostCallback);
-
-        std::memcpy(data.data(), mirror.data() + offset, data.size());
+        if (isDirect)
+            ReadImplDirect(flushHostCallback, data, offset);
+        else
+            ReadImplStaged(isFirstUsage, flushHostCallback, data, offset);
    }

    bool Buffer::Write(span<u8> data, vk::DeviceSize offset, const std::function<void()> &gpuCopyCallback) {
        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        everHadInlineUpdate = true;

-        // We cannot have *ANY* state changes for the duration of this function, if the buffer became CPU dirty partway through the GPU writes would mismatch the CPU writes
-        std::scoped_lock lock{stateMutex};
-
-        // If the buffer is GPU dirty do the write on the GPU and we're done
-        if (dirtyState == DirtyState::GpuDirty) {
-            if (gpuCopyCallback)
-                gpuCopyCallback();
-            else
-                return true;
-        }
-
-        if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
-            // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes)
-            SynchronizeHost();
-
-        std::memcpy(mirror.data() + offset, data.data(), data.size()); // Always copy to mirror since any CPU side reads will need the up-to-date contents
-
-        if (dirtyState == DirtyState::CpuDirty && !SequencedCpuBackingWritesBlocked())
-            // Skip updating backing if the changes are gonna be updated later by SynchroniseHost in executor anyway
-            return false;
-
-        if (!SequencedCpuBackingWritesBlocked() && PollFence()) {
-            // We can write directly to the backing as long as this resource isn't being actively used by a past workload (in the current context or another)
-            std::memcpy(backing.data() + offset, data.data(), data.size());
-        } else {
-            // If this buffer is host immutable, perform a GPU-side inline update for the buffer contents since we can't directly modify the backing
-            // If no copy callback is supplied, return true to indicate that the caller should repeat the write with an appropriate callback
-            if (gpuCopyCallback)
-                gpuCopyCallback();
-            else
-                return true;
-        }
-
-        return false;
+        if (isDirect)
+            return WriteImplDirect(data, offset, gpuCopyCallback);
+        else
+            return WriteImplStaged(data, offset, gpuCopyCallback);
    }

    void Buffer::CopyFrom(vk::DeviceSize dstOffset, Buffer *src, vk::DeviceSize srcOffset, vk::DeviceSize size, const std::function<void()> &gpuCopyCallback) {
        AdvanceSequence(); // We are modifying GPU backing contents so advance to the next sequence
        everHadInlineUpdate = true;

-        std::scoped_lock lock{stateMutex, src->stateMutex}; // Fine even if src and dst are same since recursive mutex
-
-        if (dirtyState == DirtyState::CpuDirty && SequencedCpuBackingWritesBlocked())
-            // If the buffer is used in sequence directly on the GPU, SynchronizeHost before modifying the mirror contents to ensure proper sequencing. This write will then be sequenced on the GPU instead (the buffer will be kept clean for the rest of the execution due to gpuCopyCallback blocking all writes)
-            SynchronizeHost();
-
-        if (dirtyState != DirtyState::GpuDirty && src->dirtyState != DirtyState::GpuDirty) {
-            std::memcpy(mirror.data() + dstOffset, src->mirror.data() + srcOffset, size);
-
-            if (dirtyState == DirtyState::CpuDirty && !SequencedCpuBackingWritesBlocked())
-                // Skip updating backing if the changes are gonna be updated later by SynchroniseHost in executor anyway
-                return;
-
-            if (!SequencedCpuBackingWritesBlocked() && PollFence()) {
-                // We can write directly to the backing as long as this resource isn't being actively used by a past workload (in the current context or another)
-                std::memcpy(backing.data() + dstOffset, src->mirror.data() + srcOffset, size);
-            } else {
-                gpuCopyCallback();
-            }
-        } else {
-            MarkGpuDirty();
-            gpuCopyCallback();
-        }
+        if (isDirect)
+            CopyFromImplDirect(dstOffset, src, srcOffset, size, gpuCopyCallback);
+        else
+            CopyFromImplStaged(dstOffset, src, srcOffset, size, gpuCopyCallback);
    }

-
    BufferView Buffer::GetView(vk::DeviceSize offset, vk::DeviceSize size) {
        return BufferView{delegate, offset, size};
    }
@ -345,13 +568,7 @@ namespace skyline::gpu {

    BufferBinding Buffer::TryMegaBufferView(const std::shared_ptr<FenceCycle> &pCycle, MegaBufferAllocator &allocator, ContextTag executionTag,
                                            vk::DeviceSize offset, vk::DeviceSize size) {
-        if ((!everHadInlineUpdate && sequenceNumber < FrequentlySyncedThreshold) || size >= MegaBufferChunkSize)
-            // Don't megabuffer buffers that have never had inline updates and are not frequently synced since performance is only going to be harmed as a result of the constant copying and there wont be any benefit since there are no GPU inline updates that would be avoided
-            return {};
-
-        // We are safe to check dirty state here since it will only ever be set GPU dirty with the buffer locked and from the active GPFIFO thread. This helps with perf since the lock ends up being slightly expensive
-        if (dirtyState == DirtyState::GpuDirty)
-            // Bail out if buffer cannot be synced, we don't know the contents ahead of time so the sequence is indeterminate
+        if (!ValidateMegaBufferView(size))
            return {};

        // If the active execution has changed all previous allocations are now invalid
@ -361,7 +578,7 @@ namespace skyline::gpu {
        }

        // If more than half the buffer has been megabuffered in chunks within the same execution assume this will generally be the case for this buffer and just megabuffer the whole thing without chunking
-        if (unifiedMegaBufferEnabled || (megaBufferViewAccumulatedSize > (backing.size() / 2) && backing.size() < MegaBufferChunkSize)) {
+        if (unifiedMegaBufferEnabled || (megaBufferViewAccumulatedSize > (mirror.size() / 2) && mirror.size() < MegaBufferChunkSize)) {
            if (!unifiedMegaBuffer) {
                unifiedMegaBuffer = allocator.Push(pCycle, mirror, true);
                unifiedMegaBufferEnabled = true;
@ -502,5 +719,4 @@ namespace skyline::gpu {
            throw exception("Copy size mismatch!");
        return GetBuffer()->CopyFrom(GetOffset(), src.GetBuffer(), src.GetOffset(), size, gpuCopyCallback);
    }
-
 }