Implement accelerated uploads/copies through buffer manager

Previously, both I2M uploads and DMA copies would force GPU serialisation if they happened to hit a trap or were used to copy GPU dirty buffers. By using the buffer manager to implement them on the host GPU we can avoid such slowdowns entiely.
2025-07-17 08:46:39 +00:00 · 2022-10-25 20:57:30 +01:00
parent c5ec484d9a
commit cac287d9fd
10 changed files with 205 additions and 15 deletions
--- a/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.cpp
@ -0,0 +1,50 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <gpu/buffer_manager.h>
+#include <soc/gm20b/gmmu.h>
+#include <soc/gm20b/channel.h>
+#include "inline2memory.h"
+
+namespace skyline::gpu::interconnect {
+    using IOVA = soc::gm20b::IOVA;
+
+    Inline2Memory::Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
+        : gpu{gpu},
+          channelCtx{channelCtx},
+          executor{channelCtx.executor} {}
+
+    void Inline2Memory::Upload(IOVA dst, span<u32> src) {
+        auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, src.size_bytes())};
+
+        if (dstMappings.size() > 1)
+            Logger::Warn("Split mapping are unsupported for DMA copies");
+
+        auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
+            executor.AttachLockedBuffer(buffer, std::move(lock));
+        })};
+        ContextLock dstBufLock{executor.tag, dstBuf};
+
+
+        dstBuf.Write(src.cast<u8>(), 0, [&]() {
+            executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
+            // This will prevent any CPU accesses to backing for the duration of the usage
+            dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
+
+            auto srcGpuAllocation{gpu.megaBufferAllocator.Push(executor.cycle, src.cast<u8>())};
+            executor.AddOutsideRpCommand([srcGpuAllocation, dstBuf, src](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
+                vk::BufferCopy copyRegion{
+                    .size = src.size_bytes(),
+                    .srcOffset = srcGpuAllocation.offset,
+                    .dstOffset = dstBuf.GetOffset()
+                };
+                commandBuffer.copyBuffer(srcGpuAllocation.buffer, dstBuf.GetBuffer()->GetBacking(), copyRegion);
+                commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
+                    .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
+                    .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite
+                }, {}, {});
+            });
+        });
+    }
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/inline2memory.h
@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <soc/gm20b/gmmu.h>
+
+namespace skyline::gpu {
+    class GPU;
+}
+
+namespace skyline::soc::gm20b {
+    struct ChannelContext;
+}
+
+namespace skyline::gpu::interconnect {
+    class CommandExecutor;
+
+    /**
+     * @brief Handles translating I2M operations to Vulkan
+     */
+    class Inline2Memory {
+      private:
+        using IOVA = soc::gm20b::IOVA;
+
+        GPU &gpu;
+        soc::gm20b::ChannelContext &channelCtx;
+        gpu::interconnect::CommandExecutor &executor;
+
+      public:
+        Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
+
+        void Upload(IOVA dst, span<u32> src);
+    };
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.cpp
@ -0,0 +1,61 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#include <gpu/buffer_manager.h>
+#include <soc/gm20b/gmmu.h>
+#include <soc/gm20b/channel.h>
+#include "maxwell_dma.h"
+
+namespace skyline::gpu::interconnect {
+    using IOVA = soc::gm20b::IOVA;
+
+    MaxwellDma::MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
+        : gpu{gpu},
+          channelCtx{channelCtx},
+          executor{channelCtx.executor} {}
+
+    void MaxwellDma::Copy(IOVA dst, IOVA src, size_t size) {
+        auto srcMappings{channelCtx.asCtx->gmmu.TranslateRange(src, size)};
+        auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, size)};
+
+        if (srcMappings.size() > 1 || dstMappings.size() > 1)
+            Logger::Warn("Split mapping are unsupported for DMA copies");
+
+        auto srcBuf{gpu.buffer.FindOrCreate(srcMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
+            executor.AttachLockedBuffer(buffer, std::move(lock));
+        })};
+        ContextLock srcBufLock{executor.tag, srcBuf};
+
+        auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
+            executor.AttachLockedBuffer(buffer, std::move(lock));
+        })};
+        ContextLock dstBufLock{executor.tag, dstBuf};
+
+        dstBuf.CopyFrom(srcBuf, [&]() {
+            executor.AttachLockedBufferView(srcBuf, std::move(srcBufLock));
+            executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
+            // This will prevent any CPU accesses to backing for the duration of the usage
+            // GPU dirtiness will be handled on the CopyFrom end as it's not always necessary
+            srcBuf.GetBuffer()->BlockAllCpuBackingWrites();
+            dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
+
+            executor.AddOutsideRpCommand([srcBuf, dstBuf](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
+                commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eTransfer, {}, vk::MemoryBarrier{
+                    .srcAccessMask = vk::AccessFlagBits::eMemoryRead,
+                    .dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite
+                }, {}, {});
+                vk::BufferCopy copyRegion{
+                    .size = srcBuf.size,
+                    .srcOffset = srcBuf.GetOffset(),
+                    .dstOffset = dstBuf.GetOffset()
+                };
+                commandBuffer.copyBuffer(srcBuf.GetBuffer()->GetBacking(), dstBuf.GetBuffer()->GetBacking(), copyRegion);
+                commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
+                    .srcAccessMask = vk::AccessFlagBits::eTransferWrite,
+                    .dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
+                }, {}, {});
+            });
+        });
+    }
+}
--- a/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/maxwell_dma.h
@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MPL-2.0
+// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
+// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
+
+#pragma once
+
+#include <soc/gm20b/gmmu.h>
+
+namespace skyline::gpu {
+    class GPU;
+}
+
+namespace skyline::soc::gm20b {
+    struct ChannelContext;
+}
+
+namespace skyline::gpu::interconnect {
+    class CommandExecutor;
+
+    /**
+     * @brief Handles translating Maxwell DMA operations to Vulkan
+     */
+    class MaxwellDma {
+      private:
+        using IOVA = soc::gm20b::IOVA;
+
+        GPU &gpu;
+        soc::gm20b::ChannelContext &channelCtx;
+        gpu::interconnect::CommandExecutor &executor;
+
+      public:
+        MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
+
+        void Copy(IOVA dst, IOVA src, size_t size);
+    };
+}