Implement accelerated uploads/copies through buffer manager

Previously, both I2M uploads and DMA copies would force GPU serialisation if they happened to hit a trap or were used to copy GPU dirty buffers. By using the buffer manager to implement them on the host GPU we can avoid such slowdowns entiely.
This commit is contained in:
Billy Laws
2022-10-25 20:57:30 +01:00
parent c5ec484d9a
commit cac287d9fd
10 changed files with 205 additions and 15 deletions

View File

@ -0,0 +1,50 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <gpu/buffer_manager.h>
#include <soc/gm20b/gmmu.h>
#include <soc/gm20b/channel.h>
#include "inline2memory.h"
namespace skyline::gpu::interconnect {
using IOVA = soc::gm20b::IOVA;
Inline2Memory::Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
: gpu{gpu},
channelCtx{channelCtx},
executor{channelCtx.executor} {}
void Inline2Memory::Upload(IOVA dst, span<u32> src) {
auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, src.size_bytes())};
if (dstMappings.size() > 1)
Logger::Warn("Split mapping are unsupported for DMA copies");
auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
executor.AttachLockedBuffer(buffer, std::move(lock));
})};
ContextLock dstBufLock{executor.tag, dstBuf};
dstBuf.Write(src.cast<u8>(), 0, [&]() {
executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
// This will prevent any CPU accesses to backing for the duration of the usage
dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
auto srcGpuAllocation{gpu.megaBufferAllocator.Push(executor.cycle, src.cast<u8>())};
executor.AddOutsideRpCommand([srcGpuAllocation, dstBuf, src](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
vk::BufferCopy copyRegion{
.size = src.size_bytes(),
.srcOffset = srcGpuAllocation.offset,
.dstOffset = dstBuf.GetOffset()
};
commandBuffer.copyBuffer(srcGpuAllocation.buffer, dstBuf.GetBuffer()->GetBacking(), copyRegion);
commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite
}, {}, {});
});
});
}
}

View File

@ -0,0 +1,36 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#pragma once
#include <soc/gm20b/gmmu.h>
namespace skyline::gpu {
class GPU;
}
namespace skyline::soc::gm20b {
struct ChannelContext;
}
namespace skyline::gpu::interconnect {
class CommandExecutor;
/**
* @brief Handles translating I2M operations to Vulkan
*/
class Inline2Memory {
private:
using IOVA = soc::gm20b::IOVA;
GPU &gpu;
soc::gm20b::ChannelContext &channelCtx;
gpu::interconnect::CommandExecutor &executor;
public:
Inline2Memory(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
void Upload(IOVA dst, span<u32> src);
};
}

View File

@ -0,0 +1,61 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#include <gpu/buffer_manager.h>
#include <soc/gm20b/gmmu.h>
#include <soc/gm20b/channel.h>
#include "maxwell_dma.h"
namespace skyline::gpu::interconnect {
using IOVA = soc::gm20b::IOVA;
MaxwellDma::MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx)
: gpu{gpu},
channelCtx{channelCtx},
executor{channelCtx.executor} {}
void MaxwellDma::Copy(IOVA dst, IOVA src, size_t size) {
auto srcMappings{channelCtx.asCtx->gmmu.TranslateRange(src, size)};
auto dstMappings{channelCtx.asCtx->gmmu.TranslateRange(dst, size)};
if (srcMappings.size() > 1 || dstMappings.size() > 1)
Logger::Warn("Split mapping are unsupported for DMA copies");
auto srcBuf{gpu.buffer.FindOrCreate(srcMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
executor.AttachLockedBuffer(buffer, std::move(lock));
})};
ContextLock srcBufLock{executor.tag, srcBuf};
auto dstBuf{gpu.buffer.FindOrCreate(dstMappings.front(), executor.tag, [this](std::shared_ptr<Buffer> buffer, ContextLock<Buffer> &&lock) {
executor.AttachLockedBuffer(buffer, std::move(lock));
})};
ContextLock dstBufLock{executor.tag, dstBuf};
dstBuf.CopyFrom(srcBuf, [&]() {
executor.AttachLockedBufferView(srcBuf, std::move(srcBufLock));
executor.AttachLockedBufferView(dstBuf, std::move(dstBufLock));
// This will prevent any CPU accesses to backing for the duration of the usage
// GPU dirtiness will be handled on the CopyFrom end as it's not always necessary
srcBuf.GetBuffer()->BlockAllCpuBackingWrites();
dstBuf.GetBuffer()->BlockAllCpuBackingWrites();
executor.AddOutsideRpCommand([srcBuf, dstBuf](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eTransfer, {}, vk::MemoryBarrier{
.srcAccessMask = vk::AccessFlagBits::eMemoryRead,
.dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite
}, {}, {});
vk::BufferCopy copyRegion{
.size = srcBuf.size,
.srcOffset = srcBuf.GetOffset(),
.dstOffset = dstBuf.GetOffset()
};
commandBuffer.copyBuffer(srcBuf.GetBuffer()->GetBacking(), dstBuf.GetBuffer()->GetBacking(), copyRegion);
commandBuffer.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, {}, vk::MemoryBarrier{
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
}, {}, {});
});
});
}
}

View File

@ -0,0 +1,36 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Ryujinx Team and Contributors (https://github.com/ryujinx/)
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
#pragma once
#include <soc/gm20b/gmmu.h>
namespace skyline::gpu {
class GPU;
}
namespace skyline::soc::gm20b {
struct ChannelContext;
}
namespace skyline::gpu::interconnect {
class CommandExecutor;
/**
* @brief Handles translating Maxwell DMA operations to Vulkan
*/
class MaxwellDma {
private:
using IOVA = soc::gm20b::IOVA;
GPU &gpu;
soc::gm20b::ChannelContext &channelCtx;
gpu::interconnect::CommandExecutor &executor;
public:
MaxwellDma(GPU &gpu, soc::gm20b::ChannelContext &channelCtx);
void Copy(IOVA dst, IOVA src, size_t size);
};
}