Support using Vulkan semaphores with fence cycles

In some cases like presentation, it may be possible to avoid waiting on the CPU by using a semaphore to indicate GPU completion. Due to the binary nature of Vulkan semaphores this requires a fair bit of code as we need to ensure semaphores are always unsignalled before they are waited on and signalled again. This is achieved with a special kind of chained cycle that can be added even after guest GPFIFO processing for a given cycle, the main cycle's semaphore can be waited and then the cycle for the wait attached to the main cycle and it will be waited on before signalling.
This commit is contained in:
Billy Laws
2022-10-16 20:31:36 +01:00
parent 5b72be88c3
commit 0670e0e0dc
5 changed files with 101 additions and 100 deletions

View File

@ -32,10 +32,11 @@ namespace skyline::gpu {
}
CommandScheduler::CommandBufferSlot::CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool)
: device(device),
commandBuffer(device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)),
fence(device, vk::FenceCreateInfo{}),
cycle(std::make_shared<FenceCycle>(device, *fence)) {}
: device{device},
commandBuffer{device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)},
fence{device, vk::FenceCreateInfo{}},
semaphore{device, vk::SemaphoreCreateInfo{}},
cycle{std::make_shared<FenceCycle>(device, *fence, *semaphore)} {}
CommandScheduler::CommandScheduler(const DeviceState &state, GPU &pGpu)
: state{state},
@ -55,7 +56,7 @@ namespace skyline::gpu {
if (!slot.active.test_and_set(std::memory_order_acq_rel)) {
if (slot.cycle->Poll()) {
slot.commandBuffer.reset();
slot.cycle = std::make_shared<FenceCycle>(slot.device, *slot.fence);
slot.cycle = std::make_shared<FenceCycle>(*slot.cycle);
return {slot};
} else {
slot.active.clear(std::memory_order_release);
@ -76,12 +77,29 @@ namespace skyline::gpu {
return {pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool)};
}
void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle) {
void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle, span<vk::Semaphore> waitSemaphores, span<vk::Semaphore> signalSemaphores) {
boost::container::small_vector<vk::Semaphore, 3> fullWaitSemaphores{waitSemaphores.begin(), waitSemaphores.end()};
boost::container::small_vector<vk::PipelineStageFlags, 3> fullWaitStages{waitSemaphores.size(), vk::PipelineStageFlagBits::eAllCommands};
if (cycle->semaphoreSubmitWait) {
fullWaitSemaphores.push_back(cycle->semaphore);
// We don't need a full barrier since this is only done to ensure the semaphore is unsignalled
fullWaitStages.push_back(vk::PipelineStageFlagBits::eTopOfPipe);
}
boost::container::small_vector<vk::Semaphore, 2> fullSignalSemaphores{signalSemaphores.begin(), signalSemaphores.end()};
fullSignalSemaphores.push_back(cycle->semaphore);
{
std::scoped_lock lock(gpu.queueMutex);
std::scoped_lock lock{gpu.queueMutex};
gpu.vkQueue.submit(vk::SubmitInfo{
.commandBufferCount = 1,
.pCommandBuffers = &*commandBuffer,
.waitSemaphoreCount = static_cast<u32>(waitSemaphores.size()),
.pWaitSemaphores = fullWaitSemaphores.data(),
.pWaitDstStageMask = fullWaitStages.data(),
.signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
.pSignalSemaphores = fullSignalSemaphores.data(),
}, cycle->fence);
}