Support using Vulkan semaphores with fence cycles

In some cases like presentation, it may be possible to avoid waiting on the CPU by using a semaphore to indicate GPU completion. Due to the binary nature of Vulkan semaphores this requires a fair bit of code as we need to ensure semaphores are always unsignalled before they are waited on and signalled again. This is achieved with a special kind of chained cycle that can be added even after guest GPFIFO processing for a given cycle, the main cycle's semaphore can be waited and then the cycle for the wait attached to the main cycle and it will be waited on before signalling.
2025-07-17 08:46:39 +00:00 · 2022-10-16 20:31:36 +01:00
parent 5b72be88c3
commit 0670e0e0dc
5 changed files with 101 additions and 100 deletions
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
@ -32,10 +32,11 @@ namespace skyline::gpu {
    }

    CommandScheduler::CommandBufferSlot::CommandBufferSlot(vk::raii::Device &device, vk::CommandBuffer commandBuffer, vk::raii::CommandPool &pool)
-        : device(device),
-          commandBuffer(device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)),
-          fence(device, vk::FenceCreateInfo{}),
-          cycle(std::make_shared<FenceCycle>(device, *fence)) {}
+        : device{device},
+          commandBuffer{device, static_cast<VkCommandBuffer>(commandBuffer), static_cast<VkCommandPool>(*pool)},
+          fence{device, vk::FenceCreateInfo{}},
+          semaphore{device, vk::SemaphoreCreateInfo{}},
+          cycle{std::make_shared<FenceCycle>(device, *fence, *semaphore)} {}

    CommandScheduler::CommandScheduler(const DeviceState &state, GPU &pGpu)
        : state{state},
@ -55,7 +56,7 @@ namespace skyline::gpu {
            if (!slot.active.test_and_set(std::memory_order_acq_rel)) {
                if (slot.cycle->Poll()) {
                    slot.commandBuffer.reset();
-                    slot.cycle = std::make_shared<FenceCycle>(slot.device, *slot.fence);
+                    slot.cycle = std::make_shared<FenceCycle>(*slot.cycle);
                    return {slot};
                } else {
                    slot.active.clear(std::memory_order_release);
@ -76,12 +77,29 @@ namespace skyline::gpu {
        return {pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool)};
    }

-    void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle) {
+    void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, std::shared_ptr<FenceCycle> cycle, span<vk::Semaphore> waitSemaphores, span<vk::Semaphore> signalSemaphores) {
+        boost::container::small_vector<vk::Semaphore, 3> fullWaitSemaphores{waitSemaphores.begin(), waitSemaphores.end()};
+        boost::container::small_vector<vk::PipelineStageFlags, 3> fullWaitStages{waitSemaphores.size(), vk::PipelineStageFlagBits::eAllCommands};
+
+        if (cycle->semaphoreSubmitWait) {
+            fullWaitSemaphores.push_back(cycle->semaphore);
+            // We don't need a full barrier since this is only done to ensure the semaphore is unsignalled
+            fullWaitStages.push_back(vk::PipelineStageFlagBits::eTopOfPipe);
+        }
+
+        boost::container::small_vector<vk::Semaphore, 2> fullSignalSemaphores{signalSemaphores.begin(), signalSemaphores.end()};
+        fullSignalSemaphores.push_back(cycle->semaphore);
+
        {
-            std::scoped_lock lock(gpu.queueMutex);
+            std::scoped_lock lock{gpu.queueMutex};
            gpu.vkQueue.submit(vk::SubmitInfo{
                .commandBufferCount = 1,
                .pCommandBuffers = &*commandBuffer,
+                .waitSemaphoreCount = static_cast<u32>(waitSemaphores.size()),
+                .pWaitSemaphores = fullWaitSemaphores.data(),
+                .pWaitDstStageMask = fullWaitStages.data(),
+                .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
+                .pSignalSemaphores = fullSignalSemaphores.data(),
            }, cycle->fence);
        }