Introduce GPU checkpoints for crash debugging

When GPU crashes aren't reproducable in renderdoc, it helps to have someway to figure out what exactly is going on when a crash happens or what operation caused it. Add a checkpoint system that reports the GPU execution state in perfetto in time with actual GPU execution, and use flow events to show the event's path through execution, vulkan record and executor record stages.
2025-07-17 08:46:39 +00:00 · 2023-02-04 21:10:36 +00:00
parent d5b6c68ae4
commit 49cd2a71cc
9 changed files with 161 additions and 23 deletions
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
@ -3,7 +3,9 @@

 #include <gpu.h>
 #include <loader/loader.h>
+#include <vulkan/vulkan.hpp>
 #include "command_scheduler.h"
+#include "common/exception.h"

 namespace skyline::gpu {
    void CommandScheduler::WaiterThread() {
@ -91,16 +93,22 @@ namespace skyline::gpu {
        fullSignalSemaphores.push_back(cycle->semaphore);

        {
-            std::scoped_lock lock{gpu.queueMutex};
-            gpu.vkQueue.submit(vk::SubmitInfo{
-                .commandBufferCount = 1,
-                .pCommandBuffers = &*commandBuffer,
-                .waitSemaphoreCount = static_cast<u32>(fullWaitSemaphores.size()),
-                .pWaitSemaphores = fullWaitSemaphores.data(),
-                .pWaitDstStageMask = fullWaitStages.data(),
-                .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
-                .pSignalSemaphores = fullSignalSemaphores.data(),
-            }, cycle->fence);
+            try {
+                std::scoped_lock lock{gpu.queueMutex};
+                gpu.vkQueue.submit(vk::SubmitInfo{
+                    .commandBufferCount = 1,
+                    .pCommandBuffers = &*commandBuffer,
+                    .waitSemaphoreCount = static_cast<u32>(fullWaitSemaphores.size()),
+                    .pWaitSemaphores = fullWaitSemaphores.data(),
+                    .pWaitDstStageMask = fullWaitStages.data(),
+                    .signalSemaphoreCount = static_cast<u32>(fullSignalSemaphores.size()),
+                    .pSignalSemaphores = fullSignalSemaphores.data(),
+                }, cycle->fence);
+            } catch (const vk::DeviceLostError &e) {
+                // Wait 5 seconds to give traces etc. time to settle
+                std::this_thread::sleep_for(std::chrono::seconds(5));
+                throw exception("Vulkan device lost!");
+            }
        }

        cycle->NotifySubmitted();