Introduce ThreadLocal Class + Fix Several GPU Bugs

* Fix `AddClearColorSubpass` bug where it would not generate a `VkCmdNextSubpass` when an attachment clear was utilized * Fix `AddSubpass` bug where the Depth Stencil texture would not be synced * Respect `VkCommandPool` external synchronization requirements by making it thread-local with a custom RAII wrapper * Fix linear RT width calculation as it's provided in terms of bytes rather than format units * Fix `AllocateStagingBuffer` bug where it would not supply `eTransferDst` as a usage flag * Fix `AllocateMappedImage` where `VkMemoryPropertyFlags` were not respected resulting in non-`eHostVisible` memory being utilized * Change feature requirement in `AndroidManifest.xml` to Vulkan 1.1 from OGL 3.1 as this was incorrect
2025-07-17 08:46:39 +00:00 · 2021-10-11 09:13:25 +05:30
parent eb25f60033
commit 9b9bf8d300
17 changed files with 248 additions and 38 deletions
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.cpp
@ -20,21 +20,20 @@ namespace skyline::gpu {
        return false;
    }

-    CommandScheduler::CommandScheduler(GPU &pGpu) : gpu(pGpu), vkCommandPool(pGpu.vkDevice, vk::CommandPoolCreateInfo{
+    CommandScheduler::CommandScheduler(GPU &pGpu) : gpu(pGpu), pool(std::ref(pGpu.vkDevice), vk::CommandPoolCreateInfo{
        .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
        .queueFamilyIndex = pGpu.vkQueueFamilyIndex,
    }) {}

    CommandScheduler::ActiveCommandBuffer CommandScheduler::AllocateCommandBuffer() {
-        std::scoped_lock lock(mutex);
-        auto slot{std::find_if(commandBuffers.begin(), commandBuffers.end(), CommandBufferSlot::AllocateIfFree)};
-        auto slotId{std::distance(commandBuffers.begin(), slot)};
-        if (slot != commandBuffers.end())
+        auto slot{std::find_if(pool->buffers.begin(), pool->buffers.end(), CommandBufferSlot::AllocateIfFree)};
+        auto slotId{std::distance(pool->buffers.begin(), slot)};
+        if (slot != pool->buffers.end())
            return ActiveCommandBuffer(*slot);

        vk::CommandBuffer commandBuffer;
        vk::CommandBufferAllocateInfo commandBufferAllocateInfo{
-            .commandPool = *vkCommandPool,
+            .commandPool = *pool->vkCommandPool,
            .level = vk::CommandBufferLevel::ePrimary,
            .commandBufferCount = 1,
        };
@ -42,7 +41,7 @@ namespace skyline::gpu {
        auto result{(*gpu.vkDevice).allocateCommandBuffers(&commandBufferAllocateInfo, &commandBuffer, *gpu.vkDevice.getDispatcher())};
        if (result != vk::Result::eSuccess)
            vk::throwResultException(result, __builtin_FUNCTION());
-        return ActiveCommandBuffer(commandBuffers.emplace_back(gpu.vkDevice, commandBuffer, vkCommandPool));
+        return ActiveCommandBuffer(pool->buffers.emplace_back(gpu.vkDevice, commandBuffer, pool->vkCommandPool));
    }

    void CommandScheduler::SubmitCommandBuffer(const vk::raii::CommandBuffer &commandBuffer, vk::Fence fence) {
--- a/app/src/main/cpp/skyline/gpu/command_scheduler.h
+++ b/app/src/main/cpp/skyline/gpu/command_scheduler.h
@ -3,6 +3,7 @@

 #pragma once

+#include <common/thread_local.h>
 #include "fence_cycle.h"

 namespace skyline::gpu {
@ -62,9 +63,19 @@ namespace skyline::gpu {
        };

        GPU &gpu;
-        std::mutex mutex; //!< Synchronizes mutations to the command pool due to allocations
-        vk::raii::CommandPool vkCommandPool;
-        std::list<CommandBufferSlot> commandBuffers;
+
+        /**
+         * @brief A command pool designed to be thread-local to respect external synchronization for all command buffers and the associated pool
+         * @note If we utilized a single global pool there would need to be a mutex around command buffer recording which would incur significant costs
+         */
+        struct CommandPool {
+            vk::raii::CommandPool vkCommandPool;
+            std::list<CommandBufferSlot> buffers;
+
+            template<typename... Args>
+            constexpr CommandPool(Args &&... args) : vkCommandPool(std::forward<Args>(args)...) {}
+        };
+        ThreadLocal<CommandPool> pool;

        /**
         * @brief Allocates an existing or new primary command buffer from the pool
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_executor.cpp
@ -20,16 +20,18 @@ namespace skyline::gpu::interconnect {
    }

    void CommandExecutor::AddSubpass(const std::function<void(vk::raii::CommandBuffer &, const std::shared_ptr<FenceCycle> &, GPU &)> &function, vk::Rect2D renderArea, std::vector<TextureView> inputAttachments, std::vector<TextureView> colorAttachments, std::optional<TextureView> depthStencilAttachment) {
-        for (const auto& attachments : {inputAttachments, colorAttachments})
-            for (const auto& attachment : attachments)
+        for (const auto &attachments : {inputAttachments, colorAttachments})
+            for (const auto &attachment : attachments)
                syncTextures.emplace(attachment.backing.get());
+        if (depthStencilAttachment)
+            syncTextures.emplace(depthStencilAttachment->backing.get());

        bool newRenderpass{CreateRenderpass(renderArea)};
        renderpass->AddSubpass(inputAttachments, colorAttachments, depthStencilAttachment ? &*depthStencilAttachment : nullptr);
        if (newRenderpass)
            nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), function);
        else
-            nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>(), function);
+            nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
    }

    void CommandExecutor::AddClearColorSubpass(TextureView attachment, const vk::ClearColorValue &value) {
@ -38,7 +40,10 @@ namespace skyline::gpu::interconnect {
        })};
        renderpass->AddSubpass({}, attachment, nullptr);

-        if (!renderpass->ClearColorAttachment(0, value)) {
+        if (renderpass->ClearColorAttachment(0, value)) {
+            if (!newRenderpass)
+                nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>());
+        } else {
            auto function{[scissor = attachment.backing->dimensions, value](vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &, GPU &) {
                commandBuffer.clearAttachments(vk::ClearAttachment{
                    .aspectMask = vk::ImageAspectFlagBits::eColor,
@ -54,7 +59,7 @@ namespace skyline::gpu::interconnect {
            if (newRenderpass)
                nodes.emplace_back(std::in_place_type_t<node::FunctionNode>(), function);
            else
-                nodes.emplace_back(std::in_place_type_t<node::NextSubpassNode>(), function);
+                nodes.emplace_back(std::in_place_type_t<node::NextSubpassFunctionNode>(), function);
        }
    }

@ -73,12 +78,15 @@ namespace skyline::gpu::interconnect {

                using namespace node;
                for (NodeVariant &node : nodes) {
+                    #define NODE(name) [&](name& node) { node(commandBuffer, cycle, gpu); }
                    std::visit(VariantVisitor{
-                        [&](FunctionNode &node) { node(commandBuffer, cycle, gpu); },
-                        [&](RenderpassNode &node) { node(commandBuffer, cycle, gpu); },
-                        [&](NextSubpassNode &node) { node(commandBuffer, cycle, gpu); },
-                        [&](RenderpassEndNode &node) { node(commandBuffer, cycle, gpu); },
+                        NODE(FunctionNode),
+                        NODE(RenderpassNode),
+                        NODE(NextSubpassNode),
+                        NODE(NextSubpassFunctionNode),
+                        NODE(RenderpassEndNode),
                    }, node);
+                    #undef NODE
                }

                for (auto texture : syncTextures)
--- a/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/command_nodes.h
@ -286,10 +286,19 @@ namespace skyline::gpu::interconnect::node {
        }
    };

+    /**
+     * @brief A node which progresses to the next subpass during a renderpass
+     */
+    struct NextSubpassNode {
+        void operator()(vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &gpu) {
+            commandBuffer.nextSubpass(vk::SubpassContents::eInline);
+        }
+    };
+
    /**
     * @brief A FunctionNode which progresses to the next subpass prior to calling the function
     */
-    struct NextSubpassNode : private FunctionNode {
+    struct NextSubpassFunctionNode : private FunctionNode {
        using FunctionNode::FunctionNode;

        void operator()(vk::raii::CommandBuffer &commandBuffer, const std::shared_ptr<FenceCycle> &cycle, GPU &gpu) {
@ -307,5 +316,5 @@ namespace skyline::gpu::interconnect::node {
        }
    };

-    using NodeVariant = std::variant<FunctionNode, RenderpassNode, NextSubpassNode, RenderpassEndNode>; //!< A variant encompassing all command nodes types
+    using NodeVariant = std::variant<FunctionNode, RenderpassNode, NextSubpassNode, NextSubpassFunctionNode, RenderpassEndNode>; //!< A variant encompassing all command nodes types
 }
--- a/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
+++ b/app/src/main/cpp/skyline/gpu/interconnect/graphics_context.h
@ -32,6 +32,7 @@ namespace skyline::gpu::interconnect {
                    u32 gpuAddressHigh;
                };
            };
+            u32 widthBytes; //!< The width in bytes for linear textures
            GuestTexture guest;
            std::optional<TextureView> view;

@ -74,6 +75,9 @@ namespace skyline::gpu::interconnect {

        void SetRenderTargetWidth(size_t index, u32 value) {
            auto &renderTarget{renderTargets.at(index)};
+            renderTarget.widthBytes = value;
+            if (renderTarget.guest.tileConfig.mode == texture::TileMode::Linear && renderTarget.guest.format)
+                value /= renderTarget.guest.format->bpb; // Width is in bytes rather than format units for linear textures
            renderTarget.guest.dimensions.width = value;
            renderTarget.view.reset();
        }
@ -134,6 +138,10 @@ namespace skyline::gpu::interconnect {
                        throw exception("Cannot translate the supplied RT format: 0x{:X}", static_cast<u32>(format));
                }
            }();
+
+            if (renderTarget.guest.tileConfig.mode == texture::TileMode::Linear && renderTarget.guest.format)
+                renderTarget.guest.dimensions.width = renderTarget.widthBytes / renderTarget.guest.format->bpb;
+
            renderTarget.disabled = !renderTarget.guest.format;
            renderTarget.view.reset();
        }
@ -142,8 +150,17 @@ namespace skyline::gpu::interconnect {
            auto &renderTarget{renderTargets.at(index)};
            auto &config{renderTarget.guest.tileConfig};
            if (mode.isLinear) {
+                if (config.mode != texture::TileMode::Linear && renderTarget.guest.format) {
+                    // Width is provided in bytes rather than format units for linear textures
+                    renderTarget.widthBytes = renderTarget.guest.dimensions.width;
+                    renderTarget.guest.dimensions.width /= renderTarget.guest.format->bpb;
+                }
+
                config.mode = texture::TileMode::Linear;
            } else [[likely]] {
+                if (config.mode == texture::TileMode::Linear && renderTarget.guest.format)
+                    renderTarget.guest.dimensions.width = renderTarget.widthBytes;
+
                config = texture::TileConfig{
                    .mode = texture::TileMode::Block,
                    .blockHeight = static_cast<u8>(1U << mode.blockHeightLog2),
--- a/app/src/main/cpp/skyline/gpu/memory_manager.cpp
+++ b/app/src/main/cpp/skyline/gpu/memory_manager.cpp
@ -78,7 +78,7 @@ namespace skyline::gpu::memory {
    std::shared_ptr<StagingBuffer> MemoryManager::AllocateStagingBuffer(vk::DeviceSize size) {
        vk::BufferCreateInfo bufferCreateInfo{
            .size = size,
-            .usage = vk::BufferUsageFlagBits::eTransferSrc,
+            .usage = vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
            .sharingMode = vk::SharingMode::eExclusive,
            .queueFamilyIndexCount = 1,
            .pQueueFamilyIndices = &gpu.vkQueueFamilyIndex,
@ -112,7 +112,7 @@ namespace skyline::gpu::memory {
    Image MemoryManager::AllocateMappedImage(const vk::ImageCreateInfo &createInfo) {
        VmaAllocationCreateInfo allocationCreateInfo{
            .usage = VMA_MEMORY_USAGE_UNKNOWN,
-            .memoryTypeBits = static_cast<u32>(vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eDeviceLocal),
+            .requiredFlags = static_cast<VkMemoryPropertyFlags>(vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eDeviceLocal),
        };

        VkImage image;
--- a/app/src/main/cpp/skyline/gpu/texture/texture.cpp
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.cpp
@ -484,6 +484,10 @@ namespace skyline::gpu {
        cycle = lCycle;
    }

+    Texture::~Texture() {
+        WaitOnFence();
+    }
+
    TextureView::TextureView(std::shared_ptr<Texture> backing, vk::ImageViewType type, vk::ImageSubresourceRange range, texture::Format format, vk::ComponentMapping mapping) : backing(std::move(backing)), type(type), format(format), mapping(mapping), range(range) {}

    vk::ImageView TextureView::GetView() {
--- a/app/src/main/cpp/skyline/gpu/texture/texture.h
+++ b/app/src/main/cpp/skyline/gpu/texture/texture.h
@ -371,6 +371,8 @@ namespace skyline::gpu {
         */
        Texture(GPU &gpu, texture::Dimensions dimensions, texture::Format format, vk::ImageLayout initialLayout = vk::ImageLayout::eGeneral, vk::ImageUsageFlags usage = {}, vk::ImageTiling tiling = vk::ImageTiling::eOptimal, u32 mipLevels = 1, u32 layerCount = 1, vk::SampleCountFlagBits sampleCount = vk::SampleCountFlagBits::e1);

+        ~Texture();
+
        /**
         * @note The handle returned is nullable and the appropriate precautions should be taken
         */