fix(vulkan-rt): configurable recursion depth + per-shader TLAS push for compute (#21)

Two gaps in the Vulkan RT path that fault the device on the NVIDIA proprietary driver with a non-trivial pipeline (simple VulkanTriangle never hit them): 1. maxPipelineRayRecursionDepth was hardcoded to 1, so any closest-hit shader that traces a secondary ray (shadow ray — a very common pattern) recursed past the pipeline limit (UB → device fault). PipelineRTVulkan::Init now takes a maxRecursionDepth parameter (default 1, clamped to the device's maxRayRecursionDepth). 2. The NVIDIA descriptor-heap AS-read workaround rewrites every shader that reads an accelerationStructureEXT from the heap — including compute shaders — to read the TLAS device address from a push constant, but only RTPass pushed that address. A compute shader that ray-queries the TLAS (rayQueryEXT) therefore ran against an unwritten push slot → garbage AS handle → VK_ERROR_DEVICE_LOST. WorkaroundNvidiaAS::Patch now returns a per-shader PatchResult {patched, tlasPushOffset} instead of writing the clobber-prone global Device::workaroundTlasPushOffset (removed). VulkanShader stores it; ShaderBindingTableVulkan/PipelineRTVulkan carry it for RTPass, and ComputeShader tracks its own offset and pushes the caller-supplied TLAS address in Dispatch (new defaulted tlasAddress parameter), mirroring RTPass::Record. The PushConstantRewrite regression test now asserts Patch's returned patched/offset and adds two ray-querying compute-shader cases, proving the rewrite is stage-agnostic and the per-shader offset is correct. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-03 18:35:39 +00:00 · 2026-06-03 18:35:39 +00:00 · 1c310762a7
commit 1c310762a7
parent 2790bbd576
8 changed files with 248 additions and 75 deletions
--- a/implementations/Crafter.Graphics-ComputeShader.cpp
+++ b/implementations/Crafter.Graphics-ComputeShader.cpp
@ -26,7 +26,10 @@ import std;

 using namespace Crafter;

-ComputeShader::ComputeShader(ComputeShader&& other) noexcept : pipeline(other.pipeline) {
+ComputeShader::ComputeShader(ComputeShader&& other) noexcept
+    : pipeline(other.pipeline),
+      workaroundNeedsTlas(other.workaroundNeedsTlas),
+      workaroundTlasPushOffset(other.workaroundTlasPushOffset) {
    other.pipeline = VK_NULL_HANDLE;
 }

@ -36,6 +39,8 @@ ComputeShader& ComputeShader::operator=(ComputeShader&& other) noexcept {
            vkDestroyPipeline(Device::device, pipeline, nullptr);
        }
        pipeline = other.pipeline;
+        workaroundNeedsTlas = other.workaroundNeedsTlas;
+        workaroundTlasPushOffset = other.workaroundTlasPushOffset;
        other.pipeline = VK_NULL_HANDLE;
    }
    return *this;
@ -51,6 +56,13 @@ ComputeShader::~ComputeShader() {
 void ComputeShader::Load(const std::filesystem::path& spvPath) {
    VulkanShader shader(spvPath, "main", VK_SHADER_STAGE_COMPUTE_BIT, nullptr);

+    // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): remember
+    // whether VulkanShader rewrote a heap acceleration-structure read in this
+    // module, and where it expects the TLAS address pushed, so Dispatch can
+    // feed it the per-frame TLAS. Per-shader, not a global — see ComputeShader.
+    workaroundNeedsTlas = shader.patchedAS;
+    workaroundTlasPushOffset = shader.tlasPushOffset;
+
    // Spec: with VK_PIPELINE_CREATE_2_DESCRIPTOR_HEAP_BIT_EXT, layout MUST be
    // VK_NULL_HANDLE — bindings come from the bound descriptor heap and push
    // constants are pushed via vkCmdPushDataEXT instead of vkCmdPushConstants.
@ -77,7 +89,8 @@ void ComputeShader::Dispatch(VkCommandBuffer cmd,
                             const void* push, std::uint32_t pushBytes,
                             std::uint32_t gx,
                             std::uint32_t gy,
-                             std::uint32_t gz) const {
+                             std::uint32_t gz,
+                             VkDeviceAddress tlasAddress) const {
    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
    if (push != nullptr && pushBytes > 0) {
        VkPushDataInfoEXT pushInfo {
@ -87,5 +100,18 @@ void ComputeShader::Dispatch(VkCommandBuffer cmd,
        };
        Device::vkCmdPushDataEXT(cmd, &pushInfo);
    }
+    // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): if this shader
+    // ray-queries the TLAS through the heap it was rewritten to read the TLAS
+    // device address from a push constant; push the caller-supplied address
+    // where the rewrite reads it (after any user payload, or offset 0 if none).
+    // Mirrors RTPass::Record for the RT pipeline. Inert on every other driver.
+    if (Device::workaroundDescriptorHeapAS && workaroundNeedsTlas) {
+        VkPushDataInfoEXT tlasPush {
+            .sType  = VK_STRUCTURE_TYPE_PUSH_DATA_INFO_EXT,
+            .offset = workaroundTlasPushOffset,
+            .data   = { .address = &tlasAddress, .size = sizeof(tlasAddress) },
+        };
+        Device::vkCmdPushDataEXT(cmd, &tlasPush);
+    }
    vkCmdDispatch(cmd, gx, gy, gz);
 }