From 1c310762a79490abc5bd87f90a09e49e915470b2 Mon Sep 17 00:00:00 2001 From: catbot Date: Wed, 3 Jun 2026 18:35:39 +0000 Subject: [PATCH] fix(vulkan-rt): configurable recursion depth + per-shader TLAS push for compute (#21) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two gaps in the Vulkan RT path that fault the device on the NVIDIA proprietary driver with a non-trivial pipeline (simple VulkanTriangle never hit them): 1. maxPipelineRayRecursionDepth was hardcoded to 1, so any closest-hit shader that traces a secondary ray (shadow ray — a very common pattern) recursed past the pipeline limit (UB → device fault). PipelineRTVulkan::Init now takes a maxRecursionDepth parameter (default 1, clamped to the device's maxRayRecursionDepth). 2. The NVIDIA descriptor-heap AS-read workaround rewrites every shader that reads an accelerationStructureEXT from the heap — including compute shaders — to read the TLAS device address from a push constant, but only RTPass pushed that address. A compute shader that ray-queries the TLAS (rayQueryEXT) therefore ran against an unwritten push slot → garbage AS handle → VK_ERROR_DEVICE_LOST. WorkaroundNvidiaAS::Patch now returns a per-shader PatchResult {patched, tlasPushOffset} instead of writing the clobber-prone global Device::workaroundTlasPushOffset (removed). VulkanShader stores it; ShaderBindingTableVulkan/PipelineRTVulkan carry it for RTPass, and ComputeShader tracks its own offset and pushes the caller-supplied TLAS address in Dispatch (new defaulted tlasAddress parameter), mirroring RTPass::Record. The PushConstantRewrite regression test now asserts Patch's returned patched/offset and adds two ray-querying compute-shader cases, proving the rewrite is stage-agnostic and the per-shader offset is correct. Co-Authored-By: Claude Opus 4.8 --- .../Crafter.Graphics-ComputeShader.cpp | 30 +++- .../Crafter.Graphics-ComputeShader.cppm | 22 ++- interfaces/Crafter.Graphics-Device.cppm | 12 +- .../Crafter.Graphics-PipelineRTVulkan.cppm | 22 ++- interfaces/Crafter.Graphics-RTPass.cppm | 6 +- ...ter.Graphics-ShaderBindingTableVulkan.cppm | 12 ++ interfaces/Crafter.Graphics-ShaderVulkan.cppm | 49 +++-- tests/PushConstantRewrite/main.cpp | 170 ++++++++++++------ 8 files changed, 248 insertions(+), 75 deletions(-) diff --git a/implementations/Crafter.Graphics-ComputeShader.cpp b/implementations/Crafter.Graphics-ComputeShader.cpp index 258af04..3edc369 100644 --- a/implementations/Crafter.Graphics-ComputeShader.cpp +++ b/implementations/Crafter.Graphics-ComputeShader.cpp @@ -26,7 +26,10 @@ import std; using namespace Crafter; -ComputeShader::ComputeShader(ComputeShader&& other) noexcept : pipeline(other.pipeline) { +ComputeShader::ComputeShader(ComputeShader&& other) noexcept + : pipeline(other.pipeline), + workaroundNeedsTlas(other.workaroundNeedsTlas), + workaroundTlasPushOffset(other.workaroundTlasPushOffset) { other.pipeline = VK_NULL_HANDLE; } @@ -36,6 +39,8 @@ ComputeShader& ComputeShader::operator=(ComputeShader&& other) noexcept { vkDestroyPipeline(Device::device, pipeline, nullptr); } pipeline = other.pipeline; + workaroundNeedsTlas = other.workaroundNeedsTlas; + workaroundTlasPushOffset = other.workaroundTlasPushOffset; other.pipeline = VK_NULL_HANDLE; } return *this; @@ -51,6 +56,13 @@ ComputeShader::~ComputeShader() { void ComputeShader::Load(const std::filesystem::path& spvPath) { VulkanShader shader(spvPath, "main", VK_SHADER_STAGE_COMPUTE_BIT, nullptr); + // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): remember + // whether VulkanShader rewrote a heap acceleration-structure read in this + // module, and where it expects the TLAS address pushed, so Dispatch can + // feed it the per-frame TLAS. Per-shader, not a global — see ComputeShader. + workaroundNeedsTlas = shader.patchedAS; + workaroundTlasPushOffset = shader.tlasPushOffset; + // Spec: with VK_PIPELINE_CREATE_2_DESCRIPTOR_HEAP_BIT_EXT, layout MUST be // VK_NULL_HANDLE — bindings come from the bound descriptor heap and push // constants are pushed via vkCmdPushDataEXT instead of vkCmdPushConstants. @@ -77,7 +89,8 @@ void ComputeShader::Dispatch(VkCommandBuffer cmd, const void* push, std::uint32_t pushBytes, std::uint32_t gx, std::uint32_t gy, - std::uint32_t gz) const { + std::uint32_t gz, + VkDeviceAddress tlasAddress) const { vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); if (push != nullptr && pushBytes > 0) { VkPushDataInfoEXT pushInfo { @@ -87,5 +100,18 @@ void ComputeShader::Dispatch(VkCommandBuffer cmd, }; Device::vkCmdPushDataEXT(cmd, &pushInfo); } + // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): if this shader + // ray-queries the TLAS through the heap it was rewritten to read the TLAS + // device address from a push constant; push the caller-supplied address + // where the rewrite reads it (after any user payload, or offset 0 if none). + // Mirrors RTPass::Record for the RT pipeline. Inert on every other driver. + if (Device::workaroundDescriptorHeapAS && workaroundNeedsTlas) { + VkPushDataInfoEXT tlasPush { + .sType = VK_STRUCTURE_TYPE_PUSH_DATA_INFO_EXT, + .offset = workaroundTlasPushOffset, + .data = { .address = &tlasAddress, .size = sizeof(tlasAddress) }, + }; + Device::vkCmdPushDataEXT(cmd, &tlasPush); + } vkCmdDispatch(cmd, gx, gy, gz); } diff --git a/interfaces/Crafter.Graphics-ComputeShader.cppm b/interfaces/Crafter.Graphics-ComputeShader.cppm index 1e8df9c..8ca6336 100644 --- a/interfaces/Crafter.Graphics-ComputeShader.cppm +++ b/interfaces/Crafter.Graphics-ComputeShader.cppm @@ -36,6 +36,16 @@ export namespace Crafter { public: VkPipeline pipeline = VK_NULL_HANDLE; + // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): set by + // Load when this shader ray-queries the TLAS through the descriptor + // heap and was rewritten to read its device address from a push + // constant. `workaroundTlasPushOffset` is the byte offset of that member + // (after the caller's own push payload, or 0 if the shader had none). + // Tracked per-shader — a global is clobbered by whichever shader was + // patched last. Both inert (false/0) on every other driver. + bool workaroundNeedsTlas = false; + std::uint32_t workaroundTlasPushOffset = 0; + ComputeShader() = default; ComputeShader(const ComputeShader&) = delete; ComputeShader& operator=(const ComputeShader&) = delete; @@ -50,11 +60,21 @@ export namespace Crafter { // Bind, push constants (if any), dispatch. Caller computes group counts // and is responsible for any inter-dispatch barriers (UIRenderer::Dispatch // wraps this with the standard write-after-write barrier). + // + // tlasAddress is the NVIDIA descriptor-heap AS-read workaround hook + // (issue #15 / #7): a shader that ray-queries the TLAS through the + // descriptor heap is rewritten to read its device address from a push + // constant, so the caller must supply the active frame's TLAS address + // (RenderingElement3D::tlases[frameIdx].address) here. It is pushed at + // the shader's workaroundTlasPushOffset only when the shader was + // rewritten (workaroundNeedsTlas) — ignored otherwise and on every + // other driver, so shaders that don't touch an AS pass nothing. void Dispatch(VkCommandBuffer cmd, const void* push, std::uint32_t pushBytes, std::uint32_t gx, std::uint32_t gy = 1, - std::uint32_t gz = 1) const; + std::uint32_t gz = 1, + VkDeviceAddress tlasAddress = 0) const; }; } #endif // !CRAFTER_GRAPHICS_WINDOW_DOM diff --git a/interfaces/Crafter.Graphics-Device.cppm b/interfaces/Crafter.Graphics-Device.cppm index ed331ff..3708507 100644 --- a/interfaces/Crafter.Graphics-Device.cppm +++ b/interfaces/Crafter.Graphics-Device.cppm @@ -178,12 +178,12 @@ export namespace Crafter { // path and RTPass pushes the active TLAS address as push data. Delete // this flag and everything keyed on it once a fixed driver ships. inline static bool workaroundDescriptorHeapAS = false; - // Byte offset of the TLAS-address member inside the patched raygen's - // push-constant block — 0 for a freshly synthesized block, or the end - // of the user's own block when the address is appended to it (the - // shader can't have two push-constant blocks). VulkanShader sets this - // at module load; RTPass feeds it to vkCmdPushDataEXT. - inline static std::uint32_t workaroundTlasPushOffset = 0; + // The byte offset of the TLAS-address member inside a patched shader's + // push-constant block is tracked per-shader (VulkanShader::tlasPushOffset), + // not here: a single global is clobbered by whichever shader was patched + // last and so cannot serve several shaders with differing push layouts + // (e.g. an RT raygen and a ray-querying compute shader). RTPass and + // ComputeShader read the offset off the pipeline they record. static void CheckVkResult(VkResult result); static std::uint32_t GetMemoryType(std::uint32_t typeBits, VkMemoryPropertyFlags properties); diff --git a/interfaces/Crafter.Graphics-PipelineRTVulkan.cppm b/interfaces/Crafter.Graphics-PipelineRTVulkan.cppm index 0c5ddec..c0374ec 100644 --- a/interfaces/Crafter.Graphics-PipelineRTVulkan.cppm +++ b/interfaces/Crafter.Graphics-PipelineRTVulkan.cppm @@ -39,7 +39,25 @@ export namespace Crafter { VkStridedDeviceAddressRegionKHR hitRegion; VkStridedDeviceAddressRegionKHR callableRegion; - void Init(VkCommandBuffer cmd, std::span raygenGroups, std::span missGroups, std::span hitGroups, ShaderBindingTableVulkan& shaderTable) { + // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): copied + // from the shader table at Init so RTPass can push the active TLAS + // device address into the patched shaders' push constant. Inert on + // every other driver. + bool workaroundNeedsTlas = false; + std::uint32_t workaroundTlasPushOffset = 0; + + // maxRecursionDepth: the maximum ray-recursion depth the pipeline must + // support — i.e. the deepest chain of nested traceRayEXT calls. The + // raygen counts as depth 1, so a closest-hit shader that traces a shadow + // ray needs 2. Tracing beyond the value the pipeline was created with is + // undefined behaviour and faults the device, so a consumer with any + // recursion past the raygen must raise this. Defaults to 1 (raygen-only, + // matching the simple examples) and is clamped to the device's + // maxRayRecursionDepth. + void Init(VkCommandBuffer cmd, std::span raygenGroups, std::span missGroups, std::span hitGroups, ShaderBindingTableVulkan& shaderTable, std::uint32_t maxRecursionDepth = 1) { + workaroundNeedsTlas = shaderTable.workaroundNeedsTlas; + workaroundTlasPushOffset = shaderTable.workaroundTlasPushOffset; + std::vector groups; groups.reserve(raygenGroups.size() + missGroups.size() + hitGroups.size()); @@ -60,7 +78,7 @@ export namespace Crafter { .pStages = shaderTable.shaderStages.data(), .groupCount = static_cast(groups.size()), .pGroups = groups.data(), - .maxPipelineRayRecursionDepth = 1, + .maxPipelineRayRecursionDepth = std::min(maxRecursionDepth, Device::rayTracingProperties.maxRayRecursionDepth), .layout = VK_NULL_HANDLE }; diff --git a/interfaces/Crafter.Graphics-RTPass.cppm b/interfaces/Crafter.Graphics-RTPass.cppm index d436387..f03f2c9 100644 --- a/interfaces/Crafter.Graphics-RTPass.cppm +++ b/interfaces/Crafter.Graphics-RTPass.cppm @@ -42,14 +42,16 @@ export namespace Crafter { // block that VulkanShader synthesizes, so the rewritten raygen can // reach the acceleration structure by address instead of through // the faulting heap descriptor. Inert on every other driver. - if (Device::workaroundDescriptorHeapAS) { + if (Device::workaroundDescriptorHeapAS && pipeline->workaroundNeedsTlas) { VkDeviceAddress tlasAddr = RenderingElement3D::tlases[frameIdx].address; VkPushDataInfoEXT pushInfo { .sType = VK_STRUCTURE_TYPE_PUSH_DATA_INFO_EXT, // Where the rewritten raygen reads the TLAS address: 0 when // VulkanShader synthesized a fresh block, or the offset of // the member it appended to the shader's existing block. - .offset = Device::workaroundTlasPushOffset, + // Tracked per-pipeline (copied from the shader table) so a + // later-loaded shader can't clobber it. + .offset = pipeline->workaroundTlasPushOffset, .data = { .address = &tlasAddr, .size = sizeof(tlasAddr) }, }; Device::vkCmdPushDataEXT(cmd, &pushInfo); diff --git a/interfaces/Crafter.Graphics-ShaderBindingTableVulkan.cppm b/interfaces/Crafter.Graphics-ShaderBindingTableVulkan.cppm index 92875fe..aad6b3f 100644 --- a/interfaces/Crafter.Graphics-ShaderBindingTableVulkan.cppm +++ b/interfaces/Crafter.Graphics-ShaderBindingTableVulkan.cppm @@ -33,10 +33,22 @@ export namespace Crafter { class ShaderBindingTableVulkan { public: std::vector shaderStages; + // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): true when + // any stage in this table reads an acceleration structure and was + // rewritten to fetch the TLAS address from a push constant, with the + // byte offset that stage expects it at. PipelineRTVulkan copies these so + // RTPass can push the address without consulting a clobber-prone global. + // Both inert (false/0) on every other driver. + bool workaroundNeedsTlas = false; + std::uint32_t workaroundTlasPushOffset = 0; void Init(const std::span shaders) { shaderStages.reserve(shaders.size()); for(const VulkanShader& shader: shaders) { shaderStages.emplace_back(VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, nullptr, 0, shader.stage, shader.shader, shader.entrypoint.c_str(), shader.specilizationInfo); + if (shader.patchedAS) { + workaroundNeedsTlas = true; + workaroundTlasPushOffset = shader.tlasPushOffset; + } } } }; diff --git a/interfaces/Crafter.Graphics-ShaderVulkan.cppm b/interfaces/Crafter.Graphics-ShaderVulkan.cppm index 49a6699..2dc2d79 100644 --- a/interfaces/Crafter.Graphics-ShaderVulkan.cppm +++ b/interfaces/Crafter.Graphics-ShaderVulkan.cppm @@ -52,8 +52,12 @@ import :Types; // append a ulong member (the TLAS address) to the *existing* block and read // from there; only shaders with no push constant of their own get a freshly // synthesized single-member block. Its byte offset is the offset of that -// member (published via Crafter::Device::workaroundTlasPushOffset) which RTPass feeds to -// vkCmdPushDataEXT so the address lands where the rewritten load reads it. +// member, returned in PatchResult::tlasPushOffset so the caller (RTPass for the +// RT pipeline, ComputeShader::Dispatch for a compute pipeline) can feed it to +// vkCmdPushDataEXT — landing the address exactly where the rewritten load reads +// it. The offset is per-shader rather than a global: a global is clobbered by +// whichever shader was patched last and so cannot serve several shaders whose +// push-constant layouts differ. // // Exported so tests/PushConstantRewrite can drive Patch() over real compiled // SPIR-V and check the result with spirv-val; nothing in the engine calls it @@ -87,15 +91,24 @@ export namespace WorkaroundNvidiaAS { return (v + a - 1u) & ~(a - 1u); } - inline void Patch(std::vector& words) { - if (words.size() < 5) return; // not a SPIR-V module we understand. + // Outcome of patching one shader module. `patched` is true only when the + // shader read an acceleration structure and was rewritten; `tlasPushOffset` + // is then the byte offset of the TLAS-address member in the (possibly + // pre-existing) push-constant block the caller must write. + struct PatchResult { + bool patched = false; + std::uint32_t tlasPushOffset = 0; + }; + + inline PatchResult Patch(std::vector& words) { + if (words.size() < 5) return {}; // not a SPIR-V module we understand. // Split header (5 words) from the instruction stream. std::uint32_t bound = words[3]; std::vector instrs; for (std::size_t i = 5; i < words.size();) { std::uint32_t len = words[i] >> 16; - if (len == 0 || i + len > words.size()) return; // malformed — bail. + if (len == 0 || i + len > words.size()) return {}; // malformed — bail. instrs.emplace_back(words.begin() + i, words.begin() + i + len); i += len; } @@ -163,7 +176,10 @@ export namespace WorkaroundNvidiaAS { if (op == 54 /*OpFunction*/ && firstFuncIdx == instrs.size()) firstFuncIdx = k; } - if (asTypeId == 0) return; // shader never reads an acceleration structure. + if (asTypeId == 0) return {}; // shader never reads an acceleration structure. + + // Set on whichever path runs below; returned to the caller. + std::uint32_t tlasPushOffset = 0; auto newId = [&] { return bound++; }; auto mk = [](std::initializer_list ops) { @@ -230,7 +246,7 @@ export namespace WorkaroundNvidiaAS { pcVarId = existingPcVarId; const Instr* structInstr = typeInstr[existingPcStructId]; memberIdx = static_cast(structInstr->size() - 2); - Crafter::Device::workaroundTlasPushOffset = AlignUp(footprint(existingPcStructId), 8); + tlasPushOffset = AlignUp(footprint(existingPcStructId), 8); ptrPushUlongId = existingPtrUlongId; if (ptrPushUlongId == 0) { @@ -247,7 +263,7 @@ export namespace WorkaroundNvidiaAS { memberIdxConstId = newId(); typeDefs.push_back(mk({OpConstant, uintTypeId, memberIdxConstId, memberIdx})); } - decorations.push_back(mk({OpMemberDecorate, existingPcStructId, memberIdx, DecorationOffset, Crafter::Device::workaroundTlasPushOffset})); + decorations.push_back(mk({OpMemberDecorate, existingPcStructId, memberIdx, DecorationOffset, tlasPushOffset})); } else { // No user push constant — synthesize a fresh single-member block. if (uintZeroId == 0) { uintZeroId = newId(); typeDefs.push_back(mk({OpConstant, uintTypeId, uintZeroId, 0})); } @@ -262,7 +278,7 @@ export namespace WorkaroundNvidiaAS { decorations.push_back(mk({OpMemberDecorate, pcStructId, 0, DecorationOffset, 0})); decorations.push_back(mk({OpDecorate, pcStructId, DecorationBlock})); memberIdxConstId = uintZeroId; - Crafter::Device::workaroundTlasPushOffset = 0; + tlasPushOffset = 0; } // ── Rewrite each `OpLoad %asType ` into address-load + convert, and @@ -327,6 +343,8 @@ export namespace WorkaroundNvidiaAS { out[3] = bound; for (const Instr& in : instrs) out.insert(out.end(), in.begin(), in.end()); words.swap(out); + + return {true, tlasPushOffset}; } } // ─── END NVIDIA descriptor-heap AS-read workaround ──────────────────────── @@ -339,6 +357,15 @@ export namespace Crafter { VkShaderStageFlagBits stage; std::string entrypoint; VkShaderModule shader; + // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): set when + // this module read an acceleration structure and was rewritten to fetch + // the TLAS device address from a push constant. `tlasPushOffset` is the + // byte offset of that member, which whoever records the dispatch + // (RTPass / ComputeShader) must write with vkCmdPushDataEXT. Per-shader + // rather than a global because each shader's push-constant layout — and + // therefore the offset — can differ. Both false/0 on every other driver. + bool patchedAS = false; + std::uint32_t tlasPushOffset = 0; VulkanShader(const std::filesystem::path& path, std::string entrypoint, VkShaderStageFlagBits stage, VkSpecializationInfo* specilizationInfo) : stage(stage), entrypoint(entrypoint), specilizationInfo(specilizationInfo) { std::ifstream file(path, std::ios::binary); if (!file) { @@ -364,7 +391,9 @@ export namespace Crafter { // acceleration structure. Remove with the rest of the workaround // once a fixed NVIDIA driver ships. if (Device::workaroundDescriptorHeapAS) { - WorkaroundNvidiaAS::Patch(spirv); + WorkaroundNvidiaAS::PatchResult patch = WorkaroundNvidiaAS::Patch(spirv); + patchedAS = patch.patched; + tlasPushOffset = patch.tlasPushOffset; } VkShaderModuleCreateInfo module_info{VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO}; diff --git a/tests/PushConstantRewrite/main.cpp b/tests/PushConstantRewrite/main.cpp index d55ef9f..78bee47 100644 --- a/tests/PushConstantRewrite/main.cpp +++ b/tests/PushConstantRewrite/main.cpp @@ -29,7 +29,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // them through the real Patch(), and asserts with spirv-val that the result is // valid and contains exactly one push-constant variable — both for shaders // that already have a push constant (merge path) and for those that don't -// (synthesize path). It also checks the published TLAS push-constant offset. +// (synthesize path). It also checks the returned TLAS push-constant offset. +// +// It additionally covers ray-querying *compute* shaders (issue #21): the +// rewrite is stage-agnostic, and ComputeShader::Dispatch now pushes the TLAS +// address at the per-shader offset Patch returns, so a compute shader that +// reads an acceleration structure through the descriptor heap must be patched +// and report a correct offset exactly like a raygen does. // // Delete this test together with the rest of the workaround once a fixed NVIDIA // driver ships. @@ -89,7 +95,7 @@ struct Case { std::string_view glsl; bool readsAccelStruct; // whether Patch should rewrite anything bool hasExistingPushConst; // whether the source already declares a push block - std::uint32_t expectedOffset; // expected Device::workaroundTlasPushOffset (only checked when readsAccelStruct) + std::uint32_t expectedOffset; // expected PatchResult::tlasPushOffset (only checked when readsAccelStruct) }; // Shared raygen scaffolding: a heap AS + heap image, traced and stored to. @@ -156,6 +162,110 @@ std::string BuildSource(const Case& c) { return s; } +// Compute counterpart of the raygen cases (issue #21): a shader that ray-queries +// the heap TLAS via rayQueryEXT. Shares the offset math with the raygen merge +// path, so we only need one merge case and one synthesize case to prove compute +// stages are handled identically. +struct ComputeCase { + std::string_view name; + std::string_view glsl; // optional push-constant declaration + bool hasExistingPushConst; + std::uint32_t expectedOffset; // expected PatchResult::tlasPushOffset +}; + +const std::array kComputeCases = {{ + // No push constant → fresh single-member block synthesized at offset 0. + { "compute-no-push", std::string_view{""}, false, 0 }, + // Existing block {uint f; @0}; ends at 4, TLAS rounds up to the next 8. + { "compute-merge-uint", + std::string_view{"layout(push_constant) uniform PC { uint f; } pc;\n"}, true, 8 }, +}}; + +std::string BuildComputeSource(const ComputeCase& c) { + std::string s = + "#version 460\n" + "#extension GL_EXT_ray_query : enable\n" + "#extension GL_EXT_shader_image_load_formatted : enable\n" + "#extension GL_EXT_descriptor_heap : enable\n" + "#extension GL_EXT_nonuniform_qualifier : enable\n" + "layout(descriptor_heap) uniform accelerationStructureEXT topLevelAS[];\n" + "layout(descriptor_heap) uniform writeonly image2D image[];\n"; + s += c.glsl; + s += "layout(local_size_x = 64) in;\n"; + s += "void main() {\n"; + s += " vec3 origin = vec3(0.0);\n"; + s += " vec3 dir = vec3(0.0, 0.0, 1.0);\n"; + s += " rayQueryEXT rq;\n"; + s += " rayQueryInitializeEXT(rq, topLevelAS[0], gl_RayFlagsNoneEXT, 0xFF, origin, 0.001, dir, 10000.0);\n"; + s += " while (rayQueryProceedEXT(rq)) {}\n"; + float pushRef = 0; (void)pushRef; + std::string val = c.hasExistingPushConst ? "float(pc.f)" : "1.0"; + s += " imageStore(image[0], ivec2(gl_GlobalInvocationID.xy), vec4(" + val + "));\n"; + s += "}\n"; + return s; +} + +// Compile `source` for `stage`, run Patch(), and assert: spirv-val accepts it, +// exactly one push-constant variable survives, and Patch reports patched/offset +// matching expectations. Returns true on success. +bool RunCase(const fs::path& dir, std::string_view name, std::string_view stage, + const std::string& source, bool readsAccelStruct, + std::uint32_t expectedOffset) { + const fs::path glslPath = dir / (std::string(name) + "." + std::string(stage) + ".glsl"); + const fs::path spvPath = dir / (std::string(name) + ".spv"); + const fs::path patched = dir / (std::string(name) + ".patched.spv"); + + { std::ofstream f(glslPath); f << source; } + + std::string compile = "glslang --target-env vulkan1.4 -V -S " + std::string(stage) + + " \"" + glslPath.string() + "\" -o \"" + spvPath.string() + "\" > /dev/null"; + if (RunCommand(compile) != 0) { + std::println(std::cerr, "[{}] glslang failed to compile the source shader", name); + return false; + } + + std::vector words = ReadSpirv(spvPath); + if (words.size() < 5) { + std::println(std::cerr, "[{}] could not read compiled SPIR-V", name); + return false; + } + + WorkaroundNvidiaAS::PatchResult patch = WorkaroundNvidiaAS::Patch(words); + WriteSpirv(patched, words); + + // 1. The patched module must pass spirv-val under the engine's flags. + std::string validate = "spirv-val \"" + patched.string() + + "\" --relax-block-layout --scalar-block-layout --target-env vulkan1.4"; + if (RunCommand(validate) != 0) { + std::println(std::cerr, "[{}] spirv-val rejected the patched module", name); + return false; + } + + // 2. Exactly one push-constant variable — the whole point of issue #18. + int pcVars = CountPushConstantVariables(words); + if (pcVars != 1) { + std::println(std::cerr, "[{}] expected exactly 1 push-constant variable, found {}", name, pcVars); + return false; + } + + // 3. Patch must report it rewrote the shader exactly when it reads an AS. + if (patch.patched != readsAccelStruct) { + std::println(std::cerr, "[{}] expected patched={}, got {}", name, readsAccelStruct, patch.patched); + return false; + } + + // 4. The returned TLAS offset must match the expected layout end. + if (readsAccelStruct && patch.tlasPushOffset != expectedOffset) { + std::println(std::cerr, "[{}] expected TLAS push offset {}, got {}", + name, expectedOffset, patch.tlasPushOffset); + return false; + } + + std::println(std::cout, "[{}] ok (push-constant vars: {}, tlas offset: {})", + name, pcVars, readsAccelStruct ? patch.tlasPushOffset : 0u); + return true; +} + } // namespace int main() { @@ -165,58 +275,14 @@ int main() { int failures = 0; for (const Case& c : kCases) { - const fs::path glslPath = dir / (std::string(c.name) + ".rgen.glsl"); - const fs::path spvPath = dir / (std::string(c.name) + ".spv"); - const fs::path patched = dir / (std::string(c.name) + ".patched.spv"); - - { std::ofstream f(glslPath); f << BuildSource(c); } - - std::string compile = "glslang --target-env vulkan1.4 -V -S rgen \"" - + glslPath.string() + "\" -o \"" + spvPath.string() + "\" > /dev/null"; - if (RunCommand(compile) != 0) { - std::println(std::cerr, "[{}] glslang failed to compile the source shader", c.name); + if (!RunCase(dir, c.name, "rgen", BuildSource(c), c.readsAccelStruct, c.expectedOffset)) ++failures; - continue; - } - - std::vector words = ReadSpirv(spvPath); - if (words.size() < 5) { - std::println(std::cerr, "[{}] could not read compiled SPIR-V", c.name); + } + // Ray-querying compute shaders (issue #21) — must be patched and report a + // correct per-shader offset just like the raygen cases above. + for (const ComputeCase& c : kComputeCases) { + if (!RunCase(dir, c.name, "comp", BuildComputeSource(c), /*readsAccelStruct=*/true, c.expectedOffset)) ++failures; - continue; - } - - Device::workaroundTlasPushOffset = 0xDEADBEEFu; // poison so we know Patch set it - WorkaroundNvidiaAS::Patch(words); - WriteSpirv(patched, words); - - // 1. The patched module must pass spirv-val under the engine's flags. - std::string validate = "spirv-val \"" + patched.string() - + "\" --relax-block-layout --scalar-block-layout --target-env vulkan1.4"; - if (RunCommand(validate) != 0) { - std::println(std::cerr, "[{}] spirv-val rejected the patched module", c.name); - ++failures; - continue; - } - - // 2. Exactly one push-constant variable — the whole point of issue #18. - int pcVars = CountPushConstantVariables(words); - if (pcVars != 1) { - std::println(std::cerr, "[{}] expected exactly 1 push-constant variable, found {}", c.name, pcVars); - ++failures; - continue; - } - - // 3. The TLAS offset Patch published must match the expected layout end. - if (c.readsAccelStruct && Device::workaroundTlasPushOffset != c.expectedOffset) { - std::println(std::cerr, "[{}] expected TLAS push offset {}, got {}", - c.name, c.expectedOffset, Device::workaroundTlasPushOffset); - ++failures; - continue; - } - - std::println(std::cout, "[{}] ok (push-constant vars: {}, tlas offset: {})", - c.name, pcVars, c.readsAccelStruct ? Device::workaroundTlasPushOffset : 0u); } if (failures != 0) { -- 2.54.0