fix(vulkan-rt): configurable recursion depth + per-shader TLAS push for compute (#21) #22
8 changed files with 248 additions and 75 deletions
fix(vulkan-rt): configurable recursion depth + per-shader TLAS push for compute (#21)
Two gaps in the Vulkan RT path that fault the device on the NVIDIA
proprietary driver with a non-trivial pipeline (simple VulkanTriangle
never hit them):
1. maxPipelineRayRecursionDepth was hardcoded to 1, so any closest-hit
shader that traces a secondary ray (shadow ray — a very common
pattern) recursed past the pipeline limit (UB → device fault).
PipelineRTVulkan::Init now takes a maxRecursionDepth parameter
(default 1, clamped to the device's maxRayRecursionDepth).
2. The NVIDIA descriptor-heap AS-read workaround rewrites every shader
that reads an accelerationStructureEXT from the heap — including
compute shaders — to read the TLAS device address from a push
constant, but only RTPass pushed that address. A compute shader that
ray-queries the TLAS (rayQueryEXT) therefore ran against an unwritten
push slot → garbage AS handle → VK_ERROR_DEVICE_LOST.
WorkaroundNvidiaAS::Patch now returns a per-shader PatchResult
{patched, tlasPushOffset} instead of writing the clobber-prone global
Device::workaroundTlasPushOffset (removed). VulkanShader stores it;
ShaderBindingTableVulkan/PipelineRTVulkan carry it for RTPass, and
ComputeShader tracks its own offset and pushes the caller-supplied
TLAS address in Dispatch (new defaulted tlasAddress parameter),
mirroring RTPass::Record.
The PushConstantRewrite regression test now asserts Patch's returned
patched/offset and adds two ray-querying compute-shader cases, proving
the rewrite is stage-agnostic and the per-shader offset is correct.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
commit
1c310762a7
|
|
@ -26,7 +26,10 @@ import std;
|
||||||
|
|
||||||
using namespace Crafter;
|
using namespace Crafter;
|
||||||
|
|
||||||
ComputeShader::ComputeShader(ComputeShader&& other) noexcept : pipeline(other.pipeline) {
|
ComputeShader::ComputeShader(ComputeShader&& other) noexcept
|
||||||
|
: pipeline(other.pipeline),
|
||||||
|
workaroundNeedsTlas(other.workaroundNeedsTlas),
|
||||||
|
workaroundTlasPushOffset(other.workaroundTlasPushOffset) {
|
||||||
other.pipeline = VK_NULL_HANDLE;
|
other.pipeline = VK_NULL_HANDLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -36,6 +39,8 @@ ComputeShader& ComputeShader::operator=(ComputeShader&& other) noexcept {
|
||||||
vkDestroyPipeline(Device::device, pipeline, nullptr);
|
vkDestroyPipeline(Device::device, pipeline, nullptr);
|
||||||
}
|
}
|
||||||
pipeline = other.pipeline;
|
pipeline = other.pipeline;
|
||||||
|
workaroundNeedsTlas = other.workaroundNeedsTlas;
|
||||||
|
workaroundTlasPushOffset = other.workaroundTlasPushOffset;
|
||||||
other.pipeline = VK_NULL_HANDLE;
|
other.pipeline = VK_NULL_HANDLE;
|
||||||
}
|
}
|
||||||
return *this;
|
return *this;
|
||||||
|
|
@ -51,6 +56,13 @@ ComputeShader::~ComputeShader() {
|
||||||
void ComputeShader::Load(const std::filesystem::path& spvPath) {
|
void ComputeShader::Load(const std::filesystem::path& spvPath) {
|
||||||
VulkanShader shader(spvPath, "main", VK_SHADER_STAGE_COMPUTE_BIT, nullptr);
|
VulkanShader shader(spvPath, "main", VK_SHADER_STAGE_COMPUTE_BIT, nullptr);
|
||||||
|
|
||||||
|
// NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): remember
|
||||||
|
// whether VulkanShader rewrote a heap acceleration-structure read in this
|
||||||
|
// module, and where it expects the TLAS address pushed, so Dispatch can
|
||||||
|
// feed it the per-frame TLAS. Per-shader, not a global — see ComputeShader.
|
||||||
|
workaroundNeedsTlas = shader.patchedAS;
|
||||||
|
workaroundTlasPushOffset = shader.tlasPushOffset;
|
||||||
|
|
||||||
// Spec: with VK_PIPELINE_CREATE_2_DESCRIPTOR_HEAP_BIT_EXT, layout MUST be
|
// Spec: with VK_PIPELINE_CREATE_2_DESCRIPTOR_HEAP_BIT_EXT, layout MUST be
|
||||||
// VK_NULL_HANDLE — bindings come from the bound descriptor heap and push
|
// VK_NULL_HANDLE — bindings come from the bound descriptor heap and push
|
||||||
// constants are pushed via vkCmdPushDataEXT instead of vkCmdPushConstants.
|
// constants are pushed via vkCmdPushDataEXT instead of vkCmdPushConstants.
|
||||||
|
|
@ -77,7 +89,8 @@ void ComputeShader::Dispatch(VkCommandBuffer cmd,
|
||||||
const void* push, std::uint32_t pushBytes,
|
const void* push, std::uint32_t pushBytes,
|
||||||
std::uint32_t gx,
|
std::uint32_t gx,
|
||||||
std::uint32_t gy,
|
std::uint32_t gy,
|
||||||
std::uint32_t gz) const {
|
std::uint32_t gz,
|
||||||
|
VkDeviceAddress tlasAddress) const {
|
||||||
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
|
vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
|
||||||
if (push != nullptr && pushBytes > 0) {
|
if (push != nullptr && pushBytes > 0) {
|
||||||
VkPushDataInfoEXT pushInfo {
|
VkPushDataInfoEXT pushInfo {
|
||||||
|
|
@ -87,5 +100,18 @@ void ComputeShader::Dispatch(VkCommandBuffer cmd,
|
||||||
};
|
};
|
||||||
Device::vkCmdPushDataEXT(cmd, &pushInfo);
|
Device::vkCmdPushDataEXT(cmd, &pushInfo);
|
||||||
}
|
}
|
||||||
|
// NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): if this shader
|
||||||
|
// ray-queries the TLAS through the heap it was rewritten to read the TLAS
|
||||||
|
// device address from a push constant; push the caller-supplied address
|
||||||
|
// where the rewrite reads it (after any user payload, or offset 0 if none).
|
||||||
|
// Mirrors RTPass::Record for the RT pipeline. Inert on every other driver.
|
||||||
|
if (Device::workaroundDescriptorHeapAS && workaroundNeedsTlas) {
|
||||||
|
VkPushDataInfoEXT tlasPush {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_PUSH_DATA_INFO_EXT,
|
||||||
|
.offset = workaroundTlasPushOffset,
|
||||||
|
.data = { .address = &tlasAddress, .size = sizeof(tlasAddress) },
|
||||||
|
};
|
||||||
|
Device::vkCmdPushDataEXT(cmd, &tlasPush);
|
||||||
|
}
|
||||||
vkCmdDispatch(cmd, gx, gy, gz);
|
vkCmdDispatch(cmd, gx, gy, gz);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -36,6 +36,16 @@ export namespace Crafter {
|
||||||
public:
|
public:
|
||||||
VkPipeline pipeline = VK_NULL_HANDLE;
|
VkPipeline pipeline = VK_NULL_HANDLE;
|
||||||
|
|
||||||
|
// NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): set by
|
||||||
|
// Load when this shader ray-queries the TLAS through the descriptor
|
||||||
|
// heap and was rewritten to read its device address from a push
|
||||||
|
// constant. `workaroundTlasPushOffset` is the byte offset of that member
|
||||||
|
// (after the caller's own push payload, or 0 if the shader had none).
|
||||||
|
// Tracked per-shader — a global is clobbered by whichever shader was
|
||||||
|
// patched last. Both inert (false/0) on every other driver.
|
||||||
|
bool workaroundNeedsTlas = false;
|
||||||
|
std::uint32_t workaroundTlasPushOffset = 0;
|
||||||
|
|
||||||
ComputeShader() = default;
|
ComputeShader() = default;
|
||||||
ComputeShader(const ComputeShader&) = delete;
|
ComputeShader(const ComputeShader&) = delete;
|
||||||
ComputeShader& operator=(const ComputeShader&) = delete;
|
ComputeShader& operator=(const ComputeShader&) = delete;
|
||||||
|
|
@ -50,11 +60,21 @@ export namespace Crafter {
|
||||||
// Bind, push constants (if any), dispatch. Caller computes group counts
|
// Bind, push constants (if any), dispatch. Caller computes group counts
|
||||||
// and is responsible for any inter-dispatch barriers (UIRenderer::Dispatch
|
// and is responsible for any inter-dispatch barriers (UIRenderer::Dispatch
|
||||||
// wraps this with the standard write-after-write barrier).
|
// wraps this with the standard write-after-write barrier).
|
||||||
|
//
|
||||||
|
// tlasAddress is the NVIDIA descriptor-heap AS-read workaround hook
|
||||||
|
// (issue #15 / #7): a shader that ray-queries the TLAS through the
|
||||||
|
// descriptor heap is rewritten to read its device address from a push
|
||||||
|
// constant, so the caller must supply the active frame's TLAS address
|
||||||
|
// (RenderingElement3D::tlases[frameIdx].address) here. It is pushed at
|
||||||
|
// the shader's workaroundTlasPushOffset only when the shader was
|
||||||
|
// rewritten (workaroundNeedsTlas) — ignored otherwise and on every
|
||||||
|
// other driver, so shaders that don't touch an AS pass nothing.
|
||||||
void Dispatch(VkCommandBuffer cmd,
|
void Dispatch(VkCommandBuffer cmd,
|
||||||
const void* push, std::uint32_t pushBytes,
|
const void* push, std::uint32_t pushBytes,
|
||||||
std::uint32_t gx,
|
std::uint32_t gx,
|
||||||
std::uint32_t gy = 1,
|
std::uint32_t gy = 1,
|
||||||
std::uint32_t gz = 1) const;
|
std::uint32_t gz = 1,
|
||||||
|
VkDeviceAddress tlasAddress = 0) const;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#endif // !CRAFTER_GRAPHICS_WINDOW_DOM
|
#endif // !CRAFTER_GRAPHICS_WINDOW_DOM
|
||||||
|
|
|
||||||
|
|
@ -178,12 +178,12 @@ export namespace Crafter {
|
||||||
// path and RTPass pushes the active TLAS address as push data. Delete
|
// path and RTPass pushes the active TLAS address as push data. Delete
|
||||||
// this flag and everything keyed on it once a fixed driver ships.
|
// this flag and everything keyed on it once a fixed driver ships.
|
||||||
inline static bool workaroundDescriptorHeapAS = false;
|
inline static bool workaroundDescriptorHeapAS = false;
|
||||||
// Byte offset of the TLAS-address member inside the patched raygen's
|
// The byte offset of the TLAS-address member inside a patched shader's
|
||||||
// push-constant block — 0 for a freshly synthesized block, or the end
|
// push-constant block is tracked per-shader (VulkanShader::tlasPushOffset),
|
||||||
// of the user's own block when the address is appended to it (the
|
// not here: a single global is clobbered by whichever shader was patched
|
||||||
// shader can't have two push-constant blocks). VulkanShader sets this
|
// last and so cannot serve several shaders with differing push layouts
|
||||||
// at module load; RTPass feeds it to vkCmdPushDataEXT.
|
// (e.g. an RT raygen and a ray-querying compute shader). RTPass and
|
||||||
inline static std::uint32_t workaroundTlasPushOffset = 0;
|
// ComputeShader read the offset off the pipeline they record.
|
||||||
|
|
||||||
static void CheckVkResult(VkResult result);
|
static void CheckVkResult(VkResult result);
|
||||||
static std::uint32_t GetMemoryType(std::uint32_t typeBits, VkMemoryPropertyFlags properties);
|
static std::uint32_t GetMemoryType(std::uint32_t typeBits, VkMemoryPropertyFlags properties);
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,25 @@ export namespace Crafter {
|
||||||
VkStridedDeviceAddressRegionKHR hitRegion;
|
VkStridedDeviceAddressRegionKHR hitRegion;
|
||||||
VkStridedDeviceAddressRegionKHR callableRegion;
|
VkStridedDeviceAddressRegionKHR callableRegion;
|
||||||
|
|
||||||
void Init(VkCommandBuffer cmd, std::span<VkRayTracingShaderGroupCreateInfoKHR> raygenGroups, std::span<VkRayTracingShaderGroupCreateInfoKHR> missGroups, std::span<VkRayTracingShaderGroupCreateInfoKHR> hitGroups, ShaderBindingTableVulkan& shaderTable) {
|
// NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): copied
|
||||||
|
// from the shader table at Init so RTPass can push the active TLAS
|
||||||
|
// device address into the patched shaders' push constant. Inert on
|
||||||
|
// every other driver.
|
||||||
|
bool workaroundNeedsTlas = false;
|
||||||
|
std::uint32_t workaroundTlasPushOffset = 0;
|
||||||
|
|
||||||
|
// maxRecursionDepth: the maximum ray-recursion depth the pipeline must
|
||||||
|
// support — i.e. the deepest chain of nested traceRayEXT calls. The
|
||||||
|
// raygen counts as depth 1, so a closest-hit shader that traces a shadow
|
||||||
|
// ray needs 2. Tracing beyond the value the pipeline was created with is
|
||||||
|
// undefined behaviour and faults the device, so a consumer with any
|
||||||
|
// recursion past the raygen must raise this. Defaults to 1 (raygen-only,
|
||||||
|
// matching the simple examples) and is clamped to the device's
|
||||||
|
// maxRayRecursionDepth.
|
||||||
|
void Init(VkCommandBuffer cmd, std::span<VkRayTracingShaderGroupCreateInfoKHR> raygenGroups, std::span<VkRayTracingShaderGroupCreateInfoKHR> missGroups, std::span<VkRayTracingShaderGroupCreateInfoKHR> hitGroups, ShaderBindingTableVulkan& shaderTable, std::uint32_t maxRecursionDepth = 1) {
|
||||||
|
workaroundNeedsTlas = shaderTable.workaroundNeedsTlas;
|
||||||
|
workaroundTlasPushOffset = shaderTable.workaroundTlasPushOffset;
|
||||||
|
|
||||||
std::vector<VkRayTracingShaderGroupCreateInfoKHR> groups;
|
std::vector<VkRayTracingShaderGroupCreateInfoKHR> groups;
|
||||||
groups.reserve(raygenGroups.size() + missGroups.size() + hitGroups.size());
|
groups.reserve(raygenGroups.size() + missGroups.size() + hitGroups.size());
|
||||||
|
|
||||||
|
|
@ -60,7 +78,7 @@ export namespace Crafter {
|
||||||
.pStages = shaderTable.shaderStages.data(),
|
.pStages = shaderTable.shaderStages.data(),
|
||||||
.groupCount = static_cast<std::uint32_t>(groups.size()),
|
.groupCount = static_cast<std::uint32_t>(groups.size()),
|
||||||
.pGroups = groups.data(),
|
.pGroups = groups.data(),
|
||||||
.maxPipelineRayRecursionDepth = 1,
|
.maxPipelineRayRecursionDepth = std::min(maxRecursionDepth, Device::rayTracingProperties.maxRayRecursionDepth),
|
||||||
.layout = VK_NULL_HANDLE
|
.layout = VK_NULL_HANDLE
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -42,14 +42,16 @@ export namespace Crafter {
|
||||||
// block that VulkanShader synthesizes, so the rewritten raygen can
|
// block that VulkanShader synthesizes, so the rewritten raygen can
|
||||||
// reach the acceleration structure by address instead of through
|
// reach the acceleration structure by address instead of through
|
||||||
// the faulting heap descriptor. Inert on every other driver.
|
// the faulting heap descriptor. Inert on every other driver.
|
||||||
if (Device::workaroundDescriptorHeapAS) {
|
if (Device::workaroundDescriptorHeapAS && pipeline->workaroundNeedsTlas) {
|
||||||
VkDeviceAddress tlasAddr = RenderingElement3D::tlases[frameIdx].address;
|
VkDeviceAddress tlasAddr = RenderingElement3D::tlases[frameIdx].address;
|
||||||
VkPushDataInfoEXT pushInfo {
|
VkPushDataInfoEXT pushInfo {
|
||||||
.sType = VK_STRUCTURE_TYPE_PUSH_DATA_INFO_EXT,
|
.sType = VK_STRUCTURE_TYPE_PUSH_DATA_INFO_EXT,
|
||||||
// Where the rewritten raygen reads the TLAS address: 0 when
|
// Where the rewritten raygen reads the TLAS address: 0 when
|
||||||
// VulkanShader synthesized a fresh block, or the offset of
|
// VulkanShader synthesized a fresh block, or the offset of
|
||||||
// the member it appended to the shader's existing block.
|
// the member it appended to the shader's existing block.
|
||||||
.offset = Device::workaroundTlasPushOffset,
|
// Tracked per-pipeline (copied from the shader table) so a
|
||||||
|
// later-loaded shader can't clobber it.
|
||||||
|
.offset = pipeline->workaroundTlasPushOffset,
|
||||||
.data = { .address = &tlasAddr, .size = sizeof(tlasAddr) },
|
.data = { .address = &tlasAddr, .size = sizeof(tlasAddr) },
|
||||||
};
|
};
|
||||||
Device::vkCmdPushDataEXT(cmd, &pushInfo);
|
Device::vkCmdPushDataEXT(cmd, &pushInfo);
|
||||||
|
|
|
||||||
|
|
@ -33,10 +33,22 @@ export namespace Crafter {
|
||||||
class ShaderBindingTableVulkan {
|
class ShaderBindingTableVulkan {
|
||||||
public:
|
public:
|
||||||
std::vector<VkPipelineShaderStageCreateInfo> shaderStages;
|
std::vector<VkPipelineShaderStageCreateInfo> shaderStages;
|
||||||
|
// NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): true when
|
||||||
|
// any stage in this table reads an acceleration structure and was
|
||||||
|
// rewritten to fetch the TLAS address from a push constant, with the
|
||||||
|
// byte offset that stage expects it at. PipelineRTVulkan copies these so
|
||||||
|
// RTPass can push the address without consulting a clobber-prone global.
|
||||||
|
// Both inert (false/0) on every other driver.
|
||||||
|
bool workaroundNeedsTlas = false;
|
||||||
|
std::uint32_t workaroundTlasPushOffset = 0;
|
||||||
void Init(const std::span<const VulkanShader> shaders) {
|
void Init(const std::span<const VulkanShader> shaders) {
|
||||||
shaderStages.reserve(shaders.size());
|
shaderStages.reserve(shaders.size());
|
||||||
for(const VulkanShader& shader: shaders) {
|
for(const VulkanShader& shader: shaders) {
|
||||||
shaderStages.emplace_back(VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, nullptr, 0, shader.stage, shader.shader, shader.entrypoint.c_str(), shader.specilizationInfo);
|
shaderStages.emplace_back(VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, nullptr, 0, shader.stage, shader.shader, shader.entrypoint.c_str(), shader.specilizationInfo);
|
||||||
|
if (shader.patchedAS) {
|
||||||
|
workaroundNeedsTlas = true;
|
||||||
|
workaroundTlasPushOffset = shader.tlasPushOffset;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
||||||
|
|
@ -52,8 +52,12 @@ import :Types;
|
||||||
// append a ulong member (the TLAS address) to the *existing* block and read
|
// append a ulong member (the TLAS address) to the *existing* block and read
|
||||||
// from there; only shaders with no push constant of their own get a freshly
|
// from there; only shaders with no push constant of their own get a freshly
|
||||||
// synthesized single-member block. Its byte offset is the offset of that
|
// synthesized single-member block. Its byte offset is the offset of that
|
||||||
// member (published via Crafter::Device::workaroundTlasPushOffset) which RTPass feeds to
|
// member, returned in PatchResult::tlasPushOffset so the caller (RTPass for the
|
||||||
// vkCmdPushDataEXT so the address lands where the rewritten load reads it.
|
// RT pipeline, ComputeShader::Dispatch for a compute pipeline) can feed it to
|
||||||
|
// vkCmdPushDataEXT — landing the address exactly where the rewritten load reads
|
||||||
|
// it. The offset is per-shader rather than a global: a global is clobbered by
|
||||||
|
// whichever shader was patched last and so cannot serve several shaders whose
|
||||||
|
// push-constant layouts differ.
|
||||||
//
|
//
|
||||||
// Exported so tests/PushConstantRewrite can drive Patch() over real compiled
|
// Exported so tests/PushConstantRewrite can drive Patch() over real compiled
|
||||||
// SPIR-V and check the result with spirv-val; nothing in the engine calls it
|
// SPIR-V and check the result with spirv-val; nothing in the engine calls it
|
||||||
|
|
@ -87,15 +91,24 @@ export namespace WorkaroundNvidiaAS {
|
||||||
return (v + a - 1u) & ~(a - 1u);
|
return (v + a - 1u) & ~(a - 1u);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void Patch(std::vector<std::uint32_t>& words) {
|
// Outcome of patching one shader module. `patched` is true only when the
|
||||||
if (words.size() < 5) return; // not a SPIR-V module we understand.
|
// shader read an acceleration structure and was rewritten; `tlasPushOffset`
|
||||||
|
// is then the byte offset of the TLAS-address member in the (possibly
|
||||||
|
// pre-existing) push-constant block the caller must write.
|
||||||
|
struct PatchResult {
|
||||||
|
bool patched = false;
|
||||||
|
std::uint32_t tlasPushOffset = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline PatchResult Patch(std::vector<std::uint32_t>& words) {
|
||||||
|
if (words.size() < 5) return {}; // not a SPIR-V module we understand.
|
||||||
|
|
||||||
// Split header (5 words) from the instruction stream.
|
// Split header (5 words) from the instruction stream.
|
||||||
std::uint32_t bound = words[3];
|
std::uint32_t bound = words[3];
|
||||||
std::vector<Instr> instrs;
|
std::vector<Instr> instrs;
|
||||||
for (std::size_t i = 5; i < words.size();) {
|
for (std::size_t i = 5; i < words.size();) {
|
||||||
std::uint32_t len = words[i] >> 16;
|
std::uint32_t len = words[i] >> 16;
|
||||||
if (len == 0 || i + len > words.size()) return; // malformed — bail.
|
if (len == 0 || i + len > words.size()) return {}; // malformed — bail.
|
||||||
instrs.emplace_back(words.begin() + i, words.begin() + i + len);
|
instrs.emplace_back(words.begin() + i, words.begin() + i + len);
|
||||||
i += len;
|
i += len;
|
||||||
}
|
}
|
||||||
|
|
@ -163,7 +176,10 @@ export namespace WorkaroundNvidiaAS {
|
||||||
if (op == 54 /*OpFunction*/ && firstFuncIdx == instrs.size()) firstFuncIdx = k;
|
if (op == 54 /*OpFunction*/ && firstFuncIdx == instrs.size()) firstFuncIdx = k;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (asTypeId == 0) return; // shader never reads an acceleration structure.
|
if (asTypeId == 0) return {}; // shader never reads an acceleration structure.
|
||||||
|
|
||||||
|
// Set on whichever path runs below; returned to the caller.
|
||||||
|
std::uint32_t tlasPushOffset = 0;
|
||||||
|
|
||||||
auto newId = [&] { return bound++; };
|
auto newId = [&] { return bound++; };
|
||||||
auto mk = [](std::initializer_list<std::uint32_t> ops) {
|
auto mk = [](std::initializer_list<std::uint32_t> ops) {
|
||||||
|
|
@ -230,7 +246,7 @@ export namespace WorkaroundNvidiaAS {
|
||||||
pcVarId = existingPcVarId;
|
pcVarId = existingPcVarId;
|
||||||
const Instr* structInstr = typeInstr[existingPcStructId];
|
const Instr* structInstr = typeInstr[existingPcStructId];
|
||||||
memberIdx = static_cast<std::uint32_t>(structInstr->size() - 2);
|
memberIdx = static_cast<std::uint32_t>(structInstr->size() - 2);
|
||||||
Crafter::Device::workaroundTlasPushOffset = AlignUp(footprint(existingPcStructId), 8);
|
tlasPushOffset = AlignUp(footprint(existingPcStructId), 8);
|
||||||
|
|
||||||
ptrPushUlongId = existingPtrUlongId;
|
ptrPushUlongId = existingPtrUlongId;
|
||||||
if (ptrPushUlongId == 0) {
|
if (ptrPushUlongId == 0) {
|
||||||
|
|
@ -247,7 +263,7 @@ export namespace WorkaroundNvidiaAS {
|
||||||
memberIdxConstId = newId();
|
memberIdxConstId = newId();
|
||||||
typeDefs.push_back(mk({OpConstant, uintTypeId, memberIdxConstId, memberIdx}));
|
typeDefs.push_back(mk({OpConstant, uintTypeId, memberIdxConstId, memberIdx}));
|
||||||
}
|
}
|
||||||
decorations.push_back(mk({OpMemberDecorate, existingPcStructId, memberIdx, DecorationOffset, Crafter::Device::workaroundTlasPushOffset}));
|
decorations.push_back(mk({OpMemberDecorate, existingPcStructId, memberIdx, DecorationOffset, tlasPushOffset}));
|
||||||
} else {
|
} else {
|
||||||
// No user push constant — synthesize a fresh single-member block.
|
// No user push constant — synthesize a fresh single-member block.
|
||||||
if (uintZeroId == 0) { uintZeroId = newId(); typeDefs.push_back(mk({OpConstant, uintTypeId, uintZeroId, 0})); }
|
if (uintZeroId == 0) { uintZeroId = newId(); typeDefs.push_back(mk({OpConstant, uintTypeId, uintZeroId, 0})); }
|
||||||
|
|
@ -262,7 +278,7 @@ export namespace WorkaroundNvidiaAS {
|
||||||
decorations.push_back(mk({OpMemberDecorate, pcStructId, 0, DecorationOffset, 0}));
|
decorations.push_back(mk({OpMemberDecorate, pcStructId, 0, DecorationOffset, 0}));
|
||||||
decorations.push_back(mk({OpDecorate, pcStructId, DecorationBlock}));
|
decorations.push_back(mk({OpDecorate, pcStructId, DecorationBlock}));
|
||||||
memberIdxConstId = uintZeroId;
|
memberIdxConstId = uintZeroId;
|
||||||
Crafter::Device::workaroundTlasPushOffset = 0;
|
tlasPushOffset = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Rewrite each `OpLoad %asType <ptr>` into address-load + convert, and
|
// ── Rewrite each `OpLoad %asType <ptr>` into address-load + convert, and
|
||||||
|
|
@ -327,6 +343,8 @@ export namespace WorkaroundNvidiaAS {
|
||||||
out[3] = bound;
|
out[3] = bound;
|
||||||
for (const Instr& in : instrs) out.insert(out.end(), in.begin(), in.end());
|
for (const Instr& in : instrs) out.insert(out.end(), in.begin(), in.end());
|
||||||
words.swap(out);
|
words.swap(out);
|
||||||
|
|
||||||
|
return {true, tlasPushOffset};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// ─── END NVIDIA descriptor-heap AS-read workaround ────────────────────────
|
// ─── END NVIDIA descriptor-heap AS-read workaround ────────────────────────
|
||||||
|
|
@ -339,6 +357,15 @@ export namespace Crafter {
|
||||||
VkShaderStageFlagBits stage;
|
VkShaderStageFlagBits stage;
|
||||||
std::string entrypoint;
|
std::string entrypoint;
|
||||||
VkShaderModule shader;
|
VkShaderModule shader;
|
||||||
|
// NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): set when
|
||||||
|
// this module read an acceleration structure and was rewritten to fetch
|
||||||
|
// the TLAS device address from a push constant. `tlasPushOffset` is the
|
||||||
|
// byte offset of that member, which whoever records the dispatch
|
||||||
|
// (RTPass / ComputeShader) must write with vkCmdPushDataEXT. Per-shader
|
||||||
|
// rather than a global because each shader's push-constant layout — and
|
||||||
|
// therefore the offset — can differ. Both false/0 on every other driver.
|
||||||
|
bool patchedAS = false;
|
||||||
|
std::uint32_t tlasPushOffset = 0;
|
||||||
VulkanShader(const std::filesystem::path& path, std::string entrypoint, VkShaderStageFlagBits stage, VkSpecializationInfo* specilizationInfo) : stage(stage), entrypoint(entrypoint), specilizationInfo(specilizationInfo) {
|
VulkanShader(const std::filesystem::path& path, std::string entrypoint, VkShaderStageFlagBits stage, VkSpecializationInfo* specilizationInfo) : stage(stage), entrypoint(entrypoint), specilizationInfo(specilizationInfo) {
|
||||||
std::ifstream file(path, std::ios::binary);
|
std::ifstream file(path, std::ios::binary);
|
||||||
if (!file) {
|
if (!file) {
|
||||||
|
|
@ -364,7 +391,9 @@ export namespace Crafter {
|
||||||
// acceleration structure. Remove with the rest of the workaround
|
// acceleration structure. Remove with the rest of the workaround
|
||||||
// once a fixed NVIDIA driver ships.
|
// once a fixed NVIDIA driver ships.
|
||||||
if (Device::workaroundDescriptorHeapAS) {
|
if (Device::workaroundDescriptorHeapAS) {
|
||||||
WorkaroundNvidiaAS::Patch(spirv);
|
WorkaroundNvidiaAS::PatchResult patch = WorkaroundNvidiaAS::Patch(spirv);
|
||||||
|
patchedAS = patch.patched;
|
||||||
|
tlasPushOffset = patch.tlasPushOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
VkShaderModuleCreateInfo module_info{VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO};
|
VkShaderModuleCreateInfo module_info{VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO};
|
||||||
|
|
|
||||||
|
|
@ -29,7 +29,13 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
// them through the real Patch(), and asserts with spirv-val that the result is
|
// them through the real Patch(), and asserts with spirv-val that the result is
|
||||||
// valid and contains exactly one push-constant variable — both for shaders
|
// valid and contains exactly one push-constant variable — both for shaders
|
||||||
// that already have a push constant (merge path) and for those that don't
|
// that already have a push constant (merge path) and for those that don't
|
||||||
// (synthesize path). It also checks the published TLAS push-constant offset.
|
// (synthesize path). It also checks the returned TLAS push-constant offset.
|
||||||
|
//
|
||||||
|
// It additionally covers ray-querying *compute* shaders (issue #21): the
|
||||||
|
// rewrite is stage-agnostic, and ComputeShader::Dispatch now pushes the TLAS
|
||||||
|
// address at the per-shader offset Patch returns, so a compute shader that
|
||||||
|
// reads an acceleration structure through the descriptor heap must be patched
|
||||||
|
// and report a correct offset exactly like a raygen does.
|
||||||
//
|
//
|
||||||
// Delete this test together with the rest of the workaround once a fixed NVIDIA
|
// Delete this test together with the rest of the workaround once a fixed NVIDIA
|
||||||
// driver ships.
|
// driver ships.
|
||||||
|
|
@ -89,7 +95,7 @@ struct Case {
|
||||||
std::string_view glsl;
|
std::string_view glsl;
|
||||||
bool readsAccelStruct; // whether Patch should rewrite anything
|
bool readsAccelStruct; // whether Patch should rewrite anything
|
||||||
bool hasExistingPushConst; // whether the source already declares a push block
|
bool hasExistingPushConst; // whether the source already declares a push block
|
||||||
std::uint32_t expectedOffset; // expected Device::workaroundTlasPushOffset (only checked when readsAccelStruct)
|
std::uint32_t expectedOffset; // expected PatchResult::tlasPushOffset (only checked when readsAccelStruct)
|
||||||
};
|
};
|
||||||
|
|
||||||
// Shared raygen scaffolding: a heap AS + heap image, traced and stored to.
|
// Shared raygen scaffolding: a heap AS + heap image, traced and stored to.
|
||||||
|
|
@ -156,6 +162,110 @@ std::string BuildSource(const Case& c) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Compute counterpart of the raygen cases (issue #21): a shader that ray-queries
|
||||||
|
// the heap TLAS via rayQueryEXT. Shares the offset math with the raygen merge
|
||||||
|
// path, so we only need one merge case and one synthesize case to prove compute
|
||||||
|
// stages are handled identically.
|
||||||
|
struct ComputeCase {
|
||||||
|
std::string_view name;
|
||||||
|
std::string_view glsl; // optional push-constant declaration
|
||||||
|
bool hasExistingPushConst;
|
||||||
|
std::uint32_t expectedOffset; // expected PatchResult::tlasPushOffset
|
||||||
|
};
|
||||||
|
|
||||||
|
const std::array<ComputeCase, 2> kComputeCases = {{
|
||||||
|
// No push constant → fresh single-member block synthesized at offset 0.
|
||||||
|
{ "compute-no-push", std::string_view{""}, false, 0 },
|
||||||
|
// Existing block {uint f; @0}; ends at 4, TLAS rounds up to the next 8.
|
||||||
|
{ "compute-merge-uint",
|
||||||
|
std::string_view{"layout(push_constant) uniform PC { uint f; } pc;\n"}, true, 8 },
|
||||||
|
}};
|
||||||
|
|
||||||
|
std::string BuildComputeSource(const ComputeCase& c) {
|
||||||
|
std::string s =
|
||||||
|
"#version 460\n"
|
||||||
|
"#extension GL_EXT_ray_query : enable\n"
|
||||||
|
"#extension GL_EXT_shader_image_load_formatted : enable\n"
|
||||||
|
"#extension GL_EXT_descriptor_heap : enable\n"
|
||||||
|
"#extension GL_EXT_nonuniform_qualifier : enable\n"
|
||||||
|
"layout(descriptor_heap) uniform accelerationStructureEXT topLevelAS[];\n"
|
||||||
|
"layout(descriptor_heap) uniform writeonly image2D image[];\n";
|
||||||
|
s += c.glsl;
|
||||||
|
s += "layout(local_size_x = 64) in;\n";
|
||||||
|
s += "void main() {\n";
|
||||||
|
s += " vec3 origin = vec3(0.0);\n";
|
||||||
|
s += " vec3 dir = vec3(0.0, 0.0, 1.0);\n";
|
||||||
|
s += " rayQueryEXT rq;\n";
|
||||||
|
s += " rayQueryInitializeEXT(rq, topLevelAS[0], gl_RayFlagsNoneEXT, 0xFF, origin, 0.001, dir, 10000.0);\n";
|
||||||
|
s += " while (rayQueryProceedEXT(rq)) {}\n";
|
||||||
|
float pushRef = 0; (void)pushRef;
|
||||||
|
std::string val = c.hasExistingPushConst ? "float(pc.f)" : "1.0";
|
||||||
|
s += " imageStore(image[0], ivec2(gl_GlobalInvocationID.xy), vec4(" + val + "));\n";
|
||||||
|
s += "}\n";
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compile `source` for `stage`, run Patch(), and assert: spirv-val accepts it,
|
||||||
|
// exactly one push-constant variable survives, and Patch reports patched/offset
|
||||||
|
// matching expectations. Returns true on success.
|
||||||
|
bool RunCase(const fs::path& dir, std::string_view name, std::string_view stage,
|
||||||
|
const std::string& source, bool readsAccelStruct,
|
||||||
|
std::uint32_t expectedOffset) {
|
||||||
|
const fs::path glslPath = dir / (std::string(name) + "." + std::string(stage) + ".glsl");
|
||||||
|
const fs::path spvPath = dir / (std::string(name) + ".spv");
|
||||||
|
const fs::path patched = dir / (std::string(name) + ".patched.spv");
|
||||||
|
|
||||||
|
{ std::ofstream f(glslPath); f << source; }
|
||||||
|
|
||||||
|
std::string compile = "glslang --target-env vulkan1.4 -V -S " + std::string(stage)
|
||||||
|
+ " \"" + glslPath.string() + "\" -o \"" + spvPath.string() + "\" > /dev/null";
|
||||||
|
if (RunCommand(compile) != 0) {
|
||||||
|
std::println(std::cerr, "[{}] glslang failed to compile the source shader", name);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::uint32_t> words = ReadSpirv(spvPath);
|
||||||
|
if (words.size() < 5) {
|
||||||
|
std::println(std::cerr, "[{}] could not read compiled SPIR-V", name);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
WorkaroundNvidiaAS::PatchResult patch = WorkaroundNvidiaAS::Patch(words);
|
||||||
|
WriteSpirv(patched, words);
|
||||||
|
|
||||||
|
// 1. The patched module must pass spirv-val under the engine's flags.
|
||||||
|
std::string validate = "spirv-val \"" + patched.string()
|
||||||
|
+ "\" --relax-block-layout --scalar-block-layout --target-env vulkan1.4";
|
||||||
|
if (RunCommand(validate) != 0) {
|
||||||
|
std::println(std::cerr, "[{}] spirv-val rejected the patched module", name);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Exactly one push-constant variable — the whole point of issue #18.
|
||||||
|
int pcVars = CountPushConstantVariables(words);
|
||||||
|
if (pcVars != 1) {
|
||||||
|
std::println(std::cerr, "[{}] expected exactly 1 push-constant variable, found {}", name, pcVars);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Patch must report it rewrote the shader exactly when it reads an AS.
|
||||||
|
if (patch.patched != readsAccelStruct) {
|
||||||
|
std::println(std::cerr, "[{}] expected patched={}, got {}", name, readsAccelStruct, patch.patched);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. The returned TLAS offset must match the expected layout end.
|
||||||
|
if (readsAccelStruct && patch.tlasPushOffset != expectedOffset) {
|
||||||
|
std::println(std::cerr, "[{}] expected TLAS push offset {}, got {}",
|
||||||
|
name, expectedOffset, patch.tlasPushOffset);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::println(std::cout, "[{}] ok (push-constant vars: {}, tlas offset: {})",
|
||||||
|
name, pcVars, readsAccelStruct ? patch.tlasPushOffset : 0u);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
|
|
@ -165,58 +275,14 @@ int main() {
|
||||||
|
|
||||||
int failures = 0;
|
int failures = 0;
|
||||||
for (const Case& c : kCases) {
|
for (const Case& c : kCases) {
|
||||||
const fs::path glslPath = dir / (std::string(c.name) + ".rgen.glsl");
|
if (!RunCase(dir, c.name, "rgen", BuildSource(c), c.readsAccelStruct, c.expectedOffset))
|
||||||
const fs::path spvPath = dir / (std::string(c.name) + ".spv");
|
|
||||||
const fs::path patched = dir / (std::string(c.name) + ".patched.spv");
|
|
||||||
|
|
||||||
{ std::ofstream f(glslPath); f << BuildSource(c); }
|
|
||||||
|
|
||||||
std::string compile = "glslang --target-env vulkan1.4 -V -S rgen \""
|
|
||||||
+ glslPath.string() + "\" -o \"" + spvPath.string() + "\" > /dev/null";
|
|
||||||
if (RunCommand(compile) != 0) {
|
|
||||||
std::println(std::cerr, "[{}] glslang failed to compile the source shader", c.name);
|
|
||||||
++failures;
|
++failures;
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
// Ray-querying compute shaders (issue #21) — must be patched and report a
|
||||||
std::vector<std::uint32_t> words = ReadSpirv(spvPath);
|
// correct per-shader offset just like the raygen cases above.
|
||||||
if (words.size() < 5) {
|
for (const ComputeCase& c : kComputeCases) {
|
||||||
std::println(std::cerr, "[{}] could not read compiled SPIR-V", c.name);
|
if (!RunCase(dir, c.name, "comp", BuildComputeSource(c), /*readsAccelStruct=*/true, c.expectedOffset))
|
||||||
++failures;
|
++failures;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Device::workaroundTlasPushOffset = 0xDEADBEEFu; // poison so we know Patch set it
|
|
||||||
WorkaroundNvidiaAS::Patch(words);
|
|
||||||
WriteSpirv(patched, words);
|
|
||||||
|
|
||||||
// 1. The patched module must pass spirv-val under the engine's flags.
|
|
||||||
std::string validate = "spirv-val \"" + patched.string()
|
|
||||||
+ "\" --relax-block-layout --scalar-block-layout --target-env vulkan1.4";
|
|
||||||
if (RunCommand(validate) != 0) {
|
|
||||||
std::println(std::cerr, "[{}] spirv-val rejected the patched module", c.name);
|
|
||||||
++failures;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. Exactly one push-constant variable — the whole point of issue #18.
|
|
||||||
int pcVars = CountPushConstantVariables(words);
|
|
||||||
if (pcVars != 1) {
|
|
||||||
std::println(std::cerr, "[{}] expected exactly 1 push-constant variable, found {}", c.name, pcVars);
|
|
||||||
++failures;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. The TLAS offset Patch published must match the expected layout end.
|
|
||||||
if (c.readsAccelStruct && Device::workaroundTlasPushOffset != c.expectedOffset) {
|
|
||||||
std::println(std::cerr, "[{}] expected TLAS push offset {}, got {}",
|
|
||||||
c.name, c.expectedOffset, Device::workaroundTlasPushOffset);
|
|
||||||
++failures;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::println(std::cout, "[{}] ok (push-constant vars: {}, tlas offset: {})",
|
|
||||||
c.name, pcVars, c.readsAccelStruct ? Device::workaroundTlasPushOffset : 0u);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (failures != 0) {
|
if (failures != 0) {
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue