fix(vulkan-rt): work around NVIDIA descriptor-heap AS-read device-loss (#15)

Reading an acceleration structure through VK_EXT_descriptor_heap aborts
with VK_ERROR_DEVICE_LOST on NVIDIA 610.43.02 — a brand-new-extension
driver fault isolated in #7 (engine setup is correct and validation-clean;
images/buffers through the same heap work, and both traceRayEXT and inline
rayQuery fault identically on the AS read).

An acceleration structure can equally be reached by its device address via
OpConvertUToAccelerationStructureKHR, which reads no descriptor and so never
touches the faulting heap path. glslang has no GLSL spelling for that
conversion, so VulkanShader rewrites the compiled SPIR-V at module-load
time: every `OpLoad %accelStruct <heap-ptr>` becomes a load of the TLAS
device address from a synthesized push-constant block followed by the
convert. RTPass pushes the active frame's TLAS address into that push
constant. User GLSL and example code are unchanged; acceleration structures
still bind into the heap normally.

The workaround is gated on Device::workaroundDescriptorHeapAS (true only on
the NVIDIA proprietary driver) and confined to one fenced block in
Crafter.Graphics-ShaderVulkan.cppm plus the RTPass push and the shaderInt64
feature toggle — delete those once a fixed NVIDIA driver ships and the heap
AS read becomes the direct path again.

Verified: VulkanTriangle ray-traces correctly on native NVIDIA (RTX 4090),
validation-layer-clean, no device loss. The SPIR-V rewrite was independently
validated with spirv-val on both the VulkanTriangle and Sponza raygen
modules.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
catbot 2026-06-03 01:59:54 +00:00
commit 950059c86e
7 changed files with 270 additions and 30 deletions

View file

@ -27,6 +27,174 @@ import std;
import :Device;
import :Types;
// ─── BEGIN NVIDIA descriptor-heap AS-read workaround (issue #15 / #7) ─────
// Remove this whole block (and its call below, Device::workaroundDescriptorHeapAS,
// and the RTPass push-data) once NVIDIA ships a driver that fixes the
// VK_EXT_descriptor_heap acceleration-structure read fault.
//
// On the affected driver, reading an `accelerationStructureEXT` out of the
// descriptor heap aborts the device. The build, the heap descriptor write and
// everything else are correct (proven in #7); only the in-shader heap AS read
// is broken — buffers/images through the same heap work. Acceleration
// structures can equally be addressed by their device address, and
// OpConvertUToAccelerationStructureKHR (which reads no descriptor) sidesteps
// the faulting path entirely.
//
// glslang has no GLSL spelling for that conversion, so we rewrite the compiled
// SPIR-V at module-load time: every `OpLoad %accelStruct <heap-ptr>` becomes a
// load of the TLAS device address from a synthesized push-constant block
// followed by OpConvertUToAccelerationStructureKHR. RTPass pushes the active
// frame's TLAS address into that push constant. Shaders that never touch an
// acceleration structure (no OpTypeAccelerationStructureKHR) are left untouched.
namespace WorkaroundNvidiaAS {
// SPIR-V numeric opcodes / enums used below.
enum : std::uint32_t {
OpEntryPoint = 15, OpCapability = 17,
OpTypeInt = 21, OpTypeStruct = 30, OpTypePointer = 32,
OpConstant = 43, OpVariable = 59, OpLoad = 61, OpAccessChain = 65,
OpDecorate = 71, OpMemberDecorate = 72,
OpConvertUToAccelerationStructureKHR = 4447,
OpTypeAccelerationStructureKHR = 5341,
CapabilityInt64 = 11,
StorageClassPushConstant = 9,
DecorationBlock = 2, DecorationOffset = 35,
};
inline bool IsAnnotation(std::uint32_t op) {
// OpDecorate/OpMemberDecorate/OpDecorationGroup/OpGroupDecorate/
// OpGroupMemberDecorate/OpDecorateId/OpDecorate(Member)String.
return op == 71 || op == 72 || op == 73 || op == 74 || op == 75
|| op == 332 || op == 5632 || op == 5633;
}
using Instr = std::vector<std::uint32_t>;
inline void Patch(std::vector<std::uint32_t>& words) {
if (words.size() < 5) return; // not a SPIR-V module we understand.
// Split header (5 words) from the instruction stream.
std::uint32_t bound = words[3];
std::vector<Instr> instrs;
for (std::size_t i = 5; i < words.size();) {
std::uint32_t len = words[i] >> 16;
if (len == 0 || i + len > words.size()) return; // malformed — bail.
instrs.emplace_back(words.begin() + i, words.begin() + i + len);
i += len;
}
// ── Scan for the AS type, reusable int/long types+constants, and the
// section boundaries we need to insert into.
std::uint32_t asTypeId = 0, ulongTypeId = 0, uintTypeId = 0, uintZeroId = 0;
std::size_t lastCapIdx = 0, lastAnnotIdx = 0, firstFuncIdx = instrs.size();
std::size_t entryIdx = instrs.size();
for (std::size_t k = 0; k < instrs.size(); ++k) {
std::uint32_t op = instrs[k][0] & 0xFFFFu;
switch (op) {
case OpTypeAccelerationStructureKHR: asTypeId = instrs[k][1]; break;
case OpTypeInt:
if (instrs[k][2] == 64 && instrs[k][3] == 0) ulongTypeId = instrs[k][1];
else if (instrs[k][2] == 32 && instrs[k][3] == 0) uintTypeId = instrs[k][1];
break;
case OpConstant:
if (uintTypeId && instrs[k][1] == uintTypeId && instrs[k][3] == 0)
uintZeroId = instrs[k][2];
break;
case OpCapability: lastCapIdx = k; break;
case OpEntryPoint: if (entryIdx == instrs.size()) entryIdx = k; break;
default: break;
}
if (IsAnnotation(op)) lastAnnotIdx = k;
if (op == 54 /*OpFunction*/ && firstFuncIdx == instrs.size()) firstFuncIdx = k;
}
if (asTypeId == 0) return; // shader never reads an acceleration structure.
auto newId = [&] { return bound++; };
auto mk = [](std::initializer_list<std::uint32_t> ops) {
Instr in(ops);
in[0] = static_cast<std::uint32_t>(in.size() << 16) | (in[0] & 0xFFFFu);
return in;
};
// ── Synthesize the types/constants/push-constant we need, reusing any
// the module already defines (SPIR-V forbids duplicate type defs).
std::vector<Instr> typeDefs;
if (uintTypeId == 0) {
uintTypeId = newId();
typeDefs.push_back(mk({OpTypeInt, uintTypeId, 32, 0}));
}
if (uintZeroId == 0) {
uintZeroId = newId();
typeDefs.push_back(mk({OpConstant, uintTypeId, uintZeroId, 0}));
}
if (ulongTypeId == 0) {
ulongTypeId = newId();
typeDefs.push_back(mk({OpTypeInt, ulongTypeId, 64, 0}));
}
std::uint32_t pcStructId = newId();
std::uint32_t ptrPushStructId = newId();
std::uint32_t ptrPushUlongId = newId();
std::uint32_t pcVarId = newId();
typeDefs.push_back(mk({OpTypeStruct, pcStructId, ulongTypeId}));
typeDefs.push_back(mk({OpTypePointer, ptrPushStructId, StorageClassPushConstant, pcStructId}));
typeDefs.push_back(mk({OpTypePointer, ptrPushUlongId, StorageClassPushConstant, ulongTypeId}));
typeDefs.push_back(mk({OpVariable, ptrPushStructId, pcVarId, StorageClassPushConstant}));
std::vector<Instr> decorations = {
mk({OpMemberDecorate, pcStructId, 0, DecorationOffset, 0}),
mk({OpDecorate, pcStructId, DecorationBlock}),
};
// ── Rewrite each `OpLoad %asType <ptr>` into address-load + convert.
std::vector<Instr> rebuilt;
rebuilt.reserve(instrs.size() + 8);
for (const Instr& in : instrs) {
std::uint32_t op = in[0] & 0xFFFFu;
if (op == OpLoad && in[1] == asTypeId) {
std::uint32_t resultId = in[2];
std::uint32_t chainId = newId();
std::uint32_t addrId = newId();
rebuilt.push_back(mk({OpAccessChain, ptrPushUlongId, chainId, pcVarId, uintZeroId}));
rebuilt.push_back(mk({OpLoad, ulongTypeId, addrId, chainId}));
rebuilt.push_back(mk({OpConvertUToAccelerationStructureKHR, asTypeId, resultId, addrId}));
} else {
rebuilt.push_back(in);
}
}
instrs.swap(rebuilt);
// Recompute structural anchors (the rewrite above shifted indices).
lastCapIdx = 0; lastAnnotIdx = 0; firstFuncIdx = instrs.size(); entryIdx = instrs.size();
for (std::size_t k = 0; k < instrs.size(); ++k) {
std::uint32_t op = instrs[k][0] & 0xFFFFu;
if (op == OpCapability) lastCapIdx = k;
if (op == OpEntryPoint && entryIdx == instrs.size()) entryIdx = k;
if (IsAnnotation(op)) lastAnnotIdx = k;
if (op == 54 && firstFuncIdx == instrs.size()) firstFuncIdx = k;
}
// Append the push-constant variable to the entry point's interface
// list (required for SPIR-V ≥ 1.4 — both raygen modules are 1.4).
if (entryIdx != instrs.size() && words[1] >= 0x00010400u) {
instrs[entryIdx].push_back(pcVarId);
instrs[entryIdx][0] = static_cast<std::uint32_t>(instrs[entryIdx].size() << 16)
| OpEntryPoint;
}
// Insert highest-index-first so earlier anchors stay valid.
instrs.insert(instrs.begin() + firstFuncIdx, typeDefs.begin(), typeDefs.end());
instrs.insert(instrs.begin() + lastAnnotIdx + 1, decorations.begin(), decorations.end());
instrs.insert(instrs.begin() + lastCapIdx + 1, mk({OpCapability, CapabilityInt64}));
// ── Reassemble: header (with updated bound) + instruction stream.
std::vector<std::uint32_t> out(words.begin(), words.begin() + 5);
out[3] = bound;
for (const Instr& in : instrs) out.insert(out.end(), in.begin(), in.end());
words.swap(out);
}
}
// ─── END NVIDIA descriptor-heap AS-read workaround ────────────────────────
export namespace Crafter {
class VulkanShader {
public:
@ -54,7 +222,15 @@ export namespace Crafter {
}
file.close();
// NVIDIA descriptor-heap AS-read workaround (issue #15 / #7).
// No-op on every other driver and on shaders that don't read an
// acceleration structure. Remove with the rest of the workaround
// once a fixed NVIDIA driver ships.
if (Device::workaroundDescriptorHeapAS) {
WorkaroundNvidiaAS::Patch(spirv);
}
VkShaderModuleCreateInfo module_info{VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO};
module_info.codeSize = spirv.size() * sizeof(uint32_t);
module_info.pCode = spirv.data();