/* Crafter®.Graphics Copyright (C) 2026 Catcrafts® catcrafts.net This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License version 3.0 as published by the Free Software Foundation; This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ module; #ifndef CRAFTER_GRAPHICS_WINDOW_DOM #include "vulkan/vulkan.h" #endif // !CRAFTER_GRAPHICS_WINDOW_DOM export module Crafter.Graphics:ShaderVulkan; #ifndef CRAFTER_GRAPHICS_WINDOW_DOM import std; import :Device; import :Types; // ─── BEGIN NVIDIA descriptor-heap AS-read workaround (issue #15 / #7) ───── // Remove this whole block (and its call below, Device::workaroundDescriptorHeapAS, // and the RTPass push-data) once NVIDIA ships a driver that fixes the // VK_EXT_descriptor_heap acceleration-structure read fault. // // On the affected driver, reading an `accelerationStructureEXT` out of the // descriptor heap aborts the device. The build, the heap descriptor write and // everything else are correct (proven in #7); only the in-shader heap AS read // is broken — buffers/images through the same heap work. Acceleration // structures can equally be addressed by their device address, and // OpConvertUToAccelerationStructureKHR (which reads no descriptor) sidesteps // the faulting path entirely. // // glslang has no GLSL spelling for that conversion, so we rewrite the compiled // SPIR-V at module-load time: every `OpLoad %accelStruct ` becomes a // load of the TLAS device address from a push-constant block followed by // OpConvertUToAccelerationStructureKHR. RTPass pushes the active frame's TLAS // address into that push constant. Shaders that never touch an acceleration // structure (no OpTypeAccelerationStructureKHR) are left untouched. // // SPIR-V allows at most one push-constant variable per entry point, so we never // add a second one: if the shader already declares a push-constant block we // append a ulong member (the TLAS address) to the *existing* block and read // from there; only shaders with no push constant of their own get a freshly // synthesized single-member block. Its byte offset is the offset of that // member, returned in PatchResult::tlasPushOffset so the caller (RTPass for the // RT pipeline, ComputeShader::Dispatch for a compute pipeline) can feed it to // vkCmdPushDataEXT — landing the address exactly where the rewritten load reads // it. The offset is per-shader rather than a global: a global is clobbered by // whichever shader was patched last and so cannot serve several shaders whose // push-constant layouts differ. // // Exported so tests/PushConstantRewrite can drive Patch() over real compiled // SPIR-V and check the result with spirv-val; nothing in the engine calls it // from outside this file. Goes away with the rest of the workaround. export namespace WorkaroundNvidiaAS { // SPIR-V numeric opcodes / enums used below. enum : std::uint32_t { OpEntryPoint = 15, OpCapability = 17, OpTypeInt = 21, OpTypeFloat = 22, OpTypeVector = 23, OpTypeMatrix = 24, OpTypeArray = 28, OpTypeStruct = 30, OpTypePointer = 32, OpConstant = 43, OpVariable = 59, OpLoad = 61, OpAccessChain = 65, OpDecorate = 71, OpMemberDecorate = 72, OpConvertUToAccelerationStructureKHR = 4447, OpTypeAccelerationStructureKHR = 5341, CapabilityInt64 = 11, StorageClassPushConstant = 9, DecorationBlock = 2, DecorationMatrixStride = 7, DecorationArrayStride = 6, DecorationOffset = 35, }; inline bool IsAnnotation(std::uint32_t op) { // OpDecorate/OpMemberDecorate/OpDecorationGroup/OpGroupDecorate/ // OpGroupMemberDecorate/OpDecorateId/OpDecorate(Member)String. return op == 71 || op == 72 || op == 73 || op == 74 || op == 75 || op == 332 || op == 5632 || op == 5633; } using Instr = std::vector; inline std::uint32_t AlignUp(std::uint32_t v, std::uint32_t a) { return (v + a - 1u) & ~(a - 1u); } // Outcome of patching one shader module. `patched` is true only when the // shader read an acceleration structure and was rewritten; `tlasPushOffset` // is then the byte offset of the TLAS-address member in the (possibly // pre-existing) push-constant block the caller must write. struct PatchResult { bool patched = false; std::uint32_t tlasPushOffset = 0; }; inline PatchResult Patch(std::vector& words) { if (words.size() < 5) return {}; // not a SPIR-V module we understand. // Split header (5 words) from the instruction stream. std::uint32_t bound = words[3]; std::vector instrs; for (std::size_t i = 5; i < words.size();) { std::uint32_t len = words[i] >> 16; if (len == 0 || i + len > words.size()) return {}; // malformed — bail. instrs.emplace_back(words.begin() + i, words.begin() + i + len); i += len; } // ── Scan for the AS type, reusable int/long types+constants, any // existing push-constant block, the type/decoration/constant tables // needed to size that block, and the section boundaries to insert into. std::uint32_t asTypeId = 0, ulongTypeId = 0, uintTypeId = 0, uintZeroId = 0; std::uint32_t existingPcVarId = 0, existingPcStructId = 0, existingPtrUlongId = 0; std::size_t lastCapIdx = 0, lastAnnotIdx = 0, firstFuncIdx = instrs.size(); std::size_t entryIdx = instrs.size(); std::map typeInstr; // type-result-id → defining instr std::map constU32; // OpConstant id → 32-bit value std::map uintConstByValue; // uint value → OpConstant id std::map arrayStride; // array type id → ArrayStride std::map memberOffset; // (struct<<32|idx) → Offset std::map memberMatStride; // (struct<<32|idx) → MatrixStride std::map ptrPointee; // pointer type id → pointee type id for (std::size_t k = 0; k < instrs.size(); ++k) { const Instr& in = instrs[k]; std::uint32_t op = in[0] & 0xFFFFu; switch (op) { case OpTypeAccelerationStructureKHR: asTypeId = in[1]; typeInstr[in[1]] = ∈ break; case OpTypeInt: if (in[2] == 64 && in[3] == 0) ulongTypeId = in[1]; else if (in[2] == 32 && in[3] == 0) uintTypeId = in[1]; typeInstr[in[1]] = ∈ break; case OpTypeFloat: case OpTypeVector: case OpTypeMatrix: case OpTypeArray: case OpTypeStruct: typeInstr[in[1]] = ∈ break; case OpTypePointer: typeInstr[in[1]] = ∈ ptrPointee[in[1]] = in[3]; if (in[2] == StorageClassPushConstant && in[3] == ulongTypeId) existingPtrUlongId = in[1]; break; case OpConstant: if (in.size() >= 4) constU32[in[2]] = in[3]; if (uintTypeId && in[1] == uintTypeId && in.size() >= 4) { uintConstByValue.emplace(in[3], in[2]); if (in[3] == 0) uintZeroId = in[2]; } break; case OpVariable: if (in[3] == StorageClassPushConstant) { existingPcVarId = in[2]; existingPcStructId = ptrPointee.count(in[1]) ? ptrPointee[in[1]] : 0; } break; case OpDecorate: if (in.size() >= 4 && in[2] == DecorationArrayStride) arrayStride[in[1]] = in[3]; break; case OpMemberDecorate: { std::uint64_t key = (static_cast(in[1]) << 32) | in[2]; if (in.size() >= 5 && in[3] == DecorationOffset) memberOffset[key] = in[4]; if (in.size() >= 5 && in[3] == DecorationMatrixStride) memberMatStride[key] = in[4]; break; } case OpCapability: lastCapIdx = k; break; case OpEntryPoint: if (entryIdx == instrs.size()) entryIdx = k; break; default: break; } if (IsAnnotation(op)) lastAnnotIdx = k; if (op == 54 /*OpFunction*/ && firstFuncIdx == instrs.size()) firstFuncIdx = k; } if (asTypeId == 0) return {}; // shader never reads an acceleration structure. // Set on whichever path runs below; returned to the caller. std::uint32_t tlasPushOffset = 0; auto newId = [&] { return bound++; }; auto mk = [](std::initializer_list ops) { Instr in(ops); in[0] = static_cast(in.size() << 16) | (in[0] & 0xFFFFu); return in; }; // Byte footprint of a type, honouring the explicit Array/Matrix strides // glslang emits so the result is correct under both scalar and std140 // block layout. Used only to find where an existing push block ends. std::function footprint = [&](std::uint32_t tid) -> std::uint32_t { auto it = typeInstr.find(tid); if (it == typeInstr.end()) return 0; const Instr& t = *it->second; switch (t[0] & 0xFFFFu) { case OpTypeInt: case OpTypeFloat: return t[2] / 8u; case OpTypeVector: return t[3] * footprint(t[2]); case OpTypeMatrix: return t[3] * footprint(t[2]); // cols × column-vec case OpTypeArray: { std::uint32_t len = constU32.count(t[3]) ? constU32[t[3]] : 0; std::uint32_t stride = arrayStride.count(tid) ? arrayStride[tid] : footprint(t[2]); return len * stride; } case OpTypeStruct: { std::uint32_t end = 0; for (std::size_t m = 2; m < t.size(); ++m) { std::uint32_t idx = static_cast(m - 2); std::uint64_t key = (static_cast(t[1]) << 32) | idx; std::uint32_t off = memberOffset.count(key) ? memberOffset[key] : 0; std::uint32_t sz; auto mt = typeInstr.find(t[m]); if (mt != typeInstr.end() && (mt->second->at(0) & 0xFFFFu) == OpTypeMatrix && memberMatStride.count(key)) sz = memberMatStride[key] * (*mt->second)[3]; else sz = footprint(t[m]); end = std::max(end, off + sz); } return end; } case OpTypePointer: return 8; default: return 0; } }; bool merge = existingPcVarId != 0 && existingPcStructId != 0 && typeInstr.count(existingPcStructId) && (typeInstr[existingPcStructId]->at(0) & 0xFFFFu) == OpTypeStruct; // ── Synthesize/ensure the int/long types and constants we need, reusing // any the module already defines (SPIR-V forbids duplicate type defs). std::vector typeDefs; if (uintTypeId == 0) { uintTypeId = newId(); typeDefs.push_back(mk({OpTypeInt, uintTypeId, 32, 0})); } if (ulongTypeId == 0) { ulongTypeId = newId(); typeDefs.push_back(mk({OpTypeInt, ulongTypeId, 64, 0})); } std::uint32_t pcVarId, ptrPushUlongId, memberIdxConstId, memberIdx; std::vector decorations; if (merge) { // Append a ulong member to the user's existing block; read from it. pcVarId = existingPcVarId; const Instr* structInstr = typeInstr[existingPcStructId]; memberIdx = static_cast(structInstr->size() - 2); tlasPushOffset = AlignUp(footprint(existingPcStructId), 8); ptrPushUlongId = existingPtrUlongId; if (ptrPushUlongId == 0) { ptrPushUlongId = newId(); typeDefs.push_back(mk({OpTypePointer, ptrPushUlongId, StorageClassPushConstant, ulongTypeId})); } // Member index constant for the access chain — reuse an existing // uint constant of the right value, else mint one (must be an // integer constant, so only uint-typed ones qualify for reuse). auto found = uintConstByValue.find(memberIdx); if (found != uintConstByValue.end()) { memberIdxConstId = found->second; } else { memberIdxConstId = newId(); typeDefs.push_back(mk({OpConstant, uintTypeId, memberIdxConstId, memberIdx})); } decorations.push_back(mk({OpMemberDecorate, existingPcStructId, memberIdx, DecorationOffset, tlasPushOffset})); } else { // No user push constant — synthesize a fresh single-member block. if (uintZeroId == 0) { uintZeroId = newId(); typeDefs.push_back(mk({OpConstant, uintTypeId, uintZeroId, 0})); } std::uint32_t pcStructId = newId(); std::uint32_t ptrPushStructId = newId(); ptrPushUlongId = newId(); pcVarId = newId(); typeDefs.push_back(mk({OpTypeStruct, pcStructId, ulongTypeId})); typeDefs.push_back(mk({OpTypePointer, ptrPushStructId, StorageClassPushConstant, pcStructId})); typeDefs.push_back(mk({OpTypePointer, ptrPushUlongId, StorageClassPushConstant, ulongTypeId})); typeDefs.push_back(mk({OpVariable, ptrPushStructId, pcVarId, StorageClassPushConstant})); decorations.push_back(mk({OpMemberDecorate, pcStructId, 0, DecorationOffset, 0})); decorations.push_back(mk({OpDecorate, pcStructId, DecorationBlock})); memberIdxConstId = uintZeroId; tlasPushOffset = 0; } // ── Rewrite each `OpLoad %asType ` into address-load + convert, and // (when merging) append the ulong member to the existing struct type. std::vector rebuilt; rebuilt.reserve(instrs.size() + 8); for (Instr in : instrs) { std::uint32_t op = in[0] & 0xFFFFu; if (op == OpLoad && in[1] == asTypeId) { std::uint32_t resultId = in[2]; std::uint32_t chainId = newId(); std::uint32_t addrId = newId(); rebuilt.push_back(mk({OpAccessChain, ptrPushUlongId, chainId, pcVarId, memberIdxConstId})); rebuilt.push_back(mk({OpLoad, ulongTypeId, addrId, chainId})); rebuilt.push_back(mk({OpConvertUToAccelerationStructureKHR, asTypeId, resultId, addrId})); } else { if (merge && op == OpTypeStruct && in[1] == existingPcStructId) { in.push_back(ulongTypeId); in[0] = static_cast(in.size() << 16) | OpTypeStruct; } rebuilt.push_back(std::move(in)); } } instrs.swap(rebuilt); // Recompute structural anchors (the rewrite above shifted indices). lastCapIdx = 0; lastAnnotIdx = 0; firstFuncIdx = instrs.size(); entryIdx = instrs.size(); std::size_t structIdx = instrs.size(); for (std::size_t k = 0; k < instrs.size(); ++k) { std::uint32_t op = instrs[k][0] & 0xFFFFu; if (op == OpCapability) lastCapIdx = k; if (op == OpEntryPoint && entryIdx == instrs.size()) entryIdx = k; if (IsAnnotation(op)) lastAnnotIdx = k; if (op == 54 && firstFuncIdx == instrs.size()) firstFuncIdx = k; if (merge && op == OpTypeStruct && instrs[k][1] == existingPcStructId) structIdx = k; } // The newly-defined types (notably ulong) must precede every use. When // merging, the user's struct — now carrying the appended ulong member — // already sits in the type section, so the defs go in just before it; // for a fresh block the whole bundle can go at the end of the type // section (right before the first function). std::size_t typeDefsIdx = (merge && structIdx != instrs.size()) ? structIdx : firstFuncIdx; // A freshly synthesized push-constant variable must join the entry // point's interface list (required for SPIR-V ≥ 1.4 — raygen is 1.4). // A merged-into variable is already used, so it is already listed. if (!merge && entryIdx != instrs.size() && words[1] >= 0x00010400u) { instrs[entryIdx].push_back(pcVarId); instrs[entryIdx][0] = static_cast(instrs[entryIdx].size() << 16) | OpEntryPoint; } // Insert highest-index-first so earlier anchors stay valid (typeDefsIdx // ≥ lastAnnotIdx+1 ≥ lastCapIdx+1 in both the merge and synthesize cases). instrs.insert(instrs.begin() + typeDefsIdx, typeDefs.begin(), typeDefs.end()); instrs.insert(instrs.begin() + lastAnnotIdx + 1, decorations.begin(), decorations.end()); instrs.insert(instrs.begin() + lastCapIdx + 1, mk({OpCapability, CapabilityInt64})); // ── Reassemble: header (with updated bound) + instruction stream. std::vector out(words.begin(), words.begin() + 5); out[3] = bound; for (const Instr& in : instrs) out.insert(out.end(), in.begin(), in.end()); words.swap(out); return {true, tlasPushOffset}; } } // ─── END NVIDIA descriptor-heap AS-read workaround ──────────────────────── export namespace Crafter { class VulkanShader { public: std::vector specilizations; VkSpecializationInfo* specilizationInfo; VkShaderStageFlagBits stage; std::string entrypoint; VkShaderModule shader; // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7): set when // this module read an acceleration structure and was rewritten to fetch // the TLAS device address from a push constant. `tlasPushOffset` is the // byte offset of that member, which whoever records the dispatch // (RTPass / ComputeShader) must write with vkCmdPushDataEXT. Per-shader // rather than a global because each shader's push-constant layout — and // therefore the offset — can differ. Both false/0 on every other driver. bool patchedAS = false; std::uint32_t tlasPushOffset = 0; VulkanShader(const std::filesystem::path& path, std::string entrypoint, VkShaderStageFlagBits stage, VkSpecializationInfo* specilizationInfo) : stage(stage), entrypoint(entrypoint), specilizationInfo(specilizationInfo) { std::ifstream file(path, std::ios::binary); if (!file) { std::cerr << "Error: Could not open file " << path << std::endl; } // Move to the end of the file to determine its size file.seekg(0, std::ios::end); std::streamsize size = file.tellg(); file.seekg(0, std::ios::beg); std::vector spirv(size / sizeof(std::uint32_t)); // Read the data into the vector if (!file.read(reinterpret_cast(spirv.data()), size)) { std::cerr << "Error: Could not read data from file" << std::endl; } file.close(); // NVIDIA descriptor-heap AS-read workaround (issue #15 / #7). // No-op on every other driver and on shaders that don't read an // acceleration structure. Remove with the rest of the workaround // once a fixed NVIDIA driver ships. if (Device::workaroundDescriptorHeapAS) { WorkaroundNvidiaAS::PatchResult patch = WorkaroundNvidiaAS::Patch(spirv); patchedAS = patch.patched; tlasPushOffset = patch.tlasPushOffset; } VkShaderModuleCreateInfo module_info{VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO}; module_info.codeSize = spirv.size() * sizeof(uint32_t); module_info.pCode = spirv.data(); Device::CheckVkResult(vkCreateShaderModule(Device::device, &module_info, nullptr, &shader)); } }; } #endif // !CRAFTER_GRAPHICS_WINDOW_DOM