diff --git a/additional/dom-webgpu.js b/additional/dom-webgpu.js
index 625a552..a15142f 100644
--- a/additional/dom-webgpu.js
+++ b/additional/dom-webgpu.js
@@ -1834,9 +1834,18 @@ fn _wfReadRay(i: u32) -> WfRay {
 // PREP — publish indirect args for the upcoming TRACE/SHADE; zero the next
 // buffer's emit counter.
 fn _wfPrep() {
-    let n = _wfCurCount();
-    wfIndirect[0] = (n + 63u) / 64u;
-    wfIndirect[1] = 1u;
+    let n  = _wfCurCount();
+    let wg = (n + 63u) / 64u;
+    // maxComputeWorkgroupsPerDimension is 65535 on Dawn/Firefox; a 1-D
+    // dispatch of ceil(W*H/64) overflows it past ~4.19M rays (~2560x1640)
+    // and WebGPU silently drops the indirect dispatch -> black screen. Spread
+    // across a 2-D grid; wfTrace/wfShade rebuild the linear index from
+    // num_workgroups.
+    let MAXDIM = 65535u;
+    let gx = min(wg, MAXDIM);
+    let gy = (wg + MAXDIM - 1u) / MAXDIM;   // = 1 when wg <= MAXDIM
+    wfIndirect[0] = gx;
+    wfIndirect[1] = gy;
     wfIndirect[2] = 1u;
     if (wfParams.curIsA == 1u) { atomicStore(&wfCounters[1], 0u); }
     else                       { atomicStore(&wfCounters[0], 0u); }
diff --git a/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp b/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp
index 8dd949d..373249a 100644
--- a/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp
+++ b/implementations/Crafter.Graphics-PipelineRTWebGPU.cpp
@@ -231,13 +231,13 @@ void PipelineRTWebGPU::Init(WebGPUCommandEncoderRef                 /*cmd*/,
     // TRACE — zero user code: pure traversal + intersection. One thread
     // per live ray; writes a HitResult into wfHits[i].
     wgsl += "\n@compute @workgroup_size(64)\n";
-    wgsl += "fn wfTrace(@builtin(global_invocation_id) gid: vec3<u32>) { _wfTrace(gid.x); }\n";
+    wgsl += "fn wfTrace(@builtin(global_invocation_id) gid: vec3<u32>, @builtin(num_workgroups) nwg: vec3<u32>) { _wfTrace(gid.y * nwg.x * 64u + gid.x); }\n";
 
     // SHADE — one thread per live ray; loads the ray + its hit + payload,
     // dispatches to runMiss / runClosestHit, which may rtAccumulate and
     // rtEmitRay continuation/shadow rays into the next buffer.
     wgsl += "\n@compute @workgroup_size(64)\n";
-    wgsl += "fn wfShade(@builtin(global_invocation_id) gid: vec3<u32>) { _wfShade(gid.x); }\n";
+    wgsl += "fn wfShade(@builtin(global_invocation_id) gid: vec3<u32>, @builtin(num_workgroups) nwg: vec3<u32>) { _wfShade(gid.y * nwg.x * 64u + gid.x); }\n";
 
     // RESOLVE — one thread per pixel; runs the user resolve (or identity)
     // over the linear accumulator and stores to the output image.