@@ -181,6 +181,7 @@ int main(int argc, char* argv[])
181
181
// clear the histogram to 0s
182
182
driver->fillBuffer (histogramBuffer.get (),0u ,HistogramBufferSize,0u );
183
183
184
+ constexpr uint32_t kernelSetDescCount = 4u ;
184
185
constexpr auto SharedDescriptorSetDescCount = 5u ;
185
186
core::smart_refctd_ptr<IGPUDescriptorSetLayout> kernelDescriptorSetLayout,sharedDescriptorSetLayout;
186
187
core::smart_refctd_ptr<IGPUPipelineLayout> kernelPipelineLayout,sharedPipelineLayout;
@@ -195,6 +196,7 @@ int main(int argc, char* argv[])
195
196
};
196
197
{
197
198
auto firstKernelFFTShader = driver->createGPUShader (core::make_smart_refctd_ptr<ICPUShader>(R"===(
199
+ #version 450 core
198
200
#define _NBL_GLSL_WORKGROUP_SIZE_ 256
199
201
layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in;
200
202
@@ -205,19 +207,33 @@ layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) i
205
207
layout(set=0, binding=0) uniform sampler2D inputImage;
206
208
#define _NBL_GLSL_EXT_FFT_INPUT_DESCRIPTOR_DEFINED_
207
209
210
+ #include "nbl/builtin/glsl/ext/FFT/parameters_struct.glsl"
211
+ layout(push_constant) uniform PushConstants
212
+ {
213
+ nbl_glsl_ext_FFT_Parameters_t params;
214
+ float kernelScale;
215
+ } pc;
216
+ #define _NBL_GLSL_EXT_FFT_PUSH_CONSTANTS_DEFINED_
217
+ nbl_glsl_ext_FFT_Parameters_t nbl_glsl_ext_FFT_getParameters()
218
+ {
219
+ return pc.params;
220
+ }
221
+ #define _NBL_GLSL_EXT_FFT_GET_PARAMETERS_DEFINED_
222
+
208
223
#include <nbl/builtin/glsl/math/complex.glsl>
209
224
nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(in ivec3 coordinate, in uint channel)
210
225
{
211
226
ivec2 inputImageSize = textureSize(inputImage,0);
212
- vec2 normalizedCoords = (vec2(coordinate.xy)+vec2(0.5f))/(vec2(inputImageSize)*KERNEL_SCALE );
213
- vec4 texelValue = textureLod(inputImage, normalizedCoords+vec2(0.5-0.5/KERNEL_SCALE ), -log2(KERNEL_SCALE ));
227
+ vec2 normalizedCoords = (vec2(coordinate.xy)+vec2(0.5f))/(vec2(inputImageSize)*pc.kernelScale );
228
+ vec4 texelValue = textureLod(inputImage, normalizedCoords+vec2(0.5-0.5/pc.kernelScale ), -log2(pc.kernelScale ));
214
229
return nbl_glsl_complex(texelValue[channel], 0.0f);
215
230
}
216
231
#define _NBL_GLSL_EXT_FFT_GET_PADDED_DATA_DEFINED_
217
232
218
233
#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
219
234
)===" ));
220
235
auto lastKernelFFTShader = driver->createGPUShader (core::make_smart_refctd_ptr<ICPUShader>(R"===(
236
+ #version 450 core
221
237
#define _NBL_GLSL_WORKGROUP_SIZE_ 256
222
238
layout(local_size_x=_NBL_GLSL_WORKGROUP_SIZE_, local_size_y=1, local_size_z=1) in;
223
239
@@ -240,6 +256,7 @@ layout(set=0, binding=2) writeonly restrict buffer OutputBuffer
240
256
#include "nbl/builtin/glsl/ext/FFT/default_compute_fft.comp"
241
257
)===" ));
242
258
auto kernelNormalizationShader = driver->createGPUShader (core::make_smart_refctd_ptr<ICPUShader>(R"===(
259
+ #version 450 core
243
260
layout(local_size_x=16, local_size_y=16, local_size_z=1) in;
244
261
245
262
#include <nbl/builtin/glsl/ext/FFT/types.glsl>
@@ -260,12 +277,12 @@ layout(push_constant) uniform PushConstants
260
277
261
278
void main()
262
279
{
263
- nbl_glsl_complex value = in_data [nbl_glsl_dot(gl_GlobalInvocationID,pc.strides.xyz)];
280
+ nbl_glsl_complex value = inData [nbl_glsl_dot(gl_GlobalInvocationID,pc.strides.xyz)];
264
281
265
282
// imaginary component will be 0, image shall be positive
266
283
vec3 avg;
267
284
for (uint i=0u; i<3u; i++)
268
- avg[i] = in_data [pc.strides.z*i].x;
285
+ avg[i] = inData [pc.strides.z*i].x;
269
286
const float power = (nbl_glsl_scRGBtoXYZ*avg).y;
270
287
271
288
const uvec2 coord = bitfieldReverse(gl_GlobalInvocationID.xy)>>pc.bitreverse_shift.xy;
@@ -295,7 +312,6 @@ void main()
295
312
}
296
313
};
297
314
auto sampler = driver->createGPUSampler (std::move (params));
298
- constexpr uint32_t kernelSetDescCount = 4u ;
299
315
IGPUDescriptorSetLayout::SBinding binding[kernelSetDescCount] = {
300
316
{0u ,EDT_COMBINED_IMAGE_SAMPLER,1u ,IGPUSpecializedShader::ESS_COMPUTE,&sampler},
301
317
{1u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
@@ -306,13 +322,13 @@ void main()
306
322
}
307
323
308
324
{
309
- SPushConstantRange pcRange[1 ] = {IGPUSpecializedShader::ESS_COMPUTE,0u ,core::max (sizeof (FFTClass::Parameters_t),sizeof (NormalizationPushConstants))};
325
+ SPushConstantRange pcRange[1 ] = {IGPUSpecializedShader::ESS_COMPUTE,0u ,core::max (sizeof (FFTClass::Parameters_t)+ sizeof ( float ) ,sizeof (NormalizationPushConstants))};
310
326
kernelPipelineLayout = driver->createGPUPipelineLayout (pcRange,pcRange+1u ,core::smart_refctd_ptr (kernelDescriptorSetLayout));
311
327
}
312
328
313
- firstKernelFFTPipeline = driver->createGPUComputePipeline (nullptr ,core::smart_refctd_ptr (sharedPipelineLayout ),std::move (firstKernelFFTSpecializedShader));
314
- lastKernelFFTPipeline = driver->createGPUComputePipeline (nullptr ,core::smart_refctd_ptr (sharedPipelineLayout ),std::move (lastKernelFFTSpecializedShader));
315
- kernelNormalizationPipeline = driver->createGPUComputePipeline (nullptr ,core::smart_refctd_ptr (sharedPipelineLayout ),std::move (kernelNormalizationSpecializedShader));
329
+ firstKernelFFTPipeline = driver->createGPUComputePipeline (nullptr ,core::smart_refctd_ptr (kernelPipelineLayout ),std::move (firstKernelFFTSpecializedShader));
330
+ lastKernelFFTPipeline = driver->createGPUComputePipeline (nullptr ,core::smart_refctd_ptr (kernelPipelineLayout ),std::move (lastKernelFFTSpecializedShader));
331
+ kernelNormalizationPipeline = driver->createGPUComputePipeline (nullptr ,core::smart_refctd_ptr (kernelPipelineLayout ),std::move (kernelNormalizationSpecializedShader));
316
332
317
333
318
334
auto deinterleaveShader = driver->createGPUShader (core::make_smart_refctd_ptr<ICPUShader>(R"===(
@@ -1016,6 +1032,8 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
1016
1032
}
1017
1033
return tmp;
1018
1034
}();
1035
+ outParam.bloomScale = bloomScale;
1036
+ fftScratchSize = core::max (FFTClass::getOutputBufferSize (usingHalfFloatFFTStorage,kerDim,colorChannelsFFT)*2u ,fftScratchSize);
1019
1037
fftScratchSize = core::max (FFTClass::getOutputBufferSize (usingHalfFloatFFTStorage,marginSrcDim,colorChannelsFFT),fftScratchSize);
1020
1038
{
1021
1039
auto * fftPushConstants = outParam.fftPushConstants ;
@@ -1319,12 +1337,32 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
1319
1337
{
1320
1338
auto kernelDescriptorSet = driver->createGPUDescriptorSet (core::smart_refctd_ptr (kernelDescriptorSetLayout));
1321
1339
{
1340
+ IGPUDescriptorSet::SDescriptorInfo infos[kernelSetDescCount+colorChannelsFFT-1u ];
1341
+ infos[0 ].desc = kerImageView;
1342
+ infos[0 ].image .sampler = nullptr ; // immutable
1343
+ infos[1 ].desc = core::smart_refctd_ptr<IGPUBuffer>(temporaryPixelBuffer.getObject ());
1344
+ infos[1 ].buffer = {0u ,fftScratchSize>>1u };
1345
+ infos[2 ].desc = core::smart_refctd_ptr<IGPUBuffer>(temporaryPixelBuffer.getObject ());
1346
+ infos[2 ].buffer = {fftScratchSize>>1u ,fftScratchSize};
1347
+ for (uint32_t i=0u ; i<colorChannelsFFT; i++)
1348
+ {
1349
+ infos[3 +i].desc = kernelNormalizedSpectrums[i];
1350
+ infos[3 +i].image .sampler = nullptr ; // storage
1351
+ }
1352
+ IGPUDescriptorSet::SWriteDescriptorSet writes[kernelSetDescCount] =
1353
+ {
1354
+ {kernelDescriptorSet.get (),0u ,0u ,1u ,EDT_COMBINED_IMAGE_SAMPLER,infos+0u },
1355
+ {kernelDescriptorSet.get (),1u ,0u ,1u ,EDT_STORAGE_BUFFER,infos+1u },
1356
+ {kernelDescriptorSet.get (),2u ,0u ,1u ,EDT_STORAGE_BUFFER,infos+2u },
1357
+ {kernelDescriptorSet.get (),3u ,0u ,colorChannelsFFT,EDT_STORAGE_IMAGE,infos+3u }
1358
+ };
1359
+ driver->updateDescriptorSets (kernelSetDescCount,writes,0u ,nullptr );
1322
1360
}
1323
1361
driver->bindDescriptorSets (EPBP_COMPUTE,kernelPipelineLayout.get (),0u ,1u ,&kernelDescriptorSet.get (),nullptr );
1324
1362
1325
1363
// Ker Image First Axis FFT
1326
1364
driver->bindComputePipeline (firstKernelFFTPipeline.get ());
1327
- driver->pushConstants (firstKernelFFTPipeline-> getLayout (),ICPUSpecializedShader::ESS_COMPUTE,sizeof (FFTClass::Parameters_t),sizeof (float ),¶m.bloomScale );
1365
+ driver->pushConstants (kernelPipelineLayout. get (),ICPUSpecializedShader::ESS_COMPUTE,sizeof (FFTClass::Parameters_t),sizeof (float ),¶m.bloomScale );
1328
1366
FFTClass::dispatchHelper (driver,kernelPipelineLayout.get (),fftPushConstants[0 ],fftDispatchInfo[0 ]);
1329
1367
1330
1368
// Ker Image Last Axis FFT
@@ -1468,7 +1506,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
1468
1506
denoiserOutput.rowStrideInBytes = param.width * forcedOptiXFormatPixelStride;
1469
1507
denoiserOutput.format = forcedOptiXFormat;
1470
1508
denoiserOutput.pixelStrideInBytes = forcedOptiXFormatPixelStride;
1471
- #if 0 // for easy debug with renderdoc disable optix stuff
1509
+ #if 1 // for easy debug with renderdoc disable optix stuff
1472
1510
// invoke
1473
1511
if (denoiser.m_denoiser ->tileAndInvoke (
1474
1512
m_cudaStream,
0 commit comments