Skip to content

Commit dad401f

Browse files
it works just the kernel left
1 parent 59c5565 commit dad401f

File tree

2 files changed

+103
-63
lines changed

2 files changed

+103
-63
lines changed

examples_tests/39.DenoiserTonemapper/CommonPushConstants.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
#ifdef __cplusplus
66
#define int int32_t
77
#define uint uint32_t
8+
struct vec2 {float x,y;};
89
#define mat3 nbl::core::matrix3x4SIMD
910
#endif
1011
struct CommonPushConstants
1112
{
1213
uint inImageTexelPitch[3];
1314
uint imageWidth;
1415
uint imageHeight;
16+
uint fftSizeLog2; // TODO: use this
17+
vec2 kernel_half_pixel_size;
1518

1619
// luma meter and tonemapping var but also for denoiser
1720
uint percentileRange[2];

examples_tests/39.DenoiserTonemapper/main.cpp

Lines changed: 100 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@ int main(int argc, char* argv[])
181181
// clear the histogram to 0s
182182
driver->fillBuffer(histogramBuffer.get(),0u,HistogramBufferSize,0u);
183183

184-
constexpr auto SharedDescriptorSetDescCount = 4u;
184+
constexpr auto SharedDescriptorSetDescCount = 5u;
185185
core::smart_refctd_ptr<IGPUDescriptorSetLayout> sharedDescriptorSetLayout;
186186
core::smart_refctd_ptr<IGPUPipelineLayout> sharedPipelineLayout;
187187
core::smart_refctd_ptr<IGPUComputePipeline> deinterleavePipeline,intensityPipeline,secondLumaMeterAndFirstFFTPipeline,convolvePipeline,interleaveAndLastFFTPipeline;
@@ -398,14 +398,16 @@ void main()
398398
shared uint _NBL_GLSL_SCRATCH_SHARED_DEFINED_[_NBL_GLSL_SCRATCH_SHARED_SIZE_DEFINED_];
399399
400400
#include "../ShaderCommon.glsl"
401-
layout(binding = 1, std430) restrict buffer SpectrumOutputBuffer
401+
layout(binding = 1, std430) restrict buffer SpectrumBuffer
402402
{
403403
vec2 spectrum[];
404404
};
405405
#define _NBL_GLSL_EXT_FFT_INPUT_DESCRIPTOR_DEFINED_
406406
#define _NBL_GLSL_EXT_FFT_OUTPUT_DESCRIPTOR_DEFINED_
407407
408408
409+
layout(binding=4) uniform sampler2D NormalizedKernel[3];
410+
409411
410412
#include <nbl/builtin/glsl/math/complex.glsl>
411413
@@ -448,13 +450,10 @@ void convolve(in uint item_per_thread_count, in uint ch)
448450
//
449451
const uvec3 coords = nbl_glsl_ext_FFT_getCoordinates(tid);
450452
vec2 uv = vec2(bitfieldReverse(coords.xy))/vec2(4294967296.f);
451-
#ifdef CONVOLVE
453+
452454
uv += pc.params.kernel_half_pixel_size;
453455
//
454456
nbl_glsl_complex convSpectrum = textureLod(NormalizedKernel[ch],uv,0).xy;
455-
#else
456-
nbl_glsl_complex convSpectrum = nbl_glsl_complex(1.f,0.f);
457-
#endif
458457
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
459458
}
460459
}
@@ -484,7 +483,12 @@ void main()
484483
for(uint t=0u; t<item_per_thread_count; t++)
485484
{
486485
const uint tid = (t<<_NBL_GLSL_WORKGROUP_SIZE_LOG2_)|gl_LocalInvocationIndex;
487-
nbl_glsl_ext_FFT_setData(nbl_glsl_ext_FFT_getCoordinates(tid),channel,nbl_glsl_ext_FFT_impl_values[t]);
486+
const uint trueDim = nbl_glsl_ext_FFT_Parameters_t_getDimensions()[nbl_glsl_ext_FFT_Parameters_t_getDirection()];
487+
// we also prevent certain threads from writing the memory out
488+
const uint padding = ((0x1u<<log2FFTSize)-trueDim)>>1u;
489+
const uint shifted = tid-padding;
490+
if (tid>=padding && shifted<trueDim)
491+
nbl_glsl_ext_FFT_setData(ivec3(nbl_glsl_ext_FFT_getCoordinates(shifted)),channel,nbl_glsl_ext_FFT_impl_values[t]);
488492
}
489493
}
490494
}
@@ -549,7 +553,7 @@ void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_
549553
ivec2 coords = ivec2(coordinate.xy);
550554
const uint padding_size = (0x1u<<nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize())-pc.data.imageWidth;
551555
coords.x -= int(padding_size>>1u);
552-
if (coords.x<0 || coords.x>int(pc.data.imageWidth))
556+
if (coords.x<0 || coords.x>=int(pc.data.imageWidth))
553557
return;
554558
555559
uint dataOffset = coords.y*pc.data.inImageTexelPitch[EII_COLOR]+coords.x;
@@ -654,12 +658,33 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
654658
auto secondLumaMeterAndFirstFFTSpecializedShader = driver->createGPUSpecializedShader(secondLumaMeterAndFirstFFTShader.get(),specInfo);
655659
auto convolveSpecializedShader = driver->createGPUSpecializedShader(convolveShader.get(),specInfo);
656660
auto interleaveAndLastFFTSpecializedShader = driver->createGPUSpecializedShader(interleaveAndLastFFTShader.get(),specInfo);
657-
661+
662+
core::smart_refctd_ptr<IGPUSampler> samplers[colorChannelsFFT];
663+
{
664+
IGPUSampler::SParams params =
665+
{
666+
{
667+
ISampler::ETC_REPEAT,
668+
ISampler::ETC_REPEAT,
669+
ISampler::ETC_REPEAT,
670+
ISampler::ETBC_FLOAT_OPAQUE_BLACK,
671+
ISampler::ETF_LINEAR, // is it needed?
672+
ISampler::ETF_LINEAR,
673+
ISampler::ESMM_NEAREST,
674+
0u,
675+
0u,
676+
ISampler::ECO_ALWAYS
677+
}
678+
};
679+
auto sampler = driver->createGPUSampler(std::move(params));
680+
std::fill_n(samplers,colorChannelsFFT,sampler);
681+
}
658682
IGPUDescriptorSetLayout::SBinding binding[SharedDescriptorSetDescCount] = {
659-
{0u,EDT_STORAGE_BUFFER,3u,IGPUSpecializedShader::ESS_COMPUTE,nullptr},
660-
{1u,EDT_STORAGE_BUFFER,3u,IGPUSpecializedShader::ESS_COMPUTE,nullptr},
683+
{0u,EDT_STORAGE_BUFFER,EII_COUNT,IGPUSpecializedShader::ESS_COMPUTE,nullptr},
684+
{1u,EDT_STORAGE_BUFFER,EII_COUNT,IGPUSpecializedShader::ESS_COMPUTE,nullptr},
661685
{2u,EDT_STORAGE_BUFFER,1u,IGPUSpecializedShader::ESS_COMPUTE,nullptr},
662-
{3u,EDT_STORAGE_BUFFER,1u,IGPUSpecializedShader::ESS_COMPUTE,nullptr}
686+
{3u,EDT_STORAGE_BUFFER,1u,IGPUSpecializedShader::ESS_COMPUTE,nullptr},
687+
{4u,EDT_COMBINED_IMAGE_SAMPLER,colorChannelsFFT,IGPUSpecializedShader::ESS_COMPUTE,samplers}
663688
};
664689
sharedDescriptorSetLayout = driver->createGPUDescriptorSetLayout(binding,binding+SharedDescriptorSetDescCount);
665690
SPushConstantRange pcRange[1] = {IGPUSpecializedShader::ESS_COMPUTE,0u,sizeof(CommonPushConstants)};
@@ -1068,7 +1093,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
10681093
cuda::CCUDAHandler::GraphicsAPIObjLink<IGPUBuffer> retval = core::smart_refctd_ptr<IGPUBuffer>(gpubuffers->operator[](ix)->getBuffer());
10691094
if (!cuda::CCUDAHandler::defaultHandleResult(cuda::CCUDAHandler::registerBuffer(&retval)))
10701095
{
1071-
os::Printer::log(makeImageIDString(i) + "Could register the image data buffer with CUDA, skipping image!", ELL_ERROR);
1096+
os::Printer::log(makeImageIDString(i) + "Could not register the image data buffer with CUDA, skipping image!", ELL_ERROR);
10721097
skip = true;
10731098
}
10741099
return retval;
@@ -1094,57 +1119,93 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
10941119
shaderConstants.inImageTexelPitch[j] = image->getRegions().begin()[0].bufferRowLength;
10951120
inImageByteOffset[j] = offsetPair->getOffset();
10961121
}
1097-
// upload the constants to the GPU
1098-
driver->pushConstants(sharedPipelineLayout.get(), video::IGPUSpecializedShader::ESS_COMPUTE, 0u, sizeof(CommonPushConstants), &shaderConstants);
10991122
}
11001123

11011124
// process
11021125
{
1126+
// get the bloom kernel FFT Spectrum
1127+
core::smart_refctd_ptr<IGPUImageView> kernelNormalizedSpectrums[colorChannelsFFT];
1128+
{
1129+
core::smart_refctd_ptr<IGPUImageView> kerImageView;
1130+
{
1131+
auto kerGpuImages = driver->getGPUObjectsFromAssets(&param.kernel, &param.kernel + 1u, &assetConverter);
1132+
1133+
1134+
IGPUImageView::SCreationParams kerImgViewInfo;
1135+
kerImgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
1136+
kerImgViewInfo.image = kerGpuImages->operator[](0u);
1137+
1138+
// make sure cache doesn't retain the GPU object paired to CPU object (could have used a custom IGPUObjectFromAssetConverter derived class with overrides to achieve this)
1139+
am->removeCachedGPUObject(param.kernel.get(), kerImgViewInfo.image);
1140+
1141+
kerImgViewInfo.viewType = IGPUImageView::ET_2D;
1142+
kerImgViewInfo.format = kerImgViewInfo.image->getCreationParameters().format;
1143+
kerImgViewInfo.subresourceRange.aspectMask = static_cast<IImage::E_ASPECT_FLAGS>(0u);
1144+
kerImgViewInfo.subresourceRange.baseMipLevel = 0;
1145+
kerImgViewInfo.subresourceRange.levelCount = kerImgViewInfo.image->getCreationParameters().mipLevels;
1146+
kerImgViewInfo.subresourceRange.baseArrayLayer = 0;
1147+
kerImgViewInfo.subresourceRange.layerCount = 1;
1148+
kerImageView = driver->createGPUImageView(std::move(kerImgViewInfo));
1149+
}
1150+
1151+
// TODO: the FFTs
1152+
kernelNormalizedSpectrums[] = ;
1153+
}
1154+
11031155
uint32_t outImageByteOffset[EII_COUNT];
11041156
// bind shader resources
11051157
{
11061158
// create descriptor set
11071159
auto descriptorSet = driver->createGPUDescriptorSet(core::smart_refctd_ptr(sharedDescriptorSetLayout));
11081160
// write descriptor set
11091161
{
1110-
IGPUDescriptorSet::SDescriptorInfo infos[SharedDescriptorSetDescCount+EII_COUNT*2u-2u];
1111-
auto attachBufferImageRange = [param,&infos](auto ix, IGPUBuffer* buff, uint64_t offset, uint64_t pixelByteSize) -> void
1162+
IGPUDescriptorSet::SDescriptorInfo infos[SharedDescriptorSetDescCount+EII_COUNT*2u-2u+colorChannelsFFT];
1163+
auto attachBufferImageRange = [param,&infos](auto* pInfo, IGPUBuffer* buff, uint64_t offset, uint64_t pixelByteSize) -> void
1164+
{
1165+
pInfo->desc = core::smart_refctd_ptr<IGPUBuffer>(buff);
1166+
pInfo->buffer = {offset,param.width*param.height*pixelByteSize};
1167+
};
1168+
auto attachWholeBuffer = [&infos](auto* pInfo, IGPUBuffer* buff) -> void
11121169
{
1113-
infos[ix].desc = core::smart_refctd_ptr<IGPUBuffer>(buff);
1114-
infos[ix].buffer = {offset,param.width*param.height*pixelByteSize};
1170+
pInfo->desc = core::smart_refctd_ptr<IGPUBuffer>(buff);
1171+
pInfo->buffer = {0ull,buff->getMemoryReqs().vulkanReqs.size};
11151172
};
1116-
auto attachWholeBuffer = [&infos](auto ix, IGPUBuffer* buff) -> void
1173+
IGPUDescriptorSet::SWriteDescriptorSet writes[SharedDescriptorSetDescCount] =
11171174
{
1118-
infos[ix].desc = core::smart_refctd_ptr<IGPUBuffer>(buff);
1119-
infos[ix].buffer = {0ull,buff->getMemoryReqs().vulkanReqs.size};
1175+
{descriptorSet.get(),0u,0u,denoiserInputCount,EDT_STORAGE_BUFFER,infos+0},
1176+
{descriptorSet.get(),1u,0u,denoiserInputCount,EDT_STORAGE_BUFFER,infos+EII_COUNT},
1177+
{descriptorSet.get(),2u,0u,1u,EDT_STORAGE_BUFFER,infos+EII_COUNT*2u},
1178+
{descriptorSet.get(),3u,0u,1u,EDT_STORAGE_BUFFER,infos+EII_COUNT*2u+1u},
1179+
{descriptorSet.get(),4u,0u,colorChannelsFFT,EDT_COMBINED_IMAGE_SAMPLER,infos+EII_COUNT*2u+2u}
11201180
};
11211181
uint64_t interleavedPixelBytesize = getTexelOrBlockBytesize<EF_R16G16B16A16_SFLOAT>();
1122-
attachBufferImageRange(EII_COLOR,colorPixelBuffer.getObject(),inImageByteOffset[EII_COLOR],interleavedPixelBytesize);
1182+
attachBufferImageRange(writes[0].info+EII_COLOR,colorPixelBuffer.getObject(),inImageByteOffset[EII_COLOR],interleavedPixelBytesize);
11231183
if (denoiserInputCount>EII_ALBEDO)
1124-
attachBufferImageRange(EII_ALBEDO,albedoPixelBuffer.getObject(),inImageByteOffset[EII_ALBEDO],interleavedPixelBytesize);
1184+
attachBufferImageRange(writes[0].info+EII_ALBEDO,albedoPixelBuffer.getObject(),inImageByteOffset[EII_ALBEDO],interleavedPixelBytesize);
11251185
if (denoiserInputCount>EII_NORMAL)
1126-
attachBufferImageRange(EII_NORMAL,normalPixelBuffer.getObject(),inImageByteOffset[EII_NORMAL],interleavedPixelBytesize);
1186+
attachBufferImageRange(writes[0].info+EII_NORMAL,normalPixelBuffer.getObject(),inImageByteOffset[EII_NORMAL],interleavedPixelBytesize);
11271187
for (uint32_t j=0u; j<denoiserInputCount; j++)
11281188
{
11291189
outImageByteOffset[j] = j*param.width*param.height*forcedOptiXFormatPixelStride;
1130-
attachBufferImageRange(EII_COUNT+j,temporaryPixelBuffer.getObject(),outImageByteOffset[j],forcedOptiXFormatPixelStride);
1190+
attachBufferImageRange(writes[1].info+j,temporaryPixelBuffer.getObject(),outImageByteOffset[j],forcedOptiXFormatPixelStride);
11311191
if (j==0u)
11321192
infos[EII_COUNT].buffer.size = fftScratchSize;
11331193
}
1134-
attachWholeBuffer(EII_COUNT*2u,histogramBuffer.get());
1135-
attachWholeBuffer(EII_COUNT*2u+1u,intensityBuffer.getObject());
1136-
IGPUDescriptorSet::SWriteDescriptorSet writes[SharedDescriptorSetDescCount] =
1194+
attachWholeBuffer(writes[2].info,histogramBuffer.get());
1195+
attachWholeBuffer(writes[3].info,intensityBuffer.getObject());
1196+
for (auto j=0u; j< colorChannelsFFT; j++)
11371197
{
1138-
{descriptorSet.get(),0u,0u,denoiserInputCount,EDT_STORAGE_BUFFER,infos+0},
1139-
{descriptorSet.get(),1u,0u,denoiserInputCount,EDT_STORAGE_BUFFER,infos+EII_COUNT},
1140-
{descriptorSet.get(),2u,0u,1u,EDT_STORAGE_BUFFER,infos+EII_COUNT*2u},
1141-
{descriptorSet.get(),3u,0u,1u,EDT_STORAGE_BUFFER,infos+EII_COUNT*2u+1u}
1142-
};
1198+
writes[0].info[4].desc = core::smart_refctd_ptr(kernelNormalizedSpectrums[j]);
1199+
//writes[0].info[4].image.imageLayout = ;
1200+
writes[0].info[4].image.sampler = nullptr; //immutable
1201+
}
11431202
driver->updateDescriptorSets(SharedDescriptorSetDescCount,writes,0u,nullptr);
11441203
}
11451204
// bind descriptor set (for all shaders)
11461205
driver->bindDescriptorSets(video::EPBP_COMPUTE,sharedPipelineLayout.get(),0u,1u,&descriptorSet.get(),nullptr);
11471206
}
1207+
// upload the constants to the GPU
1208+
driver->pushConstants(sharedPipelineLayout.get(), video::IGPUSpecializedShader::ESS_COMPUTE, 0u, sizeof(CommonPushConstants), &shaderConstants);
11481209
// compute shader pre-preprocess (transform normals and compute luminosity)
11491210
{
11501211
// bind deinterleave pipeline
@@ -1211,7 +1272,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
12111272
denoiserOutput.rowStrideInBytes = param.width * forcedOptiXFormatPixelStride;
12121273
denoiserOutput.format = forcedOptiXFormat;
12131274
denoiserOutput.pixelStrideInBytes = forcedOptiXFormatPixelStride;
1214-
#if 1 // for easy debug with renderdoc disable optix stuff
1275+
#if 0 // for easy debug with renderdoc disable optix stuff
12151276
//invoke
12161277
if (denoiser.m_denoiser->tileAndInvoke(
12171278
m_cudaStream,
@@ -1240,48 +1301,24 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
12401301
shaderConstants.flags &= 0b10u;
12411302
driver->pushConstants(sharedPipelineLayout.get(), video::IGPUSpecializedShader::ESS_COMPUTE, offsetof(CommonPushConstants,flags), sizeof(uint32_t), &shaderConstants.flags);
12421303
// Bloom
1243-
uint32_t workgroupCounts[2] = { (param.width + kComputeWGSize - 1u) / kComputeWGSize,param.height };
1304+
uint32_t workgroupCounts[2] = { (param.width+kComputeWGSize-1u)/kComputeWGSize,param.height };
12441305
{
1245-
core::smart_refctd_ptr<IGPUImageView> kerImageView;
1246-
{
1247-
auto kerGpuImages = driver->getGPUObjectsFromAssets(&param.kernel,&param.kernel+1u,&assetConverter);
1248-
1249-
1250-
IGPUImageView::SCreationParams kerImgViewInfo;
1251-
kerImgViewInfo.flags = static_cast<IGPUImageView::E_CREATE_FLAGS>(0u);
1252-
kerImgViewInfo.image = kerGpuImages->operator[](0u);
1253-
1254-
// make sure cache doesn't retain the GPU object paired to CPU object (could have used a custom IGPUObjectFromAssetConverter derived class with overrides to achieve this)
1255-
am->removeCachedGPUObject(param.kernel.get(),kerImgViewInfo.image);
1256-
1257-
kerImgViewInfo.viewType = IGPUImageView::ET_2D;
1258-
kerImgViewInfo.format = kerImgViewInfo.image->getCreationParameters().format;
1259-
kerImgViewInfo.subresourceRange.aspectMask = static_cast<IImage::E_ASPECT_FLAGS>(0u);
1260-
kerImgViewInfo.subresourceRange.baseMipLevel = 0;
1261-
kerImgViewInfo.subresourceRange.levelCount = kerImgViewInfo.image->getCreationParameters().mipLevels;
1262-
kerImgViewInfo.subresourceRange.baseArrayLayer = 0;
1263-
kerImgViewInfo.subresourceRange.layerCount = 1;
1264-
kerImageView = driver->createGPUImageView(std::move(kerImgViewInfo));
1265-
}
1266-
12671306
driver->bindComputePipeline(secondLumaMeterAndFirstFFTPipeline.get());
12681307
// dispatch
12691308
driver->dispatch(param.fftDispatchInfo[0].workGroupCount[0],param.fftDispatchInfo[0].workGroupCount[1],1u);
12701309
COpenGLExtensionHandler::extGlMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
12711310

1272-
// TODO: Y-axis FFT, multiply the spectra together, y-axis iFFT
1311+
// Y-axis FFT, multiply the spectra together, y-axis iFFT
12731312
driver->bindComputePipeline(convolvePipeline.get());
1274-
#if 0
12751313
{
12761314
const auto& kernelImgExtent = kernelNormalizedSpectrums[0]->getCreationParameters().image->getCreationParameters().extent;
12771315
vec2 kernel_half_pixel_size{0.5f,0.5f};
12781316
kernel_half_pixel_size.x /= kernelImgExtent.width;
12791317
kernel_half_pixel_size.y /= kernelImgExtent.height;
1280-
driver->pushConstants(convolvePipeline->getLayout(),ISpecializedShader::ESS_COMPUTE,offsetof(convolve_parameters_t,kernel_half_pixel_size),sizeof(convolve_parameters_t::kernel_half_pixel_size),&kernel_half_pixel_size);
1318+
driver->pushConstants(convolvePipeline->getLayout(),ISpecializedShader::ESS_COMPUTE,offsetof(CommonPushConstants,kernel_half_pixel_size),sizeof(CommonPushConstants::kernel_half_pixel_size),&kernel_half_pixel_size);
12811319
}
1282-
#endif
12831320
// dispatch
1284-
//!driver->dispatch(param.fftDispatchInfo[1].workGroupCount[0],param.fftDispatchInfo[1].workGroupCount[1],1u);
1321+
driver->dispatch(param.fftDispatchInfo[1].workGroupCount[0],param.fftDispatchInfo[1].workGroupCount[1],1u);
12851322
COpenGLExtensionHandler::extGlMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
12861323

12871324
// bind intensity pipeline

0 commit comments

Comments
 (0)