@@ -181,7 +181,7 @@ int main(int argc, char* argv[])
181
181
// clear the histogram to 0s
182
182
driver->fillBuffer (histogramBuffer.get (),0u ,HistogramBufferSize,0u );
183
183
184
- constexpr auto SharedDescriptorSetDescCount = 4u ;
184
+ constexpr auto SharedDescriptorSetDescCount = 5u ;
185
185
core::smart_refctd_ptr<IGPUDescriptorSetLayout> sharedDescriptorSetLayout;
186
186
core::smart_refctd_ptr<IGPUPipelineLayout> sharedPipelineLayout;
187
187
core::smart_refctd_ptr<IGPUComputePipeline> deinterleavePipeline,intensityPipeline,secondLumaMeterAndFirstFFTPipeline,convolvePipeline,interleaveAndLastFFTPipeline;
@@ -398,14 +398,16 @@ void main()
398
398
shared uint _NBL_GLSL_SCRATCH_SHARED_DEFINED_[_NBL_GLSL_SCRATCH_SHARED_SIZE_DEFINED_];
399
399
400
400
#include "../ShaderCommon.glsl"
401
- layout(binding = 1, std430) restrict buffer SpectrumOutputBuffer
401
+ layout(binding = 1, std430) restrict buffer SpectrumBuffer
402
402
{
403
403
vec2 spectrum[];
404
404
};
405
405
#define _NBL_GLSL_EXT_FFT_INPUT_DESCRIPTOR_DEFINED_
406
406
#define _NBL_GLSL_EXT_FFT_OUTPUT_DESCRIPTOR_DEFINED_
407
407
408
408
409
+ layout(binding=4) uniform sampler2D NormalizedKernel[3];
410
+
409
411
410
412
#include <nbl/builtin/glsl/math/complex.glsl>
411
413
@@ -448,13 +450,10 @@ void convolve(in uint item_per_thread_count, in uint ch)
448
450
//
449
451
const uvec3 coords = nbl_glsl_ext_FFT_getCoordinates(tid);
450
452
vec2 uv = vec2(bitfieldReverse(coords.xy))/vec2(4294967296.f);
451
- #ifdef CONVOLVE
453
+
452
454
uv += pc.params.kernel_half_pixel_size;
453
455
//
454
456
nbl_glsl_complex convSpectrum = textureLod(NormalizedKernel[ch],uv,0).xy;
455
- #else
456
- nbl_glsl_complex convSpectrum = nbl_glsl_complex(1.f,0.f);
457
- #endif
458
457
nbl_glsl_ext_FFT_impl_values[t] = nbl_glsl_complex_mul(sourceSpectrum,convSpectrum);
459
458
}
460
459
}
@@ -484,7 +483,12 @@ void main()
484
483
for(uint t=0u; t<item_per_thread_count; t++)
485
484
{
486
485
const uint tid = (t<<_NBL_GLSL_WORKGROUP_SIZE_LOG2_)|gl_LocalInvocationIndex;
487
- nbl_glsl_ext_FFT_setData(nbl_glsl_ext_FFT_getCoordinates(tid),channel,nbl_glsl_ext_FFT_impl_values[t]);
486
+ const uint trueDim = nbl_glsl_ext_FFT_Parameters_t_getDimensions()[nbl_glsl_ext_FFT_Parameters_t_getDirection()];
487
+ // we also prevent certain threads from writing the memory out
488
+ const uint padding = ((0x1u<<log2FFTSize)-trueDim)>>1u;
489
+ const uint shifted = tid-padding;
490
+ if (tid>=padding && shifted<trueDim)
491
+ nbl_glsl_ext_FFT_setData(ivec3(nbl_glsl_ext_FFT_getCoordinates(shifted)),channel,nbl_glsl_ext_FFT_impl_values[t]);
488
492
}
489
493
}
490
494
}
@@ -549,7 +553,7 @@ void nbl_glsl_ext_FFT_setData(in uvec3 coordinate, in uint channel, in nbl_glsl_
549
553
ivec2 coords = ivec2(coordinate.xy);
550
554
const uint padding_size = (0x1u<<nbl_glsl_ext_FFT_Parameters_t_getLog2FFTSize())-pc.data.imageWidth;
551
555
coords.x -= int(padding_size>>1u);
552
- if (coords.x<0 || coords.x>int(pc.data.imageWidth))
556
+ if (coords.x<0 || coords.x>= int(pc.data.imageWidth))
553
557
return;
554
558
555
559
uint dataOffset = coords.y*pc.data.inImageTexelPitch[EII_COLOR]+coords.x;
@@ -654,12 +658,33 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
654
658
auto secondLumaMeterAndFirstFFTSpecializedShader = driver->createGPUSpecializedShader (secondLumaMeterAndFirstFFTShader.get (),specInfo);
655
659
auto convolveSpecializedShader = driver->createGPUSpecializedShader (convolveShader.get (),specInfo);
656
660
auto interleaveAndLastFFTSpecializedShader = driver->createGPUSpecializedShader (interleaveAndLastFFTShader.get (),specInfo);
657
-
661
+
662
+ core::smart_refctd_ptr<IGPUSampler> samplers[colorChannelsFFT];
663
+ {
664
+ IGPUSampler::SParams params =
665
+ {
666
+ {
667
+ ISampler::ETC_REPEAT,
668
+ ISampler::ETC_REPEAT,
669
+ ISampler::ETC_REPEAT,
670
+ ISampler::ETBC_FLOAT_OPAQUE_BLACK,
671
+ ISampler::ETF_LINEAR, // is it needed?
672
+ ISampler::ETF_LINEAR,
673
+ ISampler::ESMM_NEAREST,
674
+ 0u ,
675
+ 0u ,
676
+ ISampler::ECO_ALWAYS
677
+ }
678
+ };
679
+ auto sampler = driver->createGPUSampler (std::move (params));
680
+ std::fill_n (samplers,colorChannelsFFT,sampler);
681
+ }
658
682
IGPUDescriptorSetLayout::SBinding binding[SharedDescriptorSetDescCount] = {
659
- {0u ,EDT_STORAGE_BUFFER,3u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
660
- {1u ,EDT_STORAGE_BUFFER,3u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
683
+ {0u ,EDT_STORAGE_BUFFER,EII_COUNT ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
684
+ {1u ,EDT_STORAGE_BUFFER,EII_COUNT ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
661
685
{2u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
662
- {3u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr }
686
+ {3u ,EDT_STORAGE_BUFFER,1u ,IGPUSpecializedShader::ESS_COMPUTE,nullptr },
687
+ {4u ,EDT_COMBINED_IMAGE_SAMPLER,colorChannelsFFT,IGPUSpecializedShader::ESS_COMPUTE,samplers}
663
688
};
664
689
sharedDescriptorSetLayout = driver->createGPUDescriptorSetLayout (binding,binding+SharedDescriptorSetDescCount);
665
690
SPushConstantRange pcRange[1 ] = {IGPUSpecializedShader::ESS_COMPUTE,0u ,sizeof (CommonPushConstants)};
@@ -1068,7 +1093,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
1068
1093
cuda::CCUDAHandler::GraphicsAPIObjLink<IGPUBuffer> retval = core::smart_refctd_ptr<IGPUBuffer>(gpubuffers->operator [](ix)->getBuffer ());
1069
1094
if (!cuda::CCUDAHandler::defaultHandleResult (cuda::CCUDAHandler::registerBuffer (&retval)))
1070
1095
{
1071
- os::Printer::log (makeImageIDString (i) + " Could register the image data buffer with CUDA, skipping image!" , ELL_ERROR);
1096
+ os::Printer::log (makeImageIDString (i) + " Could not register the image data buffer with CUDA, skipping image!" , ELL_ERROR);
1072
1097
skip = true ;
1073
1098
}
1074
1099
return retval;
@@ -1094,57 +1119,93 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
1094
1119
shaderConstants.inImageTexelPitch [j] = image->getRegions ().begin ()[0 ].bufferRowLength ;
1095
1120
inImageByteOffset[j] = offsetPair->getOffset ();
1096
1121
}
1097
- // upload the constants to the GPU
1098
- driver->pushConstants (sharedPipelineLayout.get (), video::IGPUSpecializedShader::ESS_COMPUTE, 0u , sizeof (CommonPushConstants), &shaderConstants);
1099
1122
}
1100
1123
1101
1124
// process
1102
1125
{
1126
+ // get the bloom kernel FFT Spectrum
1127
+ core::smart_refctd_ptr<IGPUImageView> kernelNormalizedSpectrums[colorChannelsFFT];
1128
+ {
1129
+ core::smart_refctd_ptr<IGPUImageView> kerImageView;
1130
+ {
1131
+ auto kerGpuImages = driver->getGPUObjectsFromAssets (¶m.kernel , ¶m.kernel + 1u , &assetConverter);
1132
+
1133
+
1134
+ IGPUImageView::SCreationParams kerImgViewInfo;
1135
+ kerImgViewInfo.flags = static_cast <IGPUImageView::E_CREATE_FLAGS>(0u );
1136
+ kerImgViewInfo.image = kerGpuImages->operator [](0u );
1137
+
1138
+ // make sure cache doesn't retain the GPU object paired to CPU object (could have used a custom IGPUObjectFromAssetConverter derived class with overrides to achieve this)
1139
+ am->removeCachedGPUObject (param.kernel .get (), kerImgViewInfo.image );
1140
+
1141
+ kerImgViewInfo.viewType = IGPUImageView::ET_2D;
1142
+ kerImgViewInfo.format = kerImgViewInfo.image ->getCreationParameters ().format ;
1143
+ kerImgViewInfo.subresourceRange .aspectMask = static_cast <IImage::E_ASPECT_FLAGS>(0u );
1144
+ kerImgViewInfo.subresourceRange .baseMipLevel = 0 ;
1145
+ kerImgViewInfo.subresourceRange .levelCount = kerImgViewInfo.image ->getCreationParameters ().mipLevels ;
1146
+ kerImgViewInfo.subresourceRange .baseArrayLayer = 0 ;
1147
+ kerImgViewInfo.subresourceRange .layerCount = 1 ;
1148
+ kerImageView = driver->createGPUImageView (std::move (kerImgViewInfo));
1149
+ }
1150
+
1151
+ // TODO: the FFTs
1152
+ kernelNormalizedSpectrums[] = ;
1153
+ }
1154
+
1103
1155
uint32_t outImageByteOffset[EII_COUNT];
1104
1156
// bind shader resources
1105
1157
{
1106
1158
// create descriptor set
1107
1159
auto descriptorSet = driver->createGPUDescriptorSet (core::smart_refctd_ptr (sharedDescriptorSetLayout));
1108
1160
// write descriptor set
1109
1161
{
1110
- IGPUDescriptorSet::SDescriptorInfo infos[SharedDescriptorSetDescCount+EII_COUNT*2u -2u ];
1111
- auto attachBufferImageRange = [param,&infos](auto ix, IGPUBuffer* buff, uint64_t offset, uint64_t pixelByteSize) -> void
1162
+ IGPUDescriptorSet::SDescriptorInfo infos[SharedDescriptorSetDescCount+EII_COUNT*2u -2u +colorChannelsFFT];
1163
+ auto attachBufferImageRange = [param,&infos](auto * pInfo, IGPUBuffer* buff, uint64_t offset, uint64_t pixelByteSize) -> void
1164
+ {
1165
+ pInfo->desc = core::smart_refctd_ptr<IGPUBuffer>(buff);
1166
+ pInfo->buffer = {offset,param.width *param.height *pixelByteSize};
1167
+ };
1168
+ auto attachWholeBuffer = [&infos](auto * pInfo, IGPUBuffer* buff) -> void
1112
1169
{
1113
- infos[ix]. desc = core::smart_refctd_ptr<IGPUBuffer>(buff);
1114
- infos[ix]. buffer = {offset,param. width *param. height *pixelByteSize };
1170
+ pInfo-> desc = core::smart_refctd_ptr<IGPUBuffer>(buff);
1171
+ pInfo-> buffer = {0ull ,buff-> getMemoryReqs (). vulkanReqs . size };
1115
1172
};
1116
- auto attachWholeBuffer = [&infos]( auto ix, IGPUBuffer* buff) -> void
1173
+ IGPUDescriptorSet::SWriteDescriptorSet writes[SharedDescriptorSetDescCount] =
1117
1174
{
1118
- infos[ix].desc = core::smart_refctd_ptr<IGPUBuffer>(buff);
1119
- infos[ix].buffer = {0ull ,buff->getMemoryReqs ().vulkanReqs .size };
1175
+ {descriptorSet.get (),0u ,0u ,denoiserInputCount,EDT_STORAGE_BUFFER,infos+0 },
1176
+ {descriptorSet.get (),1u ,0u ,denoiserInputCount,EDT_STORAGE_BUFFER,infos+EII_COUNT},
1177
+ {descriptorSet.get (),2u ,0u ,1u ,EDT_STORAGE_BUFFER,infos+EII_COUNT*2u },
1178
+ {descriptorSet.get (),3u ,0u ,1u ,EDT_STORAGE_BUFFER,infos+EII_COUNT*2u +1u },
1179
+ {descriptorSet.get (),4u ,0u ,colorChannelsFFT,EDT_COMBINED_IMAGE_SAMPLER,infos+EII_COUNT*2u +2u }
1120
1180
};
1121
1181
uint64_t interleavedPixelBytesize = getTexelOrBlockBytesize<EF_R16G16B16A16_SFLOAT>();
1122
- attachBufferImageRange (EII_COLOR,colorPixelBuffer.getObject (),inImageByteOffset[EII_COLOR],interleavedPixelBytesize);
1182
+ attachBufferImageRange (writes[ 0 ]. info + EII_COLOR,colorPixelBuffer.getObject (),inImageByteOffset[EII_COLOR],interleavedPixelBytesize);
1123
1183
if (denoiserInputCount>EII_ALBEDO)
1124
- attachBufferImageRange (EII_ALBEDO,albedoPixelBuffer.getObject (),inImageByteOffset[EII_ALBEDO],interleavedPixelBytesize);
1184
+ attachBufferImageRange (writes[ 0 ]. info + EII_ALBEDO,albedoPixelBuffer.getObject (),inImageByteOffset[EII_ALBEDO],interleavedPixelBytesize);
1125
1185
if (denoiserInputCount>EII_NORMAL)
1126
- attachBufferImageRange (EII_NORMAL,normalPixelBuffer.getObject (),inImageByteOffset[EII_NORMAL],interleavedPixelBytesize);
1186
+ attachBufferImageRange (writes[ 0 ]. info + EII_NORMAL,normalPixelBuffer.getObject (),inImageByteOffset[EII_NORMAL],interleavedPixelBytesize);
1127
1187
for (uint32_t j=0u ; j<denoiserInputCount; j++)
1128
1188
{
1129
1189
outImageByteOffset[j] = j*param.width *param.height *forcedOptiXFormatPixelStride;
1130
- attachBufferImageRange (EII_COUNT +j,temporaryPixelBuffer.getObject (),outImageByteOffset[j],forcedOptiXFormatPixelStride);
1190
+ attachBufferImageRange (writes[ 1 ]. info +j,temporaryPixelBuffer.getObject (),outImageByteOffset[j],forcedOptiXFormatPixelStride);
1131
1191
if (j==0u )
1132
1192
infos[EII_COUNT].buffer .size = fftScratchSize;
1133
1193
}
1134
- attachWholeBuffer (EII_COUNT* 2u ,histogramBuffer.get ());
1135
- attachWholeBuffer (EII_COUNT* 2u + 1u ,intensityBuffer.getObject ());
1136
- IGPUDescriptorSet::SWriteDescriptorSet writes[SharedDescriptorSetDescCount] =
1194
+ attachWholeBuffer (writes[ 2 ]. info ,histogramBuffer.get ());
1195
+ attachWholeBuffer (writes[ 3 ]. info ,intensityBuffer.getObject ());
1196
+ for ( auto j= 0u ; j< colorChannelsFFT; j++)
1137
1197
{
1138
- {descriptorSet.get (),0u ,0u ,denoiserInputCount,EDT_STORAGE_BUFFER,infos+0 },
1139
- {descriptorSet.get (),1u ,0u ,denoiserInputCount,EDT_STORAGE_BUFFER,infos+EII_COUNT},
1140
- {descriptorSet.get (),2u ,0u ,1u ,EDT_STORAGE_BUFFER,infos+EII_COUNT*2u },
1141
- {descriptorSet.get (),3u ,0u ,1u ,EDT_STORAGE_BUFFER,infos+EII_COUNT*2u +1u }
1142
- };
1198
+ writes[0 ].info [4 ].desc = core::smart_refctd_ptr (kernelNormalizedSpectrums[j]);
1199
+ // writes[0].info[4].image.imageLayout = ;
1200
+ writes[0 ].info [4 ].image .sampler = nullptr ; // immutable
1201
+ }
1143
1202
driver->updateDescriptorSets (SharedDescriptorSetDescCount,writes,0u ,nullptr );
1144
1203
}
1145
1204
// bind descriptor set (for all shaders)
1146
1205
driver->bindDescriptorSets (video::EPBP_COMPUTE,sharedPipelineLayout.get (),0u ,1u ,&descriptorSet.get (),nullptr );
1147
1206
}
1207
+ // upload the constants to the GPU
1208
+ driver->pushConstants (sharedPipelineLayout.get (), video::IGPUSpecializedShader::ESS_COMPUTE, 0u , sizeof (CommonPushConstants), &shaderConstants);
1148
1209
// compute shader pre-preprocess (transform normals and compute luminosity)
1149
1210
{
1150
1211
// bind deinterleave pipeline
@@ -1211,7 +1272,7 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
1211
1272
denoiserOutput.rowStrideInBytes = param.width * forcedOptiXFormatPixelStride;
1212
1273
denoiserOutput.format = forcedOptiXFormat;
1213
1274
denoiserOutput.pixelStrideInBytes = forcedOptiXFormatPixelStride;
1214
- #if 1 // for easy debug with renderdoc disable optix stuff
1275
+ #if 0 // for easy debug with renderdoc disable optix stuff
1215
1276
//invoke
1216
1277
if (denoiser.m_denoiser->tileAndInvoke(
1217
1278
m_cudaStream,
@@ -1240,48 +1301,24 @@ nbl_glsl_complex nbl_glsl_ext_FFT_getPaddedData(ivec3 coordinate, in uint channe
1240
1301
shaderConstants.flags &= 0b10u ;
1241
1302
driver->pushConstants (sharedPipelineLayout.get (), video::IGPUSpecializedShader::ESS_COMPUTE, offsetof (CommonPushConstants,flags), sizeof (uint32_t ), &shaderConstants.flags );
1242
1303
// Bloom
1243
- uint32_t workgroupCounts[2 ] = { (param.width + kComputeWGSize - 1u ) / kComputeWGSize ,param.height };
1304
+ uint32_t workgroupCounts[2 ] = { (param.width + kComputeWGSize - 1u )/ kComputeWGSize ,param.height };
1244
1305
{
1245
- core::smart_refctd_ptr<IGPUImageView> kerImageView;
1246
- {
1247
- auto kerGpuImages = driver->getGPUObjectsFromAssets (¶m.kernel ,¶m.kernel +1u ,&assetConverter);
1248
-
1249
-
1250
- IGPUImageView::SCreationParams kerImgViewInfo;
1251
- kerImgViewInfo.flags = static_cast <IGPUImageView::E_CREATE_FLAGS>(0u );
1252
- kerImgViewInfo.image = kerGpuImages->operator [](0u );
1253
-
1254
- // make sure cache doesn't retain the GPU object paired to CPU object (could have used a custom IGPUObjectFromAssetConverter derived class with overrides to achieve this)
1255
- am->removeCachedGPUObject (param.kernel .get (),kerImgViewInfo.image );
1256
-
1257
- kerImgViewInfo.viewType = IGPUImageView::ET_2D;
1258
- kerImgViewInfo.format = kerImgViewInfo.image ->getCreationParameters ().format ;
1259
- kerImgViewInfo.subresourceRange .aspectMask = static_cast <IImage::E_ASPECT_FLAGS>(0u );
1260
- kerImgViewInfo.subresourceRange .baseMipLevel = 0 ;
1261
- kerImgViewInfo.subresourceRange .levelCount = kerImgViewInfo.image ->getCreationParameters ().mipLevels ;
1262
- kerImgViewInfo.subresourceRange .baseArrayLayer = 0 ;
1263
- kerImgViewInfo.subresourceRange .layerCount = 1 ;
1264
- kerImageView = driver->createGPUImageView (std::move (kerImgViewInfo));
1265
- }
1266
-
1267
1306
driver->bindComputePipeline (secondLumaMeterAndFirstFFTPipeline.get ());
1268
1307
// dispatch
1269
1308
driver->dispatch (param.fftDispatchInfo [0 ].workGroupCount [0 ],param.fftDispatchInfo [0 ].workGroupCount [1 ],1u );
1270
1309
COpenGLExtensionHandler::extGlMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT);
1271
1310
1272
- // TODO: Y-axis FFT, multiply the spectra together, y-axis iFFT
1311
+ // Y-axis FFT, multiply the spectra together, y-axis iFFT
1273
1312
driver->bindComputePipeline (convolvePipeline.get ());
1274
- #if 0
1275
1313
{
1276
1314
const auto & kernelImgExtent = kernelNormalizedSpectrums[0 ]->getCreationParameters ().image ->getCreationParameters ().extent ;
1277
1315
vec2 kernel_half_pixel_size{0 .5f ,0 .5f };
1278
1316
kernel_half_pixel_size.x /= kernelImgExtent.width ;
1279
1317
kernel_half_pixel_size.y /= kernelImgExtent.height ;
1280
- driver->pushConstants(convolvePipeline->getLayout(),ISpecializedShader::ESS_COMPUTE,offsetof(convolve_parameters_t ,kernel_half_pixel_size),sizeof(convolve_parameters_t ::kernel_half_pixel_size),&kernel_half_pixel_size);
1318
+ driver->pushConstants (convolvePipeline->getLayout (),ISpecializedShader::ESS_COMPUTE,offsetof (CommonPushConstants ,kernel_half_pixel_size),sizeof (CommonPushConstants ::kernel_half_pixel_size),&kernel_half_pixel_size);
1281
1319
}
1282
- #endif
1283
1320
// dispatch
1284
- // ! driver->dispatch(param.fftDispatchInfo[1].workGroupCount[0],param.fftDispatchInfo[1].workGroupCount[1],1u);
1321
+ driver->dispatch (param.fftDispatchInfo [1 ].workGroupCount [0 ],param.fftDispatchInfo [1 ].workGroupCount [1 ],1u );
1285
1322
COpenGLExtensionHandler::extGlMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT);
1286
1323
1287
1324
// bind intensity pipeline
0 commit comments