diff --git a/examples_tests b/examples_tests
index 82bf36b2a0..498ffd21a0 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 82bf36b2a0cc3f0b02d19f91487352223937d22d
+Subproject commit 498ffd21a06b9e9c74d20b37860421d17fe7cf49
diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h
index 3f09062b18..355d792782 100644
--- a/include/nbl/asset/utils/IMeshPacker.h
+++ b/include/nbl/asset/utils/IMeshPacker.h
@@ -6,7 +6,7 @@
 #define __NBL_ASSET_I_MESH_PACKER_H_INCLUDED__
 
 #include "nbl/asset/utils/IMeshManipulator.h"
-#include "nbl/core/math/morton.h"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
 
 namespace nbl
 {
diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
new file mode 100644
index 0000000000..55d1713619
--- /dev/null
+++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
@@ -0,0 +1,35 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
+#define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace luma_meter
+{
+
+struct MeteringWindow
+{
+	using this_t = MeteringWindow;
+	float32_t2 meteringWindowScale;
+	float32_t2 meteringWindowOffset;
+
+	static this_t create(float32_t2 scale, float32_t2 offset) {
+		this_t retval;
+		retval.meteringWindowScale = scale;
+		retval.meteringWindowOffset = offset;
+		return retval;
+	}
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
new file mode 100644
index 0000000000..20af804603
--- /dev/null
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -0,0 +1,287 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "nbl/builtin/hlsl/luma_meter/common.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace luma_meter
+{
+
+template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
+struct geom_meter {
+    using float_t = typename SharedAccessor::type;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+    using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
+
+    static this_t create(float_t2 lumaMinMax, float_t sampleCount)
+    {
+        this_t retval;
+        retval.lumaMinMax = lumaMinMax;
+        retval.sampleCount = sampleCount;
+        return retval;
+    }
+
+    float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    {
+        return workgroup::reduction < plus < float_t >, GroupSize >::
+            template __call <SharedAccessor>(value, sdata);
+    }
+
+    float_t __computeLumaLog2(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(TexAccessor) tex,
+        float_t2 shiftedCoord
+    )
+    {
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        float_t3 color = tex.get(uvPos);
+        float_t luma = (float_t)TexAccessor::toXYZ(color);
+
+        luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
+
+        return log2(luma);
+    }
+
+    void __uploadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        float_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+
+        val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+    }
+
+    float_t __downloadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        uint32_t index,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
+        return luma / rangeLog2 + minLog2;
+    }
+
+    void sampleLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(ValueAccessor) val,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        float_t2 tileOffset,
+        float_t2 viewportSize
+    )
+    {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+        uint32_t2 coord = {
+            morton2d_decode_x(tid),
+            morton2d_decode_y(tid)
+        };
+
+        float_t luma = 0.0f;
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
+        float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
+
+        if (tid == 0) {
+            __uploadFloat(
+                val,
+                lumaLog2Sum,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            );
+        }
+    }
+
+    float_t gatherLuma(
+        NBL_REF_ARG(ValueAccessor) val
+    )
+    {
+        uint32_t tid = glsl::gl_SubgroupInvocationID();
+        float_t luma = glsl::subgroupAdd(
+            __downloadFloat(
+                val,
+                tid,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            )
+        );
+
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
+    }
+
+    float_t sampleCount;
+    float_t2 lumaMinMax;
+};
+
+template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
+struct median_meter {
+    using int_t = typename SharedAccessor::type;
+    using float_t  = float32_t;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+    using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
+
+    static this_t create(float_t2 lumaMinMax) {
+        this_t retval;
+        retval.lumaMinMax = lumaMinMax;
+        return retval;
+    }
+
+    int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
+        return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
+            template __call <SharedAccessor>(value, sdata);
+    }
+
+    float_t __computeLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(TexAccessor) tex,
+        float_t2 shiftedCoord
+    ) {
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        float_t3 color = tex.get(uvPos);
+        float_t luma = (float_t)TexAccessor::toXYZ(color);
+
+        return clamp(luma, lumaMinMax.x, lumaMinMax.y);
+    }
+
+    int_t __float2Int(
+        float_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    ) {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+    }
+
+    float_t __int2Float(
+        int_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    ) {
+        return val / rangeLog2 + minLog2;
+    }
+
+    void sampleLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(HistogramAccessor) histo,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        float_t2 tileOffset,
+        float_t2 viewportSize
+    ) {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+        
+        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+            sdata.set(vid, 0);
+        }
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        uint32_t2 coord = {
+            morton2d_decode_x(tid),
+            morton2d_decode_y(tid)
+        };
+
+        float_t luma = 0.0f;
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        luma = __computeLuma(window, tex, shiftedCoord);
+
+        float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
+        uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
+
+        sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        float_t histogram_value;
+        sdata.get(tid, histogram_value);
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        float_t sum = inclusive_scan(histogram_value, sdata);
+        histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+        const bool is_last_wg_invocation = tid == (GroupSize - 1);
+        const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;
+
+        for (int i = 1; i < RoundedBinCount; i++) {
+            uint32_t keyBucketStart = GroupSize * i;
+            uint32_t vid = tid + keyBucketStart;
+
+            // no if statement about the last iteration needed
+            if (is_last_wg_invocation) {
+                float_t beforeSum;
+                sdata.get(keyBucketStart, beforeSum);
+                sdata.set(keyBucketStart, beforeSum + sum);
+            }
+
+            // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
+            sdata.workgroupExecutionAndMemoryBarrier();
+
+            // no aliasing anymore
+            float_t atVid;
+            sdata.get(vid, atVid);
+            sum = inclusive_scan(atVid, sdata);
+            if (vid < BinCount) {
+                histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+            }
+        }
+    }
+
+    float_t gatherLuma(
+        NBL_REF_ARG(HistogramAccessor) histo,
+        NBL_REF_ARG(SharedAccessor) sdata
+    ) {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+
+        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+            sdata.set(
+                vid,
+                histo.get(vid & (BinCount - 1))
+            );
+        }
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        uint32_t percentile40, percentile60;
+        sdata.get(BinCount * 0.4, percentile40);
+        sdata.get(BinCount * 0.6, percentile60);
+
+        return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
+    }
+
+    float_t2 lumaMinMax;
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
new file mode 100644
index 0000000000..c0769fc88b
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -0,0 +1,160 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_MORTON_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MORTON_INCLUDED_
+
+#ifdef __HLSL_VERSION
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#else
+#include <cstdint>
+#endif
+
+namespace nbl
+{
+namespace hlsl
+{
+
+namespace impl
+{
+
+template <typename T>
+NBL_CONSTEXPR_FUNC T morton2d_mask(uint16_t _n)
+{
+    const static uint64_t mask[5] =
+    {
+        0x5555555555555555ull,
+        0x3333333333333333ull,
+        0x0F0F0F0F0F0F0F0Full,
+        0x00FF00FF00FF00FFull,
+        0x0000FFFF0000FFFFull
+    };
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
+}
+
+template <typename T>
+NBL_CONSTEXPR_FUNC T morton3d_mask(uint16_t _n)
+{
+    const static uint64_t mask[5] =
+    {
+        0x1249249249249249ull,
+        0x10C30C30C30C30C3ull,
+        0x010F00F00F00F00Full,
+        0x001F0000FF0000FFull,
+        0x001F00000000FFFFull
+    };
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
+}
+template <typename T>
+NBL_CONSTEXPR_FUNC T morton4d_mask(uint16_t _n)
+{
+    const static uint64_t mask[4] =
+    {
+        0x1111111111111111ull,
+        0x0303030303030303ull,
+        0x000F000F000F000Full,
+        0x000000FF000000FFull
+    };
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
+}
+
+template <typename T, uint32_t bitDepth>
+inline T morton2d_decode(T x)
+{
+    x = x & morton2d_mask<T>(0);
+    x = (x | (x >> 1)) & morton2d_mask<T>(1);
+    x = (x | (x >> 2)) & morton2d_mask<T>(2);
+    if (bitDepth > 8u)
+    {
+        x = (x | (x >> 4)) & morton2d_mask<T>(3);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x >> 8)) & morton2d_mask<T>(4);
+    }
+    if (bitDepth > 32u)
+    {
+        x = (x | (x >> 16));
+    }
+    return x;
+}
+
+//! Puts bits on even positions filling gaps with 0s
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_2d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 16)) & morton2d_mask<T>(4);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 8)) & morton2d_mask<T>(3);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 4)) & morton2d_mask<T>(2);
+    }
+    x = (x | (x << 2)) & morton2d_mask<T>(1);
+    x = (x | (x << 1)) & morton2d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_3d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 32)) & morton3d_mask<T>(4);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 16)) & morton3d_mask<T>(3);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 8)) & morton3d_mask<T>(2);
+    }
+    x = (x | (x << 4)) & morton3d_mask<T>(1);
+    x = (x | (x << 2)) & morton3d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_4d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 24)) & morton4d_mask<T>(3);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 12)) & morton4d_mask<T>(2);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 6)) & morton4d_mask<T>(1);
+    }
+    x = (x | (x << 3)) & morton4d_mask<T>(0);
+
+    return x;
+}
+}
+
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_x(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton >> 1); }
+
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T, bitDepth>(x) | (impl::separate_bits_2d<T, bitDepth>(y) << 1); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T, bitDepth>(x) | (impl::separate_bits_3d<T, bitDepth>(y) << 1) | (impl::separate_bits_3d<T, bitDepth>(z) << 2); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T, bitDepth>(x) | (impl::separate_bits_4d<T, bitDepth>(y) << 1) | (impl::separate_bits_4d<T, bitDepth>(z) << 2) | (impl::separate_bits_4d<T, bitDepth>(w) << 3); }
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index 2ecb08cdb2..973a313e9c 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -61,37 +61,45 @@ pointer_t<StorageClass,T> copyObject([[vk::ext_reference]] T v);
 // Here's the thing with atomics, it's not only the data type that dictates whether you can do an atomic or not.
 // It's the storage class that has the most effect (shared vs storage vs image) and we can't check that easily
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
new file mode 100644
index 0000000000..46d241c76c
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -0,0 +1,106 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace tonemapper
+{
+
+template<typename T = float32_t>
+struct Reinhard
+{
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;
+	using float_t3 = vector<float_t, 3>;
+	using this_t = Reinhard<float_t>;
+
+	static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f)
+	{
+		this_t retval;
+
+		const float_t unit = 1.0;
+		retval.keyAndManualLinearExposure = key * exp2(EV);
+		retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV);
+
+		return retval;
+	}
+
+	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		const float_t unit = 1.0;
+		float_t exposureFactors = keyAndManualLinearExposure;
+		float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
+		float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma));
+		return rawCIEXYZcolor * colorMultiplier;
+	}
+
+	float_t keyAndManualLinearExposure;
+	float_t rcpWhite2;
+};
+
+template<typename T = float32_t>
+struct ACES
+{
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;
+	using float_t3 = vector<float_t, 3>;
+	using float_t3x3 = matrix<float_t, 3, 3>;
+
+	using this_t = ACES<T>;
+	static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) {
+		this_t retval;
+		retval.gamma = Contrast;
+		const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
+		retval.exposure = EV + log2(key * reinhardMatchCorrection);
+		return retval;
+	}
+
+	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		const float_t unit = 1.0;
+		float_t3 tonemapped = rawCIEXYZcolor;
+		if (tonemapped.y > bit_cast<float_t>(numeric_limits<float_t>::min))
+			tonemapped *= exp2(log2(tonemapped.y) * (gamma - unit) + (exposure) * gamma);
+
+		// XYZ => RRT_SAT
+		// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)
+		const float_t3x3 XYZ_RRT_Input = float_t3x3(
+			float_t3(1.594168310, -0.262608051, -0.231993079),
+			float_t3(-0.6332771780, 1.5840380200, 0.0164147373),
+			float_t3(0.00892840419, 0.03648501260, 0.87711471300)
+		);
+
+		// this is obviously fitted to some particular simulated sensor/film and display
+		float_t3 v = mul(XYZ_RRT_Input, tonemapped);
+		float_t3 a = v * (v + promote<float_t3>(0.0245786)) - promote<float_t3>(0.000090537);
+		float_t3 b = v * (v * promote<float_t3>(0.983729) + promote<float_t3>(0.4329510)) + promote<float_t3>(0.238081);
+		v = a / b;
+
+		// ODT_SAT => XYZ
+		// this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t)
+		const float_t3x3 ODT_XYZ_Output = float_t3x3(
+			float_t3(0.624798000, 0.164064825, 0.161605373),
+			float_t3(0.268048108, 0.674283803, 0.057667464),
+			float_t3(0.0157514643, 0.0526682511, 1.0204007600)
+		);
+		return mul(ODT_XYZ_Output, v);
+	}
+
+	float_t gamma; // 1.0
+	float_t exposure; // actualExposure+midGrayLog2
+};
+
+// ideas for more operators https://web.archive.org/web/20191226154550/http://cs.columbia.edu/CAVE/software/softlib/dorf.php
+// or get proper ACES RRT and ODTs
+// https://partnerhelp.netflixstudios.com/hc/en-us/articles/360000622487-I-m-using-ACES-Which-Output-Transform-should-I-use-
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 227f9780ff..fe51f17fbb 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -34,6 +34,11 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/barycentric/utils.glsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl")
+# luma metering
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl")
+# tonemapper
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/legacy_bda_accessor.hlsl")
 # bump mapping
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/fragment.glsl") # TODO: rename to `frag.glsl`
@@ -292,6 +297,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quartic.hlsl")
 #extra math
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/impl.hlsl")
+#morton
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/morton.hlsl")
 #acceleration structures
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/acceleration_structures.hlsl")
 #colorspace